Source code for atomica.excel

# -*- coding: utf-8 -*-
"""
Miscellaneous utility functions for Excel files

This module implements utility functions for Excel functionality that is common
to different kinds of spreadsheets used in Atomica (e.g. Databooks and Program Books).
For example, Excel formatting, and time-varying data entry tables, are implemented here.

"""

from xlsxwriter.utility import xl_rowcol_to_cell as xlrc
import sciris as sc
import io
import numpy as np
from .system import FrameworkSettings as FS
import pandas as pd
from .utils import format_duration, datetime_to_year
import xlsxwriter
from typing import Tuple
from openpyxl.utils import get_column_letter

__all__ = ["standard_formats", "apply_widths", "update_widths", "transfer_comments", "copy_sheet", "read_tables", "read_dataframes", "TimeDependentConnections", "TimeDependentValuesEntry", "cell_get_string", "cell_get_number", "validate_category"]

# Suppress known warning in Openpyxl
# Warnings are:
# - C:\ProgramData\Miniconda3\envs\atomica37\lib\site-packages\openpyxl\worksheet\_reader.py:300: UserWarning: Conditional Formatting extension is not supported and will be removed
#   warn(msg)
# - C:\ProgramData\Miniconda3\envs\atomica37\lib\site-packages\openpyxl\worksheet\_reader.py:300: UserWarning: Data Validation extension is not supported and will be removed
#   warn(msg)
# This means that conditional formatting and data valuation rules aren't being loaded, but since `data_only=True` these don't matter and can be safely ignored
import warnings

warnings.filterwarnings(action="ignore", category=UserWarning, module="openpyxl.worksheet", lineno=300)



[docs]
def standard_formats(workbook: xlsxwriter.Workbook) -> dict:
    """
    Add standard formatting to a workbook

    Returns a dictionary of format objects for use when writing the workbook

    :param workbook: an xlsxwriter workbook
    :return: A dictionary of ``xlsxwriter.workbook.Format`` instances
    """
    atomica_blue = "#98E0FA"
    optional_orange = "#FFA500"
    BG_COLOR = atomica_blue
    OPT_COLOR = optional_orange
    BORDER_COLOR = "white"

    formats = {}

    # Locked formats
    formats["bold"] = workbook.add_format({"bold": 1})
    formats["center"] = workbook.add_format({"align": "center"})
    formats["center_bold"] = workbook.add_format({"bold": 1, "align": "center"})
    formats["rc_title"] = {}
    formats["rc_title"]["right"] = {}
    formats["rc_title"]["right"]["T"] = workbook.add_format({"bold": 1, "align": "right", "text_wrap": True})
    formats["rc_title"]["right"]["F"] = workbook.add_format({"bold": 1, "align": "right", "text_wrap": False})
    formats["rc_title"]["left"] = {}
    formats["rc_title"]["left"]["T"] = workbook.add_format({"bold": 1, "align": "left", "text_wrap": True})
    formats["rc_title"]["left"]["F"] = workbook.add_format({"bold": 1, "align": "left", "text_wrap": False})

    # Unlocked formats
    formats["unlocked"] = workbook.add_format({"locked": 0, "bg_color": BG_COLOR, "border": 1, "border_color": BORDER_COLOR})
    formats["center_unlocked"] = workbook.add_format({"align": "center", "locked": 0, "bg_color": BG_COLOR, "border": 1, "border_color": BORDER_COLOR})
    formats["general"] = workbook.add_format({"locked": 0, "num_format": 0x00, "bg_color": BG_COLOR, "border": 1, "border_color": BORDER_COLOR})

    # Conditional formats
    formats["unlocked_boolean_true"] = workbook.add_format({"bg_color": OPT_COLOR})
    formats["unlocked_boolean_false"] = workbook.add_format({"bg_color": BG_COLOR})
    formats["not_required"] = workbook.add_format({"bg_color": "#EEEEEE", "border": 1, "border_color": "#CCCCCC"})
    formats["white_bg"] = workbook.add_format({"bg_color": "#FFFFFF", "border": 1, "border_color": "#CCCCCC"})
    formats["ignored"] = workbook.add_format({"pattern": 14})  # Hatched with diagonal lines - this represents a cell whose value will not be used in the model run (e.g., an assumption that also has time-specific points)
    formats["warning"] = workbook.add_format({"bg_color": "#FF0000"})
    formats["ignored_warning"] = workbook.add_format({"pattern": 14, "bg_color": "#FF0000"})  # hatched, with red background
    formats["ignored_not_required"] = workbook.add_format({"pattern": 14, "bg_color": "#EEEEEE", "border": 1, "border_color": "#CCCCCC"})  # hatched, with grey background

    return formats




[docs]
def apply_widths(worksheet: xlsxwriter.workbook.Worksheet, width_dict: dict) -> None:
    """
    Set column widths

    Given a dictionary of precomputed character counts (typically the maximum number of characters in
    any cell written to each column), set the width of the column in Excel accordingly. The Excel
    column width is slightly larger than the character count.

    :param worksheet: A Worksheet instance
    :param width_dict: A dictionary of character widths to use for each column (by index)
    """

    for idx, width in width_dict.items():
        worksheet.set_column(idx, idx, width * 1.1 + 1)




[docs]
def update_widths(width_dict: dict, column_index: int, contents: str) -> None:
    """
    Keep track of required width for a column

    ``width_dict`` is a dict that is keyed by column index e.g. 0,1,2
    and the value is the length of the longest contents seen for that column

    :param width_dict: Storage dictionary
    :param column_index: Index of the column value has been inserted in
    :param contents: Content, length of which is used to set width

    """

    if width_dict is None or contents is None or not sc.isstring(contents):
        return

    if len(contents) == 0:
        return

    if column_index not in width_dict:
        width_dict[column_index] = len(contents)
    else:
        width_dict[column_index] = max(width_dict[column_index], len(contents))




[docs]
def transfer_comments(target: sc.Spreadsheet, comment_source: sc.Spreadsheet) -> None:
    """
    Copy comments between spreadsheets

    This function copies comments from one spreadsheet to another. Under the hood,
    a new spreadsheet is created with values from the ``target`` Spreadsheet
    and cell-wise formatting from the ``comment_source`` Spreadsheet. If a cell exists in
    this spreadsheet and not in the source, it will be retained as-is. If more cells exist in
    the ``comment_source`` than in this spreadsheet, those cells will be dropped. If a sheet exists in
    the ``comment_source`` and not in the current workbook, it will be added

    :param target: The target spreadsheet to write comments into
    :param comment_source: The source spreadsheet containing comments

    """

    import openpyxl
    from openpyxl.comments import Comment

    assert isinstance(target, sc.Spreadsheet)
    assert isinstance(comment_source, sc.Spreadsheet)

    this_workbook = openpyxl.load_workbook(target.tofile(), data_only=False)  # This is the value source workbook
    old_workbook = openpyxl.load_workbook(comment_source.tofile(), data_only=False)  # A openpyxl workbook for the old content

    for sheet in this_workbook.worksheets:

        # If this sheet isn't in the old workbook, do nothing
        if sheet.title not in old_workbook.sheetnames:
            continue

        # Transfer comments
        for row in old_workbook[sheet.title].rows:
            for cell in row:
                if cell.comment:
                    sheet[cell.coordinate].comment = Comment(cell.comment.text, "")

    # Save the modified spreadsheet to a new buffer
    f = io.BytesIO()
    this_workbook.save(f)
    f.flush()
    f.seek(0)
    target.load(f)




[docs]
def copy_sheet(source: str, sheet_name: str, workbook: xlsxwriter.Workbook) -> None:
    """
    Copy a sheet into a Workbook

    This function allows values to be copied from a file into a Workbook. The
    main use case is to support custom sheets in databooks that are not otherwise parsed
    but which might need to be retained. In particular, the ``ProjectData`` class does
    not parse ignored sheets at all, because no guarantees can be made about the quantity and
    type of the content, and whether there are formulas etc. that would be lost. In some cases
    though, it may be necessary to edit the databook and preserve specific sheets. In general,
    this can be done by using the ``to_workbook()`` method and then manually performing whatever
    operations are necessary to preserve the content on the extra sheets. However, when the
    extra sheet content is extremely simple e.g. just a table of values, then this helper
    function can be used to facilitate copying the content.

    Warning - note that Excel functions, formatting, and comments will NOT be preserved.

    :param source: File name of the spreadsheet to read the source sheet from or an ``sc.Spreadsheet`` instance
    :param sheet_name: Name of the sheet to write to
    :param workbook: A Workbook instance to add the sheet to
    :return: None - the sheet will be added to the Workbook in-place
    """

    import openpyxl

    if sc.isstring(source):
        source = sc.Spreadsheet(source)

    src_workbook = openpyxl.load_workbook(source.tofile(), read_only=True, data_only=True)  # Load in read-only mode for performance, since we don't parse comments etc.
    src_worksheet = src_workbook[sheet_name]
    dst_worksheet = workbook.add_worksheet(sheet_name)

    for i, row in enumerate(src_worksheet.rows):
        for j, cell in enumerate(row):
            dst_worksheet.write(i, j, cell.value)

    src_workbook.close()




[docs]
def read_tables(worksheet) -> Tuple[list, list]:
    """
    Read tables from sheet

    :param worksheet: An openpyxl worksheet
    :return: A tuple containing - A list of tables (which is a list of rows, terminated by an empty row in the original spreadsheet),
             and a list of start row indices for each table read in
    """
    # This function takes in a openpyxl worksheet, and returns tables
    # A table consists of a block of rows with any #ignore rows skipped
    # This function will start at the top of the worksheet, read rows into a buffer
    # until it gets to the first entirely empty row
    # And then returns the contents of that buffer as a table. So a table is a list of openpyxl rows
    # This function continues until it has exhausted all of the rows in the sheet

    buffer = []
    tables = []
    start_rows = []
    start = None

    for i, row in enumerate(worksheet.rows):

        # Determine whether to skip the row, add it to the buffer, or flush buffer into a table
        flush = False
        for j, cell in enumerate(row):
            if cell.value:
                if cell.data_type == "s" and cell.value.startswith("#ignore"):
                    if j == 0:
                        break  # If #ignore is encountered in the first column, skip the row and continue parsing the table
                    else:
                        flush = True
                        break  # If #ignore is encoutered after preceding blank cells
                else:
                    # Read the row into the buffer and continue parsing the table
                    if not buffer:
                        start = i + 1  # Excel rows are indexed starting at 1
                    buffer.append(row)
                    break  # If the cell has a value in it, continue parsing the table
        else:
            if buffer:
                flush = True

        if flush:
            tables.append(buffer)  # Only append the buffer if it is not empty
            start_rows.append(start)
            buffer = []

    # After the last row, if the buffer has some un-flushed contents, then include it in the last table
    if buffer:
        tables.append(buffer)
        start_rows.append(start)

    return tables, start_rows




[docs]
def read_dataframes(worksheet, merge=False) -> list:
    """
    Read dataframes from sheet

    This function operates similarly to ``read_tables`` except it returns Dataframes instead of
    cells. This enables the dataframes to be constructed more quickly, at the expense of being
    able to track the cell references and row numbers. These are shown for databooks (via ``read_tables``)
    but not for frameworks (which go via ``read_dataframes``)


    :param worksheet: An openpyxl worksheet
    :param merge: If False (default) then blank rows will be used to split the dataframes. If True, only one
                  DataFrame will be returned
    :return: A list of DataFrames

    """

    if worksheet.max_row is None or worksheet.max_column is None:
        worksheet.calculate_dimension(force=True)

    content = np.empty((worksheet.max_row, worksheet.max_column), dtype="object")
    ignore = np.zeros((worksheet.max_row), dtype=bool)
    empty = np.zeros((worksheet.max_row), dtype=bool)  # True for index where a new table begins

    for i, row in enumerate(worksheet.rows):

        any_values = False  # Set True if this row contains any values
        for j, cell in enumerate(row):
            v = cell.value
            if cell.data_type == "s":
                if not any_values and v.startswith("#ignore"):
                    # If we encounter a #ignore and it's the first content in the row
                    if j == 0:
                        # If it's the first cell, ignore the row (i.e., do NOT treat it as a blank row)
                        ignore[i] = True  # Ignore the row
                    break
                elif v.startswith("#ignore"):
                    # Skip the rest of the row
                    content[i, j:] = None
                    break
                else:
                    v = v.strip()
                    any_values = any_values or bool(v)  # If it's a string type, call strip() before checking truthiness
            elif v is not None:
                any_values = True
            content[i, j] = v

        if not any_values:
            empty[i] = True

    tables = []
    if merge:
        ignore[empty] = True
        if all(ignore):
            return []
        tables.append(content[~ignore, :])
    else:
        # A change from False to True means that we need to start a new table
        # A True followed by a True doesn't start a new table but instead gets ignored
        content = content[~ignore, :]
        empty = empty[~ignore]

        # If there is no content at all, return immediately
        if all(empty):
            return []

        idx = []
        start = None

        for i in range(len(empty)):

            # Check for the start of a table
            if not empty[i] and start is None:
                start = i

            # Check for the end of a table
            if empty[i] and start is not None:
                # If we were reading a table and have reached an empty row
                idx.append((start, i))
                start = None
            elif i + 1 == len(empty) and start is not None:
                # If we were reading a table and have reached the end of the data
                idx.append((start, i + 1))
                start = None

        tables = []
        for start, stop in idx:
            tables.append(content[start:stop].copy())  # Use .copy() so that we don't retain any references to the original full array outside this function

    dfs = []
    for table in tables:
        df = pd.DataFrame(table)
        df.dropna(axis=1, how="all", inplace=True)
        df.columns = df.iloc[0]
        df = df[1:]
        dfs.append(df)
    return dfs




[docs]
class TimeDependentConnections:
    """
    Structure for reading/writing interactions

    A :class:`TimeDependentConnections` object is suitable when there are time dependent interactions between two quantities
    This class is used for both transfers and interactions. The content that it writes consists of

    - A connection matrix table that has Y/N selection of which interactions are present between two things
    - A set of pairwise connections specifying to, from, units, assumption, and time

    Interactions can have a diagonal, whereas transfers cannot (e.g. a population can infect itself but cannot transfer to itself).

    In Excel, a :class:`TimeDependentConnections` maps to three tables

    1. A table to enter the code name and full name
    2. An interactions matrix with Y/N indicating whether an interaction between two populations exists
    3. A set of rows for entering time varying data for each pair of populations

    :param code_name: the code name of this quantity e.g. 'aging'
    :param full_name: the full name of this quantity e.g. 'Aging'
    :param tvec: time values for the time-dependent rows
    :param pops: list of strings to use as the rows and columns - these are typically lists of population code names
    :param type: 'transfer' or 'interaction'. A transfer cannot have diagonal entries, and can have Number or Probability formats. An Interaction can have
                 diagonal entries and only has N.A. formats
    :param ts: Optionally specify a dict containing all of the non-empty TimeSeries objects used. The format is ``{(from_pop, to_pop):TimeSeries}``.
               An interaction can only be Y/N for clarity, if it is Y then a row is displayed for the TimeSeries. Actually, the Y/N can be
               decided in the first instance based on the provided TimeSeries i.e. if a TimeSeries is provided for an interaction, then the
               interaction must have been marked with Y
    :param pop_type: Specify pop_type, which is used by :meth:`ProjectData.add_pop` to determine which TDCs to add new populations to

    """

    def __init__(self, code_name: str, full_name: str, tvec: np.array, from_pops: list, to_pops: list, interpop_type: str, ts: dict = None, from_pop_type: str = None, to_pop_type: str = None):
        self.code_name = code_name
        self.full_name = full_name
        self.type = interpop_type
        self.from_pop_type = from_pop_type
        self.from_pops = from_pops
        self.to_pop_type = to_pop_type
        self.to_pops = to_pops
        self.tvec = tvec
        self.ts = ts if ts is not None else sc.odict()

        self.attributes = {}  #: Attributes associated with the table
        self.ts_attributes = {}  #: Attributes associated with each TimeSeries row. This is a dict-of-dicts with `ts_attributes[attribute]={(from_pop, to_pop):value}` e.g., `ts_attributes["Provenance"] = {('0-4', '5-14'): 'value'}`
        self.ts_attributes["Provenance"] = {}  # Include provenance attribute by default

        self.assumption_heading = "Constant"  #: Heading to use for assumption column
        self.write_units = None  #: Write a column for units (if None, units will be written if any of the TimeSeries have units)
        self.write_uncertainty = None  #: Write a column for units (if None, units will be written if any of the TimeSeries have uncertainty)
        self.write_assumption = None  #: Write a column for units (if None, units will be written if any of the TimeSeries have an assumption)

        if self.type == "transfer":
            self.enable_diagonal = False
            self.allowed_units = []
            self.allowed_units.append("%s (%s)" % (FS.QUANTITY_TYPE_NUMBER.title(), format_duration(1, pluralize=True)))
            self.allowed_units.append("%s (per %s)" % (FS.QUANTITY_TYPE_RATE.title(), format_duration(1, pluralize=False)))
            self.allowed_units.append("%s (%s)" % (FS.QUANTITY_TYPE_DURATION.title(), format_duration(1, pluralize=True)))
        elif self.type == "interaction":
            self.enable_diagonal = True
            self.allowed_units = [FS.DEFAULT_SYMBOL_INAPPLICABLE]
        else:
            raise Exception('Unknown TimeDependentConnections type - must be "transfer" or "interaction"')

    def __repr__(self):
        return '<TDC %s "%s">' % (self.type.title(), self.code_name)


[docs]
    @classmethod
    def from_tables(cls, tables: list, interaction_type):
        """
        Instantiate based on list of tables

        This method instantiates and initializes a new :class:`TimeDependentConnections` object from
        tables that have been read in using :func:`read_tables`. Note that the parent object
        such as :class:`ProjectData` is responsible for extracting the tables and passing them
        to this function. For instance, the transfers sheet might contain more than one set of
        tables, so it is the calling function's responsibility to split those tables up into
        the groups of three expected by this method.

        :param tables: A list of tables. A table here is a list of rows, and a row is a list of cells.
        :param interaction_type: A string identifying the interaction type - either 'transfer' or 'interaction'
        :return: A new :class:`TimeDependentConnections` instance

        """

        from .utils import TimeSeries  # Import here to avoid circular reference

        assert interaction_type in {"transfer", "interaction"}, "Unknown interaction type"

        # Read the TDC definition table (including attributes)
        code_name = None
        full_name = None
        from_pop_type = None
        to_pop_type = None

        # Start by reading the TDC header row specifying the name and pop types
        # This table also contains only a single row. Any subsequent rows will automatically be ignored
        attributes = {}
        for header_cell, value_cell in zip(tables[0][0], tables[0][1]):

            header = cell_get_string(header_cell, allow_empty=True)
            if header is None:
                continue
            elif header.startswith("#ignore"):
                break

            lowered_header = header.lower()
            if lowered_header == "abbreviation":
                code_name = cell_get_string(value_cell)
            elif lowered_header == "full name":
                full_name = cell_get_string(value_cell)
            elif lowered_header == "from population type":
                from_pop_type = cell_get_string(value_cell, True)
            elif lowered_header == "to population type":
                to_pop_type = cell_get_string(value_cell, True)
            else:
                attributes[header] = value_cell.value

        if interaction_type == "transfer":
            assert from_pop_type == to_pop_type, "Transfers can only occur between populations of the same type"

        if code_name is None:
            raise Exception("Code name/abbreviation missing")

        if full_name is None:
            raise Exception("Full name missing")

        # Read the pops from the Y/N table. The Y/N content of the table depends on the timeseries objects that
        # are present. That is, if the Y/N matrix contains a Y then a TimeSeries must be read in, and vice versa.
        # Therefore, we don't actually parse the matrix, and instead just read in all the TimeSeries instances
        # that are defined and infer the matrix that way. However, we do still need to parse the to_pops and from_pops
        # These both get parsed because for interactions across pop types they could be different. Blank cells in either
        # the row or the column signal the end of parsing those population lists

        # Read the to_pops from the first row of the table, until a blank cell is encountered
        to_pops = []
        for cell in tables[1][0][1:]:
            if cell.value is None or (cell.data_type == "s" and cell.value.startswith("#ignore")):
                break
            else:
                to_pops.append(cell.value)

        # Read the from_pops from the first column, until a blank cell is encountered. Note that a #ignore
        # can never be present in the first column, because this would cause the row to be skipped prior to this point
        from_pops = []
        for row in tables[1][1:]:
            if row[0].value is None:
                break
            else:
                from_pops.append(row[0].value)

        # Instantiate the TDC object based on the metadata from the first two tables
        tdc = cls(code_name, full_name, None, from_pops=from_pops, to_pops=to_pops, interpop_type=interaction_type, from_pop_type=from_pop_type, to_pop_type=to_pop_type)
        tdc.attributes = attributes

        known_headings = {"from population", "to population", "units", "uncertainty", "constant", "assumption"}
        headings, times, tdc.tvec = _parse_ts_header(tables[2][0], known_headings, skip_first=False)

        # Validate and process headings
        if not times and "constant" not in headings:
            raise Exception("Could not find an assumption or time-specific value - all tables must contain at least one of these values")
        tdc.write_units = True if "units" in headings else None
        tdc.write_uncertainty = True if "uncertainty" in headings else None
        tdc.write_assumption = True if "constant" in headings else None
        if "assumption" in headings:
            tdc.write_assumption = True
            tdc.assumption_heading = "Assumption"
        for heading in headings:
            if heading not in known_headings:
                tdc.ts_attributes[heading] = {}

        tdc.ts = sc.odict()
        for row in tables[2][1:]:
            if row[0].value != "...":
                assert row[0].value in from_pops, 'Population "%s" not found - should be contained in %s' % (row[0].value, from_pops)
                assert row[2].value in to_pops, 'Population "%s" not found - should be contained in %s' % (row[2].value, to_pops)
                vals = [x.value for x in row]
                from_pop = vals[0]
                to_pop = vals[2]

                if "units" in headings:
                    units = cell_get_string(row[headings["units"]], allow_empty=True)
                    if units is not None and units.lower().strip() in FS.STANDARD_UNITS:
                        units = units.lower().strip()  # Only lower and strip units if they are standard units
                else:
                    units = None
                ts = TimeSeries(units=units)

                if "uncertainty" in headings:
                    ts.sigma = cell_get_number(row[headings["uncertainty"]])
                else:
                    ts.sigma = None

                if "constant" in headings:
                    ts.assumption = cell_get_number(row[headings["constant"]])
                elif "assumption" in headings:
                    ts.assumption = cell_get_number(row[headings["assumption"]])
                else:
                    ts.assumption = None

                for attribute in tdc.ts_attributes:
                    if attribute in headings:
                        tdc.ts_attributes[attribute][(from_pop, to_pop)] = row[headings[attribute]].value

                for t, idx in times.items():
                    ts.insert(t, cell_get_number(row[idx]))  # If cell_get_number returns None, this gets handled accordingly by ts.insert()

                tdc.ts[(from_pop, to_pop)] = ts

        return tdc



[docs]
    def write(self, worksheet, start_row, formats, references: dict = None, widths: dict = None) -> int:
        """
        Write to cells in a worksheet

        :param worksheet: An xlsxwriter worksheet instance
        :param start_row: The first row in which to write values
        :param formats: Format dict for the opened workbook - typically the return value of :func:`standard_formats` when the workbook was opened
        :param references: References dict containing cell references for strings in the current workbook
        :param widths: ``dict`` storing column widths
        :return: The row index for the next available row for writing in the spreadsheet

        """

        assert self.assumption_heading in {"Constant", "Assumption"}, "Unsupported assumption heading"
        write_units = self.write_units if self.write_units is not None else any((ts.units is not None for ts in self.ts.values()))
        write_uncertainty = self.write_uncertainty if self.write_uncertainty is not None else any((ts.sigma is not None for ts in self.ts.values()))
        write_assumption = self.write_assumption if self.write_assumption is not None else any((ts.assumption is not None for ts in self.ts.values()))

        if not references:
            references = {x: x for x in self.from_pops + self.to_pops}  # Default null mapping for populations

        # First, write the name entry table
        current_row = start_row

        column = 0
        worksheet.write(current_row, column, "Abbreviation", formats["center_bold"])
        update_widths(widths, column, "Abbreviation")
        worksheet.write(current_row + 1, column, self.code_name)
        update_widths(widths, column, self.code_name)

        column += 1
        worksheet.write(current_row, column, "Full Name", formats["center_bold"])
        update_widths(widths, column, "Full Name")
        worksheet.write(current_row + 1, column, self.full_name)
        update_widths(widths, column, self.full_name)

        column += 1
        worksheet.write(current_row, column, "From population type", formats["center_bold"])
        update_widths(widths, column, "From population type")
        worksheet.write(current_row + 1, column, self.from_pop_type)
        update_widths(widths, column, self.from_pop_type)

        column += 1
        worksheet.write(current_row, column, "To population type", formats["center_bold"])
        update_widths(widths, column, "To population type")
        worksheet.write(current_row + 1, column, self.to_pop_type)
        update_widths(widths, column, self.to_pop_type)

        for attribute, value in self.attributes.items():
            column += 1
            worksheet.write(current_row, column, attribute, formats["center_bold"])
            update_widths(widths, column, attribute)
            worksheet.write(current_row + 1, column, value)
            update_widths(widths, column, value)

        # Then, write the Y/N matrix
        current_row += 3  # Leave a blank row below the matrix
        # Note - table_references are local to this TimeDependentConnections instance
        # For example, there could be two transfers, and each of them could potentially transfer between 0-4 and 5-14
        # so the worksheet might contain two references from 0-4 to 5-14 but they would be for different transfers and thus
        # the time-dependent rows would depend on different boolean table cells
        current_row, table_references, values_written = self._write_pop_matrix(worksheet, current_row, formats, references, boolean_choice=True, widths=widths)

        # Finally, write the time dependent part
        headings = []
        headings.append("From population")
        headings.append("")  # --->
        headings.append("To population")
        offset = len(headings)

        attribute_index = {}
        for attribute in self.ts_attributes:
            attribute_index[attribute] = offset
            headings.append(attribute)
            offset += 1

        if write_units:
            headings.append("Units")
            units_index = offset  # Column to write the units in
            offset += 1

        if write_uncertainty:
            headings.append("Uncertainty")
            uncertainty_index = offset  # Column to write the units in
            offset += 1

        if write_assumption:
            headings.append(self.assumption_heading)
            headings.append("")
            constant_index = offset
            offset += 1
            if len(self.tvec):
                offset += 1  # Additional offset for the 'OR' column

        headings += [float(x) for x in self.tvec]
        for i, entry in enumerate(headings):
            worksheet.write(current_row, i, entry, formats["center_bold"])
            update_widths(widths, i, entry)

        # Now, we will write a wrapper that gates the content
        # If the gating cell is 'Y', then the content will be displayed, otherwise not
        def gate_content(content, gating_cell):
            if content.startswith("="):  # If this is itself a reference
                return '=IF(%s="Y",%s,"...")' % (gating_cell, content[1:])
            else:
                return '=IF(%s="Y","%s","...")' % (gating_cell, content)

        for from_idx in range(0, len(self.from_pops)):
            for to_idx in range(0, len(self.to_pops)):
                current_row += 1
                from_pop = self.from_pops[from_idx]
                to_pop = self.to_pops[to_idx]
                entry_tuple = (from_pop, to_pop)
                entry_cell = table_references[entry_tuple]

                # Write hyperlink
                if values_written[entry_cell] != FS.DEFAULT_SYMBOL_INAPPLICABLE:
                    worksheet.write_url(entry_cell, "internal:%s!%s" % (worksheet.name, xlrc(current_row, 1)), cell_format=formats["center_unlocked"], string=values_written[entry_cell])
                    worksheet.write_url(xlrc(current_row, 1), "internal:%s!%s" % (worksheet.name, entry_cell), cell_format=formats["center_unlocked"])

                if entry_tuple in self.ts:
                    ts = self.ts[entry_tuple]
                    format = formats["not_required"]
                else:
                    ts = None
                    format = formats["unlocked"]

                if ts:
                    worksheet.write_formula(current_row, 0, gate_content(references[from_pop], entry_cell), formats["center_bold"], value=from_pop)
                    update_widths(widths, 0, from_pop)
                    worksheet.write_formula(current_row, 1, gate_content("--->", entry_cell), formats["center"], value="--->")
                    worksheet.write_formula(current_row, 2, gate_content(references[to_pop], entry_cell), formats["center_bold"], value=to_pop)
                    update_widths(widths, 2, to_pop)

                    # Write the attributes
                    for attribute in self.ts_attributes:
                        if isinstance(self.ts_attributes[attribute], dict):
                            if entry_tuple in self.ts_attributes[attribute]:
                                val = self.ts_attributes[attribute][entry_tuple]
                            else:
                                val = None
                        else:
                            val = self.ts_attributes[attribute]

                        if val is not None:
                            worksheet.write(current_row, attribute_index[attribute], val)
                            update_widths(widths, attribute_index[attribute], val)

                    if self.write_units:
                        worksheet.write(current_row, units_index, ts.units, format)
                        update_widths(widths, units_index, ts.units)
                        if self.allowed_units:
                            worksheet.data_validation(xlrc(current_row, units_index), {"validate": "list", "source": [x for x in self.allowed_units]})

                    if self.write_uncertainty:
                        worksheet.write(current_row, uncertainty_index, ts.sigma, formats["not_required"])

                    if self.write_assumption:
                        worksheet.write(current_row, constant_index, ts.assumption, format)
                        if len(self.tvec):
                            worksheet.write_formula(current_row, constant_index + 1, gate_content("OR", entry_cell), formats["center"], value="OR")
                            update_widths(widths, constant_index + 1, "OR")

                else:
                    worksheet.write_formula(current_row, 0, gate_content(references[from_pop], entry_cell), formats["center_bold"], value="...")
                    worksheet.write_formula(current_row, 1, gate_content("--->", entry_cell), formats["center"], value="...")
                    worksheet.write_formula(current_row, 2, gate_content(references[to_pop], entry_cell), formats["center_bold"], value="...")

                    if self.write_units:
                        worksheet.write_blank(current_row, units_index, "", format)
                        if self.allowed_units:
                            worksheet.data_validation(xlrc(current_row, units_index), {"validate": "list", "source": [x for x in self.allowed_units]})

                    if self.write_uncertainty:
                        worksheet.write_blank(current_row, uncertainty_index, "", formats["not_required"])

                    if self.write_assumption:
                        worksheet.write_blank(current_row, constant_index, "", format)
                        if len(self.tvec):
                            worksheet.write_formula(current_row, constant_index + 1, gate_content("OR", entry_cell), formats["center"], value="...")
                            update_widths(widths, constant_index + 1, "...")

                content = [None] * len(self.tvec)

                if ts:
                    for t, v in zip(ts.t, ts.vals):
                        idx = np.where(self.tvec == t)[0][0]
                        content[idx] = v

                for idx, v in enumerate(content):
                    if v is None:
                        worksheet.write_blank(current_row, offset + idx, v, format)
                    else:
                        worksheet.write(current_row, offset + idx, v, format)
                    widths[offset + idx] = max(widths[offset + idx], 7) if offset + idx in widths else 7

                if not content:
                    idx = 0

                if self.write_assumption and len(self.tvec):
                    # Conditional formatting for the assumption, depending on whether time-values were entered
                    fcn_empty_times = 'COUNTIF(%s:%s,"<>" & "")>0' % (xlrc(current_row, offset), xlrc(current_row, offset + idx))
                    worksheet.conditional_format(xlrc(current_row, constant_index), {"type": "formula", "criteria": "=" + fcn_empty_times, "format": formats["ignored"]})
                    worksheet.conditional_format(xlrc(current_row, constant_index), {"type": "formula", "criteria": "=AND(%s,NOT(ISBLANK(%s)))" % (fcn_empty_times, xlrc(current_row, constant_index)), "format": formats["ignored_warning"]})

                # Conditional formatting for the row - it has a white background if the gating cell is 'N'
                worksheet.conditional_format("%s:%s" % (xlrc(current_row, 3), xlrc(current_row, offset + idx)), {"type": "formula", "criteria": '=%s<>"Y"' % (entry_cell), "format": formats["white_bg"]})

        current_row += 2

        return current_row


    def _write_pop_matrix(self, worksheet, start_row, formats, references: dict = None, boolean_choice=False, widths: dict = None):
        """
        Write a square matrix to Excel

        This function writes the Y/N matrix

        - Transfer matrix
        - Interactions matrix

        If ``self.enable_diagonal`` is ``False`` then the diagonal will be forced to be ``'N.A.'``. If an entry
        is specified for an entry on the diagonal and ``enable_diagonal=False``, an error will be thrown

        :param worksheet: An xlsxwriter worksheet instance
        :param start_row: The first row in which to write values
        :param formats: Format dict for the opened workbook - typically the return value of :func:`standard_formats` when the workbook was opened
        :param references: Optionally supply dict with references, used to link population names in Excel
        :param boolean_choice: If True, values will be coerced to Y/N and an Excel validation will be added
        :param widths: ``dict`` storing column widths
        :return: Tuple with ``(next_row, table_references, values_written)``. The references are used for hyperlinking to the Excel matrix

        """

        entries = self.ts

        if not references:
            references = {x: x for x in self.from_pops + self.to_pops}  # This is a null-mapping that takes say 'adults'->'adults' thus simplifying the workflow. Otherwise, it's assumed a reference exists for every node

        table_references = {}
        values_written = {}

        # Write the headers
        for i, node in enumerate(self.to_pops):
            worksheet.write_formula(start_row, i + 1, references[node], formats["center_bold"], value=node)
            update_widths(widths, i + 1, node)
        for i, node in enumerate(self.from_pops):
            worksheet.write_formula(start_row + i + 1, 0, references[node], formats["center_bold"], value=node)
            update_widths(widths, 0, node)

        # Prepare the content - first replace the dict with one keyed by index. This is because we cannot apply formatting
        # after writing content, so have to do the writing in a single pass over the entire matrix
        if boolean_choice:
            content = np.full((len(self.from_pops), len(self.to_pops)), "N", dtype=object)  # This will also coerce the value to string in preparation for writing
        else:
            content = np.full((len(self.from_pops), len(self.to_pops)), "", dtype=object)  # This will also coerce the value to string in preparation for writing

        for interaction, value in entries.items():
            from_pop, to_pop = interaction
            if not self.enable_diagonal and from_pop == to_pop:
                raise Exception("Trying to write a diagonal entry to a table that is not allowed to contain diagonal terms")  # This is because data loss will occur if the user adds entries on the diagonal, then writes the table, and then reads it back in
            from_idx = self.from_pops.index(from_pop)
            to_idx = self.to_pops.index(to_pop)
            if boolean_choice:
                value = "Y" if value else "N"
            content[from_idx, to_idx] = value

        # Write the content
        for from_idx in range(0, len(self.from_pops)):
            for to_idx in range(0, len(self.to_pops)):
                row = start_row + 1 + from_idx
                col = to_idx + 1
                if not self.enable_diagonal and self.to_pops[to_idx] == self.from_pops[from_idx]:  # Disable the diagonal if it's linking the same two quantities and that's desired
                    val = FS.DEFAULT_SYMBOL_INAPPLICABLE
                    worksheet.write(row, col, val, formats["center"])
                    worksheet.data_validation(xlrc(row, col), {"validate": "list", "source": ["N.A."]})
                else:
                    val = content[from_idx, to_idx]
                    worksheet.write(row, col, content[from_idx, to_idx], formats["center_unlocked"])
                    if boolean_choice:
                        worksheet.data_validation(xlrc(row, col), {"validate": "list", "source": ["Y", "N"]})
                        worksheet.conditional_format(xlrc(row, col), {"type": "cell", "criteria": "equal to", "value": '"Y"', "format": formats["unlocked_boolean_true"]})
                        worksheet.conditional_format(xlrc(row, col), {"type": "cell", "criteria": "equal to", "value": '"N"', "format": formats["unlocked_boolean_false"]})
                table_references[(self.from_pops[from_idx], self.to_pops[to_idx])] = xlrc(row, col, True, True)  # Store reference to this interaction
                values_written[table_references[(self.from_pops[from_idx], self.to_pops[to_idx])]] = val

        next_row = start_row + 1 + len(self.from_pops) + 1
        return next_row, table_references, values_written




[docs]
class TimeDependentValuesEntry:
    """Table for time-dependent data entry

    This class is Databooks and Program books to enter potentially time-varying data.
    Conceptually, it maps a set of TimeSeries object to a single name and table in the
    spreadsheet. For example, a Characteristic might contain a TimeSeries for each population,
    and the resulting TimeDependentValuesEntry (TDVE) table would have a `name` matching the
    population, and TimeSeries for each population.

    The TDVE class optionally allows the specification of units, assumptions, and uncertainty,
    which each map to properties on the underlying TimeSeries objects. It also contains a
    time vector corresponding to the time values that appear or will appear in the spreadsheet.

    Note that the units are stored within the TimeSeries objects, which means that they can
    are able to differ across rows.

    :param name: The name/title for this table
    :param tvec: Specify the time values for this table. All TimeSeries in the ts dict should have corresponding time values
    :param ts: Optionally specify an odict() of TimeSeries objects populating the rows. Could be populated after
    :param allowed_units: Optionally specify a list of allowed units that will appear as a dropdown
    :param comment: Optionally specify descriptive text that will be added as a comment to the name cell

    """

    def __init__(self, name, tvec: np.array = None, ts=None, allowed_units: list = None, comment: str = None, pop_type: str = None, default_all: bool = False):

        if ts is None:
            ts = sc.odict()

        self.name = name  #: Name for th quantity printed in Excel
        self.comment = comment  #: A comment that will be added in Excel
        self.tvec = [] if tvec is None else tvec  #: time axis (e.g. np.arange(2000,2019)) - all TimeSeries time values must exactly match one of the values here
        self.ts = ts  # : dict of :class:`TimeSeries` objects
        self.allowed_units = [x.title() if x in FS.STANDARD_UNITS else x for x in allowed_units] if allowed_units is not None else None  # Otherwise, can be an odict with keys corresponding to ts - leave as None for no restriction
        self.pop_type = pop_type

        self.ts_attributes = {}  #: Dictionary containing extra attributes to write along with each TimeSeries object.
        self.ts_attributes["Provenance"] = {}  # Include provenance attribute by default

        #  Keys are attribute name, values can be either a scalar or a dict keyed by the same keys as self.ts. Compared to units, uncertainty etc.
        #  attributes are store in the TDVE rather than in the TimeSeries
        self.assumption_heading = "Constant"  #: Heading to use for assumption column

        self.write_units = None  #: Write a column for units (if None, units will be written if any of the TimeSeries have units)
        self.write_uncertainty = None  #: Write a column for uncertainty (if None, uncertainty will be written if any of the TimeSeries have uncertainty)
        self.write_assumption = None  #: Write a column for assumption/constant (if None, assumption will be written if any of the TimeSeries have an assumption)

        self.default_all = default_all  #: Record whether the framework specifies that this TDVE should default to having an 'All' row instead of population-specific rows (the user can manually modify further)

    def __repr__(self):
        output = sc.prepr(self)
        return output

    @property
    def has_data(self) -> bool:
        """
        Check whether all time series have data entered

        :return: True if all of the TimeSeries objects stored in the TDVE have data
        """

        return all([x.has_data for x in self.ts.values()])


[docs]
    @classmethod
    def from_rows(cls, rows: list):
        """
        Create new instance from Excel rows

        Given a set of openpyxl rows, instantiate a :class:`TimeDependentValuesEntry` object
        That is, the parent object e.g. :class:`ProjectData` is responsible for finding where the TDVE table is,
        and reading all of the rows associated with it (skipping ``#ignored`` rows) and then passing those rows,
        unparsed, to this function

        Headings for 'units', 'uncertainty', and 'assumption'/'constant' are optional and will be read in
        if they are present in the spreadsheet.

        :param rows: A list of rows
        :return: A new :class:`TimeDependentValuesEntry` instance

        """

        from .utils import TimeSeries  # Import here to avoid circular reference

        # Retrieve the name
        name = rows[0][0].value
        if name is None:
            raise Exception('The name of the table is missing. This can also happen if extra rows have been added without a "#ignore" entry in the first column')
        elif not sc.isstring(name):
            raise Exception("In cell %s of the spreadsheet, the name of the quantity assigned to this table needs to be a string" % rows[0][0].coordinate)
        name = name.strip()  # The name needs to be written back in a case sensitive form

        tdve = cls(name)
        known_headings = {"units", "uncertainty", "constant", "assumption"}
        headings, times, tdve.tvec = _parse_ts_header(rows[0], known_headings, skip_first=True)

        # Validate and process headings
        if not times and "constant" not in headings:
            raise Exception("Could not find an assumption or time-specific value - all tables must contain at least one of these values")
        tdve.write_units = True if "units" in headings else None
        tdve.write_uncertainty = True if "uncertainty" in headings else None
        tdve.write_assumption = True if "constant" in headings else None
        if "assumption" in headings:
            tdve.write_assumption = True
            tdve.assumption_heading = "Assumption"
        for heading in headings:
            if heading not in known_headings:
                # If it's not a known heading and it's a string, then it must be an attribute
                # Note that the way `headings` is populated by skipping i=0 ensures that the table name
                # is not interpreted as a heading
                tdve.ts_attributes[heading] = {}
        ts_entries = sc.odict()

        for row in rows[1:]:
            if row[0].value is None:
                continue

            if not row[0].data_type in {"s", "str"}:
                raise Exception("In cell %s of the spreadsheet, the name of the entry was expected to be a string, but it was not. The left-most column is expected to be a name. If you are certain the value is correct, add an single quote character at the start of the cell to ensure it remains as text" % row[0].coordinate)
            series_name = row[0].value.strip()

            if "units" in headings:
                units = cell_get_string(row[headings["units"]], allow_empty=True)
                if units is not None and units.lower().strip() in FS.STANDARD_UNITS:
                    units = units.lower().strip()  # Only lower and strip units if they are standard units
            else:
                units = None
            ts = TimeSeries(units=units)

            if "uncertainty" in headings:
                ts.sigma = cell_get_number(row[headings["uncertainty"]])
            else:
                ts.sigma = None

            if "constant" in headings:
                ts.assumption = cell_get_number(row[headings["constant"]])
            elif "assumption" in headings:
                ts.assumption = cell_get_number(row[headings["assumption"]])
            else:
                ts.assumption = None

            for attribute in tdve.ts_attributes:
                if attribute in headings:
                    # If it's a default attribute e.g. provenance, and it is missing from the databook, then don't populate it
                    tdve.ts_attributes[attribute][series_name] = row[headings[attribute]].value

            for t, idx in times.items():
                ts.insert(t, cell_get_number(row[idx]))  # If cell_get_number returns None, this gets handled accordingly by ts.insert()
            ts_entries[series_name] = ts

        tdve.ts = ts_entries
        return tdve



[docs]
    def write(self, worksheet, start_row, formats, references: dict = None, widths: dict = None) -> int:
        """
        Write to cells in a worksheet

        Note that the year columns are drawn from the ``tvec`` attribute. To suppress the year columns (e.g. for the user to enter only an assumption)
        then set ``tvec`` to an empty array/list.

        :param worksheet: An xlsxwriter worksheet instance
        :param start_row: The first row in which to write values
        :param formats: Format dict for the opened workbook - typically the return value of :func:`standard_formats` when the workbook was opened
        :param references: References dict containing cell references for strings in the current workbook
        :param widths: ``dict`` storing column widths
        :return: The row index for the next available row for writing in the spreadsheet

        """

        assert self.assumption_heading in {"Constant", "Assumption"}, "Unsupported assumption heading"

        write_units = self.write_units if self.write_units is not None else any((ts.units is not None for ts in self.ts.values()))
        write_uncertainty = self.write_uncertainty if self.write_uncertainty is not None else any((ts.sigma is not None for ts in self.ts.values()))
        write_assumption = self.write_assumption if self.write_assumption is not None else any((ts.assumption is not None for ts in self.ts.values()))

        if not references:
            references = dict()

        current_row = start_row

        # First, assemble and write the headings
        headings = []
        headings.append(self.name)
        offset = 1  # This is the column where the time values start (after the 'or')

        # Next allocate attributes
        attribute_index = {}
        for attribute in self.ts_attributes:
            attribute_index[attribute] = offset
            headings.append(attribute)
            offset += 1

        if write_units:
            headings.append("Units")
            units_index = offset  # Column to write the units in
            offset += 1

        if write_uncertainty:
            headings.append("Uncertainty")
            uncertainty_index = offset  # Column to write the units in
            offset += 1

        if write_assumption:
            headings.append(self.assumption_heading)
            headings.append("")
            constant_index = offset
            offset += 2

        headings += [float(x) for x in self.tvec]
        for i, entry in enumerate(headings):
            worksheet.write(current_row, i, entry, formats["center_bold"])
            update_widths(widths, i, entry)

        if not pd.isna(self.comment):
            worksheet.write_comment(xlrc(current_row, 0), self.comment)

        # Now, write the TimeSeries objects - self.ts is an odict and whatever pops are present will be written in whatever order they are in
        for row_name, row_ts in self.ts.items():
            current_row += 1

            # Write the name
            if row_name in references:
                worksheet.write_formula(current_row, 0, references[row_name], formats["center_bold"], value=row_name)
                update_widths(widths, 0, row_name)
            else:
                worksheet.write_string(current_row, 0, row_name, formats["center_bold"])
                update_widths(widths, 0, row_name)

            # Write the attributes
            for attribute in self.ts_attributes:
                if isinstance(self.ts_attributes[attribute], dict):
                    if row_name in self.ts_attributes[attribute]:
                        val = self.ts_attributes[attribute][row_name]
                    else:
                        val = None
                else:
                    val = self.ts_attributes[attribute]

                if val is not None:
                    worksheet.write(current_row, attribute_index[attribute], val)
                    update_widths(widths, attribute_index[attribute], val)

            # Write the units
            if write_units:
                if row_ts.units:
                    if row_ts.units.lower().strip() in FS.STANDARD_UNITS:  # Preserve case if nonstandard unit
                        unit = row_ts.units.title().strip()
                    else:
                        unit = row_ts.units.strip()
                    worksheet.write(current_row, units_index, unit)
                    update_widths(widths, units_index, unit)
                else:
                    worksheet.write(current_row, units_index, FS.DEFAULT_SYMBOL_INAPPLICABLE)

                if self.allowed_units and isinstance(self.allowed_units, dict) and row_name in self.allowed_units:  # Add dropdown selection if there is more than one valid choice for the units
                    allowed = self.allowed_units[row_name]
                elif self.allowed_units and not isinstance(self.allowed_units, dict):
                    allowed = self.allowed_units
                else:
                    allowed = None

                if allowed:
                    worksheet.data_validation(xlrc(current_row, units_index), {"validate": "list", "source": allowed})

            if write_uncertainty:
                if row_ts.sigma is None:
                    worksheet.write(current_row, uncertainty_index, row_ts.sigma, formats["not_required"])  # NB. For now, uncertainty is always optional
                else:
                    worksheet.write(current_row, uncertainty_index, row_ts.sigma, formats["not_required"])

            if row_ts.has_data:
                format = formats["not_required"]
            else:
                format = formats["unlocked"]

            if write_assumption:
                worksheet.write(current_row, constant_index, row_ts.assumption, format)
                if len(self.tvec):
                    worksheet.write(current_row, constant_index + 1, "OR", formats["center"])
                    update_widths(widths, constant_index + 1, "OR")

            # Write the time values if they are present
            if len(self.tvec):
                content = [None] * len(self.tvec)  # Initialize an empty entry for every time in the TDVE's tvec

                for t, v in zip(row_ts.t, row_ts.vals):
                    # If the TimeSeries contains data for that time point, then insert it now
                    idx = np.where(self.tvec == t)[0]
                    if len(idx):
                        content[idx[0]] = v

                for idx, v in enumerate(content):
                    if v is None:
                        worksheet.write_blank(current_row, offset + idx, v, format)
                    else:
                        worksheet.write(current_row, offset + idx, v, format)
                    widths[offset + idx] = max(widths[offset + idx], 7) if offset + idx in widths else 7

                if write_assumption:
                    # Conditional formatting for the assumption
                    # Do this here, because after the loop above, we have easy and clear access to the range of cells to include in the formula
                    fcn_empty_times = 'COUNTIF(%s:%s,"<>" & "")>0' % (xlrc(current_row, offset), xlrc(current_row, offset + len(content) - 1))
                    # Hatched out if the cell will be ignored
                    worksheet.conditional_format(xlrc(current_row, constant_index), {"type": "formula", "criteria": "=" + fcn_empty_times, "format": formats["ignored"]})
                    worksheet.conditional_format(xlrc(current_row, constant_index), {"type": "formula", "criteria": "=AND(%s,NOT(ISBLANK(%s)))" % (fcn_empty_times, xlrc(current_row, constant_index)), "format": formats["ignored_warning"]})

        return current_row + 2  # Add two so there is a blank line after this table




def _parse_ts_header(row: list, known_headings: list, skip_first=False) -> Tuple[dict, dict, np.array]:
    """
    Internal function to read TDVE/TDC timeseries tables

    :param row: A list of openpyxl Cell instances corresponding to the heading row (the one containing the
                 headers for 'constant', 'uncertainty', 'provenance' etc., and the optional year values
    :param known_headings: A list of known headings that should be lowered for further processing. These
                           would correspond to the standard/default headings
    :param skip_first: If True, the first heading cell will be skipped. This is required for TDVE tables
                       where the top left cell is the name of the variable and not a column heading
    :return: A tuple containing
                - A dictionary of heading values (from string cell types) with {heading: column_index}.
                  Any known headings will have had `str.lower()` called on them.
                - A dictionary of time values and the columns they appear in. Datetimes will be converted
                  into floats at this step (so for example a spreadsheet containing 1/1/2022 will have
                  that converted to 2022.0 at this step)
                - A numpy array with sorted time values, that can be used as a time vector

    """

    headings = {}
    times = {}
    for i, cell in enumerate(row):
        v = cell.value
        if v is None or (i == 0 and skip_first):
            continue
        elif cell.data_type in {"s", "str"}:
            v = v.strip()
            if v.startswith("#ignore"):
                break
            elif v.lower() in known_headings:
                if v.lower() in headings:
                    raise Exception(f"Duplicate heading in cell {cell.coordinate} - another cell in the same row already has the same header '{v}' (possibly in cell {get_column_letter(1+headings[v.lower()])}{cell.row})")
                headings[v.lower()] = i
            else:
                if v in headings:
                    raise Exception(f"Duplicate heading in cell {cell.coordinate} - another cell in the same row already has the same header '{v}' (possibly in cell {get_column_letter(1+headings[v.lower()])}{cell.row})")
                headings[v] = i
        elif cell.is_date:
            year = datetime_to_year(v)
            if year in times:
                raise Exception(f"Duplicate year in cell {cell.coordinate} - another cell in the same row already contains the same year ({year}) (possibly in cell {get_column_letter(1+times[year])}{cell.row})")
            times[datetime_to_year(v)] = i
        elif cell.data_type == "n":
            if v in times:
                raise Exception(f"Duplicate year in cell {cell.coordinate} - another cell in the same row already contains the same year ({v}) (possibly in cell {get_column_letter(1+times[v])}{cell.row})")
            times[v] = i
        else:
            raise Exception("Unknown data type in cell %s of the spreadsheet - quantity must be a string or a number" % cell.coordinate)
    return headings, times, np.array(sorted(times), dtype=float)



[docs]
def cell_get_string(cell, allow_empty=False) -> str:
    """
    Return string value from cell

    This function checks if a cell contains a string. If it does, the stripped value
    will be returned. Otherwise, an informative error will be raised

    Note that the string type is determined from the cell's value rather than
    the openpyxl cell data type.

    :param cell: An openpyxl cell
    :return: A string with the contents of the cell
    """
    if cell.value is None and allow_empty:
        return None
    elif not sc.isstring(cell.value):
        raise Exception("Cell %s needs to contain a string (i.e. not a number, date, or other cell type)" % cell.coordinate)
    else:
        return cell.value.strip()




[docs]
def cell_get_number(cell, dtype=float):
    """
    Return numeric value from cell

    This function is to guard against accidentally having the Excel cell contain a string
    instead of a number. If a string has been entered, an error will be raised. The added value
    from this function is that if the Excel cell type is empty but the value is
    empty or ``N.A.`` then the value will be treated as though the cell was correctly set to a
    numeric type but had been left empty.

    The output is cast to ``dtype`` which means that code that requires numeric input from Excel
    can use this input to guarantee that the resulting number is of the correct type, or ``None``.

    :param cell: An openpyxl cell
    :param dtype: If the cell is numeric, cast to this type (default is `float` but could be `int` for example)
    :return: A scalar instance of ``dtype`` (e.g. ``float``) or ``None`` if cell is empty or being treated as empty
    :raises: :class:`Exception` if the cell contains a string

    """

    if cell.value is None:
        return None
    elif cell.data_type == "n":  # Numeric type
        return dtype(cell.value)
    elif cell.data_type == "s":  # Only do relatively expensive string processing if it's actually a string type
        s = cell.value.lower().strip()
        if s == FS.DEFAULT_SYMBOL_INAPPLICABLE:
            return None
        elif not s.replace("-", ""):
            return None

    raise Exception("Cell %s needs to contain a number" % cell.coordinate)




[docs]
def validate_category(workbook, expected_category) -> None:
    """
    Check Atomica workbook type

    This function makes sure that a workbook has a particular category property
    stored within it, and displays an appropriate error message if not. If the
    category isn't present or doesn't start with 'atomica', just ignore it for
    robustness (instead, a parsing error will likely be raised)

    :param workbook: An openpyxl workbook
    :param category: The expected string category
    :raises: :class:`Exception` if the workbook category is not valid

    """

    category = workbook.properties.category
    if category and sc.isstring(category) and category.startswith("atomica:"):
        if category.strip() != expected_category.strip():
            expected_type = expected_category.split(":")[1].title()
            actual_type = category.split(":")[1].title()
            message = "Error loading %s - the provided file was a %s file" % (expected_type, actual_type)
            raise Exception(message)