Source code for atomica.cascade

"""
Utility functions for working with cascades

Cascades are defined in a :class:`ProjectFramework` object. This module
implements functions that are useful for working with the cascades, including

- Validation
- Plotting
- Value extraction

On the plotting side, the two key functions are

- :func:`plot_single_cascade` which makes a single cascade plot complete with
  shaded regions between bars, and conversion arrows
- :func:`plot_multi_cascade` which makes a scenario comparison type cascade
  plot with bars grouped by cascade stage (not possible with normal
  :func:`plotting.plot_bars`)

The plot takes in as arguments the cascade and populations. Users can specify
cascades as

- The name of a cascade in the Framework
- The index of a cascade in the Framework
- A list of comps/characs in stage order, with stage names matching the
  comps/characs
- A ordered dict with ``{stage:comps/characs}``, with a customized stage name.
  This is referred to in code as a **cascade dict**

The first two representations map to cascades defined in the framework, while
the last two representations relate to defining custom cascades on the fly.
They are therefore sanitized in two stages

- :func:`sanitize_cascade_inputs` turns cascade indices into names, and
  cascade lists into dicts. Returning string names for predefined cascades
  allows the name to be used in the title of plots
- :func:`get_cascade_outputs` turns cascade names into cascade dicts

The dictionary representation is always required when retrieving the values of
cascades. There are two types of value retrieval:

- :func:`get_cascade_vals` which returns values for each cascade stage from a
  model result
- :func:`get_cascade_data` which attempts to compute values for each cascade
  stage from a :class:`ProjectData` instance. This is used when plotting data
  points on the cascade plot. Compartments and characteristics are
  automatically summed as required. Data points will only be displayed if the
  data has values for all of the included quantities in the year being
  plotted.


"""

from .plotting import plot_legend
import matplotlib.pyplot as plt
import numpy as np
import textwrap
import sciris as sc
import matplotlib
from .utils import NDict, nested_loop
from .results import Result, Ensemble
from .system import logger
from .data import ProjectData
import functools

default_figsize = (10, 4)
default_ax_position = [0.15, 0.2, 0.35, 0.7]

__all__ = [
    "InvalidCascade",
    "plot_cascade",
    "sanitize_cascade",
    "sanitize_pops",
    "validate_cascade",
    "plot_single_cascade_series",
    "plot_single_cascade",
    "plot_multi_cascade",
    "get_cascade_vals",
    "cascade_summary",
    "get_cascade_data",
    "CascadeEnsemble",
]



[docs]
class InvalidCascade(Exception):
    """
    Error if cascade is not valid

    This error gets thrown if a cascade failed validation - for example, because the
    requested stages were not correctly nested

    """

    pass




[docs]
def plot_cascade(results=None, cascade=None, pops=None, year=None, data=None, show_table: bool = None):
    """
    Plot single or multiple cascade plot

    :func:`plot_single_cascade` generates a plot where multiple results each have their
    own figure. A common requirement (used on the FE) is to decide between calling
    :func:`plot_single_cascade` or calling :func:`plot_multi_cascade` based on whether
    there are multiple :class:`Result` instances or not.

    A multi-cascade plot will be displayed if there are multiple years or if there are multiple
    results. Thus this function is always guaranteed to return a single figure.

    :param results: A single :class:`Result` instance, or list of instances
    :param cascade: A cascade specification supported by :func:`sanitize_cascade`
    :param pops: A population specification supported by :func:`sanitize_pops` - must correspond to a single aggregation
    :param year: A single year, or multiple years (can be a scalar, list, or array)
    :param data: A :class:`ProjectData` instance
    :param show_table: If ``True`` and a multi-cascade plot is generated, then the loss table will also be shown
    :return: Figure object containing the plot that was produced

    """
    year = sc.promotetolist(year)
    results = sc.promotetolist(results)
    if len(year) > 1 or len(results) > 1:
        output = plot_multi_cascade(results=results, cascade=cascade, pops=pops, year=year, data=data, show_table=show_table)
    else:
        fig = plot_single_cascade(result=results[0], cascade=cascade, pops=pops, year=year, data=data)
        table = None
        output = (fig, table)
    return output  # Either fig or (fig,table)




[docs]
def sanitize_cascade(framework, cascade, fallback_used: bool = False) -> tuple:
    """
    Normalize cascade inputs

    For convenience, users can specify cascades in one of several
    representations. To facilitate working with these representations on the
    backend, this function turns any valid representation into a dictionary
    mapping cascade stage names to a list of compartments/characs. It also
    returns the name of the cascade (if one is present) for use in plot
    titles.

    As an example of the cascade dictionary, suppose the spreadsheet had
    stages

    - Stage 1 - ``sus,vac,inf``
    - Stage 2 - ``vac,inf``

    Then example usage would be:

    >>> sanitize_cascade(framework,'main')[1]
    {'Stage 1':['sus','vac','inf'],'Stage 2':['vac','inf']

    This function also validates the cascade, so it is not necessary to call :func:`validate_cascade` separately.

    :param framework: A :class:`ProjectFramework` instance
    :param cascade: Supported cascade representation. Could be
        - A string cascade nameP
        - An integer specifying the index of the cascade
        - ``None``, which maps to the first cascade in the framework
        - A ``list`` of cascade stages
        - A ``dict`` defining the cascade The first three input formats will
          result in the cascade name also being returned (otherwise it will be
          assigned ``None``
    :return: A tuple with ``(cascade_name,cascade_dict)`` - the cascade name
        is ``None`` if the cascade was specified as a ``list`` or ``dict``

    """

    if cascade is None:
        cascade = 0  # Use the first cascade

    if isinstance(cascade, list):
        # Assemble cascade from comp/charac names using the display name as the stage name
        outputs = sc.odict()
        for name in cascade:
            spec = framework.get_variable(name)[0]
            outputs[spec["display name"]] = [spec.name]
        cascade = outputs
    elif isinstance(cascade, int):
        # Retrieve the cascade name based on index
        cascade = framework.cascades.keys()[cascade]

    if sc.isstring(cascade):
        cascade_name = cascade
        df = framework.cascades[cascade_name]
        cascade_dict = sc.odict()
        for _, stage in df.iterrows():
            cascade_dict[stage.iloc[0]] = [x.strip() for x in stage.iloc[1].split(",")]  # Split the name of the stage and the constituents
    else:
        cascade_name = None
        cascade_dict = cascade

    pop_type = validate_cascade(framework, cascade_dict, fallback_used=fallback_used)  # Check that the requested cascade dictionary is valid

    return cascade_name, cascade_dict, pop_type




[docs]
def sanitize_pops(pops, pop_source, pop_type) -> dict:
    """
    Sanitize input populations

    The input populations could be specified as

    - A list or dict (with single key) containing either population code names. List inputs can contain full names (e.g. from the FE)
    - A string like 'all' or 'total'
    - None, which is shorthand for all populations

    For cascade purposes, the specified populations must evaluate to a single
    aggregation. That is, a cascade plot can only be made for a single group
    of people at a time.

    :param pops: The population representation to sanitize (list, dict,
        string)
    :param pop_source: Object to draw available populations from (a
        :class:`Result` or :class:`PlotData`)
    :param pop_type: Population type to select. All returned populations will match this type
    :return: A dict with a single key that can be used by :class:`PlotData` to
        specify populations

    """

    # Retrieve a list mapping result names to labels
    if isinstance(pop_source, Result):
        available = [(x.name, x.label, x.type) for x in pop_source.model.pops]
    elif isinstance(pop_source, ProjectData):
        available = [(x, y["label"], y["type"]) for x, y in pop_source.pops.items()]
    else:
        raise Exception("Unrecognized source for pop names - must be a Result or a ProjectData instance")

    def sanitize_name(name):
        name = name.strip()
        for x, y, ptype in available:
            if x == name or y == name:
                if ptype != pop_type:
                    raise Exception(f'Requested population "{x}" has type "{ptype}" but requested cascade is for "{pop_type}"')
                return x, y
        raise Exception('Name "%s" not found' % (name))

    if pops in [None, "all", "All", "aggregate", "total"]:
        # If populations are an aggregation for all pops, then set the dict appropriately
        pops = {"Entire population": [x[0] for x in available if x[2] == pop_type]}
        if not pops["Entire population"]:
            raise Exception("No populations with the requested type were found")

    elif isinstance(pops, list) or sc.isstring(pops):
        # If it's a list or string, convert it to a dict
        if sc.isstring(pops):
            pops = sc.promotetolist(pops)

        code_names = [sanitize_name(x)[0] for x in pops]

        if len(code_names) > 1:
            pops = {"Selected populations": code_names}
        else:
            pops = {sanitize_name(code_names[0])[1]: [code_names[0]]}

    assert isinstance(pops, dict)  # At this point, it should be a dictionary
    assert len(pops) == 1, "Aggregation must evaluate to only one output population"

    return sc.odict(pops)  # Make sure an odict gets returned rather than a dict




[docs]
def validate_cascade(framework, cascade, cascade_name=None, fallback_used: bool = False) -> str:
    """
    Check if a cascade is valid

    A cascade is invalid if any stage does not contain a compartment that appears in subsequent stages i.e.
    if the stages are not all nested. Also, all compartments referred to must exist in the same population type,
    otherwise it is not possible to define a population-specific cascade as it would intrinsically span populations.

    :param framework: A :class:`ProjectFramework` instance
    :param cascade: A cascade representation supported by :func:`sanitize_cascade`
    :param cascade_name: Name of cascade to be printed in error messages
    :param fallback_used: If ``True``, then in the event that the cascade is not valid, the error message will reflect the fact that it was not a user-defined cascade
    :return: The population type if the cascade is valid
    :raises: ``InvalidCascade`` if the cascade is not valid

    """

    if not isinstance(cascade, dict):
        _, _, pop_type = sanitize_cascade(framework, cascade, fallback_used=fallback_used)  # This will result in a call to validate_cascade()
        return pop_type
    else:
        cascade_dict = cascade

    expanded = sc.odict()
    for stage, includes in cascade_dict.items():
        expanded[stage] = framework.get_charac_includes(includes)

    pop_types = set()
    comps = framework.comps
    for stage in expanded.values():
        for comp in stage:
            pop_types.add(comps.at[comp, "population type"])
    if len(pop_types) > 1:
        if fallback_used:
            raise Exception("The framework defines multiple population types and has characteristics spanning population types. Therefore, a default fallback cascade cannot be automatically constructed. You will need to explicitly define a cascade in the framework file")
        else:
            raise Exception('Cascade "%s" includes compartments from more than one population type' % (cascade_name))

    for i in range(0, len(expanded) - 1):
        if not (set(expanded[i + 1]) <= set(expanded[i])):
            message = ""
            if fallback_used:
                message += "The fallback cascade is not properly nested\n\n"
            elif sc.isstring(cascade_name):
                message += 'The cascade "%s" is not properly nested\n\n' % (cascade_name)
            else:
                message += "The requested cascade is not properly nested\n\n"

            message += 'Stage "%s" appears after stage "%s" so it must contain a subset of the compartments in "%s"\n\n' % (expanded.keys()[i + 1], expanded.keys()[i], expanded.keys()[i])
            message += "After expansion of any characteristics, the compartments comprising these stages are:\n"
            message += '"%s" = %s\n' % (expanded.keys()[i], expanded[i])
            message += '"%s" = %s\n' % (expanded.keys()[i + 1], expanded[i + 1])
            message += '\nTo be valid, stage "%s" would need the following compartments added to it: %s' % (expanded.keys()[i], list(set(expanded[i + 1]) - set(expanded[i])))
            if fallback_used and not framework.cascades:
                message += "\n\nNote that the framework did not contain a cascade - in many cases, the characteristics do not form a valid cascade. You will likely need to explicitly define a cascade in the framework file"
            if fallback_used and framework.cascades:
                message += "\n\nAlthough the framework fallback cascade was not valid, user-specified cascades do exist. The fallback cascade should only be used if user cascades are not present."
            elif sc.isstring(cascade):
                message += "\n\nTo fix this error, please modify the definition of the cascade in the framework file"

            raise InvalidCascade(message)

    return list(pop_types)[0]  # Return the population type




[docs]
def plot_single_cascade_series(result=None, cascade=None, pops=None, data=None) -> list:
    """
    Plot stacked timeseries

    Plot a stacked timeseries of the cascade. Unlike a normal stacked plot, the shaded areas show losses
    so for example the overall height of the plot corresponds to the number of people in the first cascade stage.
    Thus instead of the cascade progressing from left to right, the cascade progresses from top to bottom.
    This way, the left-right axis can be used to show the change in cascade flow over time.


    :param results: A single result, or list of results. One figure will be generated for each result
    :param cascade: A cascade specification supported by :func:`sanitize_cascade`
    :param pops: A population specification supported by :func:`sanitize_pops` - must correspond to a single aggregation
    :param data: A :class:`ProjectData` instance
    :return: List of Figure objects for all figures that were generated

    """

    from .plotting import PlotData, plot_series  # Import here to avoid circular dependencies

    if isinstance(result, list):
        figs = []
        for r in result:
            figs.append(plot_single_cascade(r, cascade, pops, data))
        return figs

    assert isinstance(result, Result), "Input must be a single Result object"

    cascade_name, cascade_dict, pop_type = sanitize_cascade(result.framework, cascade)
    pops = sanitize_pops(pops, result, pop_type)
    d = PlotData(result, outputs=cascade_dict, pops=pops)
    d.set_colors(outputs=d.outputs)

    figs = plot_series(d, axis="outputs")  # 1 result, 1 pop, axis=outputs guarantees 1 plot
    ax = figs[0].axes[0]

    if data is not None:
        t = d.tvals()[0]
        cascade_data, _ = get_cascade_data(data, result.framework, cascade_dict, pops, t)
        for stage, vals in cascade_data.items():
            color = d[d.results[0], d.pops[0], stage].color  # Get the colour of this quantity
            flt = ~np.isnan(vals)
            if np.any(flt):  # Need to only plot real values, because NaNs show up in mpld3 even though they don't appear in the normal figure
                ax.scatter(t[flt], vals[flt], marker="o", s=40, linewidths=1, facecolors=color, color="k", zorder=100)

    return figs




[docs]
def plot_single_cascade(result=None, cascade=None, pops=None, year=None, data=None, title=False):
    """
    Plot cascade for a single result

    This is the fancy cascade plot, which only applies to a single result at a single time

    :param results: A single result, or list of results. One figure will be generated for each result
    :param cascade: A cascade specification supported by :func:`sanitize_cascade`
    :param pops: A population specification supported by :func:`sanitize_pops` - must correspond to a single aggregation
    :param year: A single year, can be a scalar or an iterable of length 1
    :param data: A :class:`ProjectData` instance
    :param title: Optionally override the title of the plot
    :return: Figure object containing the plot, or list of figures if multiple figures were produced

    """

    barcolor = (0.00, 0.15, 0.48)  # Cascade color -- array([0,38,122])/255.
    diffcolor = (0.85, 0.89, 1.00)  # (0.74, 0.82, 1.00) # Original: (0.93,0.93,0.93)
    losscolor = (0, 0, 0)  # (0.8,0.2,0.2)

    cascade_name, cascade_dict, pop_type = sanitize_cascade(result.framework, cascade)

    pops = sanitize_pops(pops, result, pop_type)

    if not year:
        year = result.t[-1]  # Draw cascade for last year
    year = sc.promotetoarray(year)

    if isinstance(result, list):
        figs = []
        for r in result:
            figs.append(plot_single_cascade(r, cascade, pops, year, data))
        return figs

    assert len(year) == 1
    assert isinstance(result, Result), "Input must be a single Result object"
    cascade_vals, t = get_cascade_vals(result, cascade, pops, year)
    if data is not None:
        cascade_data, _ = get_cascade_data(data, result.framework, cascade, pops, year)
        cascade_data_array = np.hstack(cascade_data.values())

    assert len(t) == 1, "Plot cascade requires time aggregation"
    cascade_array = np.hstack(cascade_vals.values())

    fig = plt.figure(figsize=default_figsize)
    #    fig.set_figwidth(fig.get_figwidth()*1.5)
    ax = plt.gca()
    bar_x = np.arange(len(cascade_vals))
    h = plt.bar(bar_x, cascade_array, width=0.5, color=barcolor)
    if data is not None:
        non_nan = np.isfinite(cascade_data_array)
        if np.any(non_nan):
            plt.scatter(bar_x[non_nan], cascade_data_array[non_nan], s=40, c="#ff9900", marker="s", zorder=100)

    ax.set_xticks(np.arange(len(cascade_vals)))
    ax.set_xticklabels(["\n".join(textwrap.wrap(x, 15)) for x in cascade_vals.keys()])

    ylim = ax.get_ylim()
    yticks = ax.get_yticks()
    data_yrange = np.diff(ylim)
    ax.set_ylim(-data_yrange * 0.2, data_yrange * 1.1)
    ax.set_yticks(yticks)
    for i, val in enumerate(cascade_array):
        plt.text(i, val * 1.01, "%s" % sc.sigfig(val, sigfigs=3, sep=True, keepints=True), verticalalignment="bottom", horizontalalignment="center", zorder=200)

    bars = h.get_children()
    conversion = cascade_array[1:] / cascade_array[0:-1]  # Fraction not lost
    conversion_text_height = cascade_array[-1] / 2

    for i in range(len(bars) - 1):
        left_bar = bars[i]
        right_bar = bars[i + 1]

        xy = np.array(
            [
                (left_bar.get_x() + left_bar.get_width(), 0),  # Bottom left corner
                (left_bar.get_x() + left_bar.get_width(), left_bar.get_y() + left_bar.get_height()),  # Top left corner
                (right_bar.get_x(), right_bar.get_y() + right_bar.get_height()),  # Top right corner
                (right_bar.get_x(), 0),  # Bottom right corner
            ]
        )

        p = matplotlib.patches.Polygon(xy, closed=True, facecolor=diffcolor)
        ax.add_patch(p)

        bbox_props = dict(boxstyle="rarrow", fc=(0.7, 0.7, 0.7), lw=1)

        t = ax.text(np.average(xy[1:3, 0]), conversion_text_height, "%s%%" % sc.sigfig(conversion[i] * 100, sigfigs=3, sep=True), ha="center", va="center", rotation=0, bbox=bbox_props)

    loss = np.diff(cascade_array)
    for i, val in enumerate(loss):

        plt.text(i, -data_yrange[0] * 0.02, "Loss: %s" % sc.sigfig(-val, sigfigs=3, sep=True), verticalalignment="top", horizontalalignment="center", color=losscolor)

    pop_label = list(pops.keys())[0]
    plt.ylabel("Number of people")
    if title:
        if sc.isstring(cascade) and not cascade.lower() == "cascade":
            plt.title("%s cascade for %s in %d" % (cascade, pop_label, year))
        else:
            plt.title("Cascade for %s in %d" % (pop_label, year))
    plt.tight_layout()

    return fig




[docs]
def plot_multi_cascade(results=None, cascade=None, pops=None, year=None, data=None, show_table=None):
    """ "
    Plot cascade for multiple results

    This is a cascade plot that handles multiple results and times
    Results are grouped by stage/output, which is not possible to do with plot_bars()

    :param results: A single result, or list of results. A single figure will be generated
    :param cascade: A cascade specification supported by :func:`sanitize_cascade`
    :param pops: A population specification supported by :func:`sanitize_pops` - must correspond to a single aggregation
    :param year: A scalar, or array of time points. Bars will be plotted for every time point
    :param data: A :class:`ProjectData` instance (currently not used)
    :param show_table: If ``True`` then a table with loss values will be rendered in the figure
    :return: Figure object containing the plot

    """

    if show_table is None:
        show_table = True

    # First, process the cascade into an odict of outputs for PlotData
    if isinstance(results, sc.odict):
        results = [result for _, result in results.items()]
    elif isinstance(results, Result):
        results = [results]
    elif isinstance(results, NDict):
        results = list(results)

    cascade_name, cascade_dict, pop_type = sanitize_cascade(results[0].framework, cascade)
    pops = sanitize_pops(pops, results[0], pop_type)

    if not year:
        year = results[0].t[-1]  # Draw cascade for last year
    year = sc.promotetoarray(year)

    if len(results) > 1 and len(year) > 1:

        def label_fcn(result, t):
            return "%s (%s)" % (result.name, t)

    elif len(results) > 1:

        def label_fcn(result, t):
            return "%s" % (result.name)

    else:

        def label_fcn(result, t):
            return "%s" % (t)

    # Gather all of the cascade outputs and years
    cascade_vals = sc.odict()
    for result in results:
        for t in year:
            cascade_vals[label_fcn(result, t)] = get_cascade_vals(result, cascade, pops=pops, year=t)[0]

    # Determine the number of bars, per stage - based either on result or time point
    n_bars = len(cascade_vals)
    bar_width = 1.0  # This is the width of the bars
    bar_gap = 0.15  # This is the width of the bars
    block_gap = 1.0  # This is the gap between blocks
    block_size = n_bars * (bar_width + bar_gap)
    x = np.arange(0, len(cascade_vals[0].keys())) * (block_size + block_gap)  # One block for each cascade stage
    colors = sc.gridcolors(n_bars)  # Default colors
    legend_entries = sc.odict()

    fig = plt.figure(figsize=default_figsize)
    #    fig.set_figwidth(fig.get_figwidth()*1.5)

    for offset, (bar_label, data) in enumerate(cascade_vals.items()):
        legend_entries[bar_label] = colors[offset]
        vals = np.hstack(data.values())
        plt.bar(x + offset * (bar_width + bar_gap), vals, color=colors[offset], width=bar_width)

    plot_legend(legend_entries, fig=fig)
    ax = fig.axes[0]
    ax.set_xticks(x + (block_size - bar_gap - bar_width) / 2)
    ax.set_xticklabels(["\n".join(textwrap.wrap(k, 15)) for k in cascade_vals[0].keys()])
    if show_table:
        ax.get_xaxis().set_ticks_position("top")

    # Make the loss table
    cell_text = []
    for data in cascade_vals.values():
        cascade_array = np.hstack(data.values())
        loss = np.diff(cascade_array)
        loss_str = ["%s" % sc.sigfig(-val, sigfigs=3, sep=True) for val in loss]
        loss_str.append("-")  # No loss for final stage
        cell_text.append(loss_str)

    # Clean up formatting
    yticks = ax.get_yticks()
    ax.set_yticks(yticks[1:])  # Remove the first tick at 0 so it doesn't clash with table - TODO: improve table spacing so this isn't needed
    plt.ylabel("Number of people")
    if show_table:
        plt.subplots_adjust(top=0.8, right=0.75, left=0.2, bottom=0.25)
    else:
        plt.subplots_adjust(top=0.95, right=0.75, left=0.2, bottom=0.25)

    # Reset axes
    plt.tight_layout()

    # Add a table at the bottom of the axes
    row_labels = list(cascade_vals.keys())
    if show_table:
        plt.table(cellText=cell_text, rowLabels=row_labels, rowColours=None, colLabels=None, loc="bottom", cellLoc="center")
        return fig
    else:
        col_labels = [k for k in cascade_vals[0].keys()]
        table = {"text": cell_text, "rowlabels": row_labels, "collabels": col_labels}
        return fig, table




[docs]
def get_cascade_vals(result, cascade, pops=None, year=None) -> tuple:
    """
    Get values for a cascade

    If the population list
    :param result: A single :class:`Result` instance
    :param cascade: A cascade representation supported by :func:`sanitize_cascade`
    :param pops: A population representation supported by :func:`sanitize_pops`
    :param year: Optionally specify a subset of years to retrieve values for.
        Can be a scalar, list, or array. If ``None``, all time points in the
        result will be used
    :return: A tuple with ``(cascade_vals,t)`` where ``cascade_vals`` is the
        form ``{stage_name:np.array}`` and ``t`` is a ``np.array`` with the
        year values

    """

    from .plotting import PlotData  # Import here to avoid circular dependencies

    # Sanitize the cascade inputs
    _, cascade_dict, pop_type = sanitize_cascade(result.framework, cascade)
    pops = sanitize_pops(pops, result, pop_type)  # Get list representation since we don't care about the name of the aggregated pop

    if year is None:
        d = PlotData(result, outputs=cascade_dict, pops=pops)
    else:
        year = sc.promotetoarray(year)
        d = PlotData(result, outputs=cascade_dict, pops=pops)
        d.interpolate(year)

    assert len(d.pops) == 1, "get_cascade_vals() cannot get results for multiple populations or population aggregations, only a single pop or single aggregation"
    cascade_vals = sc.odict()
    for result in d.results:
        for pop in d.pops:
            for output in d.outputs:
                cascade_vals[output] = d[(result, pop, output)].vals  # NB. Might want to return the Series here to retain formatting, units etc.
    t = d.tvals()[0]  # nb. first entry in d.tvals() is time values, second entry is time labels

    return cascade_vals, t




[docs]
def cascade_summary(source_data, year: float, pops=None, cascade=0) -> None:
    """
    Print summary of cascade

    This function takes in results, either as a Result or list of Results, or as a CascadeEnsemble.

    :param source_data: A :class:`Result` or a :class:`CascadeEnsemble`
    :param year: A scalar year to print results in
    :param pops: If a :class:`Result` was passed in, this can be any valid population aggregation. If a :class:`CascadeEnsemble`
            was passed in, then this must match the name of one of the population aggregations stored in the Ensemble (i.e.
            it must be an item contained in `CascadeEnsemble.pops`)
    :param cascade: If a :class:`Result` was passed in, this argument specifies which cascade to use. If a :class:`CascadeEnsemble`
        was passed in, then this argument is ignored because the :class:`CascadeEnsemble` already uniquely specifies the cascade
    :param pretty: If ``True``, absolute values will be rounded to integers and percentages to 2 sig figs
    :return:

    """

    # If we want to support uncertainty, then we need to be able to pass in a CascadeEnsemble
    # A CascadeEnsemble might contain more than one Result, which this function needs to deal with. Since there is a
    # one-to-one mapping between a cascade and a PlotData, a CascadeEnsemble can only ever store one cascade at a time
    # A Result may also contain more than one cascade, hence we take in the 'cascade' argument to select which one

    # CascadeEnsemble cannot specify which cascade (pre-defined in the Ensemble)
    if isinstance(source_data, CascadeEnsemble):
        vals, uncertainty, t = source_data.get_vals(pop=pops, years=year)
        for result in vals.keys():
            print("Result: %s - %g" % (result, year))
            baseline = vals[result][0][0]
            for stage in vals[result].keys():
                v = vals[result][stage][0]
                u = uncertainty[result][stage][0]
                print("%s - %s ± %s (%s%% ± %s%%)" % (stage, np.round(v), sc.sigfig(u, 2), sc.sigfig(100 * v / baseline, 2), sc.sigfig(100 * u / baseline, 2)))
    else:
        # Convert to list of results, check all names are unique
        source_data = sc.promotetolist(source_data)
        if len(set([x.name for x in source_data])) != len(source_data):
            raise Exception("If passing in multiple Results, they must have different names")

        for result in source_data:

            cascade_name, cascade_dict, pop_type = sanitize_cascade(result.framework, cascade)
            absolute, _ = get_cascade_vals(result, cascade_dict, pops=pops, year=year)
            percentage = sc.dcp(absolute)
            for i in reversed(range(len(percentage))):
                percentage[i] /= percentage[0]

            print("Result: %s - %g" % (result.name, year))
            for k in absolute.keys():
                print("%s - %.0f (%s%%)" % (k, np.round(absolute[k][0]), sc.sigfig(percentage[k][0] * 100, 2)))




[docs]
def get_cascade_data(data, framework, cascade, pops=None, year=None):
    """
    Get data values for a cascade

    This function is the counterpart to :func:`get_cascade_vals` but it
    returns values from data rather than values from a :class:`Result`. Note
    that the inputs and outputs are slightly different - this function still
    needs the framework so that it can sanitize the requested cascade. If
    ``year`` is specified, the output is guaranteed to be the same size as the
    input year array, the same as :func:`get_cascade_vals`. However, the
    :func:`get_cascade_vals` defaults to all time points in the simulation
    output, whereas this function defaults to all data years. Thus, if the
    year is omitted, the returned time points may be different between the two
    functions. To make a plot superimposing data and model output, the year
    should be specified explicitly to ensure that the years match up.


    NB - In general, data probably will NOT exist
    Set the logging level to 'DEBUG' to have messages regarding this printed out

    :param data: A :class:`ProjectData` instance
    :param framework: A :class:`ProjectFramework` instance
    :param cascade: A cascade representation supported by
        :func:`sanitize_cascade`
    :param pops: Supported population representation. Can be 'all', or a pop
        name, or a list of pop names, or a dict with one key
    :param year: Optionally specify a subset of years to retrieve values for.
        Can be a scalar, list, or array. If ``None``, all time points in the
        :class:`ProjectData` instance will be used
    :return: A tuple with ``(cascade_vals,t)`` where ``cascade_vals`` is the
        form ``{stage_name:np.array}`` and ``t`` is a ``np.array`` with the
        year values

    """

    _, cascade_dict, pop_type = sanitize_cascade(framework, cascade)
    pops = sanitize_pops(pops, data, pop_type)[0]  # Get list representation since we don't care about the name of the aggregated pop

    if year is not None:
        t = sc.promotetoarray(year)  # Output times are guaranteed to be
    else:
        t = data.tvec  # Defaults to data's time vector

    # Now, get the outputs in the given years
    data_values = dict()
    for stage_constituents in cascade_dict.values():
        if sc.isstring(stage_constituents):
            stage_constituents = [stage_constituents]  # Make it a list - this is going to be a common source of errors otherwise
        for code_name in stage_constituents:
            if code_name not in data_values:
                data_values[code_name] = np.zeros(t.shape) * np.nan  # data values start out as NaN - this is a fallback in case for some reason pops is empty (the data will be all NaNs then)

                for pop_idx, pop in enumerate(pops):
                    ts = data.get_ts(code_name, pop)  # The TimeSeries data for the required variable and population
                    vals = np.ones(t.shape) * np.nan  # preallocate output values coming from this TimeSeries object

                    # Now populate this array
                    if ts is not None:
                        for i, tval in enumerate(ts.t):
                            match = np.where(t == tval)[0]
                            if len(match):  # If a time point in the TimeSeries matches the requested time - then match[0] is the index in t
                                vals[match[0]] = ts.vals[i]
                        if np.any(np.isnan(vals)):
                            logger.debug("Data for %s (%s) did not contain values for some of the requested years" % (code_name, pop))
                    else:
                        logger.debug("Data not present for %s (%s)" % (code_name, pop))

                    if pop_idx == 0:
                        data_values[code_name] = vals  # If at least one TimeSeries was found, use the first one as the data values (it could still be NaN if no times match)
                    else:
                        data_values[code_name] += vals

    # Now, data values contains all of the required quantities in all of the required years. Last step is to aggregate them
    cascade_data = sc.odict()
    for stage_name, stage_constituents in cascade_dict.items():
        for code_name in stage_constituents:
            if stage_name not in cascade_data:
                cascade_data[stage_name] = data_values[code_name]
            else:
                cascade_data[stage_name] += data_values[code_name]

    return cascade_data, t




[docs]
class CascadeEnsemble(Ensemble):
    """
    Ensemble for cascade plots

    This specialized Ensemble type is oriented to working with cascades. It has pre-defined mapping
    functions for retrieving cascade values and wrappers to plot cascade data.

    Conceptually, the idea is that using cascades with ensembles requires doing two things

    - Having a mapping function that generates PlotData instances where the outputs are
      cascade stages
    - Having a plotting function that makes bar plots where all of the bars for the same year/result
      are the same color (which rules out `Ensemble.plot_bars()`) where the bars are grouped by
      output (which rules out `plotting.plot_bars()`) and where the plot data is stored in `PlotData`
      instances rather than in `Result` object (which rules out `cascade.plot_multi_cascade`)

    This specialized Ensemble class implements both of the above steps

    - The constructor takes in the name of the cascade (or a cascade dict) and internally generates
      a suitable mapping function
    - `CascadeEnsemble.plot_multi_cascade` handles plotting multi-bar plots with error bars for cascades

    :param framework: A :class:`ProjectFramework` instance
    :param cascade: A cascade representation supported by :func:`sanitize_cascade`. However, if the cascade is a dict, then
                    it will not be sanitized. This allows advanced aggregations to be used. A CascadeEnsemble can only
                    store results for one cascade - to record multiple cascades, create further CascadeEnsemble instances
                    as required.
    :param years: Optionally interpolate results onto these years, to reduce storage requirements
    :param baseline_results: Optionally store baseline result obtained without uncertainty
    :param pops: A population aggregation dict. Can evaluate to more than one aggregated population

    """

    def __init__(self, framework, cascade, years=None, baseline_results=None, pops=None):

        if years is not None:
            years = sc.promotetoarray(years)

        if isinstance(cascade, dict):
            cascade_name = None
            cascade_dict = cascade
        else:
            cascade_name, cascade_dict, pop_type = sanitize_cascade(framework, cascade)

        if not cascade_name:
            cascade_name = "Cascade"

        mapfun = functools.partial(_cascade_ensemble_mapping, cascade_dict=cascade_dict, years=years, pops=pops)

        # Perform normal Ensemble initialization using the cascade mapping function defined above
        # (with the closure for the requested cascade, pops, and years)
        super().__init__(name=cascade_name, mapping_function=mapfun, baseline_results=baseline_results)


[docs]
    def get_vals(self, pop=None, years=None) -> tuple:
        """
        Return cascade values and uncertainty

        This method returns arrays of cascade values and uncertainties. Unlike :func:`get_cascade_vals`
        this method returns uncertainties and works for multiple Results (which can be stored in a single
        PlotData instance).

        This is implemented in `CascadeEnsemble` and not `Ensemble` for now because we make certain
        assumptions in `CascadeEnsemble` that are not valid more generally - specifically, that the outputs
        all correspond to a single set of cascade stages, and the

        The year must match a year contained in the CascadeEnsemble - the match is made by finding the
        year, rather than interpolation. This is because interpolation may have occurred when the
        Result was initially stored as a PlotData in the CascadeEnsemble - in that case, double interpolation
        may occur and provide incorrect results (e.g. if the simulation is interpolated onto two years, and then
        interpolated again as part of getting the values). To prevent this from happening, interpolation is not
        performed again here

        :param pop: Any population aggregations should have been completed when the results were loaded into
                     the Ensemble. Thus, we only prompt for a single population name here
        :param years: Select subset of years from the Ensemble. Must match items in ``self.tvec``
        :return: Tuple of ``(vals,uncertainty,t)`` where vals and uncertainty are doubly-nested dictionaries
            of the form ``vals[result_name][stage_name]=np.array`` with arrays the same sie as ``t`` (which matches
            the input argument ``years`` if provided)

        """

        if pop is None:
            pop = self.pops[0]

        if years is None:
            years_idx = np.arange(len(self.tvec))
        else:
            years = sc.promotetoarray(years)
            years_idx = [list(self.tvec).index(x) for x in years]

        vals = sc.odict()
        uncertainty = sc.odict()
        series_lookup = self._get_series()

        for result in self.results:

            vals[result] = sc.odict()
            uncertainty[result] = sc.odict()

            for stage in self.outputs:

                stage_vals = [x.vals[years_idx] for x in series_lookup[result, pop, stage]]
                uncertainty[result][stage] = np.vstack(stage_vals).std(axis=0)

                # Populate the baseline values
                if self.baseline:
                    vals[result][stage] = self.baseline[result, pop, stage].vals[years_idx]
                else:
                    vals[result][stage] = np.mean(stage_vals, axis=0)

        return (vals, uncertainty, self.tvec[years_idx].copy())



[docs]
    def plot_multi_cascade(self, pop=None, years=None):
        """
        Plot multi-cascade with uncertainties

        The multi-cascade with uncertainties differs from the
        normal plot_multi_cascade primarily in the fact that this plot is based around
        PlotData instances while plot_multi_cascade is a simplified routine that
        takes in results and calls get_cascade_vals. Thus, while this method assumes
        that the PlotData contains a properly nested cascade, it's not actually valided
        which allows more flexibility in terms of defining arbitrary quantities to
        include on the plot (like 'virtual' stages that are functions of cascade stages)

        Intended usage is for

        - One population/population aggregation
        - Multiple years OR multiple results, but not both

        Thus, the legend will either show result names for a single year, or years for a single result

        Population aggregation here is assumed to have been done at the time the Result was loaded
        into the Ensemble, so the pop argument here simply specifies which one of the already
        aggregated population groups should be used.

        Could be generalized further once applications are clearer

        """

        if years is None:
            if len(self.results) > 1:
                years = self.tvec[-1]
            else:
                years = self.tvec
        years = sc.promotetoarray(years)

        assert not (len(years) > 1 and len(self.results) > 1), "If multiple results are present, can only plot one year at a time"
        fig, ax = plt.subplots()

        if pop is None:
            pop = self.pops[0]  # Use first population

        # Iterate over bar groups
        # A bar group is for a single year-result combination but contains multiple outputs
        n_colors = len(years) * len(self.results)  # Offset to apply to each bar
        n_stages = len(self.outputs)  # Number of stages being plotted
        series_lookup = self._get_series()

        w = 1  # bar width
        g1 = 0.1  # gap between bars
        g2 = 1  # gap between stages
        stage_width = n_colors * (w + g1) + w + g2

        base_positions = np.arange(n_stages) * stage_width
        n_rendered = 0  # Track the offset for bars rendered

        for year in years:
            year_idx = list(self.tvec).index(year)
            for result in self.results:

                # Assemble the results for the bar group to render
                # This is an array with an entry for every bar
                # The outputs are ordered as the dict is ordered so can use them directly
                stage_vals = []
                for output in self.outputs:
                    stage_vals.append(np.array([x.vals[year_idx] for x in series_lookup[result, pop, output]]))
                stage_vals = np.vstack(stage_vals).T

                if self.baseline:
                    baseline_vals = np.array([self.baseline[result, pop, x].vals[year_idx] for x in self.outputs])
                else:
                    baseline_vals = np.mean(stage_vals, axis=0)

                label = "%s - %g" % (result, year)
                ax.bar(base_positions + n_rendered * (w + g1), baseline_vals, yerr=np.std(stage_vals, axis=0), capsize=10, label=label, width=w)
                n_rendered += 1

        ax.legend()
        ax.set_xticks(base_positions + (n_colors - 1) * (w + g1) / 2)
        ax.set_xticklabels(self.outputs)
        return fig




def _cascade_ensemble_mapping(results, cascade_dict, years, pops):
    # This mapping function returns PlotData for the cascade
    # It's a closure containing the cascade, years, and pops requested
    # It's a separate function so it can be pickled and parallelized
    from .plotting import PlotData

    d = PlotData(results, outputs=cascade_dict, pops=pops)
    if years is not None:
        d.interpolate(years)
    return d