Source code for leap.data_generation.reassessment_data

import pandas as pd
import numpy as np
import itertools
from leap.utils import get_data_path
from leap.logger import get_logger
from leap.data_generation.occurrence_calibration_data import get_asthma_occurrence_prediction

pd.options.mode.copy_on_write = True

logger = get_logger(__name__, 20)

STARTING_YEAR = 1999
STABILIZATION_YEAR = 2025
MIN_ASTHMA_AGE = 3  # Minimum age for asthma diagnosis
MAX_ASTHMA_AGE = 62
MAX_AGE = 110
PROVINCES = ["BC", "CA"]
MAX_YEARS = {
    "BC": 2043,
    "CA": 2066
}


[docs]def get_asthma_df(
    starting_year: int = STARTING_YEAR,
    max_year: int = 2065,
    min_age: int = MIN_ASTHMA_AGE,
    max_age: int = MAX_AGE,
    max_asthma_age: int = MAX_ASTHMA_AGE,
    stabilization_year: int = STABILIZATION_YEAR
) -> pd.DataFrame:
    """Loads the asthma prevalence / incidence predictions from Model 1.

    Args:
        starting_year: The starting year for the dataframe.
        max_year: The ending year for the dataframe.
        min_age: The minimum age for asthma prediction.
        max_age: The maximum age for asthma prediction.
        max_asthma_age: The maximum age for for which the asthma prevalence / incidence
            model can accurately make predictions.
        stabilization_year: The year when asthma stabilization occurs.

    Returns:
        A DataFrame containing asthma occurrence predictions.
        Columns:

        * ``age (int)``: age in years, range ``[min_age, max_age]``.
        * ``sex (str)``: one of ``"M"`` or ``"F"``.
        * ``year (int)``: calendar year, range ``[starting_year, max_year]``.
        * ``incidence (float)``: predicted asthma incidence for the given age, sex, and year.
        * ``prevalence (float)``: predicted asthma prevalence for the given age, sex, and year.

    """
    df_asthma = pd.DataFrame(
        list(itertools.product(
            range(min_age, max_age + 1),
            ["F", "M"],
            range(starting_year, max_year + 1)
        )),
        columns=["age", "sex", "year"]
    )

    df_asthma["incidence"] = df_asthma.apply(
        lambda x: get_asthma_occurrence_prediction(
            x["age"], x["sex"], x["year"], "incidence", max_asthma_age, stabilization_year
        ),
        axis=1
    )
    df_asthma["prevalence"] = df_asthma.apply(
        lambda x: get_asthma_occurrence_prediction(
            x["age"], x["sex"], x["year"], "prevalence", max_asthma_age, stabilization_year
        ),
        axis=1
    )
    df_asthma["incidence"] = df_asthma.apply(
        lambda x: x["prevalence"] if x["age"] == 3 else x["incidence"],
        axis=1
    )
    return df_asthma


[docs]def calculate_reassessment_probability(
    prevalence_past: float,
    prevalence_current: float,
    incidence_current: float
) -> float:
    """Calculates the reassessment probability based on asthma prevalence and incidence.

    Args:
        prevalence_past: The prevalence of asthma from the previous year.
        prevalence_current: The prevalence of asthma in the current year.
        incidence_current: The incidence of asthma in the current year.

    Returns:
        The probability that someone diagnosed with asthma will maintain their diagnosis in the
        current year.
    """

    prob = (prevalence_current - incidence_current * (1 - prevalence_past)) / prevalence_past
    return max(0, min(prob, 1))


[docs]def get_reassessment_data(
    df_asthma: pd.DataFrame,
    province: str = "CA",
    starting_year: int = STARTING_YEAR,
    max_year: int = 2065,
    max_age: int = MAX_AGE
) -> pd.DataFrame:
    """Generates reassessment data for asthma prevalence and incidence.

    Args:
        df_asthma: A dataframe containing asthma prevalence and incidence predictions from
            Occurrence Model 1. The dataframe should have the following columns:

            * ``age (int)``: age in years, range ``[3, max_age]``.
            * ``sex (str)``: one of ``"M"`` or ``"F"``.
            * ``year (int)``: calendar year, range ``[starting_year, max_year]``.
            * ``incidence (float)``: predicted asthma incidence for the given age, sex, and year.
            * ``prevalence (float)``: predicted asthma prevalence for the given age, sex, and year.

        province: The 2-letter province code, e.g. ``"CA"``.
        starting_year: The starting year for the data.
        max_year: The ending year for the data.
        max_age: The maximum age for asthma prediction.

    Returns:
        A DataFrame containing the reassessment data.
        Columns:

        * ``year (int)``: calendar year, range ``[starting_year + 1, max_year]``.
        * ``province (str)``: the 2-letter province code, e.g. ``"CA"``.
        * ``age (int)``: age in years, range ``[4, max_age]``.
        * ``sex (str)``: one of ``"M"`` or ``"F"``.
        * ``prob (float)``: the probability that someone diagnosed with asthma will
          maintain their asthma diagnosis in the given year. Range: ``[0, 1]``.
    """

    df_asthma_grouped = df_asthma.groupby(["year"])

    df_reassessment = pd.DataFrame({
        "year": np.array([], dtype=int),
        "province": [],
        "age": np.array([], dtype=int),
        "sex": [],
        "prob": []
    })

    for year in range(starting_year + 1, max_year + 1):

        # Get the predicted prevalence for the previous year
        df_year_0 = df_asthma_grouped.get_group((year - 1,))
        df_year_0 = df_year_0.loc[df_year_0["age"] < max_age]
        df_year_0["age_current"] = df_year_0.apply(
             lambda x: x["age"] + 1,
                axis=1
        )
        df_year_0.rename(columns={"age": "age_past", "year": "year_past"}, inplace=True)

        # Get the predicted prevalence for the current year
        df_year_1 = df_asthma_grouped.get_group((year,))
        df_year_1 = df_year_1.loc[df_year_1["age"] > 3]
        df_year_1.rename(columns={"age": "age_current", "year": "year_current"}, inplace=True)


        df = pd.merge(
            df_year_0, df_year_1, on=["age_current", "sex"], suffixes=("_past", "_current"), how="outer"
        )
        df["prob"] = df.apply(
            lambda x: calculate_reassessment_probability(
                x["prevalence_past"], x["prevalence_current"], x["incidence_current"]
            ),
            axis=1
        )

        df.drop(
            columns=[
                "prevalence_past", "prevalence_current", "incidence_current", "incidence_past",
                "age_past", "year_past"
            ],
            inplace=True
        )
        df.rename(
            columns={"year_current": "year", "age_current": "age"}, inplace=True
        )
        df["province"] = [province] * df.shape[0]
        df_reassessment = pd.concat([df_reassessment, df], axis=0)
    
    return df_reassessment


[docs]def generate_reassessment_data():
    """Generate reassessment data for asthma prevalence and incidence across different provinces."""

    df_reassessment = pd.DataFrame({
        "year": np.array([], dtype=int),
        "province": [],
        "age": np.array([], dtype=int),
        "sex": [],
        "prob": []
    })

    for province in PROVINCES:
        df_asthma = get_asthma_df(
            starting_year=STARTING_YEAR,
            max_year=MAX_YEARS[province],
            min_age=MIN_ASTHMA_AGE,
            max_age=MAX_AGE,
            max_asthma_age=MAX_ASTHMA_AGE,
            stabilization_year=STABILIZATION_YEAR
        )
        df = get_reassessment_data(
            df_asthma=df_asthma,
            province=province,
            max_year=MAX_YEARS[province],
            max_age=MAX_AGE
        )
        df_reassessment = pd.concat([df_reassessment, df], axis=0)

    df_reassessment.reset_index(drop=True, inplace=True)
    df_reassessment.to_csv(get_data_path("processed_data/asthma_reassessment.csv"), index=False)


if __name__ == "__main__":
    generate_reassessment_data()