Source code for larch.model.troubleshooting

from __future__ import annotations

import logging
from typing import Literal

import numpy as np
import pandas as pd
import xarray as xr

from ..util import dictx
from .basemodel import BaseModel
from .numbamodel import NumbaModel as Model
from .possible_overspec import (
    PossibleOverspecificationError,
    compute_possible_overspecification,
)

logger = logging.getLogger(__name__)


[docs] def doctor( model: Model, *, repair_ch_av: Literal["?", "+", "-", "!"] | None = "?", repair_ch_zq: Literal["?", "-", "!"] | None = None, repair_av_zq: Literal["?", "-", "!"] | None = None, repair_noch_nzwt: Literal["?", "+", "-"] | None = "?", repair_nan_wt: Literal["?", True, "!"] | None = "?", repair_nan_data_co: Literal["?", True, "!"] | None = "?", check_low_variance_data_co: Literal["?", "!"] | None = None, check_overspec: Literal["?", "!", None] = None, repair_nan_utility: Literal["?", True, "!"] | None = "?", verbose: int = 3, warning_stacklevel: int = 2, ): """ Diagnose data problems with a model. The doctor will check for common data problems that can cause numerical instability in a model. The doctor will return a list of problems found, and optionally repair them. Parameters ---------- model : larch.Model The model to diagnose. repair_ch_av : {'?', '+', '-', '!'}, default '?' How to repair the data if some observations are chosen but not available. The plus ('+') will make the conflicting alternatives available, overriding the availability status. The minus ('-') will make them not chosen (possibly leaving no chosen alternative). A question mark ('?') effects no repair, and simply emits a warning without interrupting program execution. An exclamation mark will raise an error if there are any conflicts. repair_ch_zq : {'?', '-', '!'}, default None How to repair the data if some observations are chosen but have zero quantity. The minus ('-') will make alternatives with zero quantity not chosen (possibly leaving no chosen alternative). A question mark ('?') effects no repair, and simply emits a warning. An exclamation mark ('!') will raise an error if there are any conflicts. repair_av_zq : {'?', '-', '!'}, default None How to repair the data if some observations are available but have zero quantity. The minus ('-') will make alternatives with zero quantity not available (possibly leaving no available alternatives). A question mark ('?') effects no repair, and simply emits a warning. An exclamation mark ('!') will raise an error if there are any conflicts. repair_noch_nzwt : {'?', '+', '-'}, default '?' How to repair the data if some observations have no choice but have some weight. Minus ('-') will make the weight zero when there is no choice. Plus ('+') will make the weight zero, plus autoscale all remaining weights so the total of the case weights equals the number of cases. A question mark ('?') effects no repair, and simply emits a warning. repair_nan_wt : {'?', '!', True}, default '?' How to repair the data if some weight values are NaN. Any true value other than "?" or "!" will make NaN values in weight zero. The question mark simply emits a warning if there are NaN values found, while the exclamation mark will raise an error. repair_nan_data_co : {'?', '!', True}, default '?' How to repair the data if some data_co values are NaN. Any true value other than "?" or "!" will make NaN values in data_co zero. The question mark simply emits a warning if there are NaN values found, while the exclamation mark will raise an error. check_low_variance_data_co : {'?', '!'}, default None Check if any data_co columns have very low variance. No repairs are available for this check. The question mark simply emits a warning if there are issues found, while the exclamation mark will raise an error. check_overspec : {'?', '!'}, default None Check model for possible over-specification. No repairs are available for this check. A question mark ('?') simply emits a warning if a possible over- specification is found. An exclamation mark ('!') will raise an error if possible over-specification is found. This is considered a "deep" check, and will only be run if there are no known data problems found by the other checks. repair_nan_utility : {'?', '!', True}, default '?' How to repair the data if some utility values are NaN at current parameters. Any true value other than "?" or "!" will take alternatives with NaN values in utility, and make them unavailable. The question mark simply emits a warning if there are NaN values found, while the exclamation mark will raise an error. This is considered a "deep" check, and will only be run if there are no known data problems found by the other checks. verbose : int, default 3 The number of example rows to list for each problem. warning_stacklevel : int, default 2 The stacklevel for warnings. Returns ------- model : larch.Model The model with revised dataset attached. problems : dict A dictionary of problems found, with the key being the name of the problem and the value being a DataFrame with the number of bad instances and some example rows. Raises ------ TypeError If the model is not a Model instance. ValueError If any of the repair settings are invalid, or if the repair is set to '!' and there are any conflicts found. """ problems = dictx() if not isinstance(model, Model): raise TypeError("the doctor requires a Model instance to diagnose") def apply_repair(repair, repair_func): nonlocal model, problems, verbose if repair is None: return logger.info(f"checking for {repair_func.__name__}") model, diagnosis = repair_func(model, repair=repair, verbose=verbose) if diagnosis is not None: logger.warning( f"problem: {repair_func.__name__} has ({len(diagnosis)} issues)", stacklevel=warning_stacklevel + 2, ) problems[repair_func.__name__] = diagnosis apply_repair(repair_ch_av, chosen_but_not_available) apply_repair(repair_noch_nzwt, nothing_chosen_but_nonzero_weight) apply_repair(repair_nan_data_co, nan_data_co) apply_repair(repair_nan_wt, nan_weight) apply_repair(check_low_variance_data_co, low_variance_data_co) apply_repair(repair_ch_zq, chosen_but_zero_quantity) apply_repair(repair_av_zq, available_but_zero_quantity) if not problems: # Following are deep checks, which require actually evaluating the model. # We run these after the data checks above, so we can skip this work if there # are already known data problems. apply_repair(repair_nan_utility, nan_utility) apply_repair(check_overspec, overspecification) return model, problems
[docs] def chosen_but_not_available( model: Model, repair: Literal["?", "+", "-", "!"] = "?", verbose: int = 3 ) -> tuple[Model, pd.DataFrame | None]: """ Check if some observations are chosen but not available. Alternatives that are unavailable have their utility values set to negative infinity. If even one observation is chosen but not available, the model log-likelihood will nominally be negative infinity regardless of the values of any other parameters. Note that some compute engines (e.g. JAX) may not actually return negative infinity log likelihoods due to clipping of extreme values. Parameters ---------- model : larch.Model The model to check. repair : {'?', '+', '-', '!'}, default '?' How to repair the data. The plus ('+') will make the conflicting alternatives available, overriding the availability status. The minus ('-') will make them not chosen (possibly leaving no chosen alternative). A question mark ('?') effects no repair, and simply emits a warning without interrupting program execution. An exclamation mark will raise an error if there are any conflicts. verbose : int, default 3 The number of example rows to list for each problem. Returns ------- model : larch.Model The model with revised dataset attached. diagnosis : pd.DataFrame The number of bad instances, by alternative, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any conflicts found. """ dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) not_avail = dataset["av"] == 0 chosen = dataset["ch"] > 0 chosen_but_not_available = not_avail & chosen chosen_but_not_available_sum = chosen_but_not_available.sum(dataset.dc.CASEID) diagnosis = None if chosen_but_not_available_sum.sum() > 0: i1, i2 = np.where(chosen_but_not_available) diagnosis = pd.DataFrame( chosen_but_not_available_sum[chosen_but_not_available_sum > 0], columns=[ "n", ], ) diagnosis.insert(0, "altid", None) diagnosis_rownum = 0 for colnum, colname in enumerate( chosen_but_not_available.coords[dataset.dc.ALTID] ): if chosen_but_not_available_sum[colnum] > 0: diagnosis.loc[diagnosis_rownum, "example rows"] = ", ".join( str(j) for j in i1[i2 == colnum][:verbose] ) if isinstance(colname, xr.DataArray) and colname.ndim == 0: colname = colname.item() diagnosis.loc[diagnosis_rownum, "altid"] = colname diagnosis_rownum += 1 if repair == "+": model.dataset["av"].data[ chosen_but_not_available.values[:, : model.dataset["av"].shape[1]] ] = 1 elif repair == "-": model.dataset["ch"].data[ chosen_but_not_available.values[:, : model.dataset["ch"].shape[1]] ] = 0 elif repair == "!": raise ValueError( "some observed choices are not available (try `repair_ch_av`)" ) return model, diagnosis
[docs] def chosen_but_zero_quantity( model: Model, repair: Literal["?", "-", "!"] | None = None, verbose: int = 3 ): """ Check if some observations are chosen but have zero quantity. Alternatives that have zero quantity have utility values that end up as negative infinity, regardless of whether the alternative would otherwise be available. Due to the mathematical structure of how quantities are used in Larch, this situation is generally a result of a data problem, and not due to the current values of model parameters. If even one observation is chosen but has zero quantity, the model log-likelihood will nominally be negative infinity regardless of the values of any other parameters. Note that some compute engines (e.g. JAX) may not actually return negative infinity log likelihoods due to clipping of extreme values. Parameters ---------- model : BaseModel The model to check. repair : {'?', '-', '!'} How to repair the data. The minus ('-') will make alternatives with zero quantity not chosen (possibly leaving no chosen alternative). A question mark ('?') effects no repair, and simply emits a warning. An exclamation mark ('!') will raise an error if there are any conflicts. verbose : int, default 3 The number of example rows to list for each problem. Returns ------- model : BaseModel The model with revised data diagnosis : pd.DataFrame The number of bad instances, by alternative, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any conflicts found. """ if repair not in ("?", "-", "!", None): raise ValueError(f'invalid repair setting "{repair}"') if not model.quantity_ca: # no quantities, so no problem return model, None quant = model.quantity() zero_quantity = np.asarray(quant[:, : model.graph.n_elementals()] == 0) dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) ch_and_zero_quantity = (dataset["ch"] > 0) & zero_quantity ch_and_zero_quantity_sum = ch_and_zero_quantity.sum(dataset.dc.CASEID) diagnosis = None if ch_and_zero_quantity.sum() > 0: i1, i2 = np.where(ch_and_zero_quantity) diagnosis = ( ch_and_zero_quantity_sum[ch_and_zero_quantity_sum > 0] .to_pandas() .rename("n") .to_frame() ) for colnum, colname in enumerate(dataset.dc.altids()): if ch_and_zero_quantity_sum[colnum] > 0: diagnosis.loc[colname, "example rows"] = ", ".join( str(j) for j in i1[i2 == colnum][:verbose] ) msg = "chosen_but_zero_quantity: some observed choices have zero quantity.\n" try: from tabulate import tabulate except ImportError: msg += diagnosis.to_string() else: msg += tabulate(diagnosis, headers="keys", tablefmt="fancy_outline") if repair == "!": msg += "\nTry `repair_ch_zq` to resolve." raise ValueError(msg) elif repair == "?": logger.warning(msg) elif repair == "-": logger.warning( msg.replace( "some observed choices", "zeroing out observed choices that" ) ) model.dataset["ch"].values[ch_and_zero_quantity] = 0 return model, diagnosis
def available_but_zero_quantity( model: Model, repair: Literal["?", "-", "!"] | None = None, verbose: int = 3 ): """ Check if some observations are available but have zero quantity. Alternatives that have zero quantity have utility values that end up as negative infinity, regardless of whether the alternative would otherwise be available. Due to the mathematical structure of how quantities are used in Larch, this situation is generally a result of a data problem, and not due to the current values of model parameters. If even one observation is available but has zero quantity, the first derivative of the model log-likelihood may be incalculable, and the model parameter estimation process may fail. Parameters ---------- model : BaseModel The model to check. repair : {'?', '-', '!'} How to repair the data. The minus ('-') will make alternatives with zero quantity not available (possibly leaving no available alternatives). A question mark ('?') effects no repair, and simply emits a warning. An exclamation mark ('!') will raise an error if there are any conflicts. verbose : int, default 3 The number of example rows to list for each problem. Returns ------- model : BaseModel The model with revised data diagnosis : pd.DataFrame The number of bad instances, by alternative, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any conflicts found. """ if repair not in ("?", "-", "!", None): raise ValueError(f'invalid repair setting "{repair}"') if not model.quantity_ca: # no quantities, so no problem return model, None quant = model.quantity() zero_quantity = np.asarray(quant[:, : model.graph.n_elementals()] == 0) dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) av_and_zero_quantity = (dataset["av"] > 0) & zero_quantity av_and_zero_quantity_sum = av_and_zero_quantity.sum(dataset.dc.CASEID) diagnosis = None if av_and_zero_quantity.sum() > 0: i1, i2 = np.where(av_and_zero_quantity) diagnosis = ( av_and_zero_quantity_sum[av_and_zero_quantity_sum > 0] .to_pandas() .rename("n") .to_frame() ) for colnum, colname in enumerate(dataset.dc.altids()): if av_and_zero_quantity_sum[colnum] > 0: diagnosis.loc[colname, "example rows"] = ", ".join( str(j) for j in i1[i2 == colnum][:verbose] ) msg = ( "available_but_zero_quantity: some available choices have zero quantity.\n" ) try: from tabulate import tabulate except ImportError: msg += diagnosis.to_string() else: msg += tabulate(diagnosis, headers="keys", tablefmt="fancy_outline") if repair == "!": msg += "\nTry `repair_av_zq` to resolve." raise ValueError(msg) elif repair == "?": logger.warning(msg) elif repair == "-": logger.warning( msg.replace( "some available choices", "zeroing out available choices that" ) ) model.dataset["av"].values[av_and_zero_quantity] = 0 return model, diagnosis
[docs] def nothing_chosen_but_nonzero_weight( model, repair: Literal["?", "-", "*", "!"] = "?", verbose=3 ): """ Check if some observations have no choice but have some weight. Parameters ---------- model : BaseModel The model to check. repair : {'?', '-', '*', '!'} How to repair the data. Minus ('-') will make the weight zero when there is no choice. Star ('*') will also make the weight zero, plus autoscale all remaining weights so the total of the case weights equals the number of cases. A question mark ('?') effects no repair, and simply emits a warning. verbose : int, default 3 The number of example rows to list for each problem. Returns ------- model : BaseModel The revised dataframe diagnosis : pd.DataFrame The number of bad instances, by alternative, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any conflicts found. """ diagnosis = None dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) if "wt" in dataset and "ch" in dataset: nothing_chosen = dataset["ch"].sum(dataset.dc.ALTID) == 0 nothing_chosen_some_weight = nothing_chosen & (dataset["wt"] > 0) if nothing_chosen_some_weight.sum() > 0: i1 = np.where(nothing_chosen_some_weight)[0] diagnosis = pd.DataFrame( [ nothing_chosen_some_weight.sum(), ], columns=[ "n", ], index=[ "nothing_chosen_some_weight", ], ) diagnosis.loc["nothing_chosen_some_weight", "example rows"] = ", ".join( str(j) for j in i1[:verbose] ) msg = "nothing_chosen_but_nonzero_weight: some cases have no choice but non-zero weight.\n" try: from tabulate import tabulate except ImportError: msg += diagnosis.to_string() else: msg += tabulate(diagnosis, headers="keys", tablefmt="fancy_outline") if repair == "!": msg += "\nTry `repair_noch_nzwt` to resolve." raise ValueError(msg) elif repair == "?": logger.warning(msg) elif repair == "+": raise ValueError( "cannot resolve nothing_chosen_but_nonzero_weight by assuming some choice" ) elif repair == "-": model.dataset["wt"].data[nothing_chosen] = 0 elif repair == "*": model.dataset["wt"].data[nothing_chosen] = 0 model.dataset.dc.autoscale_weights() return model, diagnosis
[docs] def nan_data_co( model: Model, repair: Literal["?", True, "!"] = "?", verbose: int = 3 ) -> tuple[Model, pd.DataFrame | None]: """ Check if some data_co values are NaN. Parameters ---------- model : larch.Model The model to check. repair : {"?", "!", True} Whether to repair the data. Any true value other than "?" or "!" will make NaN values in data_co zero. The question mark simply emits a warning if there are NaN values found, while the exclamation mark will raise an error. verbose : int, default 3 The number of example columns to list for each problem. Returns ------- model : larch.Model The model with revised dataset attached. diagnosis : pd.DataFrame The number of bad instances, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any NaN values found. """ dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) diagnosis = None if "co" in dataset: nan_dat = np.isnan(dataset["co"]).sum(dataset.dc.CASEID) if nan_dat.sum(): diagnosis = ( nan_dat[nan_dat > 0] .iloc[:verbose] .to_pandas() .rename("n_nan") .to_frame() ) n = int(nan_dat.sum()) if repair == "?": logger.warning(f"nan_data_co: {n} instances have NaN values") elif repair == "!": raise ValueError( f"nan_data_co: {n} instances have NaN values, try `repair_nan_data_co`" ) if repair and repair != "?": dataset["co"] = dataset["co"].fillna(0) return model, diagnosis
[docs] def nan_weight( model: Model, repair: Literal["?", True, "!"] = "?", verbose: int = 3 ) -> tuple[Model, pd.DataFrame | None]: """ Check if some weight values are NaN. Parameters ---------- model : larch.Model The model to check. repair : {"?", "!", True} Whether to repair the data. Any true value other than "?" or "!" will make NaN values in weight zero. The question mark simply emits a warning if there are NaN values found, while the exclamation mark will raise an error. verbose : int, default 3 The number of example columns to list for each problem. Returns ------- model : larch.Model The model with revised dataset attached. diagnosis : pd.DataFrame The number of bad instances, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any NaN values found. """ dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) diagnosis = None if "wt" in dataset: nan_wt = int(np.isnan(dataset["wt"]).sum()) if nan_wt: diagnosis = ( dataset["wt"][np.isnan(dataset["wt"])] .iloc[:verbose] .to_pandas() .to_frame() ) if repair == "?": logger.warning(f"nan_weight: {nan_wt} instances have NaN values") elif repair == "!": raise ValueError( f"nan_weight: {nan_wt} instances have NaN values, try `repair_nan_wt`" ) if repair and repair != "?": dataset["wt"] = dataset["wt"].fillna(0) return model, diagnosis
[docs] def low_variance_data_co( model: Model, repair: Literal["?", "!"] = "?", verbose: int = 3 ): """ Check if any data_co columns have very low variance. Parameters ---------- model : larch.Model The model to check. repair : {"?", "!"} No repairs are available for this check. The question mark simply emits a warning if there are issues found, while the exclamation mark will raise an error. verbose : int, default 3 The number of example columns to list if there is a problem. Returns ------- model : larch.Model The model with revised dataset attached. diagnosis : pd.DataFrame The number of bad instances, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any low variance columns found. """ dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) diagnosis = None if "co" in dataset: variance = dataset["co"].var(dataset.dc.CASEID).to_pandas().rename("variance") if variance.min() < 1e-3: diagnosis = variance[variance < 1e-3].to_frame() if repair == "?": logger.warning( f"low_variance_data_co: {len(diagnosis)} columns have low variance" ) elif repair == "!": raise ValueError( f"low_variance_data_co: {len(diagnosis)} columns have low variance" ) return model, diagnosis
def nan_utility(model: Model, repair: Literal["?", True, "!"] = "?", verbose: int = 3): """ Check if any utility values are NaN at current parameters. Parameters ---------- model : larch.Model The model to check. repair : {'?', '!', True} Whether to repair the data. Any true value other than "?" or "!" will take alternatives with NaN values in utility, and make them unavailable. The question mark simply emits a warning if there are NaN values found, while the exclamation mark will raise an error. verbose : int, default 3 The number of example columns to list for each problem. Returns ------- model : larch.Model The model with revised dataset attached. diagnosis : pd.DataFrame The number of bad instances, and some example rows. Raises ------ ValueError If the repair is set to '!' and there are any NaN values found. """ dataset = model.dataset if dataset is None: raise ValueError("data not loaded") assert isinstance(dataset, xr.Dataset) n_alts = dataset["av"].shape[1] u = model.utility()[:, :n_alts] nan_u = np.isnan(u) & (dataset["av"] > 0) diagnosis = None nan_util = int(nan_u.sum()) if nan_util: diagnosis = nan_u.sum(dataset.dc.CASEID).to_pandas().rename("n").to_frame() if repair == "?": logger.warning( f"nan_utility: {nan_util} available alternatives have NaN " f"utility values" ) elif repair == "!": raise ValueError( f"nan_utility: {nan_util} available alternatives have NaN " f"utility values,\ntry using the Model.doctor `repair_nan_utility` " f"argument, set it to True to make them unavailable" ) elif repair: model.dataset["av"].data[nan_u] = 0 return model, diagnosis
[docs] def overspecification( model: BaseModel, repair: Literal["?", "!"] = "?", verbose: int = 3 ): """ Check model for possible over-specification. Parameters ---------- model : larch.Model The model to check. repair : {'?', '!'} No automatic repairs are available for this check. A question mark ('?') simply emits a warning if a possible over-specification is found. An exclamation mark ('!') will raise an error if possible over-specification is found. verbose : int, default 3 This is ignored for the overspecification check; all possible problems are listed. Returns ------- model : larch.Model The model with revised dataset attached. diagnosis : pd.DataFrame A dataframe of possible over-specification problems in the model. The index of this dataframe is a multi-index with the first level being the problem number, the second level being the eigenvalue, and the third level being the parameter name[s] of the non-zero elements of each problematic eigenvector. The columns are the non-zero eigenvector values. """ pvals = model.pvals locks = np.asarray(model.pholdfast.astype(bool)) if model.compute_engine == "jax": _se, hess, _inv_hess = model.jax_param_cov(pvals) else: hess = -model.d2_loglike(pvals) hess = np.asarray(hess).copy() hess[locks, :] = 0 hess[:, locks] = 0 diagnosis = None overspec = compute_possible_overspecification(hess, model.pholdfast) if overspec: diagnosis = [] possible_overspecification = [] msg = "Model is possibly over-specified (hessian is nearly singular)." msg += "\nLook for problems in these parameters or groups of parameters:" for eigval, ox, eigenvec in overspec: if eigval == "LinAlgError": possible_overspecification.append((eigval, [ox], [""])) else: paramset = list(np.asarray(model.pnames)[ox]) possible_overspecification.append((eigval, paramset, eigenvec[ox])) diagnosis.append( ( eigval, pd.Series(eigenvec[ox], index=paramset, name="eigenvector"), ) ) msg += f"\n- Eigenvalue: {eigval}" max_len_param = max(len(p) for p in paramset) for p, z in zip(paramset, eigenvec[ox]): msg += f"\n {p:{max_len_param}s}: {z}" model._possible_overspecification = possible_overspecification if repair == "!": raise PossibleOverspecificationError(msg) elif repair == "?": logger.warning(msg) diagnosis = pd.concat( [d[1] for d in diagnosis], keys=[(n, d[0]) for (n, d) in enumerate(diagnosis)], names=["problem", "eigenvalue", "parameter"], ) if isinstance(diagnosis, pd.Series): diagnosis = diagnosis.to_frame() return model, diagnosis