Source code for hapc.hazard

"""Discrete-time logistic hazard via HAPC (``family="logit-hazard"``).

Provides :func:`hazard_hapc`, the Python counterpart of R ``hazard.hapc()``.
It is a thin wrapper around :func:`hapc.cv_hapc` with ``family="binomial"``:
right-censored survival data ``(X, T, Delta)`` are expanded into a
person-period table (one row per subject-per-interval-at-risk) whose binary
response is the discrete hazard indicator, with the visit time prepended as the
first HAL covariate.

Data contract
-------------
For subject ``i`` let ``T_event_i`` be the latent event time and ``C_i`` the
censoring time. The user supplies only the observed quantities

    T_i     = min(T_event_i, C_i)        # observed time
    Delta_i = 1(T_event_i <= C_i)        # event indicator

together with baseline covariates ``X``. Times are assumed *discrete* (interval
/ visit indices). Each subject contributes one person-period row for every grid
time ``g <= T_i``; the hazard label is ``1`` only at the event interval
(``g == T_i`` and ``Delta_i == 1``) and ``0`` otherwise (including the last
interval of censored subjects).

``norm="sv"`` is **not implemented** for this family and raises
``NotImplementedError``.
"""

from typing import NamedTuple, Optional

import numpy as np

from .core import _C, design_hapc
from .cv import CVResult, cv_hapc
from .single import hapc as _hapc


[docs] class HazardResult(NamedTuple): """Output of :func:`hazard_hapc`. Attributes ---------- hazard : np.ndarray, shape (N,) Estimated discrete hazard for each person-period row (aligned with ``ids``/``times``); the cross-validated predictions at the winning λ. ids : np.ndarray, shape (N,) Subject index (0-based) for each person-period row. times : np.ndarray, shape (N,) Grid time for each person-period row. Y : np.ndarray, shape (N,) Binary hazard label for each person-period row. time_grid : np.ndarray, shape (K,) The discrete time grid used. lambdas : np.ndarray, shape (L,) CV λ grid. risk : np.ndarray, shape (L,) Mean cross-validated logistic deviance per λ. best_lambda : float Deviance-minimising λ. interior : bool ``True`` when ``best_lambda`` is strictly inside the grid (not at either endpoint) — a basic check that the grid brackets the optimum. cv : CVResult The full underlying :func:`hapc.cv_hapc` result. predict_hazard : np.ndarray or None, shape (m, K) Hazard surface for new subjects (only when ``predict`` is supplied). predict_survival : np.ndarray or None, shape (m, K) Survival curves ``S(t|x) = prod_{g<=t}(1 - hazard(g|x))`` for new subjects (only when ``predict`` is supplied). """ hazard: np.ndarray ids: np.ndarray times: np.ndarray Y: np.ndarray time_grid: np.ndarray lambdas: np.ndarray risk: np.ndarray best_lambda: float interior: bool cv: CVResult predict_hazard: Optional[np.ndarray] predict_survival: Optional[np.ndarray]
def _infer_time_grid(T: np.ndarray) -> np.ndarray: """Default discrete grid: ``min:max`` for integer times, else unique values.""" T = np.asarray(T, dtype=np.float64).ravel() if np.all(np.abs(T - np.round(T)) < 1e-9): return np.arange(int(round(T.min())), int(round(T.max())) + 1, dtype=np.float64) return np.unique(T) def _person_period(X: np.ndarray, T: np.ndarray, Delta: np.ndarray, grid: np.ndarray): """Expand ``(X, T, Delta)`` into the person-period design. Returns ``ids``, ``times``, the ``[time, X]`` design matrix ``Xpp`` and the binary hazard response ``Y``. Subject ``i`` contributes one row per grid time ``g <= T_i``; ``Y = 1`` only at the event interval. """ n = X.shape[0] ids, times, Xrows, Y = [], [], [], [] for i in range(n): gi = grid[grid <= T[i]] if gi.size == 0: gi = grid[:1] k = gi.size ids.append(np.full(k, i, dtype=np.int64)) times.append(gi) Xrows.append(np.repeat(X[i][None, :], k, axis=0)) Y.append(((gi == T[i]) & (Delta[i] == 1)).astype(np.float64)) ids = np.concatenate(ids) times = np.concatenate(times) Xpp = np.column_stack([times, np.vstack(Xrows)]) Yv = np.concatenate(Y) return ids, times, Xpp, Yv
[docs] def hazard_hapc(X: np.ndarray, T: np.ndarray, Delta: np.ndarray, norm: str = "1", max_degree: int = 1, npcs: Optional[int] = None, time_grid: Optional[np.ndarray] = None, log_lambda_min: float = -4, log_lambda_max: float = -1, grid_length: int = 15, nfolds: int = 5, predict: Optional[np.ndarray] = None, center: bool = True, verbose: bool = False, max_iter: int = 5000, tol: float = 1e-3, step_factor: float = 0.8) -> HazardResult: """Discrete-time logistic hazard HAPC fit (``family="logit-hazard"``). Convenience wrapper around :func:`hapc.cv_hapc` with ``family="binomial"``. The right-censored survival data ``(X, T, Delta)`` are expanded into a person-period table (one row per subject-per-interval-at-risk) whose binary response is the discrete hazard indicator, the visit time is prepended as the first HAL covariate, and the regularisation parameter ``lambda`` is chosen by cross-validated logistic deviance. R counterpart: ``hapc::hazard.hapc()``. Parameters ---------- X : np.ndarray, shape (n, p) Baseline covariates, one row per subject. T : np.ndarray, shape (n,) Observed times ``T_i = min(T_event_i, C_i)``. Assumed discrete. Delta : np.ndarray, shape (n,) Event indicators ``Delta_i in {0,1}`` (1 = event, 0 = right-censored). norm : {"1", "2"}, default "1" ``"1"`` = logistic LASSO, ``"2"`` = logistic ridge. ``"sv"`` raises ``NotImplementedError``. max_degree : int, default 1 HAL interaction order over ``[time, X]``. npcs : int, optional Number of principal components. Defaults to the number of person-period rows (capped internally as in :func:`hapc.cv_hapc`). time_grid : np.ndarray, optional Discrete time grid (risk-set grid). Defaults to ``min(T):max(T)`` when ``T`` is integer-valued, else ``np.unique(T)``. Subjects are assumed at risk from ``min(time_grid)`` onward. log_lambda_min, log_lambda_max, grid_length : float / int Log-λ CV grid (defaults ``-4``, ``-1``, ``15``). nfolds : int, default 5 Number of CV folds. predict : np.ndarray, optional, shape (m, p) Baseline covariates for *new subjects*. When supplied the model (refit at the CV-selected λ) is evaluated on the full time grid for each new subject, returning the hazard surface and the implied survival curves. center, verbose, max_iter, tol, step_factor : Passed through to :func:`hapc.cv_hapc` / :func:`hapc.hapc`. Returns ------- HazardResult See :class:`HazardResult`. ``hazard`` holds the cross-validated discrete hazard for each person-period row; ``predict_hazard`` / ``predict_survival`` are populated when ``predict`` is supplied. Notes ----- **Model.** The discrete hazard is the conditional event probability in interval ``t`` given survival up to ``t``, lambda(t | x) = P(T_event = t | T_event >= t, X = x), modelled on the logit scale by a Highly Adaptive Principal Components fit ``f`` of the augmented covariate ``(t, x)``: logit lambda(t | x) = f(t, x). The HAL basis spans indicator (and, for ``max_degree > 1``, interaction) tensor products in ``(t, x)``, so the time effect, the covariate effects and their interactions are estimated nonparametrically; the L1/L2 penalty (``norm``) controls smoothness and is tuned by cross-validation. **Person-period likelihood.** Under independent right-censoring the observed-data likelihood factorises over the at-risk intervals, prod_i prod_{t <= T_i} lambda(t | x_i) ** Y_it * (1 - lambda(t | x_i)) ** (1 - Y_it), with ``Y_it = 1(T_event_i = t)``. This is exactly the Bernoulli (logistic) likelihood of the expanded person-period table, so fitting a binomial HAPC model to ``Y_it`` against ``(t, x_i)`` estimates the discrete hazard (Cox 1972; Brown 1975; Allison 1982). **Survival.** The conditional survival function follows from the estimated hazard by the product-limit relation S(t | x) = P(T_event > t | x) = prod_{s <= t} (1 - lambda(s | x)), returned in ``predict_survival`` for new subjects when ``predict`` is given. References ---------- Cox, D. R. (1972). Regression models and life-tables. *JRSS B*, 34(2), 187-220. Brown, C. C. (1975). On the use of indicator variables for studying the time-dependence of parameters in a response-time model. *Biometrics*, 31(4), 863-872. Allison, P. D. (1982). Discrete-time methods for the analysis of event histories. *Sociological Methodology*, 13, 61-98. Singer, J. D. and Willett, J. B. (2003). *Applied Longitudinal Data Analysis*. Oxford University Press. Benkeser, D. and van der Laan, M. (2016). The Highly Adaptive Lasso estimator. *IEEE DSAA*, 689-696. Examples -------- >>> import numpy as np >>> from hapc import hazard_hapc >>> rng = np.random.default_rng(0) >>> n = 200 >>> X = np.column_stack([rng.uniform(size=n), rng.integers(0, 2, n)]).astype(float) >>> grid = np.arange(1, 7) >>> def haz(t, x): return 1 / (1 + np.exp(-(-2.6 + 0.3 * t + 1.3 * x[0] - 0.9 * x[1]))) >>> Tev = np.full(n, grid.max()) >>> for i in range(n): ... for t in grid: ... if rng.random() < haz(t, X[i]): ... Tev[i] = t; break >>> C = rng.choice(grid, n) >>> Tobs = np.minimum(Tev, C); Delta = (Tev <= C).astype(float) >>> fit = hazard_hapc(X, Tobs, Delta, norm="1", max_degree=2, time_grid=grid) >>> bool(fit.best_lambda > 0) True """ norm = str(norm) if norm == "sv": raise NotImplementedError( "family='logit-hazard' (discrete-time logistic hazard) is not " "implemented for norm='sv'; use norm='1' or norm='2'." ) if norm not in {"1", "2"}: raise ValueError( f"family='logit-hazard' supports norm in {{'1','2'}}; got '{norm}'" ) X = _C(X) T = np.asarray(T, dtype=np.float64).ravel() Delta = np.asarray(Delta, dtype=np.float64).ravel() n, p = X.shape if T.size != n: raise ValueError(f"T must have {n} elements, got {T.size}") if Delta.size != n: raise ValueError(f"Delta must have {n} elements, got {Delta.size}") if not np.all(np.isin(Delta, (0.0, 1.0))): raise ValueError("Delta must be a 0/1 event indicator.") if time_grid is None: grid = _infer_time_grid(T) else: grid = np.unique(np.asarray(time_grid, dtype=np.float64).ravel()) if grid.size < 2: raise ValueError(f"Need at least two distinct time points; got {grid.size}.") if T.max() > grid.max(): raise ValueError("Some observed times exceed max(time_grid); supply a " "wider 'time_grid'.") ids, times, Xpp, Yv = _person_period(X, T, Delta, grid) if npcs is None: # Default npcs = numerical rank of the person-period HAL design. The # expansion creates many duplicate rows (discrete time x repeated # baseline covariates), so the kernel is rank-deficient; keeping the # null-space directions makes the 1/d prediction reconstruction blow up # (notably for norm="2"). Dropping the near-zero singular values is both # numerically safe and a sensible amount of regularisation. des0 = design_hapc(Xpp, int(max_degree), int(Xpp.shape[0]), center=center) d0 = np.asarray(des0.d, dtype=np.float64) npcs = int(max(1, np.sum(d0 > 1e-7 * d0[0]))) cv = cv_hapc( Xpp, Yv, family="binomial", norm=norm, max_degree=max_degree, npcs=npcs, log_lambda_min=log_lambda_min, log_lambda_max=log_lambda_max, grid_length=grid_length, nfolds=nfolds, predict=Xpp, center=center, verbose=verbose, max_iter=max_iter, tol=tol, step_factor=step_factor, ) hazard_train = np.asarray(cv.predictions, dtype=np.float64).ravel() lambdas = np.asarray(cv.lambdas, dtype=np.float64) risk = np.asarray(cv.mses, dtype=np.float64) best_lambda = float(cv.best_lambda) best_idx = int(np.argmin(risk)) interior = bool(0 < best_idx < lambdas.size - 1) predict_hazard = predict_survival = None if predict is not None: Xnew = _C(predict) if Xnew.shape[1] != p: raise ValueError("predict must have the same number of columns as X.") m, K = Xnew.shape[0], grid.size # Full-grid expansion: row order = (subject 0: all times), (subject 1: ...) new_times = np.tile(grid, m) new_X = np.repeat(Xnew, K, axis=0) newXpp = np.column_stack([new_times, new_X]) fit = _hapc( Xpp, Yv, family="binomial", norm=norm, max_degree=max_degree, npcs=npcs, lambda_=best_lambda, predict=newXpp, center=center, verbose=verbose, max_iter=max_iter, tol=tol, step_factor=step_factor, ) haz = np.asarray(fit.probabilities, dtype=np.float64).reshape(m, K) predict_hazard = haz predict_survival = np.cumprod(1.0 - haz, axis=1) return HazardResult( hazard=hazard_train, ids=ids, times=times, Y=Yv, time_grid=grid, lambdas=lambdas, risk=risk, best_lambda=best_lambda, interior=interior, cv=cv, predict_hazard=predict_hazard, predict_survival=predict_survival, )
__all__ = ["HazardResult", "hazard_hapc"]