classifier

classifier ¶

sklearn-compatible classifier for ngboost-lightning.

LightningBoostClassifier ¶

LightningBoostClassifier(
    dist: type[Distribution] = Bernoulli,
    n_estimators: int = 500,
    learning_rate: float = 0.01,
    minibatch_frac: float = 1.0,
    col_sample: float = 1.0,
    natural_gradient: bool = True,
    tol: float = 0.0001,
    random_state: int | None = None,
    verbose: bool = True,
    verbose_eval: int = 100,
    num_leaves: int = 31,
    max_depth: int = -1,
    min_child_samples: int = 20,
    subsample: float = 1.0,
    colsample_bytree: float = 1.0,
    reg_alpha: float = 0.0,
    reg_lambda: float = 0.0,
    lgbm_params: dict[str, Any] | None = None,
    scoring_rule: ScoringRule | None = None,
    validation_fraction: float | None = None,
)

Bases: BaseEstimator, ClassifierMixin

Natural gradient boosting classifier powered by LightGBM.

Outputs full probability distributions over classes by boosting the parameters of a categorical distribution using the natural gradient of the log-likelihood.

Internally trains K-1 independent LightGBM boosters (one per logit parameter), faithfully replicating the NGBoost algorithm with LightGBM's histogram-based splitting for speed.

PARAMETER	DESCRIPTION
`dist`	Distribution class to use. Must be a subclass of `Categorical` (created via `k_categorical`). Defaults to `Bernoulli` (binary classification, K=2). For multiclass, use `k_categorical(K)` with the appropriate K. TYPE: `type[Distribution]` DEFAULT: `Bernoulli`
`n_estimators`	Number of boosting iterations. TYPE: `int` DEFAULT: `500`
`learning_rate`	Outer learning rate applied to each boosting step. TYPE: `float` DEFAULT: `0.01`
`minibatch_frac`	Fraction of training rows to subsample each iteration for gradient computation (NGBoost-style minibatch). 1.0 means no subsampling. TYPE: `float` DEFAULT: `1.0`
`col_sample`	Fraction of columns to subsample each boosting iteration. 1.0 means no column subsampling. All K parameter-boosters see the same feature subset each iteration. TYPE: `float` DEFAULT: `1.0`
`natural_gradient`	Whether to use the natural gradient (True) or the ordinary gradient (False). TYPE: `bool` DEFAULT: `True`
`tol`	Convergence tolerance. Training stops when the mean gradient norm falls below this value. TYPE: `float` DEFAULT: `0.0001`
`random_state`	Seed for reproducibility (minibatch sampling). TYPE: `int \| None` DEFAULT: `None`
`verbose`	Whether to log training progress. TYPE: `bool` DEFAULT: `True`
`verbose_eval`	Log progress every this many iterations. TYPE: `int` DEFAULT: `100`
`num_leaves`	Maximum number of leaves per tree. TYPE: `int` DEFAULT: `31`
`max_depth`	Maximum tree depth. -1 means no limit. TYPE: `int` DEFAULT: `-1`
`min_child_samples`	Minimum number of samples in a leaf. TYPE: `int` DEFAULT: `20`
`subsample`	LightGBM-level row subsampling ratio per tree. TYPE: `float` DEFAULT: `1.0`
`colsample_bytree`	Column subsampling ratio per tree. TYPE: `float` DEFAULT: `1.0`
`reg_alpha`	L1 regularization on leaf weights. TYPE: `float` DEFAULT: `0.0`
`reg_lambda`	L2 regularization on leaf weights. TYPE: `float` DEFAULT: `0.0`
`lgbm_params`	Additional parameters passed to each LightGBM Booster. TYPE: `dict[str, Any] \| None` DEFAULT: `None`
`validation_fraction`	Fraction of training data to hold out as validation for early stopping. If set and `X_val`/`y_val` are not provided to `fit()`, the training data is automatically split. Defaults to `None` (no auto-split). TYPE: `float \| None` DEFAULT: `None`

ATTRIBUTE	DESCRIPTION
`engine_`	The fitted `NGBEngine` instance.
`classes_`	Array of unique class labels seen during fit.
`n_classes_`	Number of classes.
`n_features_in_`	Number of features seen during `fit`.
`n_estimators_`	Actual number of boosting iterations.
`init_params_`	Initial distribution parameters from `dist.fit(y)`.
`scalings_`	Line search scale factor per iteration.
`train_loss_`	Training NLL per iteration.

Examples:

>>> from ngboost_lightning import LightningBoostClassifier
>>> clf = LightningBoostClassifier(n_estimators=100, learning_rate=0.05)
>>> clf.fit(X_train, y_train)
>>> probs = clf.predict_proba(X_test)
>>> labels = clf.predict(X_test)

Initialize the classifier. See class docstring for parameters.

Source code in ngboost_lightning/classifier.py

def __init__(
    self,
    dist: type[Distribution] = Bernoulli,
    n_estimators: int = 500,
    learning_rate: float = 0.01,
    minibatch_frac: float = 1.0,
    col_sample: float = 1.0,
    natural_gradient: bool = True,
    tol: float = 1e-4,
    random_state: int | None = None,
    verbose: bool = True,
    verbose_eval: int = 100,
    # Surfaced LightGBM params
    num_leaves: int = 31,
    max_depth: int = -1,
    min_child_samples: int = 20,
    subsample: float = 1.0,
    colsample_bytree: float = 1.0,
    reg_alpha: float = 0.0,
    reg_lambda: float = 0.0,
    # Escape hatch
    lgbm_params: dict[str, Any] | None = None,
    # Scoring rule
    scoring_rule: ScoringRule | None = None,
    # Auto validation split
    validation_fraction: float | None = None,
) -> None:
    """Initialize the classifier. See class docstring for parameters."""
    self.dist = dist
    self.n_estimators = n_estimators
    self.learning_rate = learning_rate
    self.minibatch_frac = minibatch_frac
    self.col_sample = col_sample
    self.natural_gradient = natural_gradient
    self.tol = tol
    self.random_state = random_state
    self.verbose = verbose
    self.verbose_eval = verbose_eval
    self.num_leaves = num_leaves
    self.max_depth = max_depth
    self.min_child_samples = min_child_samples
    self.subsample = subsample
    self.colsample_bytree = colsample_bytree
    self.reg_alpha = reg_alpha
    self.reg_lambda = reg_lambda
    self.lgbm_params = lgbm_params
    self.scoring_rule = scoring_rule
    self.validation_fraction = validation_fraction

feature_importances_ `property` ¶

feature_importances_: NDArray[floating]

Feature importances per distribution parameter.

RETURNS	DESCRIPTION
`NDArray[floating]`	Importance array, shape `[n_params, n_features]`. Each row
`NDArray[floating]`	sums to 1.0 and corresponds to one logit parameter.

fit ¶

fit(
    X: NDArray[floating],
    y: NDArray[floating],
    X_val: NDArray[floating] | None = None,
    y_val: NDArray[floating] | None = None,
    early_stopping_rounds: int | None = None,
    sample_weight: NDArray[floating] | None = None,
    val_sample_weight: NDArray[floating] | None = None,
    train_loss_monitor: Callable[
        [Distribution, NDArray[floating]], float
    ]
    | None = None,
    val_loss_monitor: Callable[
        [Distribution, NDArray[floating]], float
    ]
    | None = None,
) -> Self

Fit the natural gradient boosting classifier.

PARAMETER	DESCRIPTION
`X`	Training features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`
`y`	Training class labels, shape `[n_samples]`. TYPE: `NDArray[floating]`
`X_val`	Validation features for early stopping. TYPE: `NDArray[floating] \| None` DEFAULT: `None`
`y_val`	Validation class labels for early stopping. TYPE: `NDArray[floating] \| None` DEFAULT: `None`
`early_stopping_rounds`	Stop if validation loss hasn't improved for this many consecutive iterations. TYPE: `int \| None` DEFAULT: `None`
`sample_weight`	Per-sample training weights, shape `[n_samples]`. If `None`, all samples are weighted equally. TYPE: `NDArray[floating] \| None` DEFAULT: `None`
`val_sample_weight`	Per-sample validation weights, shape `[n_val_samples]`. Required when both `sample_weight` and validation data are provided. TYPE: `NDArray[floating] \| None` DEFAULT: `None`
`train_loss_monitor`	Custom callable for computing training loss. Signature: `(pred_dist, y) -> float`. Replaces the default scoring-rule-based training loss for recording only (gradients still use the scoring rule). TYPE: `Callable[[Distribution, NDArray[floating]], float] \| None` DEFAULT: `None`
`val_loss_monitor`	Custom callable for computing validation loss. Signature: `(pred_dist, y) -> float`. Replaces the default scoring-rule-based validation loss for both recording and early stopping decisions. TYPE: `Callable[[Distribution, NDArray[floating]], float] \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`Self`	The fitted estimator.

RAISES	DESCRIPTION
`ValueError`	If the number of classes in y does not match the distribution's K, or if a LightGBM parameter appears in both a surfaced kwarg and `lgbm_params`, or if weight/validation arguments are inconsistent, or if both `validation_fraction` and explicit `X_val`/`y_val` are provided.

Source code in ngboost_lightning/classifier.py

def fit(
    self,
    X: NDArray[np.floating],
    y: NDArray[np.floating],
    X_val: NDArray[np.floating] | None = None,
    y_val: NDArray[np.floating] | None = None,
    early_stopping_rounds: int | None = None,
    sample_weight: NDArray[np.floating] | None = None,
    val_sample_weight: NDArray[np.floating] | None = None,
    train_loss_monitor: Callable[[Distribution, NDArray[np.floating]], float]
    | None = None,
    val_loss_monitor: Callable[[Distribution, NDArray[np.floating]], float]
    | None = None,
) -> Self:
    """Fit the natural gradient boosting classifier.

    Args:
        X: Training features, shape ``[n_samples, n_features]``.
        y: Training class labels, shape ``[n_samples]``.
        X_val: Validation features for early stopping.
        y_val: Validation class labels for early stopping.
        early_stopping_rounds: Stop if validation loss hasn't improved
            for this many consecutive iterations.
        sample_weight: Per-sample training weights, shape ``[n_samples]``.
            If ``None``, all samples are weighted equally.
        val_sample_weight: Per-sample validation weights,
            shape ``[n_val_samples]``. Required when both
            ``sample_weight`` and validation data are provided.
        train_loss_monitor: Custom callable for computing training loss.
            Signature: ``(pred_dist, y) -> float``. Replaces the
            default scoring-rule-based training loss for recording
            only (gradients still use the scoring rule).
        val_loss_monitor: Custom callable for computing validation loss.
            Signature: ``(pred_dist, y) -> float``. Replaces the
            default scoring-rule-based validation loss for both
            recording and early stopping decisions.

    Returns:
        The fitted estimator.

    Raises:
        ValueError: If the number of classes in y does not match the
            distribution's K, or if a LightGBM parameter appears in
            both a surfaced kwarg and ``lgbm_params``, or if
            weight/validation arguments are inconsistent, or if both
            ``validation_fraction`` and explicit ``X_val``/``y_val``
            are provided.
    """
    X_checked, y_checked = validate_data(self, X, y)

    if sample_weight is not None:
        sample_weight = np.asarray(sample_weight, dtype=np.float64)
    if val_sample_weight is not None:
        val_sample_weight = np.asarray(val_sample_weight, dtype=np.float64)

    # Auto validation split (before label encoding so encoder sees all
    # classes from the full y)
    has_explicit_val = X_val is not None and y_val is not None
    if self.validation_fraction is not None and has_explicit_val:
        msg = (
            "validation_fraction and explicit X_val/y_val cannot both "
            "be provided. Use one or the other."
        )
        raise ValueError(msg)

    if self.validation_fraction is not None and not has_explicit_val:
        split_arrays = [X_checked, y_checked]
        if sample_weight is not None:
            split_arrays.append(sample_weight)

        splits = train_test_split(
            *split_arrays,
            test_size=self.validation_fraction,
            random_state=self.random_state,
            stratify=y_checked,
        )
        if sample_weight is not None:
            X_checked, X_val, y_checked, y_val, sample_weight, val_sample_weight = (
                splits
            )
        else:
            X_checked, X_val, y_checked, y_val = splits

        if early_stopping_rounds is None:
            early_stopping_rounds = 20

    # Encode labels
    self._label_encoder = LabelEncoder()
    y_encoded = self._label_encoder.fit_transform(y_checked).astype(np.float64)
    self.classes_ = self._label_encoder.classes_
    self.n_classes_: int = len(self.classes_)

    # Validate distribution K matches n_classes
    if not issubclass(self.dist, Categorical):
        msg = (
            f"dist must be a Categorical subclass (from k_categorical), "
            f"got {self.dist}"
        )
        raise TypeError(msg)

    expected_k = getattr(self.dist, "K", None)
    if expected_k is not None and expected_k != self.n_classes_:
        msg = (
            f"Distribution expects K={expected_k} classes but y contains "
            f"{self.n_classes_} classes. Use dist=k_categorical({self.n_classes_})."
        )
        raise ValueError(msg)

    if X_val is not None and y_val is not None:
        X_val = check_array(X_val, dtype=np.float64)
        y_val_encoded = self._label_encoder.transform(np.asarray(y_val)).astype(
            np.float64
        )
    else:
        y_val_encoded = None

    merged_lgbm = build_lgbm_params(self, self.lgbm_params)

    if isinstance(self.scoring_rule, CRPScore):
        msg = (
            "CRPScore is not supported for classification. "
            "CRPS is only defined for continuous distributions."
        )
        raise ValueError(msg)

    self.engine_ = NGBEngine(
        dist=self.dist,
        n_estimators=self.n_estimators,
        learning_rate=self.learning_rate,
        minibatch_frac=self.minibatch_frac,
        col_sample=self.col_sample,
        natural_gradient=self.natural_gradient,
        tol=self.tol,
        random_state=self.random_state,
        verbose=self.verbose,
        verbose_eval=self.verbose_eval,
        lgbm_params=merged_lgbm,
        scoring_rule=self.scoring_rule,
    )
    self.engine_.fit(
        X_checked,
        y_encoded,
        X_val=X_val,
        y_val=y_val_encoded,
        early_stopping_rounds=early_stopping_rounds,
        sample_weight=sample_weight,
        val_sample_weight=val_sample_weight,
        train_loss_monitor=train_loss_monitor,
        val_loss_monitor=val_loss_monitor,
    )

    # Copy fitted attributes from engine
    self.init_params_: NDArray[np.floating] = self.engine_.init_params_
    self.scalings_: list[float] = self.engine_.scalings_
    self.train_loss_: list[float] = self.engine_.train_loss_
    self.n_estimators_: int = self.engine_.n_estimators_
    self.boosters_ = self.engine_.boosters_

    if hasattr(self.engine_, "val_loss_"):
        self.val_loss_: list[float] = self.engine_.val_loss_
        self.best_val_loss_itr_: int | None = self.engine_.best_val_loss_itr_

    return self

predict ¶

predict(X: NDArray[floating]) -> NDArray[integer]

Predict class labels.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`

RETURNS	DESCRIPTION
`NDArray[integer]`	Predicted class labels, shape `[n_samples]`.

Source code in ngboost_lightning/classifier.py

def predict(self, X: NDArray[np.floating]) -> NDArray[np.integer]:
    """Predict class labels.

    Args:
        X: Features, shape ``[n_samples, n_features]``.

    Returns:
        Predicted class labels, shape ``[n_samples]``.
    """
    check_is_fitted(self)
    proba = self.predict_proba(X)
    indices = np.argmax(proba, axis=1)
    result: NDArray[np.integer] = self.classes_[indices]
    return result

predict_proba ¶

predict_proba(X: NDArray[floating]) -> NDArray[floating]

Predict class probabilities.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`

RETURNS	DESCRIPTION
`NDArray[floating]`	Probability matrix, shape `[n_samples, n_classes]`.
`NDArray[floating]`	Each row sums to 1.

Source code in ngboost_lightning/classifier.py

def predict_proba(self, X: NDArray[np.floating]) -> NDArray[np.floating]:
    """Predict class probabilities.

    Args:
        X: Features, shape ``[n_samples, n_features]``.

    Returns:
        Probability matrix, shape ``[n_samples, n_classes]``.
        Each row sums to 1.
    """
    check_is_fitted(self)
    X_checked = check_array(X, dtype=np.float64)
    dist = self.engine_.pred_dist(X_checked)
    assert isinstance(dist, Categorical)
    result: NDArray[np.floating] = dist.probs
    return result

pred_dist ¶

pred_dist(X: NDArray[floating]) -> Categorical

Predict the full conditional distribution.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`

RETURNS	DESCRIPTION
`Categorical`	A Categorical distribution instance for all samples.

Source code in ngboost_lightning/classifier.py

def pred_dist(self, X: NDArray[np.floating]) -> Categorical:
    """Predict the full conditional distribution.

    Args:
        X: Features, shape ``[n_samples, n_features]``.

    Returns:
        A Categorical distribution instance for all samples.
    """
    check_is_fitted(self)
    X_checked = check_array(X, dtype=np.float64)
    dist = self.engine_.pred_dist(X_checked)
    if not isinstance(dist, Categorical):
        msg = f"Expected Categorical distribution, got {type(dist)}"
        raise TypeError(msg)
    return dist

staged_predict ¶

staged_predict(
    X: NDArray[floating],
) -> Generator[NDArray[integer]]

Yield class label predictions after each boosting iteration.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`

YIELDS	DESCRIPTION
`Generator[NDArray[integer]]`	Predicted class labels at iteration i, shape `[n_samples]`.

Source code in ngboost_lightning/classifier.py

def staged_predict(self, X: NDArray[np.floating]) -> Generator[NDArray[np.integer]]:
    """Yield class label predictions after each boosting iteration.

    Args:
        X: Features, shape ``[n_samples, n_features]``.

    Yields:
        Predicted class labels at iteration *i*, shape ``[n_samples]``.
    """
    for probs in self.staged_predict_proba(X):
        indices = np.argmax(probs, axis=1)
        result: NDArray[np.integer] = self.classes_[indices]
        yield result

staged_predict_proba ¶

staged_predict_proba(
    X: NDArray[floating],
) -> Generator[NDArray[floating]]

Yield class probabilities after each boosting iteration.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`

YIELDS	DESCRIPTION
`Generator[NDArray[floating]]`	Probability matrix at iteration i,
`Generator[NDArray[floating]]`	shape `[n_samples, n_classes]`.

Source code in ngboost_lightning/classifier.py

def staged_predict_proba(
    self, X: NDArray[np.floating]
) -> Generator[NDArray[np.floating]]:
    """Yield class probabilities after each boosting iteration.

    Args:
        X: Features, shape ``[n_samples, n_features]``.

    Yields:
        Probability matrix at iteration *i*,
        shape ``[n_samples, n_classes]``.
    """
    for dist in self.staged_pred_dist(X):
        result: NDArray[np.floating] = dist.probs
        yield result

staged_pred_dist ¶

staged_pred_dist(
    X: NDArray[floating],
) -> Generator[Categorical]

Yield the full conditional distribution after each iteration.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`

YIELDS	DESCRIPTION
`Generator[Categorical]`	Categorical distribution at iteration i.

Source code in ngboost_lightning/classifier.py

def staged_pred_dist(self, X: NDArray[np.floating]) -> Generator[Categorical]:
    """Yield the full conditional distribution after each iteration.

    Args:
        X: Features, shape ``[n_samples, n_features]``.

    Yields:
        Categorical distribution at iteration *i*.
    """
    check_is_fitted(self)
    X_checked = check_array(X, dtype=np.float64)
    for dist in self.engine_.staged_pred_dist(X_checked):
        assert isinstance(dist, Categorical)
        yield dist

score ¶

score(X: NDArray[floating], y: NDArray[floating]) -> float

Negative mean NLL (higher is better).

Follows the same convention as LightningBoostRegressor: returns -mean(NLL) so that higher values indicate better fit. This is consistent with probabilistic scoring but differs from sklearn's ClassifierMixin.score() which returns accuracy.

PARAMETER	DESCRIPTION
`X`	Features, shape `[n_samples, n_features]`. TYPE: `NDArray[floating]`
`y`	True class labels, shape `[n_samples]`. TYPE: `NDArray[floating]`

RETURNS	DESCRIPTION
`float`	`-mean(NLL)` as a float. Higher indicates a better fit.

Source code in ngboost_lightning/classifier.py

def score(self, X: NDArray[np.floating], y: NDArray[np.floating]) -> float:
    """Negative mean NLL (higher is better).

    Follows the same convention as ``LightningBoostRegressor``: returns
    ``-mean(NLL)`` so that higher values indicate better fit. This is
    consistent with probabilistic scoring but differs from sklearn's
    ``ClassifierMixin.score()`` which returns accuracy.

    Args:
        X: Features, shape ``[n_samples, n_features]``.
        y: True class labels, shape ``[n_samples]``.

    Returns:
        ``-mean(NLL)`` as a float. Higher indicates a better fit.
    """
    check_is_fitted(self)
    X_checked = check_array(X, dtype=np.float64)
    y_encoded = self._label_encoder.transform(np.asarray(y)).astype(np.float64)
    return -self.engine_.scoring_rule.total_score(
        self.engine_.pred_dist(X_checked), y_encoded
    )

classifier

classifier ¶

LightningBoostClassifier ¶

feature_importances_ property ¶

fit ¶

predict ¶

predict_proba ¶

pred_dist ¶

staged_predict ¶

staged_predict_proba ¶

staged_pred_dist ¶

score ¶

feature_importances_ `property` ¶