Module `hipe4ml.model_handler`

Module containing the class used for wrapping the models from different ML libraries to build a new model with common methods

Classes

class ModelHandler (input_model=None, training_columns=None, model_params=None, task_type='classification')

Expand source code

class ModelHandler:
    """
    Class used for wrapping the models from different ML libraries to
    build a new model with common methods. Currently LightGBM, XGBoost
    (through their sklearn interface) and sklearn models are supported.

    Parameters
    -------------------------------------------------
    input_model: XGBoost, LightGBM or sklearn model

    training_columns: list
        Contains the name of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']

    model_params: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...

    task_type: str
        Task type of the model: 'classification' or 'regression'
    """

    def __init__(self, input_model=None, training_columns=None, model_params=None, task_type='classification'):
        self.model = input_model
        self.training_columns = training_columns
        self.model_params = model_params
        self._n_classes = None
        self.optuna_cross_val_score = None
        self._task_type = task_type
        if self._task_type not in ['classification', 'regression']:
            raise ValueError(
                "Task type must be either 'classification' or 'regression'")

        if self.model is not None:
            self.model_string = inspect.getmodule(
                self.model).__name__.partition('.')[0]

            if self.model_string not in ["xgboost", "lightgbm", "sklearn"]:
                raise ValueError(
                    "Model must be either 'xgboost', 'lightgbm' or 'sklearn'")

            if self.model_params is None:
                self.model_params = self.model.get_params()
            else:
                self.model.set_params(**self.model_params)

    def set_model_params(self, model_params):
        """
        Set the model (hyper-)parameters

        Parameters
        ------------------------------------
        model_params: dict
            Model hyper-parameter values. For
            example (XGBoost): max_depth, learning_rate,
            n_estimators, gamma, min_child_weight, ...
        """
        self.model_params = model_params
        self.model.set_params(**self.model_params)

    def get_model_params(self):
        """
        Get the model (hyper-)parameters

        Returns
        ------------------------------------
        out: dict
            Model hyper-parameter values. For
            example (XGBoost): max_depth, learning_rate,
            n_estimators, gamma, min_child_weight, ...
        """
        return self.model.get_params()

    def set_training_columns(self, training_columns):
        """
        Set the features used for the training process

        Parameters
        ------------------------------------
        training_columns: list
            Contains the name of the features used for the training.
            Example: ['dEdx', 'pT', 'ct']
        """
        self.training_columns = training_columns

    def get_training_columns(self):
        """
        Get the features used for the training process

        Returns
        ------------------------------------
        out: list
            Names of the features used for the training.
            Example: ['dEdx', 'pT', 'ct']
        """

        return self.training_columns

    def get_original_model(self):
        """
        Get the original unwrapped model

        Returns
        ---------------------------
        out: XGBoost, LGBM or sklearn model
        """
        return self.model

    def get_model_module(self):
        """
        Get the string containing the name
        of the model module

        Returns
        ---------------------------
        out: str
            Name of the model module
        """
        return self.model_string

    def get_n_classes(self):
        """
        Get the number of classes

        Returns
        ---------------------------
        out: int
            Number of classes
        """
        return self._n_classes

    def get_task_type(self):
        """
        Get the task type of the model

        Returns
        ---------------------------
        out: str
            Task type of the model: 'classification' or 'regression'
        """
        return self._task_type

    def fit(self, x_train, y_train, **kwargs):
        """
        Fit Model

        Parameters
        ---------------------------
        x_train: array-like, sparse matrix
            Training data

        y_train: array-like, sparse matrix
            Target data

        **kwargs:
            Extra kwargs passed on to model.fit() method
        """
        if self._task_type == 'classification':
            n_classes = len(np.unique(y_train))
            self._n_classes = n_classes
        if self.training_columns is None:
            self.training_columns = list(x_train.columns)

        self.model.fit(x_train[self.training_columns], y_train, **kwargs)

    def predict(self, x_test, output_margin=True, **kwargs):
        """
        Return model prediction for the array x_test
        Parameters
        --------------------------------------
        x_test: hipe4ml tree_handler, array-like, sparse matrix
            The input sample.

        output_margin: bool
            Whether to output the raw untransformed margin value. If False model
            probabilities are returned. Not used when task_type is 'regression'.

        **kwargs:
            Extra kwargs passed on to the following model prediction function:
            if (task_type == 'classification')
            - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True
            - predict_proba() if output_margin==False
            if (task_type == 'regression')
            - predict()

        Returns
        ---------------------------------------
        out: numpy array
            Model predictions
        """
        if isinstance(x_test, hipe4ml.tree_handler.TreeHandler):
            x_test = x_test.get_data_frame()

        x_test = x_test[self.training_columns]

        # regression
        if self._task_type == 'regression':
            return self.model.predict(x_test, **kwargs)

        # classification
        if output_margin:
            if self.model_string == 'xgboost':
                return self.model.predict(x_test, output_margin=True, **kwargs)
            if self.model_string == 'lightgbm':
                return self.model.predict(x_test, raw_score=True, **kwargs)
            if self.model_string == 'sklearn':
                if not hasattr(self.model, 'decision_function'):
                    raise ValueError(
                        "This Model does not support a decision_function(): use output_margin=False")
                return self.model.decision_function(x_test, **kwargs).ravel()

        pred = self.model.predict_proba(x_test, **kwargs)
        # in case of binary classification return only the scores of
        # the signal class
        if pred.shape[1] <= 2:
            pred = pred[:, 1]
        return pred

    def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro',
                         multi_class_opt='raise', **kwargs):
        """
        Perform the training and the testing of the model. The model performance is estimated
        using the ROC AUC metric for classification and the MSE for regression.

        Parameters
        ----------------------------------------------
        data: list
            Contains respectively: training
            set dataframe, training label array,
            test set dataframe, test label array

        return_prediction: bool
            If True Model predictions on the test set are
            returned

        output_margin: bool
            Whether to output the raw untransformed margin value. If False model
            probabilities are returned. Not used when task_type is 'regression'.

        average: string
            Option for the average of ROC AUC scores used only in case of multi-classification.
            You can choose between 'macro' and 'weighted'. For more information see
            https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score

        multi_class_opt: string
            Option to compute ROC AUC scores used only in case of multi-classification.
            The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available

        **kwargs: dict
            Extra kwargs passed on to the model fit method

        Returns
        ---------------------------------------
        out: numpy array or None
            If return_prediction==True, Model predictions on the test set are
            returned

        """

        # get number of classes
        n_classes = len(np.unique(data[1]))
        self._n_classes = n_classes
        print('==============================')
        print(f"Training {self.model_string} model for {self._task_type}")
        if self._task_type == 'classification':
            print('Number of detected classes:', n_classes)

        # final training with the optimized hyperparams
        print('Training the model: ...')
        self.fit(data[0], data[1], **kwargs)
        print('Training the model: Done!')
        print('Testing the model: ...')
        y_pred = self.predict(data[2], output_margin=output_margin)
        if self._task_type == 'classification':
            roc_score = roc_auc_score(
                data[3], y_pred, average=average, multi_class=multi_class_opt)
            print(f'ROC_AUC_score: {roc_score:.6f}')
        else:
            mse_score = mean_squared_error(data[3], y_pred)
            print(f'Mean squared error: {mse_score:.6f}')
        print('Testing the model: Done!')
        print('==============================')
        if return_prediction:
            return y_pred
        return None

    def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize',
                               optuna_sampler=None, resume_study=None, save_study=None, **kwargs):
        """
        Perform hyperparameter optimization of ModelHandler using the Optuna module.
        The model hyperparameters are automatically set as the ones that provided the
        best result during the optimization.

        Parameters
        ------------------------------------------------------
        data: list
            Contains respectively: training
            set dataframe, training label array,
            test set dataframe, test label array

        hyperparams_ranges: dict
            Hyperparameter ranges (in tuples or list). If a parameter is not
            in a tuple or a list it will be considered constant.
            Important: the type of the params must be preserved
            when passing the ranges.
            For example:
            dict={
                'max_depth':(10,100)
                'learning_rate': (0.01,0.03)
                'n_jobs': 8
            }

        cross_val_scoring: string, callable or None
            Score metrics used for the cross-validation.
            A string (see sklearn model evaluation documentation:
            https://scikit-learn.org/stable/modules/model_evaluation.html)
            or a scorer callable object / function with signature scorer(estimator, X, y)
            which should return only a single value.
            In binary classification 'roc_auc' is suggested.
            In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’,
            ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested.
            For more information see
            https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

        direction: str
            The direction of optimization. Either 'maximize' or 'minimize'.
            (e.g. for the metric 'roc_auc' the direction is 'maximize')

        optuna_sampler: optuna.samplers.BaseSampler
            Sampler to be used for the optuna (maxi-)minimisation.
            If None, default TPESampler is used. For more information see:
            https://optuna.readthedocs.io/en/stable/reference/samplers.html

        nfold: int
            Number of folds to calculate the cross validation error

        resume_study: str
            A string indicating the filename of the study to be resumed.
            If None, the study is not resumed.

        save_study: str
            A string indicating the filename of the study. If None,
            the study is not saved into a file.

        **kwargs: dict
            Optuna study parameters

        Returns
        ------------------------------------------------------

        study: optuna.study.Study
            The obtuna object which stores the whole study. See Optuna's documentation for more details:
            https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
        """

        n_classes = len(np.unique(data[1]))
        self._n_classes = n_classes
        if self.training_columns is None:
            self.training_columns = list(data[0].columns)

        x_train, y_train, _, _ = data

        def __get_int_or_uniform(hyperparams_ranges, trial):

            params = {}

            for key in hyperparams_ranges:
                if isinstance(hyperparams_ranges[key][0], int):
                    params[key] = trial.suggest_int(
                        key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])
                elif isinstance(hyperparams_ranges[key][0], float):
                    params[key] = trial.suggest_uniform(
                        key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])

            return params

        def __objective(trial):

            params = __get_int_or_uniform(hyperparams_ranges, trial)
            model_copy = deepcopy(self.model)
            model_copy.set_params(**{**self.model_params, **params})
            return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train,
                                           cv=nfold, scoring=cross_val_scoring, n_jobs=1))
        if resume_study:
            with open(resume_study, 'rb') as resume_study_file:
                study = pickle.load(resume_study_file)
        else:
            study = optuna.create_study(
                direction=direction, sampler=optuna_sampler)

        study.optimize(__objective, **kwargs)

        if save_study:
            with open(save_study, 'wb') as study_file:
                pickle.dump(study, study_file)

        print(f"Number of finished trials: {len(study.trials)}")
        print("Best trial:")
        best_trial = study.best_trial

        print(f"Value: {best_trial.value}")
        print("Params: ")
        for key, value in best_trial.params.items():
            print(f"    {key}: {value}")

        self.set_model_params({**self.model_params, **best_trial.params})
        self.optuna_cross_val_score = best_trial.value

        return study

    def dump_original_model(self, filename, xgb_format=False):
        """
        Save the trained model into a pickle
        file. Only for xgboost models it is also given
        the possibility to save them into a .model file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved

        xgb_format : bool
            If True saves the xgboost model into a .model file
        """
        if xgb_format is False:
            with open(filename, "wb") as output_file:
                pickle.dump(self.model, output_file)
        else:
            if self.model_string == 'xgboost':
                self.model.save_model(filename)
            else:
                print("File not saved: only xgboost models support the .model extension")

    def dump_model_handler(self, filename):
        """
        Save the model handler into a pickle file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved
        """
        with open(filename, "wb") as output_file:
            pickle.dump(self, output_file)

    def load_model_handler(self, filename):
        """
        Load a model handler saved into a pickle file

        Parameters
        -----------------------------------------------------
        filename: str
            Name of the file in which the model is saved
        """
        with open(filename, "rb") as input_file:
            loaded_model = pickle.load(input_file)
            self.model = loaded_model.get_original_model()
            self.training_columns = loaded_model.get_training_columns()
            self.model_params = loaded_model.get_model_params()
            self.model.set_params(**self.model_params)
            self.model_string = loaded_model.get_model_module()
            self._n_classes = loaded_model.get_n_classes()
            self._task_type = loaded_model.get_task_type()

Class used for wrapping the models from different ML libraries to build a new model with common methods. Currently LightGBM, XGBoost (through their sklearn interface) and sklearn models are supported.

Parameters

input_model : XGBoost, LightGBM or sklearn model
training_columns : list: Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']
model_params : dict: Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …
task_type : str: Task type of the model: 'classification' or 'regression'

Methods

def dump_model_handler(self, filename)

Expand source code

def dump_model_handler(self, filename):
    """
    Save the model handler into a pickle file

    Parameters
    -----------------------------------------------------
    filename: str
        Name of the file in which the model is saved
    """
    with open(filename, "wb") as output_file:
        pickle.dump(self, output_file)

Save the model handler into a pickle file

Parameters

filename : str: Name of the file in which the model is saved

def dump_original_model(self, filename, xgb_format=False)

Expand source code

def dump_original_model(self, filename, xgb_format=False):
    """
    Save the trained model into a pickle
    file. Only for xgboost models it is also given
    the possibility to save them into a .model file

    Parameters
    -----------------------------------------------------
    filename: str
        Name of the file in which the model is saved

    xgb_format : bool
        If True saves the xgboost model into a .model file
    """
    if xgb_format is False:
        with open(filename, "wb") as output_file:
            pickle.dump(self.model, output_file)
    else:
        if self.model_string == 'xgboost':
            self.model.save_model(filename)
        else:
            print("File not saved: only xgboost models support the .model extension")

Save the trained model into a pickle file. Only for xgboost models it is also given the possibility to save them into a .model file

Parameters

filename : str: Name of the file in which the model is saved
xgb_format : bool: If True saves the xgboost model into a .model file

def fit(self, x_train, y_train, **kwargs)

Expand source code

def fit(self, x_train, y_train, **kwargs):
    """
    Fit Model

    Parameters
    ---------------------------
    x_train: array-like, sparse matrix
        Training data

    y_train: array-like, sparse matrix
        Target data

    **kwargs:
        Extra kwargs passed on to model.fit() method
    """
    if self._task_type == 'classification':
        n_classes = len(np.unique(y_train))
        self._n_classes = n_classes
    if self.training_columns is None:
        self.training_columns = list(x_train.columns)

    self.model.fit(x_train[self.training_columns], y_train, **kwargs)

Fit Model

Parameters

x_train : array-like, sparse matrix: Training data
y_train : array-like, sparse matrix: Target data

**kwargs: Extra kwargs passed on to model.fit() method

def get_model_module(self)

Expand source code

def get_model_module(self):
    """
    Get the string containing the name
    of the model module

    Returns
    ---------------------------
    out: str
        Name of the model module
    """
    return self.model_string

Get the string containing the name of the model module

Returns

out : str: Name of the model module

def get_model_params(self)

Expand source code

def get_model_params(self):
    """
    Get the model (hyper-)parameters

    Returns
    ------------------------------------
    out: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...
    """
    return self.model.get_params()

Get the model (hyper-)parameters

Returns

out : dict: Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …

def get_n_classes(self)

Expand source code

def get_n_classes(self):
    """
    Get the number of classes

    Returns
    ---------------------------
    out: int
        Number of classes
    """
    return self._n_classes

Get the number of classes

Returns

out : int: Number of classes

def get_original_model(self)

Expand source code

def get_original_model(self):
    """
    Get the original unwrapped model

    Returns
    ---------------------------
    out: XGBoost, LGBM or sklearn model
    """
    return self.model

Get the original unwrapped model

Returns

out : XGBoost, LGBM or sklearn model

def get_task_type(self)

Expand source code

def get_task_type(self):
    """
    Get the task type of the model

    Returns
    ---------------------------
    out: str
        Task type of the model: 'classification' or 'regression'
    """
    return self._task_type

Get the task type of the model

Returns

out : str: Task type of the model: 'classification' or 'regression'

def get_training_columns(self)

Expand source code

def get_training_columns(self):
    """
    Get the features used for the training process

    Returns
    ------------------------------------
    out: list
        Names of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']
    """

    return self.training_columns

Get the features used for the training process

Returns

out : list: Names of the features used for the training. Example: ['dEdx', 'pT', 'ct']

def load_model_handler(self, filename)

Expand source code

def load_model_handler(self, filename):
    """
    Load a model handler saved into a pickle file

    Parameters
    -----------------------------------------------------
    filename: str
        Name of the file in which the model is saved
    """
    with open(filename, "rb") as input_file:
        loaded_model = pickle.load(input_file)
        self.model = loaded_model.get_original_model()
        self.training_columns = loaded_model.get_training_columns()
        self.model_params = loaded_model.get_model_params()
        self.model.set_params(**self.model_params)
        self.model_string = loaded_model.get_model_module()
        self._n_classes = loaded_model.get_n_classes()
        self._task_type = loaded_model.get_task_type()

Load a model handler saved into a pickle file

Parameters

filename : str: Name of the file in which the model is saved

def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize', optuna_sampler=None, resume_study=None, save_study=None, **kwargs)

Expand source code

def optimize_params_optuna(self, data, hyperparams_ranges, cross_val_scoring, nfold=5, direction='maximize',
                           optuna_sampler=None, resume_study=None, save_study=None, **kwargs):
    """
    Perform hyperparameter optimization of ModelHandler using the Optuna module.
    The model hyperparameters are automatically set as the ones that provided the
    best result during the optimization.

    Parameters
    ------------------------------------------------------
    data: list
        Contains respectively: training
        set dataframe, training label array,
        test set dataframe, test label array

    hyperparams_ranges: dict
        Hyperparameter ranges (in tuples or list). If a parameter is not
        in a tuple or a list it will be considered constant.
        Important: the type of the params must be preserved
        when passing the ranges.
        For example:
        dict={
            'max_depth':(10,100)
            'learning_rate': (0.01,0.03)
            'n_jobs': 8
        }

    cross_val_scoring: string, callable or None
        Score metrics used for the cross-validation.
        A string (see sklearn model evaluation documentation:
        https://scikit-learn.org/stable/modules/model_evaluation.html)
        or a scorer callable object / function with signature scorer(estimator, X, y)
        which should return only a single value.
        In binary classification 'roc_auc' is suggested.
        In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’,
        ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested.
        For more information see
        https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

    direction: str
        The direction of optimization. Either 'maximize' or 'minimize'.
        (e.g. for the metric 'roc_auc' the direction is 'maximize')

    optuna_sampler: optuna.samplers.BaseSampler
        Sampler to be used for the optuna (maxi-)minimisation.
        If None, default TPESampler is used. For more information see:
        https://optuna.readthedocs.io/en/stable/reference/samplers.html

    nfold: int
        Number of folds to calculate the cross validation error

    resume_study: str
        A string indicating the filename of the study to be resumed.
        If None, the study is not resumed.

    save_study: str
        A string indicating the filename of the study. If None,
        the study is not saved into a file.

    **kwargs: dict
        Optuna study parameters

    Returns
    ------------------------------------------------------

    study: optuna.study.Study
        The obtuna object which stores the whole study. See Optuna's documentation for more details:
        https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study
    """

    n_classes = len(np.unique(data[1]))
    self._n_classes = n_classes
    if self.training_columns is None:
        self.training_columns = list(data[0].columns)

    x_train, y_train, _, _ = data

    def __get_int_or_uniform(hyperparams_ranges, trial):

        params = {}

        for key in hyperparams_ranges:
            if isinstance(hyperparams_ranges[key][0], int):
                params[key] = trial.suggest_int(
                    key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])
            elif isinstance(hyperparams_ranges[key][0], float):
                params[key] = trial.suggest_uniform(
                    key, hyperparams_ranges[key][0], hyperparams_ranges[key][1])

        return params

    def __objective(trial):

        params = __get_int_or_uniform(hyperparams_ranges, trial)
        model_copy = deepcopy(self.model)
        model_copy.set_params(**{**self.model_params, **params})
        return np.mean(cross_val_score(model_copy, x_train[self.training_columns], y_train,
                                       cv=nfold, scoring=cross_val_scoring, n_jobs=1))
    if resume_study:
        with open(resume_study, 'rb') as resume_study_file:
            study = pickle.load(resume_study_file)
    else:
        study = optuna.create_study(
            direction=direction, sampler=optuna_sampler)

    study.optimize(__objective, **kwargs)

    if save_study:
        with open(save_study, 'wb') as study_file:
            pickle.dump(study, study_file)

    print(f"Number of finished trials: {len(study.trials)}")
    print("Best trial:")
    best_trial = study.best_trial

    print(f"Value: {best_trial.value}")
    print("Params: ")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")

    self.set_model_params({**self.model_params, **best_trial.params})
    self.optuna_cross_val_score = best_trial.value

    return study

Perform hyperparameter optimization of ModelHandler using the Optuna module. The model hyperparameters are automatically set as the ones that provided the best result during the optimization.

Parameters

data : list: Contains respectively: training set dataframe, training label array, test set dataframe, test label array
hyperparams_ranges : dict: Hyperparameter ranges (in tuples or list). If a parameter is not in a tuple or a list it will be considered constant. Important: the type of the params must be preserved when passing the ranges. For example: dict={ 'max_depth':(10,100) 'learning_rate': (0.01,0.03) 'n_jobs': 8 }
cross_val_scoring : string, callable or None: Score metrics used for the cross-validation. A string (see sklearn model evaluation documentation: https://scikit-learn.org/stable/modules/model_evaluation.html) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value. In binary classification 'roc_auc' is suggested. In multi-classification one between ‘roc_auc_ovr’, ‘roc_auc_ovo’, ‘roc_auc_ovr_weighted’ and ‘roc_auc_ovo_weighted’ is suggested. For more information see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
direction : str: The direction of optimization. Either 'maximize' or 'minimize'. (e.g. for the metric 'roc_auc' the direction is 'maximize')
optuna_sampler : optuna.samplers.BaseSampler: Sampler to be used for the optuna (maxi-)minimisation. If None, default TPESampler is used. For more information see: https://optuna.readthedocs.io/en/stable/reference/samplers.html
nfold : int: Number of folds to calculate the cross validation error
resume_study : str: A string indicating the filename of the study to be resumed. If None, the study is not resumed.
save_study : str: A string indicating the filename of the study. If None, the study is not saved into a file.
**kwargs : dict: Optuna study parameters

Returns

study : optuna.study.Study: The obtuna object which stores the whole study. See Optuna's documentation for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study

def predict(self, x_test, output_margin=True, **kwargs)

Expand source code

def predict(self, x_test, output_margin=True, **kwargs):
    """
    Return model prediction for the array x_test
    Parameters
    --------------------------------------
    x_test: hipe4ml tree_handler, array-like, sparse matrix
        The input sample.

    output_margin: bool
        Whether to output the raw untransformed margin value. If False model
        probabilities are returned. Not used when task_type is 'regression'.

    **kwargs:
        Extra kwargs passed on to the following model prediction function:
        if (task_type == 'classification')
        - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True
        - predict_proba() if output_margin==False
        if (task_type == 'regression')
        - predict()

    Returns
    ---------------------------------------
    out: numpy array
        Model predictions
    """
    if isinstance(x_test, hipe4ml.tree_handler.TreeHandler):
        x_test = x_test.get_data_frame()

    x_test = x_test[self.training_columns]

    # regression
    if self._task_type == 'regression':
        return self.model.predict(x_test, **kwargs)

    # classification
    if output_margin:
        if self.model_string == 'xgboost':
            return self.model.predict(x_test, output_margin=True, **kwargs)
        if self.model_string == 'lightgbm':
            return self.model.predict(x_test, raw_score=True, **kwargs)
        if self.model_string == 'sklearn':
            if not hasattr(self.model, 'decision_function'):
                raise ValueError(
                    "This Model does not support a decision_function(): use output_margin=False")
            return self.model.decision_function(x_test, **kwargs).ravel()

    pred = self.model.predict_proba(x_test, **kwargs)
    # in case of binary classification return only the scores of
    # the signal class
    if pred.shape[1] <= 2:
        pred = pred[:, 1]
    return pred

Return model prediction for the array x_test Parameters

x_test : hipe4ml tree_handler, array-like, sparse matrix: The input sample.
output_margin : bool: Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.

**kwargs: Extra kwargs passed on to the following model prediction function: if (task_type == 'classification') - predict() (XGBoost and LGBM) or decision_function() (sklearn) if output_margin==True - predict_proba() if output_margin==False if (task_type == 'regression') - predict()

Returns

out : numpy array: Model predictions

def set_model_params(self, model_params)

Expand source code

def set_model_params(self, model_params):
    """
    Set the model (hyper-)parameters

    Parameters
    ------------------------------------
    model_params: dict
        Model hyper-parameter values. For
        example (XGBoost): max_depth, learning_rate,
        n_estimators, gamma, min_child_weight, ...
    """
    self.model_params = model_params
    self.model.set_params(**self.model_params)

Set the model (hyper-)parameters

Parameters

model_params : dict: Model hyper-parameter values. For example (XGBoost): max_depth, learning_rate, n_estimators, gamma, min_child_weight, …

def set_training_columns(self, training_columns)

Expand source code

def set_training_columns(self, training_columns):
    """
    Set the features used for the training process

    Parameters
    ------------------------------------
    training_columns: list
        Contains the name of the features used for the training.
        Example: ['dEdx', 'pT', 'ct']
    """
    self.training_columns = training_columns

Set the features used for the training process

Parameters

training_columns : list: Contains the name of the features used for the training. Example: ['dEdx', 'pT', 'ct']

def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro', multi_class_opt='raise', **kwargs)

Expand source code

def train_test_model(self, data, return_prediction=False, output_margin=False, average='macro',
                     multi_class_opt='raise', **kwargs):
    """
    Perform the training and the testing of the model. The model performance is estimated
    using the ROC AUC metric for classification and the MSE for regression.

    Parameters
    ----------------------------------------------
    data: list
        Contains respectively: training
        set dataframe, training label array,
        test set dataframe, test label array

    return_prediction: bool
        If True Model predictions on the test set are
        returned

    output_margin: bool
        Whether to output the raw untransformed margin value. If False model
        probabilities are returned. Not used when task_type is 'regression'.

    average: string
        Option for the average of ROC AUC scores used only in case of multi-classification.
        You can choose between 'macro' and 'weighted'. For more information see
        https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score

    multi_class_opt: string
        Option to compute ROC AUC scores used only in case of multi-classification.
        The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available

    **kwargs: dict
        Extra kwargs passed on to the model fit method

    Returns
    ---------------------------------------
    out: numpy array or None
        If return_prediction==True, Model predictions on the test set are
        returned

    """

    # get number of classes
    n_classes = len(np.unique(data[1]))
    self._n_classes = n_classes
    print('==============================')
    print(f"Training {self.model_string} model for {self._task_type}")
    if self._task_type == 'classification':
        print('Number of detected classes:', n_classes)

    # final training with the optimized hyperparams
    print('Training the model: ...')
    self.fit(data[0], data[1], **kwargs)
    print('Training the model: Done!')
    print('Testing the model: ...')
    y_pred = self.predict(data[2], output_margin=output_margin)
    if self._task_type == 'classification':
        roc_score = roc_auc_score(
            data[3], y_pred, average=average, multi_class=multi_class_opt)
        print(f'ROC_AUC_score: {roc_score:.6f}')
    else:
        mse_score = mean_squared_error(data[3], y_pred)
        print(f'Mean squared error: {mse_score:.6f}')
    print('Testing the model: Done!')
    print('==============================')
    if return_prediction:
        return y_pred
    return None

Perform the training and the testing of the model. The model performance is estimated using the ROC AUC metric for classification and the MSE for regression.

Parameters

data : list: Contains respectively: training set dataframe, training label array, test set dataframe, test label array
return_prediction : bool: If True Model predictions on the test set are returned
output_margin : bool: Whether to output the raw untransformed margin value. If False model probabilities are returned. Not used when task_type is 'regression'.
average : string: Option for the average of ROC AUC scores used only in case of multi-classification. You can choose between 'macro' and 'weighted'. For more information see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score
multi_class_opt : string: Option to compute ROC AUC scores used only in case of multi-classification. The one-vs-one 'ovo' and one-vs-rest 'ovr' approaches are available
**kwargs : dict: Extra kwargs passed on to the model fit method

Returns

out : numpy array or None: If return_prediction==True, Model predictions on the test set are returned