Source code for mamut.preprocessing.handlers

from typing import List, Literal

import numpy as np
import pandas as pd
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from scipy.stats import skew
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, IsolationForest
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer

import mamut.preprocessing.settings as settings



[docs]
def handle_outliers(
    X: pd.DataFrame,
    y: pd.Series,
    feature_names: List[str],
    contamination: float = 0.01,
    random_state: int = 42,
) -> (pd.DataFrame, pd.Series, IsolationForest):
    """
    Handles outliers in the dataset using IsolationForest.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        y: pd.Series
            Target array.
        feature_names: List[str]
            Names of the features in the dataset.
        contamination: float
            The proportion of outliers in the data.
        random_state: int
            Seed for reproducibility.

    Returns:
        X_filtered: pd.DataFrame
            Feature matrix with outliers removed.
        y_filtered: pd.Series
            Target array with outliers removed.
        transformer: IsolationForest
            Fitted IsolationForest model.
    """
    X, y = X.copy(), y.copy()
    iso_forest = IsolationForest(contamination=contamination, random_state=random_state)
    outliers = iso_forest.fit_predict(X[feature_names])
    mask = outliers == 1

    return X[mask], y[mask], iso_forest




[docs]
def handle_imbalanced(
    X: pd.DataFrame,
    y: pd.Series,
    strategy: Literal["SMOTE", "undersample", "combine"],
    random_state=42,
) -> (pd.DataFrame, pd.Series, object):
    """
    Balances an imbalanced dataset using techniques from imbalanced-learn.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        y: pd.Series
            Target array.
        strategy: Literal["SMOTE", "undersample", "combine"]
            Resampling method to use. Options:
            - 'SMOTE': Synthetic Minority Oversampling Technique.
            - 'undersample': Random undersampling of majority class.
            - 'combine': SMOTE with Tomek links.
        random_state: int
            Seed for reproducibility.

    Returns:
        X_resampled: pd.DataFrame
            Feature matrix after resampling.
        y_resampled: pd.Series
            Target array after resampling.
        transformer: object
            Fitted resampling method instance.
    """
    if strategy not in settings.resampler_mapping.keys():
        raise ValueError(
            f"Invalid resampling strategy, choose from {settings.resampler_mapping.keys()}."
        )

    y_series = y if isinstance(y, pd.Series) else pd.Series(y)
    minority_count = y_series.value_counts().min()
    if strategy in {"SMOTE", "combine"}:
        k_neighbors = min(5, minority_count - 1)
        if k_neighbors < 1:
            return X, y, None
        smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        if strategy == "SMOTE":
            resampler = smote
        else:
            resampler = SMOTETomek(random_state=random_state, smote=smote)
    else:
        resampler = settings.resampler_mapping[strategy](random_state=random_state)
    X_resampled, y_resampled = resampler.fit_resample(X, y)

    return X_resampled, y_resampled, resampler




[docs]
def handle_skewed(
    X: pd.DataFrame, feature_names: List[str], threshold: float = 1
) -> (pd.DataFrame, PowerTransformer, List[str]):
    """
    Handles skewed features in the dataset using PowerTransformer.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        feature_names: List[str]
            Names of the features in the dataset.
        threshold: float
            Threshold for skewness.

    Returns:
        X_transformed: pd.DataFrame
            Feature matrix with skewed features transformed.
        transformer: PowerTransformer
            Fitted PowerTransformer model.
        skewed_feature_names: List[str]
            Names of the skewed features that were transformed.
        lambdas: List[float]
            Lambda values for the transformed features.
    """
    X = X.copy()
    skewed_feature_names = []
    pt = PowerTransformer(method="yeo-johnson", standardize=False)
    for feature in feature_names:
        feature_skewness = skew(X[feature])
        if abs(feature_skewness) > threshold:
            skewed_feature_names.append(feature)

    lambdas = []
    if len(skewed_feature_names) > 0:
        X[skewed_feature_names] = pt.fit_transform(X[skewed_feature_names])
        lambdas = pt.lambdas_

    return X, pt, skewed_feature_names, lambdas




[docs]
def handle_missing_numeric(
    X: pd.DataFrame,
    feature_names: List[str],
    strategy: Literal["iterative", "knn", "mean", "median", "constant"],
) -> (pd.DataFrame, object):
    """
    Handles missing numeric values in the dataset using specified imputation strategy.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        feature_names: List[str]
            Names of the numeric features in the dataset.
        strategy: Literal["iterative", "knn", "mean", "median", "constant"]
            Imputation strategy to use.

    Returns:
        X_imputed: pd.DataFrame
            Feature matrix with missing numeric values imputed.
        imputer: object
            Fitted imputer model.
    """
    if strategy not in settings.imputer_mapping.keys():
        raise ValueError(
            f"Invalid imputation strategy, choose from {settings.imputer_mapping.keys()}."
        )

    X = X.copy()
    imputer = settings.imputer_mapping[strategy]()
    imputer.fit(X[feature_names])
    X[feature_names] = imputer.transform(X[feature_names])

    return X, imputer




[docs]
def handle_missing_categorical(
    X: pd.DataFrame,
    feature_names: List[str],
    strategy: Literal["most_frequent", "constant"],
) -> (pd.DataFrame, SimpleImputer):
    """
    Handles missing categorical values in the dataset using specified imputation strategy.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        feature_names: List[str]
            Names of the categorical features in the dataset.
        strategy: Literal["most_frequent", "constant"]
            Imputation strategy to use.

    Returns:
        X_imputed: pd.DataFrame
            Feature matrix with missing categorical values imputed.
        imputer: SimpleImputer
            Fitted SimpleImputer model.
    """
    X = X.copy()
    imputer = SimpleImputer(strategy=strategy)
    imputer.fit(X[feature_names])
    X[feature_names] = imputer.transform(X[feature_names])

    return X, imputer




[docs]
def handle_categorical(
    X: pd.DataFrame, feature_names: List[str]
) -> (pd.DataFrame, OneHotEncoder):
    """
    Handles categorical features in the dataset using OneHotEncoder.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        feature_names: List[str]
            Names of the categorical features in the dataset.

    Returns:
        X_encoded: pd.DataFrame
            Feature matrix with categorical features encoded.
        encoder: OneHotEncoder
            Fitted OneHotEncoder model.
        ohe_feature_names: List[str]
            Names of the one-hot encoded features.
    """
    X = X.copy()
    encoder = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
    encoder.fit(X[feature_names])
    encoded_features = encoder.transform(X[feature_names])
    ohe_feature_names = encoder.get_feature_names_out(feature_names)
    encoded_features_df = pd.DataFrame(
        encoded_features,
        columns=ohe_feature_names,
        index=X.index,
    )
    X = X.drop(columns=feature_names).join(encoded_features_df)

    return X, encoder, ohe_feature_names




[docs]
def handle_scaling(
    X: pd.DataFrame, feature_names: List[str], strategy: Literal["standard", "robust"]
) -> (pd.DataFrame, object):
    """
    Handles scaling of features in the dataset using specified scaling strategy.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        feature_names: List[str]
            Names of the features to be scaled.
        strategy: Literal["standard", "robust"]
            Scaling strategy to use.

    Returns:
        X_scaled: pd.DataFrame
            Feature matrix with scaled features.
        scaler: object
            Fitted scaler instance.
    """
    if strategy not in ["standard", "robust"]:
        raise ValueError(
            f"Invalid scaling strategy, choose from {settings.scaler_mapping.keys()}."
        )

    X = X.copy()
    scaler = settings.scaler_mapping[strategy]()
    scaler.fit(X[feature_names])
    X[feature_names] = scaler.transform(X[feature_names])

    return X, scaler




[docs]
def handle_selection(
    X: pd.DataFrame, y: pd.Series, threshold: float = 0.05, random_state: int = 42
) -> (pd.DataFrame, SelectFromModel, List[str]):
    """
    Handles feature selection using ExtraTreesClassifier.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        y: pd.Series
            Target array.
        threshold: float
            Threshold for feature selection.
        random_state: int
            Seed for reproducibility.

    Returns:
        X_selected: pd.DataFrame
            Feature matrix with selected features.
        selector: SelectFromModel
            Fitted SelectFromModel instance.
        selected_features: List[str]
            Names of the selected features.
        feature_importances: np.ndarray
            Feature importances from the ExtraTreesClassifier.
    """
    X = X.copy()
    selector = SelectFromModel(
        ExtraTreesClassifier(random_state=random_state), threshold=threshold
    )
    selector.fit(X, y)
    X_selected = selector.transform(X)
    selected_features = X.columns[selector.get_support()].tolist()
    X_selected_df = pd.DataFrame(X_selected, columns=selected_features, index=X.index)

    feature_importances = selector.estimator_.feature_importances_

    return X_selected_df, selector, selected_features, feature_importances




[docs]
def handle_extraction(
    X: pd.DataFrame, threshold: float = 0.95, random_state: int = 42
) -> (np.ndarray, PCA):
    """
    Handles feature extraction using PCA.

    Parameters:
        X: pd.DataFrame
            Feature matrix.
        threshold: float
            Threshold for PCA.
        random_state: int
            Seed for reproducibility.

    Returns:
        X_extracted: np.ndarray
            Feature matrix after PCA transformation.
        extractor: PCA
            Fitted PCA instance.
        loadings: np.ndarray
            Loadings of the PCA components.
    """
    X = X.copy()
    extractor = PCA(
        n_components=threshold, svd_solver="full", random_state=random_state
    )
    extractor.fit(X)
    X_extracted = extractor.transform(X)
    loadings = extractor.components_

    return X_extracted, extractor, loadings