Source code for mamut.preprocessing.handlers

from typing import List, Literal

import numpy as np
import pandas as pd
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from scipy.stats import skew
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, IsolationForest
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer

import mamut.preprocessing.settings as settings


[docs] def handle_outliers( X: pd.DataFrame, y: pd.Series, feature_names: List[str], contamination: float = 0.01, random_state: int = 42, ) -> (pd.DataFrame, pd.Series, IsolationForest): """ Handles outliers in the dataset using IsolationForest. Parameters: X: pd.DataFrame Feature matrix. y: pd.Series Target array. feature_names: List[str] Names of the features in the dataset. contamination: float The proportion of outliers in the data. random_state: int Seed for reproducibility. Returns: X_filtered: pd.DataFrame Feature matrix with outliers removed. y_filtered: pd.Series Target array with outliers removed. transformer: IsolationForest Fitted IsolationForest model. """ X, y = X.copy(), y.copy() iso_forest = IsolationForest(contamination=contamination, random_state=random_state) outliers = iso_forest.fit_predict(X[feature_names]) mask = outliers == 1 return X[mask], y[mask], iso_forest
[docs] def handle_imbalanced( X: pd.DataFrame, y: pd.Series, strategy: Literal["SMOTE", "undersample", "combine"], random_state=42, ) -> (pd.DataFrame, pd.Series, object): """ Balances an imbalanced dataset using techniques from imbalanced-learn. Parameters: X: pd.DataFrame Feature matrix. y: pd.Series Target array. strategy: Literal["SMOTE", "undersample", "combine"] Resampling method to use. Options: - 'SMOTE': Synthetic Minority Oversampling Technique. - 'undersample': Random undersampling of majority class. - 'combine': SMOTE with Tomek links. random_state: int Seed for reproducibility. Returns: X_resampled: pd.DataFrame Feature matrix after resampling. y_resampled: pd.Series Target array after resampling. transformer: object Fitted resampling method instance. """ if strategy not in settings.resampler_mapping.keys(): raise ValueError( f"Invalid resampling strategy, choose from {settings.resampler_mapping.keys()}." ) y_series = y if isinstance(y, pd.Series) else pd.Series(y) minority_count = y_series.value_counts().min() if strategy in {"SMOTE", "combine"}: k_neighbors = min(5, minority_count - 1) if k_neighbors < 1: return X, y, None smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors) if strategy == "SMOTE": resampler = smote else: resampler = SMOTETomek(random_state=random_state, smote=smote) else: resampler = settings.resampler_mapping[strategy](random_state=random_state) X_resampled, y_resampled = resampler.fit_resample(X, y) return X_resampled, y_resampled, resampler
[docs] def handle_skewed( X: pd.DataFrame, feature_names: List[str], threshold: float = 1 ) -> (pd.DataFrame, PowerTransformer, List[str]): """ Handles skewed features in the dataset using PowerTransformer. Parameters: X: pd.DataFrame Feature matrix. feature_names: List[str] Names of the features in the dataset. threshold: float Threshold for skewness. Returns: X_transformed: pd.DataFrame Feature matrix with skewed features transformed. transformer: PowerTransformer Fitted PowerTransformer model. skewed_feature_names: List[str] Names of the skewed features that were transformed. lambdas: List[float] Lambda values for the transformed features. """ X = X.copy() skewed_feature_names = [] pt = PowerTransformer(method="yeo-johnson", standardize=False) for feature in feature_names: feature_skewness = skew(X[feature]) if abs(feature_skewness) > threshold: skewed_feature_names.append(feature) lambdas = [] if len(skewed_feature_names) > 0: X[skewed_feature_names] = pt.fit_transform(X[skewed_feature_names]) lambdas = pt.lambdas_ return X, pt, skewed_feature_names, lambdas
[docs] def handle_missing_numeric( X: pd.DataFrame, feature_names: List[str], strategy: Literal["iterative", "knn", "mean", "median", "constant"], ) -> (pd.DataFrame, object): """ Handles missing numeric values in the dataset using specified imputation strategy. Parameters: X: pd.DataFrame Feature matrix. feature_names: List[str] Names of the numeric features in the dataset. strategy: Literal["iterative", "knn", "mean", "median", "constant"] Imputation strategy to use. Returns: X_imputed: pd.DataFrame Feature matrix with missing numeric values imputed. imputer: object Fitted imputer model. """ if strategy not in settings.imputer_mapping.keys(): raise ValueError( f"Invalid imputation strategy, choose from {settings.imputer_mapping.keys()}." ) X = X.copy() imputer = settings.imputer_mapping[strategy]() imputer.fit(X[feature_names]) X[feature_names] = imputer.transform(X[feature_names]) return X, imputer
[docs] def handle_missing_categorical( X: pd.DataFrame, feature_names: List[str], strategy: Literal["most_frequent", "constant"], ) -> (pd.DataFrame, SimpleImputer): """ Handles missing categorical values in the dataset using specified imputation strategy. Parameters: X: pd.DataFrame Feature matrix. feature_names: List[str] Names of the categorical features in the dataset. strategy: Literal["most_frequent", "constant"] Imputation strategy to use. Returns: X_imputed: pd.DataFrame Feature matrix with missing categorical values imputed. imputer: SimpleImputer Fitted SimpleImputer model. """ X = X.copy() imputer = SimpleImputer(strategy=strategy) imputer.fit(X[feature_names]) X[feature_names] = imputer.transform(X[feature_names]) return X, imputer
[docs] def handle_categorical( X: pd.DataFrame, feature_names: List[str] ) -> (pd.DataFrame, OneHotEncoder): """ Handles categorical features in the dataset using OneHotEncoder. Parameters: X: pd.DataFrame Feature matrix. feature_names: List[str] Names of the categorical features in the dataset. Returns: X_encoded: pd.DataFrame Feature matrix with categorical features encoded. encoder: OneHotEncoder Fitted OneHotEncoder model. ohe_feature_names: List[str] Names of the one-hot encoded features. """ X = X.copy() encoder = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False) encoder.fit(X[feature_names]) encoded_features = encoder.transform(X[feature_names]) ohe_feature_names = encoder.get_feature_names_out(feature_names) encoded_features_df = pd.DataFrame( encoded_features, columns=ohe_feature_names, index=X.index, ) X = X.drop(columns=feature_names).join(encoded_features_df) return X, encoder, ohe_feature_names
[docs] def handle_scaling( X: pd.DataFrame, feature_names: List[str], strategy: Literal["standard", "robust"] ) -> (pd.DataFrame, object): """ Handles scaling of features in the dataset using specified scaling strategy. Parameters: X: pd.DataFrame Feature matrix. feature_names: List[str] Names of the features to be scaled. strategy: Literal["standard", "robust"] Scaling strategy to use. Returns: X_scaled: pd.DataFrame Feature matrix with scaled features. scaler: object Fitted scaler instance. """ if strategy not in ["standard", "robust"]: raise ValueError( f"Invalid scaling strategy, choose from {settings.scaler_mapping.keys()}." ) X = X.copy() scaler = settings.scaler_mapping[strategy]() scaler.fit(X[feature_names]) X[feature_names] = scaler.transform(X[feature_names]) return X, scaler
[docs] def handle_selection( X: pd.DataFrame, y: pd.Series, threshold: float = 0.05, random_state: int = 42 ) -> (pd.DataFrame, SelectFromModel, List[str]): """ Handles feature selection using ExtraTreesClassifier. Parameters: X: pd.DataFrame Feature matrix. y: pd.Series Target array. threshold: float Threshold for feature selection. random_state: int Seed for reproducibility. Returns: X_selected: pd.DataFrame Feature matrix with selected features. selector: SelectFromModel Fitted SelectFromModel instance. selected_features: List[str] Names of the selected features. feature_importances: np.ndarray Feature importances from the ExtraTreesClassifier. """ X = X.copy() selector = SelectFromModel( ExtraTreesClassifier(random_state=random_state), threshold=threshold ) selector.fit(X, y) X_selected = selector.transform(X) selected_features = X.columns[selector.get_support()].tolist() X_selected_df = pd.DataFrame(X_selected, columns=selected_features, index=X.index) feature_importances = selector.estimator_.feature_importances_ return X_selected_df, selector, selected_features, feature_importances
[docs] def handle_extraction( X: pd.DataFrame, threshold: float = 0.95, random_state: int = 42 ) -> (np.ndarray, PCA): """ Handles feature extraction using PCA. Parameters: X: pd.DataFrame Feature matrix. threshold: float Threshold for PCA. random_state: int Seed for reproducibility. Returns: X_extracted: np.ndarray Feature matrix after PCA transformation. extractor: PCA Fitted PCA instance. loadings: np.ndarray Loadings of the PCA components. """ X = X.copy() extractor = PCA( n_components=threshold, svd_solver="full", random_state=random_state ) extractor.fit(X) X_extracted = extractor.transform(X) loadings = extractor.components_ return X_extracted, extractor, loadings