Source code for tree_lab.preprocessing

from sklearn.preprocessing import *
import pandas as pd


[docs]class DataPreprocessor: """ A class for preprocessing data. Parameters: - data: pandas.DataFrame """ def __init__(self, data): """ Initializes the DataPreprocessor object. Parameters: - data (pandas.DataFrame): The input data for preprocessing. """ self.data = data.copy()
[docs] def normalize_data(self, numeric_columns, scaler_type="normal"): """ Normalizes the numeric columns of the input data using the specified scaler type. Parameters: - numeric_columns (list): list of column names containing numeric data to be normalized. - scaler_type (str): the type of scaler to be used. Options: 'normal' (default), 'minmax', 'max_absolute'. Returns: pandas.DataFrame: the normalized data. Raises: ValueError, if the specified columns are not numeric or contain NA values. """ stop = False numeric_data = self.data[numeric_columns] for col in numeric_columns: if not (pd.to_numeric(numeric_data[col], errors='coerce').notnull().all()): print( f"'{col}' is not a numeric column! " f"It is either categorical or contains n/a values! " f"Only numeric columns can be normalized!") stop = True break if not stop: if scaler_type == "minmax": scaler = MinMaxScaler() elif scaler_type == "normal": scaler = StandardScaler() elif scaler_type == "max_absolute": scaler = MaxAbsScaler() else: scaler = MinMaxScaler() normalized_data = scaler.fit_transform(numeric_data) self.data[numeric_columns] = normalized_data return self.data
[docs] def onehot_encode(self, columns, keep_original=True): """ Performs one-hot encoding on specified columns of the input data. Parameters: - columns (list): list of column names containing categorical data to be one-hot encoded. - keep_original (bool): if True, keeps the original columns in addition to the one-hot encoded columns. Returns: pandas.DataFrame: the one-hot encoded data. Raises: ValueError, if the specified columns are not of type 'object'. """ proceed = True for col in columns: if not (self.data[col].dtype == 'object'): proceed = False print("One of the inputted columns is maybe not an Object!") break if proceed: onehot_data = pd.get_dummies(self.data, columns=columns) if keep_original: subset = self.data[columns] self.data = pd.concat([onehot_data, subset], axis=1) return self.data self.data = onehot_data return self.data
[docs] def display(self): """ It displays the current state of the data. Returns: pandas.DataFrame """ return self.data