Source code for tree_lab.preprocessing
from sklearn.preprocessing import *
import pandas as pd
[docs]class DataPreprocessor:
"""
A class for preprocessing data.
Parameters:
- data: pandas.DataFrame
"""
def __init__(self, data):
"""
Initializes the DataPreprocessor object.
Parameters:
- data (pandas.DataFrame): The input data for preprocessing.
"""
self.data = data.copy()
[docs] def normalize_data(self, numeric_columns, scaler_type="normal"):
"""
Normalizes the numeric columns of the input data using the specified
scaler type.
Parameters:
- numeric_columns (list): list of column names containing numeric
data to be normalized.
- scaler_type (str): the type of scaler to be used. Options:
'normal' (default), 'minmax', 'max_absolute'.
Returns:
pandas.DataFrame: the normalized data.
Raises:
ValueError, if the specified columns are not numeric or contain NA values.
"""
stop = False
numeric_data = self.data[numeric_columns]
for col in numeric_columns:
if not (pd.to_numeric(numeric_data[col],
errors='coerce').notnull().all()):
print(
f"'{col}' is not a numeric column! "
f"It is either categorical or contains n/a values! "
f"Only numeric columns can be normalized!")
stop = True
break
if not stop:
if scaler_type == "minmax":
scaler = MinMaxScaler()
elif scaler_type == "normal":
scaler = StandardScaler()
elif scaler_type == "max_absolute":
scaler = MaxAbsScaler()
else:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(numeric_data)
self.data[numeric_columns] = normalized_data
return self.data
[docs] def onehot_encode(self, columns, keep_original=True):
"""
Performs one-hot encoding on specified columns of the input data.
Parameters:
- columns (list): list of column names containing categorical data
to be one-hot encoded.
- keep_original (bool): if True, keeps the original columns in
addition to the one-hot encoded columns.
Returns:
pandas.DataFrame: the one-hot encoded data.
Raises:
ValueError, if the specified columns are not of type 'object'.
"""
proceed = True
for col in columns:
if not (self.data[col].dtype == 'object'):
proceed = False
print("One of the inputted columns is maybe not an Object!")
break
if proceed:
onehot_data = pd.get_dummies(self.data, columns=columns)
if keep_original:
subset = self.data[columns]
self.data = pd.concat([onehot_data, subset], axis=1)
return self.data
self.data = onehot_data
return self.data
[docs] def display(self):
"""
It displays the current state of the data.
Returns:
pandas.DataFrame
"""
return self.data