Source code for tree_lab.Cleaning
[docs]class TreeDataCleaner:
"""
Initializes a TreeDataCleaner object.
Parameters:
- data: pandas.DataFrame
"""
def __init__(self, data):
self.data = data.copy()
[docs] def detect_na(self):
"""
Checks for null values in the dataset, prints columns with null values (if any),
and removes duplicate rows in the dataset.
"""
# Check for any null values in the dataset
null_values_check = self.data.isnull().any()
# Print columns with null values, if any
if null_values_check.any():
print("Columns with null values:")
print(null_values_check[null_values_check].index.tolist())
# Remove duplicate rows in the dataset
self.data = self.data.drop_duplicates()
[docs] def impute_na(self):
"""
Imputes missing values in the 'EMF' column by filling them with the mean of the column.
Fills any remaining null values in the dataset with 0.
"""
self.data['EMF'].fillna(self.data['EMF'].mean(), inplace=True)
self.data = self.data.fillna(0)
[docs] def modify_status(self):
"""
Modifies the 'Alive' column by replacing 'X' with 1.
Renames the 'Event' column to 'Dead'.
Returns:
pandas.DataFrame: The modified DataFrame.
"""
self.data['Alive'] = self.data['Alive'].replace('X', 1)
self.data = self.data.rename(columns={'Event': 'Dead'})
return self.data
[docs] def del_cols(self, columns_to_delete):
"""
Function to allow users to delete specified columns.
Parameters:
- columns_to_delete: a list of column names to be deleted.
Example:
cleaner.del_cols(['Age', 'Height'])
"""
for column_name in columns_to_delete:
if column_name in self.data.columns:
del self.data[column_name]
else:
print(f"Column '{column_name}' either already deleted or not found in the dataset.")
[docs] def display(self):
"""
It displays the current state of the data.
Returns:
pandas.DataFrame
"""
return self.data