Source code for tree_lab.Visualization

import pandas as pd
import matplotlib.pyplot as plot
import warnings
import polars.selectors as cs
import seaborn as sns
import polars as pl
from tree_lab import importing as imp

# Ignore all warnings
warnings.filterwarnings("ignore")

data = imp.import_data()


# visualisation
[docs]def summarize(df, col, kind="Frequency and Relative frequency", dec=2):
    """
    Summarizes the columns selected from the dataframe by showing the
    frequency and/or relative frequency of the categories

    Parameter:
        - df: a pandas dataframe
        - col: the columns of the dataframe that the user wants to summarize
        - kind: a string specifying if the frequency and/or the relative
          frequencies should be displayed. The default is "Frequency and
          Relative frequency", but it is also possible to choose "Frequency", or
          "Relative frequency"

    Returns the frequency tables for the selected columns
    """

    # Create frequency tables for each specified column
    frequency_tables = {}
    for column_name in col:

        value_counts = df[column_name].value_counts()
        percentages = ((value_counts / len(df)) * 100).round(dec)

        if kind == "Frequency":
            frequency_table_df = pd.DataFrame({'Frequency': value_counts})
            frequency_table_df.reset_index(inplace=True)
            frequency_table_df.columns = [column_name, 'Frequency']
        elif kind == "Relative frequency":
            frequency_table_df = pd.DataFrame(
                {'Relative frequency': percentages})
            frequency_table_df.reset_index(inplace=True)
            frequency_table_df.columns = [column_name, 'Relative frequency']
        elif kind == "Frequency and Relative frequency":
            frequency_table_df = pd.DataFrame(
                {'Frequency': value_counts, 'Relative frequency': percentages})
            frequency_table_df.reset_index(inplace=True)
            frequency_table_df.columns = [column_name, 'Frequency',
                                          'Relative frequency']
        else:
            frequency_table_df = None
            print("Something went wrong!")

        frequency_tables[column_name] = frequency_table_df

    # Display the frequency tables
    for column_name, frequency_table_df in frequency_tables.items():
        print(f"\nSummary for {column_name}:\n")
        print(frequency_table_df)


[docs]def compute_stats(dataframe, selected_columns):
    """
    Computes mean, standard deviation, minimum, maximum and median for the
    specified columns

    Parameters:
        - dataframe: the dataframe
        - selected_columns: list containing the columns for which we wish
          to have the statistics

    Returns:
         a dataframe containing the statistics of the columns specified in input
    """
    valid_columns = ['Light_ISF', 'AMF', 'EMF', 'Phenolics', 'Lignin', 'NSC']

    for column in selected_columns:
        if column not in valid_columns:
            print(f"Error: '{column}' is not one of the specified columns.")
            return

    results = pd.DataFrame({
        'Mean': dataframe[selected_columns].mean(),
        'Standard Deviation': dataframe[selected_columns].std(),
        'Minimum': dataframe[selected_columns].min(),
        'Maximum': dataframe[selected_columns].max(),
        'Median': dataframe[selected_columns].median()
    })

    return results


[docs]def bar_plot(df, kind):
    """
    Generate different types of bar charts based on the specified kind parameter.

    Parameters:
        - df (pandas DataFrame): Input DataFrame containing relevant data.
        - kind (str): Type of bar chart to generate. Options:
          "Species_vs_Status", "Species_vs_field", "Light level vs status".

    Returns:
        The plots are displayed using the 'plot.show()' method.

    Notes:
        - For "Species_vs_Status", the function generates a bar plot showing the
          count of alive and dead instances for each species.
        - For "Species_vs_field", the function creates a stacked bar chart
          representing the count of each species in different fields.
        - For "Light level vs status", a bar plot is generated to display the
          count of alive and dead instances for each light level category.

    The function utilizes seaborn and matplotlib for visualization
    """

    if kind == "Species_vs_Status":

        df_subset = df[['Species', 'Alive', 'Dead']]
        by_species_df = df_subset.groupby('Species').sum()

        # Create and add a new column containing the species names
        new_column_data = ["Acer saccharum", "Prunus serotina", "Quercus alba",
                           "Quercus rubra"]
        by_species_df.insert(0, 'Species', new_column_data)

        species_df = pl.DataFrame(by_species_df)

        data_counts = species_df.melt(id_vars="Species",
                                      value_vars=cs.numeric(),
                                      variable_name="Status",
                                      value_name="Count")

        sns.set_theme(style="whitegrid")

        data_counts2 = pd.DataFrame(data_counts,
                                    columns=["Species", "Status", "Count"])
        # Plotting the bar chart
        g = sns.catplot(
            data=data_counts2, kind="bar",
            x="Species", y="Count", hue="Status",
            errorbar="sd", palette="dark", alpha=.6, height=6
        )
        g.despine(left=True)
        g.set_axis_labels("Species", "Count")
        plot.subplots_adjust(top=0.9)  # leave space for the title
        g.fig.suptitle("Bar Chart: Species vs Status")

        for p in g.ax.patches:
            g.ax.annotate(f'{p.get_height():.0f}',
                          (p.get_x() + p.get_width() / 2., p.get_height()),
                          ha='center', va='center', xytext=(0, 10),
                          textcoords='offset points')
        plot.show()

    elif kind == "Species_vs_field":

        df_subset = df[['Species', 'Plot']]
        contingency_table = pd.crosstab(df_subset['Plot'], df_subset['Species'])

        df_final = pd.DataFrame(contingency_table)

        # Plotting the bar chart
        ax = df_final.plot(kind='bar', stacked=True, figsize=(10, 6))
        ax.set_xlabel('Field')
        ax.set_ylabel('Count')
        ax.set_title('Count of Species for Each Field')
        plot.legend(title='Species', bbox_to_anchor=(1.05, 1), loc='upper left')
        plot.tight_layout()  # Adjust layout to make space for the legend
        plot.show()

    elif kind == "Light level vs status":

        df_sublight = df[['Light_Cat', 'Alive', 'Dead']]
        by_light_df = df_sublight.groupby('Light_Cat').sum()

        light_cat = ["High", "Low", "Med"]
        by_light_df.insert(0, 'Light_Cat', light_cat)

        light_df = pl.DataFrame(by_light_df)

        data_light_counts = light_df.melt(id_vars="Light_Cat",
                                          value_vars=cs.numeric(),
                                          variable_name="Status",
                                          value_name="Count")

        # plot
        sns.set_theme(style="whitegrid")

        data_light_counts2 = pd.DataFrame(data_light_counts,
                                          columns=["Light_Cat", "Status",
                                                   "Count"])
        g = sns.catplot(
            data=data_light_counts2, kind="bar",
            x="Light_Cat", y="Count", hue="Status",
            errorbar="sd", palette="pink", alpha=.6, height=6
        )
        g.despine(left=True)
        g.set_axis_labels("Light level", "Count")
        plot.subplots_adjust(top=0.9)  # leave space for the title
        g.fig.suptitle("Bar Chart: light level vs status")

        for p in g.ax.patches:
            g.ax.annotate(f'{p.get_height():.0f}',
                          (p.get_x() + p.get_width() / 2., p.get_height()),
                          ha='center', va='center', xytext=(0, 10),
                          textcoords='offset points')

        plot.show()


[docs]def scatter_plot(df, column_x, column_y, hue_column, title):
    """
    This function creates a scatter plot for the specified columns in the DataFrame

    Parameters:
        - df: a dataframe
        - column_x: a string specifying the name of a numerical variable of df
        - column_y: a string specifying the name of a numerical variable of df
        - hue_column: allows to assign a categorical variable to the data points
          and represent it using different colours
        - title: a string specifying the title of the plot


    Returns:
        The plots are displayed using the 'plot.show()' method.
        """


    sns.set_theme(style="whitegrid")

    # Input validation
    if not pd.api.types.is_numeric_dtype(df[column_x]):
        raise ValueError(f"The column '{column_x}' must contain numerical data.")

    if not pd.api.types.is_numeric_dtype(df[column_y]):
        raise ValueError(f"The column '{column_y}' must contain numerical data.")

    # Create the scatter plot with hue and style based on the specified column
    plot.figure(figsize=(10, 6))
    sns.scatterplot(x=column_x, y=column_y, hue=hue_column, style=hue_column,
                    data=df, palette="viridis", markers=True)

    # Set labels and title
    plot.xlabel(column_x)
    plot.ylabel(column_y)
    plot.title(title)

    # Add legend
    plot.legend(title=hue_column, bbox_to_anchor=(1.05, 1), loc='upper left')

    # Adjust layout to make space for the legend
    plot.tight_layout()

    # Show the plot
    plot.show()