Source code for UtilitiesDQMaRC


import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from itertools import product
from htmltools import TagList, tags
from shiny import ui


[docs]
def overall_quality_fx(avg_prop_good):
    """
    Determines the overall quality level based on the average proportion of 'good' data.

    Parameters
    ----------
    avg_prop_good : float
        The average proportion (percentage) of 'good' quality data across all metrics.

    Returns
    -------
    str
        A string representing the overall quality level. 
        Possible values are "Outstanding", "Good", "Requires Improvement", or "Inadequate" with corresponding colours for background and text.
    """

    if avg_prop_good > 90:
        return ["Outstanding", "#a6cee3", "#1f78b4"]  # Light blue for Outstanding 
    elif avg_prop_good >= 80:
        return ["Good", "#b2df8a", "#33a02c"]  # Green for Good
    elif avg_prop_good >= 60:
        return ["Requires Improvement", "#fdbf6f", "#ff7f00"]  # Amber for Requires Improvement
    else:
        return ["Inadequate", "#fb9a99", "#e31a1c"]  # Red for Inadequate

    

[docs]
class DonutChartGenerator:
    """
    A class for generating donut charts to visualise data quality metrics.

    Attributes
    ----------
    data : pandas.DataFrame
        The data containing quality metrics to be visualised.

    Methods
    -------
    plot_donut_charts()
        Generates a subplot of donut charts for each quality metric in the data.

        Returns
        -------
        plotly.graph_objs._figure.Figure
            A Plotly Figure object containing the subplot of donut charts.

            
    """

    def __init__(self, data):
        # """
        # Initialises the DonutChartGenerator with data.

        # Parameters
        # ----------
        # data : pandas.DataFrame
        #     The data containing quality metrics to be visualised.
        # """

        self.data = data


[docs]
    def plot_donut_charts(self):
        # """
        # Generates and returns a subplot of donut charts for each unique quality metric in the data.

        # Returns
        # -------
        # plotly.graph_objs._figure.Figure
        #     A Plotly Figure object containing the subplot of donut charts.
        # """
        # Initialise variables and create subplot framework
        metrics = self.data['Metric'].unique()

        # create subplots
        fig = make_subplots(
            rows=1,
            cols=len(metrics),
            specs=[[{'type': 'domain'}] * len(metrics)],
            subplot_titles=metrics
        )

        for i, metric in enumerate(metrics, start=1):
            metric_data_copy = self.data.copy()
            metric_data = metric_data_copy[(metric_data_copy['Metric'] == metric) & (metric_data_copy['Prop_NA'] == 0)]
            avg_prop_good = round(metric_data['Prop_Good'].mean(), 2)
            avg_prop_bad = round(metric_data['Prop_Bad'].mean(), 2)

            marker_vals = dict(colors=metric_data[['Colour_Bad', 'Colour_Good']][:1].values.tolist()[0])

            fig.add_trace(
                go.Pie(
                    labels=['Bad', 'Good'],  # Updated labels
                    values=[avg_prop_bad, avg_prop_good],
                    title=f"{avg_prop_good}%",
                    titlefont_size=22,
                    hole=0.6,
                    textposition="none",
                    showlegend=False,
                    marker=marker_vals
                ),
                row=1, col=i
            )
            fig.update_traces(
                showlegend = False,
                hoverinfo = "label+value"
                )

        # Customising subplot titles to be bold and larger text size
        for i, ann in enumerate(fig.layout.annotations):
            fig.layout.annotations[i].font = dict(size=16, color="black", family="Arial, bold")
            fig.layout.annotations[i].text = ann.text.upper()  # Convert text to uppercase

        return fig




[docs]
class BarPlotGenerator:
    """
    A class for generating bar plots to visualise data quality metrics for a chosen metric.

    Attributes
    ----------
    data : pandas.DataFrame
        The data containing quality metrics to be visualised.
    chosen_metric : str
        The metric for which to generate the bar plot.

    Methods
    -------
    plot_bar()
        Generates a bar plot for the chosen metric.

        Returns
        -------
        plotly.graph_objs._figure.Figure
            A Plotly Figure object containing the bar plot.

    """

    def __init__(self, data, chosen_metric):
        # """
        # Initialises the BarPlotGenerator with data and the chosen metric.

        # Parameters
        # ----------
        # data : pandas.DataFrame
        #     The data containing quality metrics to be visualised.
        # chosen_metric : str
        #     The metric for which to generate the bar plot.
        # """

        self.data = data
        self.chosen_metric = chosen_metric
    

[docs]
    def plot_bar(self):
        # """
        # Generates and returns a bar plot for the chosen quality metric in the data.

        # Returns
        # -------
        # plotly.graph_objs._figure.Figure
        #     A Plotly Figure object containing the bar plot.
        # """
        
        # Filter the data based on the chosen metric and non-NA values
        metric_data = self.data[(self.data['Metric'] == self.chosen_metric) & (self.data['Prop_NA'] != 100)].copy()

        # Sort the filtered data
        metric_data.sort_values(by=['Prop_Good', 'Field'], inplace=True)
        
        # Generate the figure and add bar traces for Good, Bad, and NA proportions
        fig = go.Figure()
        fig.add_trace(
            go.Bar(
                name = "Good", 
                x = metric_data['Prop_Good'], 
                y = metric_data['Field'], 
                orientation='h',
                marker_color = metric_data['Colour_Good']
                )
            )
        fig.add_trace(
            go.Bar(
                name = "Bad", 
                x = metric_data['Prop_Bad'], 
                y = metric_data['Field'], 
                orientation='h',
                marker_color = metric_data['Colour_Bad']
                )
            )
        fig.add_trace(
            go.Bar(
                name = "NA", 
                x = metric_data['Prop_NA'], 
                y = metric_data['Field'], 
                orientation='h',
                marker_color = metric_data['Colour_NA']
                )
            )
        fig.update_layout(
            barmode = 'stack'
        )
        fig.update_traces(
            showlegend = False,
            hoverinfo = "name+x"
            )

        return fig




[docs]
class MetricCalculator:
    """
    A class designed to calculate and compile data quality metrics from a provided dataset.

    Attributes
    ----------
    data : pandas.DataFrame
        The input dataset containing various quality metrics and fields.
    result : pandas.DataFrame
        A DataFrame initialised to store the calculated metrics, including counts and proportions of good, bad, and N/A data.

    Methods
    -------
    calculate_metrics()
        Calculates aggregate metrics for each field and metric combination present in the input data, updating the `result` attribute.
        
    """

    def __init__(self, data):
        # """
        # Initialises the MetricCalculator with the given dataset.

        # Parameters
        # ----------
        # data : pandas.DataFrame
        #     The input dataset from which data quality metrics will be calculated.
        # """

        self.data = data
        self.result = pd.DataFrame()


[docs]
    def calculate_metrics(self):
        # """
        # Processes the input dataset to calculate aggregate metrics for each unique field and metric combination.

        # This method populates the `result` DataFrame with each field-metric combination's total count, and the proportion of good, bad, and N/A data. It relies on the naming convention in the dataset columns to identify and separate fields and metrics.

        # The resulting DataFrame is structured to provide a comprehensive overview of data quality across multiple dimensions.
        # """
        # Extract unique fields and metrics from the dataset's column names
        fields = set()
        metrics = set()

        for column in self.data.columns:
            parts = column.split('_count_|_')
            metric, field = parts[0], parts[1]
            fields.add(field)
            metrics.add(metric)

        # Initialise the result DataFrame with combinations of fields and metrics
        field_metric_pairs = list(product(fields, metrics))
        field_list, metric_list = zip(*field_metric_pairs)

        self.result['Field'] = field_list
        self.result['Metric'] = metric_list
        self.result['Count'] = 0
        self.result['Prop_Bad'] = 0.0
        self.result['Prop_Good'] = 0.0
        self.result['Prop_NA'] = 0.0

        # Fill in data quality metrics for each field-metric pair
        for index, row in self.result.iterrows():
            field = row['Field']
            metric = row['Metric']
            column_name = f'{metric}_count_|_{field}'

            if column_name in self.data.columns:
                prop_na = len(self.data) - self.data[column_name].count()

                if prop_na == len(self.data):
                    self.result.at[index, f'{metric}_count_|_{field}'] = np.nan
                else:
                    count = self.data[column_name].sum()
                    prop_bad = (count / len(self.data)) * 100
                    self.result.at[index, f'{metric}_count_|_{field}'] = count
                    self.result.at[index, f'Count'] += count
                    self.result.at[index, f'Prop_Bad'] += prop_bad
                    self.result.at[index, f'Prop_Good'] += 100 - prop_bad
                
                self.result.at[index, f'Prop_NA'] = (prop_na / len(self.data)) * 100
            
            else:
                self.result.at[index, f'Prop_NA'] = 100.0
        
        # Finalise the result DataFrame structure
        self.result = self.result[['Field', 'Metric', 'Count', 'Prop_Bad', 'Prop_Good', 'Prop_NA']]
        self.result.sort_values(by = ['Field','Metric'], inplace=True)
        self.result.drop_duplicates(subset=['Field', 'Metric'], inplace=True)
        self.result[['Prop_Bad', 'Prop_Good', 'Prop_NA']] = self.result[['Prop_Bad', 'Prop_Good', 'Prop_NA']].round(2)




[docs]
def col_bad(row):
    """
    Assigns a color code to a data quality metric indicating a "bad" quality status.

    Parameters
    ----------
    row : pandas.Series
        A row from a DataFrame, expected to contain a 'Metric' column specifying the data quality metric.

    Returns
    -------
    str
        A hexadecimal color code associated with the "bad" quality status of the specified metric.

    Notes
    -----
    The function maps different data quality metrics to specific color codes, enhancing visual distinction in graphical representations.
    """
    # Define color mappings for various data quality metrics indicating "bad" status
    metric_color_map = {
        'completeness': '#a6cee3',
        'consistency': '#fb9a99',
        'timeliness': '#fdbf6f',
        'uniqueness': '#cab2d6',
        'validity': '#F49FA0',  # Updated color code from a commented-out alternative
        'accuracy': '#fb9a99'
    }
    # Default color if metric is not in the predefined list
    default_color = '#a6cee3'

    return metric_color_map.get(row['Metric'], default_color)



[docs]
def col_good(row):
    """
    Assigns a color code to a data quality metric indicating a "good" quality status.

    Parameters
    ----------
    row : pandas.Series
        A row from a DataFrame, expected to contain a 'Metric' column specifying the data quality metric.

    Returns
    -------
    str
        A hexadecimal color code associated with the "good" quality status of the specified metric.

    Notes
    -----
    Similar to `col_bad`, this function provides a way to visually differentiate between various data quality metrics in graphical representations by mapping them to specific color codes for "good" quality status.
    """
    # Define color mappings for various data quality metrics indicating "good" status
    metric_color_map = {
        'completeness': '#1f78b4',
        'consistency': '#e31a1c',
        'timeliness': '#ff7f00',
        'uniqueness': '#6a3d9a',
        'validity': '#b15928',
        'accuracy': '#e31a1c'
    }
    # Default color if metric is not in the predefined list
    default_color = '#1f78b4'

    return metric_color_map.get(row['Metric'], default_color)


about_text = TagList(
    tags.h3("Welcome to the Data Quality Profiling Tool"),
    tags.p(
        """
        This is the front-end to a data quality 
        profiling tool that is built in python.
        It provides a suite of data quality tests across six dimensions, 
        including """, 
        tags.strong("Completeness"), ", ",
        tags.strong("Validity"), ", ",
        tags.strong("Uniqueness"), ", ",
        tags.strong("Timeliness"), ", ",
        tags.strong("Consistency"), " and " ,
        tags.strong("Accuracy"),".",
        style="""
        text-align: justify;
        word-break:break-word;
        hyphens: auto;
        """,
    ),
)

key_features_text = TagList(
    tags.h4("Key Features"),
    tags.strong("1) Comprehensive DQ Checks:"),
    "Dive deep into your data with checks across six critical dimensions of data quality.",
    tags.strong("2) Custom Test Parameters: "),
    "Tailor data quality checks to meet the unique needs of your dataset with customisable test parameters.",
    tags.strong("3) Aggregated Results Overview: "),
    "Gain a bird's-eye view of your data's quality through aggregated summaries and detailed error reporting.",
    tags.strong("4) Dynamic Test Configuration: "),
    "Easily configure and modify tests to adapt to your evolving data quality requirements.",
    tags.strong("5) Interactive Results Analysis: "),                
    "Explore error details with interactive reports that make pinpointing issues straightforward.",
)
get_started_text = TagList(
    tags.h4("Get Started"),
    tags.strong("1) Upload Your Dataset:"),
    "Begin by uploading a csv of the dataset you wish to analyse.",
    tags.strong("2) Set Your Test Parameters: "),
    "Customise your data quality checks by setting parameters tailored to your dataset's specific needs. You can do this by initialising a test parameter template based on your input dataset. ",
    tags.strong("3) Run Data Quality Checks: "),
    "Execute a comprehensive suite of tests across your dataset with just a click.",
    tags.strong("4) Analyse Results: "),
    "View aggregated summaries, explore detailed error reports, and make informed decisions to improve your data quality.",
)

error_input_df_text = TagList(
    ui.markdown(
        """
        No input dataset found. Please choose a **.csv** or **.xlsx** file.
        """
    )
)
error_test_params_text = TagList(
    ui.markdown(
        """
        No test parameters found. 
        Please choose your test parameters either by initialising them
        via the **"Initialise Parameters"** or by uploading a .csv or .xlsx 
        test parameters file via the **"Upload Parameters"** button.
        """
    )
)

error_metric_variable_choice_text = TagList(
    ui.markdown(
        """
        No errors were found for this combination of DQ metric and chosen variable.
        """
    )
)