Source code for app

# ================
# === Packages ===
# ================
import sys
import pandas as pd

# === shiny packages === 
from shiny import App, reactive, render, ui
from shiny.types import FileInfo
from shiny import reactive
import asyncio

from shinywidgets import output_widget, render_widget, register_widget

# Conditional import of shinywidgets
if 'shiny' in sys.modules:
    from ipydatagrid import DataGrid, TextRenderer
    from shinywidgets import output_widget, render_widget, register_widget

from datetime import datetime

# === custom modules ===
from DQMaRC import DataQuality
from DQMaRC.UtilitiesDQMaRC import (
    MetricCalculator, 
    BarPlotGenerator, 
    DonutChartGenerator, 
    overall_quality_fx, col_good, col_bad,
    about_text, key_features_text, get_started_text, 
    error_input_df_text
)


# ==================================================================
# =============================== UI ===============================
# ==================================================================

app_ui = ui.page_fillable(
        ui.tags.head(
            # Custom CSS rules
            ui.tags.style("""
                .btn.action-button {
                    background-color: #a6cee3 !important; /* Light blue background */
                    color: #252525 !important; /* White text color */
                }
                .btn-file {
                    background-color: #a6cee3 !important; /* Light blue background */
                    color: #252525 !important; /* White text color */
                }
                /* Additional styles for hover effect */
                .btn-file:hover {
                    background-color: #95c8d8 !important; /* Slightly darker light blue on hover */
                }
                .shiny-download-link, .btn-default.shiny-download-link {
                    background-color: #a6cee3 !important; /* Light blue background */
                    color: #252525 !important; /* White text color */
                }
                /* Additional styles for hover effect */
                .shiny-download-link:hover, .btn-default.shiny-download-link:hover {
                    background-color: #95c8d8 !important; /* Slightly darker light blue on hover */
                }

            """)
        ),

    # Title
    ui.p(
        ui.strong(ui.h1(
            "DQMaRC: Data Quality Profiling",
            )
        ), 
    ),

    ui.navset_tab(

        # ==== tab 1. Welcome Page & Instructions ====
        ui.nav_panel(
            "1. Welcome",
            ui.card(
                about_text,
            ),
            # DQ Dims Explanatory Text
            ui.layout_column_wrap(
                ui.card(
                    key_features_text,
                ),
                ui.card(
                    get_started_text,
                ),
                width= 1 / 2
                ),
            ),

        # ==== tab 2. Data Upload ====
        ui.nav_panel(
            "2. Data Upload",
            ui.layout_column_wrap(
                ui.card(
                    ui.help_text("Upload your data here"),

                    # upload data 
                    ui.input_file(
                        id = "upload_data_file", 
                        label = "", 
                        button_label = "Browse",
                        accept=[".csv", ".xlsx"]
                        ),
                    ),
            ui.card(
                ui.help_text("Input Data Shape"),
                ui.output_ui("input_data_shape")),
            ),
            # view uploaded dataset
            ui.card(
                ui.output_data_frame("view_input_data"),
            )
        ),

        # ==== tab 3. Test Params =====
        ui.nav_panel(
            "3. Test Parameters",
            # test params action buttons
            ui.card(
                ui.layout_column_wrap(

                    #initialise test params
                    ui.input_action_button(
                        id = "initialise_params",
                        label = "Initialise Parameters",
                        class_="custom-class"
                    ),
                    
                    # upload test params
                    ui.input_file(
                        id = "upload_test_params_file", 
                        label = "", 
                        button_label = "Upload Parameters",
                        accept=[".csv", ".xls"]
                    ),

                    # download test params
                    ui.download_button(
                        id = "download_params",
                        label = "Download Parameters",
                        class_="custom-class"
                    ),
                )
            ),
              
            # test params table
            ui.card(
                output_widget("render_test_params"),
                # max_height="600px",
                class_="my-custom-card-class" # optional for future custom CSS classes
                ),
            ),
                
        # ==== tab 4. DQ Dashboard ====
        ui.nav_panel(
            "4. DQ Dashboard",

            ui.card(
                ui.input_action_button(id = "run_parameters", label = "Set and Run Parameters")
            ),
            
            ui.card(
                ui.panel_conditional(
                "input.choose_metric",

                # overall quality label
                ui.card(
                    ui.h2(ui.output_ui("overall_quality_label"))
                ),

                # donut
                ui.card(
                    output_widget("donutchart")
                ),

                # barplot + error records
                ui.layout_column_wrap(
                    # barplot
                    ui.card(
                        output_widget("barplot_chart"), full_screen=True),
                    
                    # DQ metric explanatory text
                    ui.card(ui.layout_column_wrap(
                        # choose a metric
                        ui.input_select(
                        id = "choose_metric",
                        label = "Choose a Dimension",
                        selected = None,
                        choices = []
                        ),
                        # choose a variable
                        ui.input_select(
                        id = "choose_variable",
                        label = "Choose a Variable",
                        selected = None,
                        choices = []
                        ), 
                        width = 1 / 2
                    ),
                        # filtered erroneous records
                        output_widget("error_records_datagrid"),
                        full_screen=True
                    ),
                    width = 1 / 2
                ),

                # Download results
                ui.card(
                    ui.layout_column_wrap(

                        # download full raw results
                        ui.download_button(
                            id = "download_full_results",
                            label = "Download Full Results",
                        ),

                        # download summary results
                        ui.download_button(
                            id = "download_summary_results",
                            label = "Download Summary Results",
                        )
                    ),
                ) # download results layout column wrap
            )
            ) # panel conditional
        ) # nav panel
    ) # navset tab
) # ui

# ==================================================================
# ============================= SERVER =============================
# ==================================================================


[docs]
def server(input, output, session):
    """
    Defines the server logic for handling user inputs, processing data,
    and generating outputs for DQMaRC.

    Functions
    ---------
    handle_source_data_upload()
        Handles the upload of the source data file.
    show_error_input_df_text(df_input, id_num)
        Displays an error message if no dataset is found.
    input_data()
        Renders the uploaded DataFrame for viewing.
    input_data_shape_fx()
        Displays the shape of the uploaded DataFrame.
    initialise_test_params()
        Initialises test parameters based on the uploaded data.
    handle_test_params_upload()
        Handles the upload of test parameters file.
    initialise_DQ_tool()
        Initialises the DataQuality tool with the uploaded data.
    render_test_params()
        Renders the test parameters UI for editing.
    run_metrics()
        Runs the data quality metrics based on the test parameters.
    download_full_results()
        Downloads the full results of the DQ checks.
    download_summary_results()
        Downloads the summary results of the DQ checks.
    overall_quality_label_fx()
        Displays the overall quality label based on the summary results.
    donutchart_fx()
        Generates donut charts for the DQ metrics.
    barplot_chart_fx()
        Generates bar charts for the selected DQ metric.
    update_metric_choices()
        Updates the metric choices based on the summary results.
    update_variable_choices()
        Updates the variable choices based on the selected metric.
    display_error_records_fx()
        Displays the error records based on the selected metric and variable.
    """
    # global variables
    input_df_global = reactive.Value() # needed to set and run params
    test_params_global = reactive.Value() # needed to set and run params
    test_params_global_updated = reactive.Value() # updated test params global
    summary_results_global = reactive.Value()
    source_df_raw_global = reactive.Value()

    # ==================================
    # ========== Input Data ============
    # ==================================
    # Handle the upload of source data file
    @reactive.Effect
    @reactive.event(input.upload_data_file)
    def handle_source_data_upload():
        source_data_file = input.upload_data_file()
        if source_data_file:
            df_source_data = pd.read_csv(source_data_file[0]["datapath"], sep=",", na_filter=False)
            input_df_global.set(df_source_data)
            ui.notification_show("Input data successfully uploaded.", type='message')
        else:
            ui.notification_show("Please upload an input dataset CSV file.", type='error')

    #===================================
    def show_error_input_df_text(df_input, id_num):
        @output(id="{}".format(id_num))
        @render.text
        def error_input_df_fx():
            if df_input.empty: 
                # Display an error message if no dataset is found
                return error_input_df_text
            else:
                pass

    #===================================
    @output(id="view_input_data")
    @render.data_frame
    def input_data():
        """
        Renders the DataFrame created from the uploaded file for viewing, or an error message if no data is available.
        """
        df = input_df_global()
        show_error_input_df_text(df, "error_input_df")
        return df

    #===================================
    @output(id="input_data_shape")
    @render.text
    def input_data_shape_fx():
        """
        Displays the shape of the uploaded DataFrame.
        """
        df = input_df_global()
        if df.empty:
            return error_input_df_text
        else:
            out_shape_text = f"Rows: {df.shape[0]}, Columns: {df.shape[1]}"
            return out_shape_text

    # ==================================
    # ========= TEST PARAMS ============
    # ==================================
    # you can have one of two test params csv/df's: default or pre-defined (upload)
    # test_params (TP) = None else default else uploaded_TP
    # Initialise or Upload Test Params
    @reactive.Effect
    @reactive.event(input.initialise_params)
    def initialise_test_params():
        df_input = input_df_global()
        if df_input is not None and not df_input.empty:
            dq_tool = DataQuality(df_input)
            test_params_df = dq_tool.get_test_params()
            test_params_global.set(test_params_df)
            test_params_global_updated.set(test_params_df)
            ui.notification_show("Test parameters initialised.", type='message')
        else:
            ui.notification_show("Please upload an input dataset before initialising parameters.", type='error')

    # Handle the upload of test parameters file
    @reactive.Effect
    @reactive.event(input.upload_test_params_file)
    def handle_test_params_upload():
        file = input.upload_test_params_file()
        if file:
            df = pd.read_csv(file[0]["datapath"])
            test_params_global.set(df)
            test_params_global_updated.set(df)
            ui.notification_show("Test parameters file uploaded successfully.", type='message')
        else:
            ui.notification_show("Please select a test parameters file to upload.", type='error')

    # initialise DQ tool
    @reactive.Calc
    @reactive.event(input.initialise_params, input.run_parameters)
    def initialise_DQ_tool():
        df = input_df_global()
        if df is None or df.empty:
            # Show notification if no data is uploaded
            ui.notification_show(
                ui.tags.div("Please upload an input dataset"),
                type='error', 
                duration=5  # Adjust duration as needed
            )
        else:
            try:
                dq = DataQuality(df)
                return dq
            except Exception as e:
                print("!! error in initialise DQ tool")
    
    # render editable test parameters 
    @render_widget
    @reactive.event(input.initialise_params, input.upload_test_params_file)
    def render_test_params():
        """
        Renders the test parameters UI allowing for editing and interaction.
        """
        try:
            test_params = test_params_global()
            test_params = DataGrid(
                test_params, 
                editable=True, 
                header_visibility='column',
                base_column_size = 180,
                base_column_header_size = 30,
                header_renderer=TextRenderer(background_color='lightblue'),
                grid_style={"header_background_color": "lightblue"}  
                )
            register_widget("params_table", test_params)

            # Create a reactive value for tracking cell changes
            cell_changes = reactive.Value()

            def on_cell_changed(cell):
                cell_changes.set(str(cell))
                test_params_global_updated.set(test_params.get_visible_data()) 

            # register cell change callback
            test_params.on_cell_change(on_cell_changed)

            # download test parameters
            @render.download(filename=f"{datetime.now().strftime('%Y-%m-%d %H.%M.%S')}_test_params.csv", encoding='UTF-8-sig')
            def download_params():
                yield test_params.get_visible_data().to_csv(index=False)

            return test_params
        
        except Exception as e:
            ui.notification_show(
                ui.tags.div("Error in render_test_params."),
                type='error', 
                duration=5  # Adjust duration as needed
            )

    # ======================================
    # ====== set & run current params ======
    # this is for checking test params output in raw table output position
    # Set and Run Test Parameters
    @reactive.Effect
    @reactive.event(input.run_parameters)
    async def run_metrics():
        df_input = input_df_global()
        # Call the function once and use the result multiple times
        updated_params = test_params_global_updated()

        # Check if updated_params is None or empty, and use a default if so
        df_params = updated_params if updated_params is not None and not updated_params.empty else test_params_global()

        if df_input is None or df_input.empty:
            ui.notification_show("Please upload an input dataset before running tests.", type='error')

        if df_params is None or df_params.empty:
            ui.notification_show("Please initialise or upload test parameters before running them.", type='error')

        try:
            # Proceed with running metrics...
            with ui.Progress(min=1, max=8) as progress:  # Adjust the max value based on the number of steps
                progress.set(message="(1/8) Initialising test parameters.")
                await asyncio.sleep(0.5)  # Wait for 1 second

                # Simulate processing steps with progress updates
                progress.inc(message="(2/8) Test parameters loaded.")
                await asyncio.sleep(0.5)  # Wait for 1 second

                dq = initialise_DQ_tool()
                progress.inc(message="(3/8) Data Quality tool initialised.")
                await asyncio.sleep(0.5)  # Wait for 1 second

                dq.set_test_params(df_params)
                progress.inc(message="(4/8) Test parameters set.")
                await asyncio.sleep(0.5)  # Wait for 1 second
                
                # Simulate the metrics computation step
                dq.run_all_metrics()
                progress.inc(message="(5/8) Running metrics.")
                await asyncio.sleep(0.5)  # Wait for 1 second
                
                # raw results joined to source/input df
                raw = dq.raw_results()
                source_df_raw = input_df_global().join(raw)
                source_df_raw_global.set(source_df_raw)

                # prepare results for DQ dashboard, first by subsetting final sums/counts of DQ dimensions
                raw_subset = raw.filter(regex='completeness|validity|consistency|uniqueness_count|accuracy|timeliness')
                calculator = MetricCalculator(raw_subset)
                progress.inc(message="(6/8) Calculating results.")
                await asyncio.sleep(0.5)  # Wait for 1 second
                
                # Simulate the calculation step, calculate aggregates
                calculator.calculate_metrics()
                progress.inc(message="(7/8) Finalising calculation.")
                await asyncio.sleep(1)  # Wait for 1 second
                
                summary_results = calculator.result
                summary_results['Colour_Good'] = summary_results.apply(col_good, axis=1)
                summary_results['Colour_Bad'] = summary_results.apply(col_bad, axis=1)
                summary_results['Colour_NA'] = '#B2C3C6'
                summary_results_global.set(summary_results)

                progress.inc(message="(8/8) Results ready.")
                await asyncio.sleep(2)  # Wait for 1 second
                
                # update set parameters input action button to "Rerun..."
                ui.update_action_button("run_parameters", label="Re-run Test Parameters")

                # Finish progress
                progress.close()
            # Simulate a long-running operation
            await asyncio.sleep(2)
            ui.notification_show("Metrics computation completed successfully.", type='message')
            # Update UI elements or global state as necessary
        except Exception as e:
            ui.notification_show(f"An error occurred while running metrics: {e}", type='error')

    # ======================================
    # ======= DOWNLOAD Full Results ========
    @render.download(
            filename=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_full_results.csv"
    )
    def download_full_results():
        yield source_df_raw_global().to_csv(index=False)

    # ======================================
    # ====== DOWNLOAD Summary Results ======
    @render.download(
            filename=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_summary_results.csv"
    )
    def download_summary_results():
        yield summary_results_global()[['Field', 'Metric', 'Count','Prop_Bad','Prop_Good','Prop_NA']].to_csv(index=False)

    # ===================================
        
    # overall DQ label
    @output(id="overall_quality_label")
    @render.ui
    def overall_quality_label_fx():
        data = summary_results_global()
        if not data.empty:
            data1 = data[data['Prop_NA'] == 0]
            avg_prop_good = data1['Prop_Good'].mean()
            overall_quality_level = overall_quality_fx(avg_prop_good)[0]
            background_color = overall_quality_fx(avg_prop_good)[1]
            text_colour = overall_quality_fx(avg_prop_good)[2]
            overall_quality_text = f"Overall Quality: {overall_quality_level}"

            return ui.tags.div(
                overall_quality_text,
                style=f"""
                    background-color: {background_color}; 
                    padding: 10px; 
                    border-radius: 5px; 
                    color: {text_colour}; 
                    border: 2px solid {text_colour};
                    text-align: center;"""
            )
        else:
            # Fallback content if data is empty
            return ui.tags.div("No data available", style="text-align: center;")

    #===================================
    # Donut charts
    @output(id="donutchart")
    @render_widget
    def donutchart_fx():
        data = summary_results_global()
        if data is not None:
            return DonutChartGenerator(data).plot_donut_charts()
  
    #===================================
    # bar chart
    @output(id="barplot_chart")
    @render_widget
    def barplot_chart_fx():
        data = summary_results_global()
        if data is not None:
            metric = input.choose_metric()
            return BarPlotGenerator(data, metric).plot_bar()

# =============================
    @reactive.Effect
    def update_metric_choices():
        summary_results = summary_results_global()
        if summary_results is not None and not summary_results.empty:
            metrics = summary_results['Metric'].unique().tolist()
            ui.update_select("choose_metric", choices=metrics)

    @reactive.Effect
    def update_variable_choices():
        summary_results = summary_results_global()
        selected_metric = input.choose_metric()
        if summary_results is not None and not summary_results.empty and selected_metric:
            # Filter summary_results to exclude rows where Prop_NA == 100 for the selected metric
            valid_fields = summary_results[(summary_results['Metric'] == selected_metric) & (summary_results['Prop_NA'] != 100)]['Field'].unique().tolist()
            ui.update_select("choose_variable", choices=valid_fields)

    def get_unique_variables_message(test_params):
        unique_variables = test_params[test_params["Uniqueness_Rows"] == True]["Field"].to_list()
        if unique_variables:
            return pd.DataFrame({"Variables Included": unique_variables})
        return pd.DataFrame({"Message": ["No unique variables found"]})
    
    def get_consistency_comparison(full_results, test_params, selected_variable, error_column):
        # Check if the selected variable exists in the test_params table
        if selected_variable not in test_params["Field"].values:
            return pd.DataFrame({"Message": [f"Variable '{selected_variable}' not found in test parameters"]})
        
        test_param_row = test_params[test_params["Field"] == selected_variable].iloc[0]
        
        # Determine which comparison approach to use
        if test_param_row["Consistency_Compare"] == True:
            compare_variable = test_param_row["Consistency_Compare_Field"]
        elif test_param_row["Consistency_Date_Relations"] == True:
            compare_variable = test_param_row["Consistency_Date_Relations_Field"]
        elif test_param_row["Timeliness_Date_Diff"] == True:
            compare_variable = test_param_row["Timeliness_Date_Diff_Field"]
        else:
            return pd.DataFrame({"Message": ["Neither Consistency_Compare nor Consistency_Date_Relations is set to True for selected variable"]})
        
        # Check for NA values in compare_variable
        if pd.isna(compare_variable):
            return pd.DataFrame({"Message": ["Comparison variable not found or is NA"]})
        
        # Ensure selected_variable and, if applicable, compare_variable exist in the full_results
        # if selected_variable not in full_results.columns or (compare_variable not in full_results.columns and compare_variable != "Consistency_Date_Relations"):
        if selected_variable not in full_results.columns or compare_variable not in full_results.columns:
            return pd.DataFrame({"Message": ["Selected or comparison variable not found in data"]})
        
        # Ensure the error column exists in full_results
        if error_column not in full_results.columns:
            return pd.DataFrame({"Message": ["Error column not found in data"]})
              
        # Proceed with comparison for other cases
        error_records = full_results[full_results[error_column] > 0]
        relevant_data = error_records[[selected_variable, compare_variable, error_column]]
        if error_records.empty:
            return pd.DataFrame({"Message": ["No errors found"]})

        if relevant_data.empty:
            return pd.DataFrame({"Message": ["No relevant data found for consistency comparison"]})
        
        aggregated_data = relevant_data.groupby([selected_variable, compare_variable]).size().reset_index(name='Count')
        if aggregated_data.empty:
            return pd.DataFrame({"Message": ["No consistency errors found between variables"]})
        
        aggregated_data_sorted = aggregated_data.sort_values(by="Count", ascending=False)
        return aggregated_data_sorted

    def get_error_records(full_results, selected_variable, error_column):
        error_records = full_results[full_results[error_column] > 0]
        if error_records.empty:
            return pd.DataFrame({"Message": ["No errors found"]})
        aggregated_errors = error_records.groupby(selected_variable)[error_column].agg('count').reset_index()
        aggregated_errors_renamed = aggregated_errors.rename(columns={error_column: "Count"})
        return aggregated_errors_renamed.sort_values(by="Count", ascending=False)

    def display_filtered_errors():
        full_results = source_df_raw_global()
        test_params = test_params_global()
        selected_metric = input.choose_metric()
        selected_variable = input.choose_variable()
        error_column = f"{selected_metric.lower()}_count_|_{selected_variable}"

        if selected_metric == "uniqueness":
            return get_unique_variables_message(test_params)

        elif selected_metric == "consistency" or selected_metric == "timeliness":
            return get_consistency_comparison(full_results, test_params, selected_variable, error_column)

        if full_results is not None and not full_results.empty and selected_metric and selected_variable:
            if error_column in full_results.columns:
                return get_error_records(full_results, selected_variable, error_column)
            return pd.DataFrame({"Message": ["Error column not found"]})

        return pd.DataFrame({"Message": ["Please select both metric and variable"]})

    @output(id="error_records_datagrid")
    @render_widget
    def display_error_records_fx():
        data = display_filtered_errors()
        data_datagrid = DataGrid(
            data, 
            header_visibility='all',
            base_column_header_size = 50,
            auto_fit_columns = True,
            selection_mode = 'cell',
            header_renderer=TextRenderer(background_color='lightblue'),
            grid_style={"header_background_color": "lightblue"}
            )
        return data_datagrid    


app = App(app_ui, server)