import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from itertools import product
from htmltools import TagList, tags
from shiny import ui
[docs]
def overall_quality_fx(avg_prop_good):
"""
Determines the overall quality level based on the average proportion of 'good' data.
Parameters
----------
avg_prop_good : float
The average proportion (percentage) of 'good' quality data across all metrics.
Returns
-------
str
A string representing the overall quality level.
Possible values are "Outstanding", "Good", "Requires Improvement", or "Inadequate" with corresponding colours for background and text.
"""
if avg_prop_good > 90:
return ["Outstanding", "#a6cee3", "#1f78b4"] # Light blue for Outstanding
elif avg_prop_good >= 80:
return ["Good", "#b2df8a", "#33a02c"] # Green for Good
elif avg_prop_good >= 60:
return ["Requires Improvement", "#fdbf6f", "#ff7f00"] # Amber for Requires Improvement
else:
return ["Inadequate", "#fb9a99", "#e31a1c"] # Red for Inadequate
[docs]
class DonutChartGenerator:
"""
A class for generating donut charts to visualise data quality metrics.
Attributes
----------
data : pandas.DataFrame
The data containing quality metrics to be visualised.
Methods
-------
plot_donut_charts()
Generates a subplot of donut charts for each quality metric in the data.
Returns
-------
plotly.graph_objs._figure.Figure
A Plotly Figure object containing the subplot of donut charts.
"""
def __init__(self, data):
# """
# Initialises the DonutChartGenerator with data.
# Parameters
# ----------
# data : pandas.DataFrame
# The data containing quality metrics to be visualised.
# """
self.data = data
[docs]
def plot_donut_charts(self):
# """
# Generates and returns a subplot of donut charts for each unique quality metric in the data.
# Returns
# -------
# plotly.graph_objs._figure.Figure
# A Plotly Figure object containing the subplot of donut charts.
# """
# Initialise variables and create subplot framework
metrics = self.data['Metric'].unique()
# create subplots
fig = make_subplots(
rows=1,
cols=len(metrics),
specs=[[{'type': 'domain'}] * len(metrics)],
subplot_titles=metrics
)
for i, metric in enumerate(metrics, start=1):
metric_data_copy = self.data.copy()
metric_data = metric_data_copy[(metric_data_copy['Metric'] == metric) & (metric_data_copy['Prop_NA'] == 0)]
avg_prop_good = round(metric_data['Prop_Good'].mean(), 2)
avg_prop_bad = round(metric_data['Prop_Bad'].mean(), 2)
marker_vals = dict(colors=metric_data[['Colour_Bad', 'Colour_Good']][:1].values.tolist()[0])
fig.add_trace(
go.Pie(
labels=['Bad', 'Good'], # Updated labels
values=[avg_prop_bad, avg_prop_good],
title=f"{avg_prop_good}%",
titlefont_size=22,
hole=0.6,
textposition="none",
showlegend=False,
marker=marker_vals
),
row=1, col=i
)
fig.update_traces(
showlegend = False,
hoverinfo = "label+value"
)
# Customising subplot titles to be bold and larger text size
for i, ann in enumerate(fig.layout.annotations):
fig.layout.annotations[i].font = dict(size=16, color="black", family="Arial, bold")
fig.layout.annotations[i].text = ann.text.upper() # Convert text to uppercase
return fig
[docs]
class BarPlotGenerator:
"""
A class for generating bar plots to visualise data quality metrics for a chosen metric.
Attributes
----------
data : pandas.DataFrame
The data containing quality metrics to be visualised.
chosen_metric : str
The metric for which to generate the bar plot.
Methods
-------
plot_bar()
Generates a bar plot for the chosen metric.
Returns
-------
plotly.graph_objs._figure.Figure
A Plotly Figure object containing the bar plot.
"""
def __init__(self, data, chosen_metric):
# """
# Initialises the BarPlotGenerator with data and the chosen metric.
# Parameters
# ----------
# data : pandas.DataFrame
# The data containing quality metrics to be visualised.
# chosen_metric : str
# The metric for which to generate the bar plot.
# """
self.data = data
self.chosen_metric = chosen_metric
[docs]
def plot_bar(self):
# """
# Generates and returns a bar plot for the chosen quality metric in the data.
# Returns
# -------
# plotly.graph_objs._figure.Figure
# A Plotly Figure object containing the bar plot.
# """
# Filter the data based on the chosen metric and non-NA values
metric_data = self.data[(self.data['Metric'] == self.chosen_metric) & (self.data['Prop_NA'] != 100)].copy()
# Sort the filtered data
metric_data.sort_values(by=['Prop_Good', 'Field'], inplace=True)
# Generate the figure and add bar traces for Good, Bad, and NA proportions
fig = go.Figure()
fig.add_trace(
go.Bar(
name = "Good",
x = metric_data['Prop_Good'],
y = metric_data['Field'],
orientation='h',
marker_color = metric_data['Colour_Good']
)
)
fig.add_trace(
go.Bar(
name = "Bad",
x = metric_data['Prop_Bad'],
y = metric_data['Field'],
orientation='h',
marker_color = metric_data['Colour_Bad']
)
)
fig.add_trace(
go.Bar(
name = "NA",
x = metric_data['Prop_NA'],
y = metric_data['Field'],
orientation='h',
marker_color = metric_data['Colour_NA']
)
)
fig.update_layout(
barmode = 'stack'
)
fig.update_traces(
showlegend = False,
hoverinfo = "name+x"
)
return fig
[docs]
class MetricCalculator:
"""
A class designed to calculate and compile data quality metrics from a provided dataset.
Attributes
----------
data : pandas.DataFrame
The input dataset containing various quality metrics and fields.
result : pandas.DataFrame
A DataFrame initialised to store the calculated metrics, including counts and proportions of good, bad, and N/A data.
Methods
-------
calculate_metrics()
Calculates aggregate metrics for each field and metric combination present in the input data, updating the `result` attribute.
"""
def __init__(self, data):
# """
# Initialises the MetricCalculator with the given dataset.
# Parameters
# ----------
# data : pandas.DataFrame
# The input dataset from which data quality metrics will be calculated.
# """
self.data = data
self.result = pd.DataFrame()
[docs]
def calculate_metrics(self):
# """
# Processes the input dataset to calculate aggregate metrics for each unique field and metric combination.
# This method populates the `result` DataFrame with each field-metric combination's total count, and the proportion of good, bad, and N/A data. It relies on the naming convention in the dataset columns to identify and separate fields and metrics.
# The resulting DataFrame is structured to provide a comprehensive overview of data quality across multiple dimensions.
# """
# Extract unique fields and metrics from the dataset's column names
fields = set()
metrics = set()
for column in self.data.columns:
parts = column.split('_count_|_')
metric, field = parts[0], parts[1]
fields.add(field)
metrics.add(metric)
# Initialise the result DataFrame with combinations of fields and metrics
field_metric_pairs = list(product(fields, metrics))
field_list, metric_list = zip(*field_metric_pairs)
self.result['Field'] = field_list
self.result['Metric'] = metric_list
self.result['Count'] = 0
self.result['Prop_Bad'] = 0.0
self.result['Prop_Good'] = 0.0
self.result['Prop_NA'] = 0.0
# Fill in data quality metrics for each field-metric pair
for index, row in self.result.iterrows():
field = row['Field']
metric = row['Metric']
column_name = f'{metric}_count_|_{field}'
if column_name in self.data.columns:
prop_na = len(self.data) - self.data[column_name].count()
if prop_na == len(self.data):
self.result.at[index, f'{metric}_count_|_{field}'] = np.nan
else:
count = self.data[column_name].sum()
prop_bad = (count / len(self.data)) * 100
self.result.at[index, f'{metric}_count_|_{field}'] = count
self.result.at[index, f'Count'] += count
self.result.at[index, f'Prop_Bad'] += prop_bad
self.result.at[index, f'Prop_Good'] += 100 - prop_bad
self.result.at[index, f'Prop_NA'] = (prop_na / len(self.data)) * 100
else:
self.result.at[index, f'Prop_NA'] = 100.0
# Finalise the result DataFrame structure
self.result = self.result[['Field', 'Metric', 'Count', 'Prop_Bad', 'Prop_Good', 'Prop_NA']]
self.result.sort_values(by = ['Field','Metric'], inplace=True)
self.result.drop_duplicates(subset=['Field', 'Metric'], inplace=True)
self.result[['Prop_Bad', 'Prop_Good', 'Prop_NA']] = self.result[['Prop_Bad', 'Prop_Good', 'Prop_NA']].round(2)
[docs]
def col_bad(row):
"""
Assigns a color code to a data quality metric indicating a "bad" quality status.
Parameters
----------
row : pandas.Series
A row from a DataFrame, expected to contain a 'Metric' column specifying the data quality metric.
Returns
-------
str
A hexadecimal color code associated with the "bad" quality status of the specified metric.
Notes
-----
The function maps different data quality metrics to specific color codes, enhancing visual distinction in graphical representations.
"""
# Define color mappings for various data quality metrics indicating "bad" status
metric_color_map = {
'completeness': '#a6cee3',
'consistency': '#fb9a99',
'timeliness': '#fdbf6f',
'uniqueness': '#cab2d6',
'validity': '#F49FA0', # Updated color code from a commented-out alternative
'accuracy': '#fb9a99'
}
# Default color if metric is not in the predefined list
default_color = '#a6cee3'
return metric_color_map.get(row['Metric'], default_color)
[docs]
def col_good(row):
"""
Assigns a color code to a data quality metric indicating a "good" quality status.
Parameters
----------
row : pandas.Series
A row from a DataFrame, expected to contain a 'Metric' column specifying the data quality metric.
Returns
-------
str
A hexadecimal color code associated with the "good" quality status of the specified metric.
Notes
-----
Similar to `col_bad`, this function provides a way to visually differentiate between various data quality metrics in graphical representations by mapping them to specific color codes for "good" quality status.
"""
# Define color mappings for various data quality metrics indicating "good" status
metric_color_map = {
'completeness': '#1f78b4',
'consistency': '#e31a1c',
'timeliness': '#ff7f00',
'uniqueness': '#6a3d9a',
'validity': '#b15928',
'accuracy': '#e31a1c'
}
# Default color if metric is not in the predefined list
default_color = '#1f78b4'
return metric_color_map.get(row['Metric'], default_color)
about_text = TagList(
tags.h3("Welcome to the Data Quality Profiling Tool"),
tags.p(
"""
This is the front-end to a data quality
profiling tool that is built in python.
It provides a suite of data quality tests across six dimensions,
including """,
tags.strong("Completeness"), ", ",
tags.strong("Validity"), ", ",
tags.strong("Uniqueness"), ", ",
tags.strong("Timeliness"), ", ",
tags.strong("Consistency"), " and " ,
tags.strong("Accuracy"),".",
style="""
text-align: justify;
word-break:break-word;
hyphens: auto;
""",
),
)
key_features_text = TagList(
tags.h4("Key Features"),
tags.strong("1) Comprehensive DQ Checks:"),
"Dive deep into your data with checks across six critical dimensions of data quality.",
tags.strong("2) Custom Test Parameters: "),
"Tailor data quality checks to meet the unique needs of your dataset with customisable test parameters.",
tags.strong("3) Aggregated Results Overview: "),
"Gain a bird's-eye view of your data's quality through aggregated summaries and detailed error reporting.",
tags.strong("4) Dynamic Test Configuration: "),
"Easily configure and modify tests to adapt to your evolving data quality requirements.",
tags.strong("5) Interactive Results Analysis: "),
"Explore error details with interactive reports that make pinpointing issues straightforward.",
)
get_started_text = TagList(
tags.h4("Get Started"),
tags.strong("1) Upload Your Dataset:"),
"Begin by uploading a csv of the dataset you wish to analyse.",
tags.strong("2) Set Your Test Parameters: "),
"Customise your data quality checks by setting parameters tailored to your dataset's specific needs. You can do this by initialising a test parameter template based on your input dataset. ",
tags.strong("3) Run Data Quality Checks: "),
"Execute a comprehensive suite of tests across your dataset with just a click.",
tags.strong("4) Analyse Results: "),
"View aggregated summaries, explore detailed error reports, and make informed decisions to improve your data quality.",
)
error_input_df_text = TagList(
ui.markdown(
"""
No input dataset found. Please choose a **.csv** or **.xlsx** file.
"""
)
)
error_test_params_text = TagList(
ui.markdown(
"""
No test parameters found.
Please choose your test parameters either by initialising them
via the **"Initialise Parameters"** or by uploading a .csv or .xlsx
test parameters file via the **"Upload Parameters"** button.
"""
)
)
error_metric_variable_choice_text = TagList(
ui.markdown(
"""
No errors were found for this combination of DQ metric and chosen variable.
"""
)
)