Source code for DQMaRC.Completeness
import pandas as pd
import numpy as np
from .Dimension import Dimension
[docs]
class Completeness(Dimension):
"""
A subclass of Dimension to assess the completeness aspect of data quality within a dataset.
This class focuses on identifying and quantifying missing or incomplete data points within a given dataset.
It uses predefined tests to determine the presence of null values, empty strings, and encoded missing values.
Parameters
----------
df : pandas.DataFrame
The dataset to be evaluated, imported via pandas' read_csv() function.
test_params : pandas.DataFrame
The test parameters that are either initialised by the Data Quality (DQ) tool or uploaded via pandas' read_csv() function.
tests : dict
A dictionary mapping test names to their relevant information and methods. It includes tests for null values, empty strings, and encoded missing values.
Methods
-------
test_null(test)
Counts the number of NULL values in specified columns of the dataset.
Parameters
----------
test : dict
The test configuration.
test_empty(test)
Identifies empty strings in specified columns of the dataset.
Parameters
----------
test : dict
The test configuration.
test_na_strings(test)
Detects strings that represent missing values, as defined in the test parameters, in specified columns of the dataset.
Parameters
----------
test : dict
The test configuration, including the encoding used to represent missing data.
"""
def __init__(self, df, test_params):
# """
# Initialises the Completeness object with a dataset and test parameters.
# Parameters
# ----------
# df : pandas.DataFrame
# The dataset to be evaluated.
# test_params : pandas.DataFrame
# The parameters defining how tests should be conducted.
# """
# Initialise the parent class with the input dataset and test parameters
Dimension.__init__(self, df, test_params)
# Dictionary of tests specific to completeness
self.tests = {
'Completeness_NULL': {'method': self.test_null, 'default': True},
'Completeness_Empty': {'method': self.test_empty, 'default': True},
'Completeness_Encoded': {'method': self.test_na_strings, 'default': False, 'arg1': 'Completeness_Encoded_Mapping'}
}
[docs]
def test_null(self, test):
# """
# Executes a test for NULL values in the dataset.
# Parameters
# ----------
# test : dict
# The test configuration.
# """
null_strings = {"NULL", "Null", "None"}
def func(col, extra_args=None):
# Returns a series indicating whether each value in the column is NA
# return self.df[col].isna()
return self.df[col].apply(lambda x: pd.isna(x) or str(x).strip() in null_strings)
self.run_metric(test, func)
[docs]
def test_empty(self, test):
# """
# Executes a test for empty strings in the dataset.
# Parameters
# ----------
# test : dict
# The test configuration.
# """
def func(col, extra_args=None):
"""
Returns a pandas Series indicating whether each value in the specified column,
after stripping any leading and trailing whitespace, is an empty string.
Parameters:
col : str
The name of the column to check for empty strings after stripping whitespace.
Returns:
pandas.Series
A Series where True indicates the cell was empty (after whitespace removal),
and False indicates it contained other characters.
"""
# Strip leading and trailing whitespace and then check if the strings are empty
return self.df[col].astype(str).str.strip().isin([''])
self.run_metric(test, func)
[docs]
def test_na_strings(self, test):
# """
# Executes a test for encoded missing values (e.g., special strings that denote missing data) in the dataset.
# Parameters
# ----------
# test : dict
# The test configuration, including the encoding used to represent missing data.
# """
def func(col, extra_args=None):
# Splits the encoded missing values string and checks if the column values are in this list
return self.df[col].apply(lambda x: str(x)).isin(self.test_params[self.test_params['Field'] == col][self.tests[test]['arg1']].item().split('|')) # corrected split to single pipe
self.run_metric(test, func)