Source code for DQMaRC.Completeness

import pandas as pd
import numpy as np
from .Dimension import Dimension


[docs]
class Completeness(Dimension):
    """
    A subclass of Dimension to assess the completeness aspect of data quality within a dataset.

    This class focuses on identifying and quantifying missing or incomplete data points within a given dataset.
    It uses predefined tests to determine the presence of null values, empty strings, and encoded missing values.

    Parameters
    ----------
        df : pandas.DataFrame
            The dataset to be evaluated, imported via pandas' read_csv() function.
        test_params : pandas.DataFrame
            The test parameters that are either initialised by the Data Quality (DQ) tool or uploaded via pandas' read_csv() function.
        tests : dict
            A dictionary mapping test names to their relevant information and methods. It includes tests for null values, empty strings, and encoded missing values.

    Methods
    -------
        test_null(test)
            Counts the number of NULL values in specified columns of the dataset.
            
            Parameters
            ----------
            test : dict
                The test configuration.

        test_empty(test)
            Identifies empty strings in specified columns of the dataset.
            
            Parameters
            ----------
            test : dict
                The test configuration.

        test_na_strings(test)
            Detects strings that represent missing values, as defined in the test parameters, in specified columns of the dataset.

            Parameters
            ----------
            test : dict
                The test configuration, including the encoding used to represent missing data.

    """

    def __init__(self, df, test_params):
        # """
        # Initialises the Completeness object with a dataset and test parameters.
        
        # Parameters
        # ----------
        # df : pandas.DataFrame
        #     The dataset to be evaluated.
        # test_params : pandas.DataFrame
        #     The parameters defining how tests should be conducted.
        # """
        # Initialise the parent class with the input dataset and test parameters
        Dimension.__init__(self, df, test_params)
        # Dictionary of tests specific to completeness
        self.tests = {
            'Completeness_NULL': {'method': self.test_null, 'default': True},
            'Completeness_Empty': {'method': self.test_empty, 'default': True},
            'Completeness_Encoded': {'method': self.test_na_strings, 'default': False, 'arg1': 'Completeness_Encoded_Mapping'}
            }


[docs]
    def test_null(self, test):
        # """
        # Executes a test for NULL values in the dataset.
        
        # Parameters
        # ----------
        # test : dict
        #     The test configuration.
        # """

        null_strings = {"NULL", "Null", "None"}

        def func(col, extra_args=None):
            # Returns a series indicating whether each value in the column is NA
            # return self.df[col].isna()
            return self.df[col].apply(lambda x: pd.isna(x) or str(x).strip() in null_strings)

        self.run_metric(test, func)



[docs]
    def test_empty(self, test):
        # """
        # Executes a test for empty strings in the dataset.
        
        # Parameters
        # ----------
        # test : dict
        #     The test configuration.
        # """

        def func(col, extra_args=None):
            """
            Returns a pandas Series indicating whether each value in the specified column, 
            after stripping any leading and trailing whitespace, is an empty string.
            
            Parameters:
            col : str
                The name of the column to check for empty strings after stripping whitespace.
            
            Returns:
            pandas.Series
                A Series where True indicates the cell was empty (after whitespace removal),
                and False indicates it contained other characters.
            """
            # Strip leading and trailing whitespace and then check if the strings are empty
            return self.df[col].astype(str).str.strip().isin([''])

        self.run_metric(test, func)



[docs]
    def test_na_strings(self, test):
        # """
        # Executes a test for encoded missing values (e.g., special strings that denote missing data) in the dataset.
        
        # Parameters
        # ----------
        # test : dict
        #     The test configuration, including the encoding used to represent missing data.
        # """
        
        def func(col, extra_args=None):
            # Splits the encoded missing values string and checks if the column values are in this list
             return self.df[col].apply(lambda x: str(x)).isin(self.test_params[self.test_params['Field'] == col][self.tests[test]['arg1']].item().split('|')) # corrected split to single pipe
        
        self.run_metric(test, func)