import re import datetime import textwrap from common.Config import Config import pandas as pd import numpy as np class DataDictionary: def __init__(self): """ Initialize the DataDictionary instance with an empty list of entries. """ self.entries = [] def add_entry(self, Type, Parameter, Description, Source, ValidValues, InferredLogic=None): """ Add an entry to the data dictionary. Args: Type (str): The type of the entry (e.g., "Numeric", "Categorical"). Parameter (str): The parameter name. Description (str): A brief description of the parameter. Source (str): The source of the parameter. ValidValues (str): Valid values or scoring method for the parameter. InferredLogic (str, optional): Inferred logic for the parameter. """ entry = { "Type": Type, "Parameter": Parameter, "Description": Description, "Source": Source, "ValidValues": ValidValues, "InferredLogic": InferredLogic, } self.entries.append(entry) def get_types(self): """ Extract all types defined for the data dictionary, preserving insertion order. Returns: list: A list of all unique types in the dictionary, preserving order. """ seen = set() ordered_types = [] for entry in self.entries: Type = entry.get("Type") if Type not in seen and Type is not None: seen.add(Type) ordered_types.append(Type) return ordered_types def get_parameters(self, type="All"): """ Extract parameters of a particular type from the data dictionary, preserving insertion order. Args: type (str): Type of entries to return (defaults to "All"). Returns: list: A list of all unique parameters matching the specified type, preserving order. """ seen = set() ordered_parameters = [] for entry in self.entries: if type == "All" or entry["Type"] == type: parameter = entry["Parameter"] if parameter not in seen: seen.add(parameter) ordered_parameters.append(parameter) return ordered_parameters def get_columns(self): """ Generate a list of column names in the format type_parameter. Returns: list: A list of column names preserving order. """ columns = [] for entry in self.entries: Type = entry["Type"] Parameter = entry["Parameter"] if Type and Parameter: # Ensure both Type and Parameter exist columns.append(f"{Type}_{Parameter}") return columns def filter_entries(self, Source=None, Type=None, Parameter=None): """ Filter entries based on Source, Type, or Parameter. Args: Source (str, optional): The source to filter by. Type (str, optional): The type to filter by. Parameter (str, optional): The parameter to filter by. Returns: list: A list of entries matching the filter criteria. """ return [ entry for entry in self.entries if (Source is None or entry["Source"] == Source) and (Type is None or entry["Type"] == Type) and (Parameter is None or entry["Parameter"] == Parameter) ] @staticmethod def generate_dictionary(data_dictionary_file): """ Static method to generate a DataDictionary instance from an Excel (.xlsx) file. Args: data_dictionary_file (str): The path to the Excel file containing data dictionary entries. Returns: DataDictionary: A populated DataDictionary instance. """ import pandas as pd # Ensure pandas is imported df = pd.read_excel(data_dictionary_file) data_dictionary = DataDictionary() for _, row in df.iterrows(): data_dictionary.add_entry( Type=row['Type'], Parameter=row['Parameter'], Description=row['Description'], Source=row['Source'], ValidValues=row['Scoring method'], InferredLogic=row.get('Inferred_Logic') ) return data_dictionary def __repr__(self): return f"DataDictionary({len(self.entries)} entries)"