import re import datetime import textwrap from Config import Config import pandas as pd import numpy as np class DataDictionary: def __init__(self): """ Initialize the DataDictionary instance with an empty list of entries. """ self.entries = [] def add_entry(self, entry): """ Add an entry to the data dictionary. Entry should be a dict with expected keys. Filters out None or empty-string values, and ensures required keys are present. """ required_keys = {"Type", "Parameter", "Description"} missing = required_keys - entry.keys() if missing: raise ValueError(f"Missing required fields in entry: {missing}") # Optionally filter or transform the entry clean_entry = {k: v for k, v in entry.items() if v is not None and v != ""} self.entries.append(clean_entry) def get_types(self): """ Extract all types defined for the data dictionary, preserving insertion order. Returns: list: A list of all unique types in the dictionary, preserving order. """ seen = set() ordered_types = [] for entry in self.entries: Type = entry.get("Type") if Type not in seen and Type is not None: seen.add(Type) ordered_types.append(Type) return ordered_types def get_parameters(self, type="All"): """ Extract parameters of a particular type from the data dictionary, preserving insertion order. Args: type (str): Type of entries to return (defaults to "All"). Returns: list: A list of all unique parameters matching the specified type, preserving order. """ seen = set() ordered_parameters = [] for entry in self.entries: if type == "All" or entry["Type"] == type: parameter = entry["Parameter"] if parameter not in seen: seen.add(parameter) ordered_parameters.append(parameter) return ordered_parameters def get_columns(self): """ Generate a list of column names in the format type_parameter. Returns: list: A list of column names preserving order. """ columns = [] for entry in self.entries: Type = entry["Type"] Parameter = entry["Parameter"] if Type and Parameter: # Ensure both Type and Parameter exist columns.append(f"{Type}_{Parameter}") return columns def filter_entries(self, Source=None, Type=None, Parameter=None): """ Filter entries based on Source, Type, or Parameter. Args: Source (str, optional): The source to filter by. Type (str, optional): The type to filter by. Parameter (str, optional): The parameter to filter by. Returns: list: A list of entries matching the filter criteria. """ return [ entry for entry in self.entries if (Source is None or entry["Source"] == Source) and (Type is None or entry["Type"] == Type) and (Parameter is None or entry["Parameter"] == Parameter) ] @staticmethod def generate_dictionary(data_dictionary_file): """ Static method to generate a DataDictionary instance from an Excel (.xlsx) file. Args: data_dictionary_file (str): The path to the Excel file containing data dictionary entries. Returns: DataDictionary: A populated DataDictionary instance. """ import pandas as pd # Ensure pandas is imported df = pd.read_excel(data_dictionary_file) data_dictionary = DataDictionary() for _, row in df.iterrows(): data_dictionary.add_entry({ "Type": row["Type"], "Parameter": row["Parameter"], "Description": row["Description"], "Source": row.get("Source"), "ValidValues": row.get("Scoring_Method"), "InferredLogic": row.get("Inferred_Logic"), }) return data_dictionary def __repr__(self): return f"DataDictionary({len(self.entries)} entries)"