WWW1M / common /DataDictionary.py
GauravGoel11's picture
initial commit
f71ff04 verified
import re
import datetime
import textwrap
from common.Config import Config
import pandas as pd
import numpy as np
class DataDictionary:
def __init__(self):
"""
Initialize the DataDictionary instance with an empty list of entries.
"""
self.entries = []
def add_entry(self, Type, Parameter, Description, Source, ValidValues, InferredLogic=None):
"""
Add an entry to the data dictionary.
Args:
Type (str): The type of the entry (e.g., "Numeric", "Categorical").
Parameter (str): The parameter name.
Description (str): A brief description of the parameter.
Source (str): The source of the parameter.
ValidValues (str): Valid values or scoring method for the parameter.
InferredLogic (str, optional): Inferred logic for the parameter.
"""
entry = {
"Type": Type,
"Parameter": Parameter,
"Description": Description,
"Source": Source,
"ValidValues": ValidValues,
"InferredLogic": InferredLogic,
}
self.entries.append(entry)
def get_types(self):
"""
Extract all types defined for the data dictionary, preserving insertion order.
Returns:
list: A list of all unique types in the dictionary, preserving order.
"""
seen = set()
ordered_types = []
for entry in self.entries:
Type = entry.get("Type")
if Type not in seen and Type is not None:
seen.add(Type)
ordered_types.append(Type)
return ordered_types
def get_parameters(self, type="All"):
"""
Extract parameters of a particular type from the data dictionary, preserving insertion order.
Args:
type (str): Type of entries to return (defaults to "All").
Returns:
list: A list of all unique parameters matching the specified type, preserving order.
"""
seen = set()
ordered_parameters = []
for entry in self.entries:
if type == "All" or entry["Type"] == type:
parameter = entry["Parameter"]
if parameter not in seen:
seen.add(parameter)
ordered_parameters.append(parameter)
return ordered_parameters
def get_columns(self):
"""
Generate a list of column names in the format type_parameter.
Returns:
list: A list of column names preserving order.
"""
columns = []
for entry in self.entries:
Type = entry["Type"]
Parameter = entry["Parameter"]
if Type and Parameter: # Ensure both Type and Parameter exist
columns.append(f"{Type}_{Parameter}")
return columns
def filter_entries(self, Source=None, Type=None, Parameter=None):
"""
Filter entries based on Source, Type, or Parameter.
Args:
Source (str, optional): The source to filter by.
Type (str, optional): The type to filter by.
Parameter (str, optional): The parameter to filter by.
Returns:
list: A list of entries matching the filter criteria.
"""
return [
entry for entry in self.entries
if (Source is None or entry["Source"] == Source) and
(Type is None or entry["Type"] == Type) and
(Parameter is None or entry["Parameter"] == Parameter)
]
@staticmethod
def generate_dictionary(data_dictionary_file):
"""
Static method to generate a DataDictionary instance from an Excel (.xlsx) file.
Args:
data_dictionary_file (str): The path to the Excel file containing data dictionary entries.
Returns:
DataDictionary: A populated DataDictionary instance.
"""
import pandas as pd # Ensure pandas is imported
df = pd.read_excel(data_dictionary_file)
data_dictionary = DataDictionary()
for _, row in df.iterrows():
data_dictionary.add_entry(
Type=row['Type'],
Parameter=row['Parameter'],
Description=row['Description'],
Source=row['Source'],
ValidValues=row['Scoring method'],
InferredLogic=row.get('Inferred_Logic')
)
return data_dictionary
def __repr__(self):
return f"DataDictionary({len(self.entries)} entries)"