student_sample_panel / common /DataDictionary.py
elaineaishophouse's picture
Upload 15 files
441d880 verified
import re
import datetime
import textwrap
from Config import Config
import pandas as pd
import numpy as np
class DataDictionary:
def __init__(self):
"""
Initialize the DataDictionary instance with an empty list of entries.
"""
self.entries = []
def add_entry(self, entry):
"""
Add an entry to the data dictionary. Entry should be a dict with expected keys.
Filters out None or empty-string values, and ensures required keys are present.
"""
required_keys = {"Type", "Parameter", "Description"}
missing = required_keys - entry.keys()
if missing:
raise ValueError(f"Missing required fields in entry: {missing}")
# Optionally filter or transform the entry
clean_entry = {k: v for k, v in entry.items() if v is not None and v != ""}
self.entries.append(clean_entry)
def get_types(self):
"""
Extract all types defined for the data dictionary, preserving insertion order.
Returns:
list: A list of all unique types in the dictionary, preserving order.
"""
seen = set()
ordered_types = []
for entry in self.entries:
Type = entry.get("Type")
if Type not in seen and Type is not None:
seen.add(Type)
ordered_types.append(Type)
return ordered_types
def get_parameters(self, type="All"):
"""
Extract parameters of a particular type from the data dictionary, preserving insertion order.
Args:
type (str): Type of entries to return (defaults to "All").
Returns:
list: A list of all unique parameters matching the specified type, preserving order.
"""
seen = set()
ordered_parameters = []
for entry in self.entries:
if type == "All" or entry["Type"] == type:
parameter = entry["Parameter"]
if parameter not in seen:
seen.add(parameter)
ordered_parameters.append(parameter)
return ordered_parameters
def get_columns(self):
"""
Generate a list of column names in the format type_parameter.
Returns:
list: A list of column names preserving order.
"""
columns = []
for entry in self.entries:
Type = entry["Type"]
Parameter = entry["Parameter"]
if Type and Parameter: # Ensure both Type and Parameter exist
columns.append(f"{Type}_{Parameter}")
return columns
def filter_entries(self, Source=None, Type=None, Parameter=None):
"""
Filter entries based on Source, Type, or Parameter.
Args:
Source (str, optional): The source to filter by.
Type (str, optional): The type to filter by.
Parameter (str, optional): The parameter to filter by.
Returns:
list: A list of entries matching the filter criteria.
"""
return [
entry for entry in self.entries
if (Source is None or entry["Source"] == Source) and
(Type is None or entry["Type"] == Type) and
(Parameter is None or entry["Parameter"] == Parameter)
]
@staticmethod
def generate_dictionary(data_dictionary_file):
"""
Static method to generate a DataDictionary instance from an Excel (.xlsx) file.
Args:
data_dictionary_file (str): The path to the Excel file containing data dictionary entries.
Returns:
DataDictionary: A populated DataDictionary instance.
"""
import pandas as pd # Ensure pandas is imported
df = pd.read_excel(data_dictionary_file)
data_dictionary = DataDictionary()
for _, row in df.iterrows():
data_dictionary.add_entry({
"Type": row["Type"],
"Parameter": row["Parameter"],
"Description": row["Description"],
"Source": row.get("Source"),
"ValidValues": row.get("Scoring_Method"),
"InferredLogic": row.get("Inferred_Logic"),
})
return data_dictionary
def __repr__(self):
return f"DataDictionary({len(self.entries)} entries)"