student_sample_panel / common /DataDictionary.py
elaineaishophouse's picture
Upload 15 files
441d880 verified
raw
history blame
4.39 kB
import re
import datetime
import textwrap
from Config import Config
import pandas as pd
import numpy as np
class DataDictionary:
def __init__(self):
"""
Initialize the DataDictionary instance with an empty list of entries.
"""
self.entries = []
def add_entry(self, entry):
"""
Add an entry to the data dictionary. Entry should be a dict with expected keys.
Filters out None or empty-string values, and ensures required keys are present.
"""
required_keys = {"Type", "Parameter", "Description"}
missing = required_keys - entry.keys()
if missing:
raise ValueError(f"Missing required fields in entry: {missing}")
# Optionally filter or transform the entry
clean_entry = {k: v for k, v in entry.items() if v is not None and v != ""}
self.entries.append(clean_entry)
def get_types(self):
"""
Extract all types defined for the data dictionary, preserving insertion order.
Returns:
list: A list of all unique types in the dictionary, preserving order.
"""
seen = set()
ordered_types = []
for entry in self.entries:
Type = entry.get("Type")
if Type not in seen and Type is not None:
seen.add(Type)
ordered_types.append(Type)
return ordered_types
def get_parameters(self, type="All"):
"""
Extract parameters of a particular type from the data dictionary, preserving insertion order.
Args:
type (str): Type of entries to return (defaults to "All").
Returns:
list: A list of all unique parameters matching the specified type, preserving order.
"""
seen = set()
ordered_parameters = []
for entry in self.entries:
if type == "All" or entry["Type"] == type:
parameter = entry["Parameter"]
if parameter not in seen:
seen.add(parameter)
ordered_parameters.append(parameter)
return ordered_parameters
def get_columns(self):
"""
Generate a list of column names in the format type_parameter.
Returns:
list: A list of column names preserving order.
"""
columns = []
for entry in self.entries:
Type = entry["Type"]
Parameter = entry["Parameter"]
if Type and Parameter: # Ensure both Type and Parameter exist
columns.append(f"{Type}_{Parameter}")
return columns
def filter_entries(self, Source=None, Type=None, Parameter=None):
"""
Filter entries based on Source, Type, or Parameter.
Args:
Source (str, optional): The source to filter by.
Type (str, optional): The type to filter by.
Parameter (str, optional): The parameter to filter by.
Returns:
list: A list of entries matching the filter criteria.
"""
return [
entry for entry in self.entries
if (Source is None or entry["Source"] == Source) and
(Type is None or entry["Type"] == Type) and
(Parameter is None or entry["Parameter"] == Parameter)
]
@staticmethod
def generate_dictionary(data_dictionary_file):
"""
Static method to generate a DataDictionary instance from an Excel (.xlsx) file.
Args:
data_dictionary_file (str): The path to the Excel file containing data dictionary entries.
Returns:
DataDictionary: A populated DataDictionary instance.
"""
import pandas as pd # Ensure pandas is imported
df = pd.read_excel(data_dictionary_file)
data_dictionary = DataDictionary()
for _, row in df.iterrows():
data_dictionary.add_entry({
"Type": row["Type"],
"Parameter": row["Parameter"],
"Description": row["Description"],
"Source": row.get("Source"),
"ValidValues": row.get("Scoring_Method"),
"InferredLogic": row.get("Inferred_Logic"),
})
return data_dictionary
def __repr__(self):
return f"DataDictionary({len(self.entries)} entries)"