Spaces:

nat232
/

student_sample_panel

Build error

File size: 4,390 Bytes

441d880

import re
import datetime
import textwrap

from Config import Config

import pandas as pd
import numpy as np

class DataDictionary:
    def __init__(self):
        """
        Initialize the DataDictionary instance with an empty list of entries.
        """
        self.entries = []

    def add_entry(self, entry):
        """
        Add an entry to the data dictionary. Entry should be a dict with expected keys.
        Filters out None or empty-string values, and ensures required keys are present.
        """
        required_keys = {"Type", "Parameter", "Description"}
        missing = required_keys - entry.keys()
        if missing:
            raise ValueError(f"Missing required fields in entry: {missing}")

        # Optionally filter or transform the entry
        clean_entry = {k: v for k, v in entry.items() if v is not None and v != ""}

        self.entries.append(clean_entry)
        
        
    def get_types(self):
        """
        Extract all types defined for the data dictionary, preserving insertion order.

        Returns:
        list: A list of all unique types in the dictionary, preserving order.
        """
        seen = set()
        ordered_types = []
        for entry in self.entries:
            Type = entry.get("Type")
            if Type not in seen and Type is not None:
                seen.add(Type)
                ordered_types.append(Type)
        return ordered_types

    def get_parameters(self, type="All"):
        """
        Extract parameters of a particular type from the data dictionary, preserving insertion order.

        Args:
        type (str): Type of entries to return (defaults to "All").

        Returns:
        list: A list of all unique parameters matching the specified type, preserving order.
        """
        seen = set()
        ordered_parameters = []
    
        for entry in self.entries:
            if type == "All" or entry["Type"] == type:
                parameter = entry["Parameter"]
                if parameter not in seen:
                    seen.add(parameter)
                    ordered_parameters.append(parameter)
    
        return ordered_parameters

    def get_columns(self):
        """
        Generate a list of column names in the format type_parameter.

        Returns:
        list: A list of column names preserving order.
        """
        columns = []
        for entry in self.entries:
            Type = entry["Type"]
            Parameter = entry["Parameter"]
            if Type and Parameter:  # Ensure both Type and Parameter exist
                columns.append(f"{Type}_{Parameter}")
        return columns
      
    def filter_entries(self, Source=None, Type=None, Parameter=None):
        """
        Filter entries based on Source, Type, or Parameter.

        Args:
        Source (str, optional): The source to filter by.
        Type (str, optional): The type to filter by.
        Parameter (str, optional): The parameter to filter by.

        Returns:
        list: A list of entries matching the filter criteria.
        """
        return [
            entry for entry in self.entries
            if (Source is None or entry["Source"] == Source) and
               (Type is None or entry["Type"] == Type) and
               (Parameter is None or entry["Parameter"] == Parameter)
        ]

    @staticmethod
    def generate_dictionary(data_dictionary_file):
        """
        Static method to generate a DataDictionary instance from an Excel (.xlsx) file.

        Args:
        data_dictionary_file (str): The path to the Excel file containing data dictionary entries.

        Returns:
        DataDictionary: A populated DataDictionary instance.
        """
        import pandas as pd  # Ensure pandas is imported
        df = pd.read_excel(data_dictionary_file)

        data_dictionary = DataDictionary()

        for _, row in df.iterrows():
            data_dictionary.add_entry({
                "Type": row["Type"],
                "Parameter": row["Parameter"],
                "Description": row["Description"],
                "Source": row.get("Source"),
                "ValidValues": row.get("Scoring_Method"),
                "InferredLogic": row.get("Inferred_Logic"),
            })
                    
        return data_dictionary

    def __repr__(self):
        return f"DataDictionary({len(self.entries)} entries)"