Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

File size: 5,014 Bytes

# data_registry.py
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional
import os

class DataRegistry:
    def __init__(self):
        self.data = {}
        self.metadata = {}
        self.healthcare_metadata = {}
    
    def add_path(self, path: str) -> bool:
        """Add a data file to the registry with healthcare-specific handling."""
        try:
            file_name = os.path.basename(path)
            
            if file_name.endswith('.csv'):
                df = pd.read_csv(path)
                
                # Standardize column names
                df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_') for col in df.columns]
                
                self.data[file_name] = df
                
                # Basic metadata
                self.metadata[file_name] = {
                    'type': 'csv',
                    'columns': list(df.columns),
                    'shape': df.shape,
                    'sample': df.head(3).to_dict('records')
                }
                
                # Healthcare-specific metadata extraction
                self._extract_healthcare_metadata(file_name, df)
                
                return True
            return False
                
        except Exception as e:
            print(f"Error adding {path}: {e}")
            return False
    
    def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
        """Extract healthcare-specific metadata from the dataframe."""
        healthcare_meta = {}
        
        # Check for healthcare facility data
        if any(col in df.columns for col in ['facility_name', 'facility_type', 'odhf_facility_type']):
            healthcare_meta['data_type'] = 'healthcare_facilities'
            if 'facility_type' in df.columns:
                healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
            if 'city' in df.columns:
                healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
        
        # Check for bed capacity data
        if any(col in df.columns for col in ['beds_current', 'beds_prev', 'bed_count']):
            healthcare_meta['data_type'] = 'bed_capacity'
            if 'zone' in df.columns:
                healthcare_meta['zones'] = df['zone'].unique().tolist()
            if 'teaching_status' in df.columns:
                healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
            
            # Calculate derived metrics
            if 'beds_current' in df.columns and 'beds_prev' in df.columns:
                df['bed_change'] = df['beds_current'] - df['beds_prev']
                df['percent_change'] = (df['bed_change'] / df['beds_prev']) * 100
                healthcare_meta['has_derived_metrics'] = True
        
        # Check for patient data (with privacy warning)
        if any(col in df.columns for col in ['patient_id', 'patient_name', 'mrn']):
            healthcare_meta['data_type'] = 'patient_data'
            healthcare_meta['privacy_warning'] = "This file contains patient identifiers. Ensure proper handling."
        
        if healthcare_meta:
            self.healthcare_metadata[file_name] = healthcare_meta
    
    def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
        """Get healthcare-specific metadata for a file."""
        return self.healthcare_metadata.get(name, {})
    
    def get_data_type(self, name: str) -> str:
        """Get the healthcare data type of a file."""
        meta = self.get_healthcare_metadata(name)
        return meta.get('data_type', 'unknown')
    
    def names(self):
        return list(self.data.keys())
    
    def get(self, name):
        return self.data.get(name)
    
    def summarize_for_prompt(self) -> str:
        """Generate a summary of all data for prompt inclusion."""
        if not self.data:
            return "No data files registered."
        
        summary_parts = []
        for file_name in self.names():
            meta = self.metadata.get(file_name, {})
            health_meta = self.get_healthcare_metadata(file_name)
            
            summary_parts.append(f"File: {file_name}")
            summary_parts.append(f"Type: {meta.get('type', 'unknown')}")
            summary_parts.append(f"Columns: {', '.join(meta.get('columns', []))}")
            summary_parts.append(f"Shape: {meta.get('shape', 'unknown')}")
            
            if health_meta:
                summary_parts.append("Healthcare Context:")
                for key, value in health_meta.items():
                    if key != 'privacy_warning':  # Don't include warnings in prompt
                        summary_parts.append(f"  {key}: {value}")
            
            summary_parts.append("")
        
        return "\n".join(summary_parts)
    
    def clear(self):
        self.data.clear()
        self.metadata.clear()
        self.healthcare_metadata.clear()