Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Build error

File size: 8,207 Bytes

# data_registry.py
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional, Union
import os
import json

class DataRegistry:
    def __init__(self):
        self.data = {}
        self.metadata = {}
        self.healthcare_metadata = {}
        self.derived_columns = {}  # Track derived columns per file
    
    def add_path(self, path: str) -> bool:
        """Add a data file to the registry with dynamic processing."""
        try:
            file_name = os.path.basename(path)
            file_ext = os.path.splitext(file_name)[1].lower()
            
            # Read file based on extension
            if file_ext == '.csv':
                df = pd.read_csv(path)
            elif file_ext in ['.xlsx', '.xls']:
                df = pd.read_excel(path)
            elif file_ext == '.json':
                with open(path, 'r') as f:
                    data = json.load(f)
                df = pd.json_normalize(data)
            elif file_ext in ['.parquet']:
                df = pd.read_parquet(path)
            else:
                print(f"Unsupported file type: {file_ext}")
                return False
            
            # Standardize column names
            df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_').replace('.', '_') for col in df.columns]
            
            # Store original dataframe
            self.data[file_name] = df.copy()
            
            # Initialize derived columns tracking
            self.derived_columns[file_name] = set()
            
            # Process healthcare data dynamically
            self._process_healthcare_data(file_name, df)
            
            # Basic metadata
            self.metadata[file_name] = {
                'type': file_ext,
                'columns': list(df.columns),
                'shape': df.shape,
                'sample': df.head(3).to_dict('records')
            }
            
            # Healthcare-specific metadata extraction
            self._extract_healthcare_metadata(file_name, df)
            
            return True
        except Exception as e:
            print(f"Error adding {path}: {e}")
            return False
    
    def _process_healthcare_data(self, file_name: str, df: pd.DataFrame):
        """Dynamically process healthcare data based on available columns."""
        # Dynamic column pattern matching
        column_patterns = {
            'facility_name': ['facility', 'name', 'hospital', 'site', 'location'],
            'facility_type': ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'],
            'beds_current': ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'],
            'beds_prev': ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'],
            'zone': ['zone', 'region', 'area', 'district'],
            'province': ['province', 'state', 'territory'],
            'city': ['city', 'municipality', 'town'],
            'teaching_status': ['teaching', 'status', 'type', 'hospital_type']
        }
        
        # Map actual columns to standard names
        column_map = {}
        for standard_col, patterns in column_patterns.items():
            for col in df.columns:
                if any(pattern in col for pattern in patterns):
                    column_map[standard_col] = col
                    break
        
        # Create derived columns if we have the necessary base columns
        if 'beds_current' in column_map and 'beds_prev' in column_map:
            current_col = column_map['beds_current']
            prev_col = column_map['beds_prev']
            
            # Calculate bed change
            df['bed_change'] = df[current_col] - df[prev_col]
            self.derived_columns[file_name].add('bed_change')
            
            # Calculate percentage change (avoid division by zero)
            df['percent_change'] = df.apply(
                lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0, 
                axis=1
            )
            self.derived_columns[file_name].add('percent_change')
        
        # If we have facility_type but not in standard form, map it
        if 'facility_type' in column_map and column_map['facility_type'] != 'facility_type':
            df['facility_type'] = df[column_map['facility_type']]
            self.derived_columns[file_name].add('facility_type')
    
    def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
        """Extract healthcare-specific metadata dynamically."""
        healthcare_meta = {}
        
        # Detect data type based on columns
        facility_cols = [col for col in df.columns if any(pattern in col for pattern in ['facility', 'name', 'site'])]
        bed_cols = [col for col in df.columns if any(pattern in col for pattern in ['bed', 'capacity'])]
        
        if facility_cols:
            healthcare_meta['data_type'] = 'facility_data'
            if 'facility_type' in df.columns:
                healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
            if 'city' in df.columns:
                healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
        
        if bed_cols:
            healthcare_meta['data_type'] = 'bed_data'
            if 'zone' in df.columns:
                healthcare_meta['zones'] = df['zone'].unique().tolist()
            if 'teaching_status' in df.columns:
                healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
            
            # Check for derived metrics
            if 'bed_change' in df.columns:
                healthcare_meta['has_derived_metrics'] = True
        
        if healthcare_meta:
            self.healthcare_metadata[file_name] = healthcare_meta
    
    def get_derived_columns(self, file_name: str) -> set:
        """Get derived columns for a file."""
        return self.derived_columns.get(file_name, set())
    
    def find_column(self, file_name: str, patterns: List[str]) -> Optional[str]:
        """Find a column matching any of the given patterns."""
        df = self.get(file_name)
        if df is None:
            return None
        
        for col in df.columns:
            if any(pattern.lower() in col.lower() for pattern in patterns):
                return col
        return None
    
    def get_data_by_type(self, data_type: str) -> List[str]:
        """Get all files of a specific data type."""
        return [
            file_name for file_name, meta in self.healthcare_metadata.items()
            if meta.get('data_type') == data_type
        ]
    
    def names(self):
        return list(self.data.keys())
    
    def get(self, name):
        return self.data.get(name)
    
    def summarize_for_prompt(self) -> str:
        """Generate a summary of all data for prompt inclusion."""
        if not self.data:
            return "No data files registered."
        
        summary_parts = []
        for file_name in self.names():
            meta = self.metadata.get(file_name, {})
            health_meta = self.get_healthcare_metadata(file_name)
            
            summary_parts.append(f"File: {file_name}")
            summary_parts.append(f"Type: {meta.get('type', 'unknown')}")
            summary_parts.append(f"Columns: {', '.join(meta.get('columns', []))}")
            summary_parts.append(f"Shape: {meta.get('shape', 'unknown')}")
            
            if health_meta:
                summary_parts.append("Healthcare Context:")
                for key, value in health_meta.items():
                    if key != 'privacy_warning':
                        summary_parts.append(f"  {key}: {value}")
            
            summary_parts.append("")
        
        return "\n".join(summary_parts)
    
    def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
        """Get healthcare-specific metadata for a file."""
        return self.healthcare_metadata.get(name, {})
    
    def clear(self):
        self.data.clear()
        self.metadata.clear()
        self.healthcare_metadata.clear()
        self.derived_columns.clear()