File size: 5,014 Bytes
8af0435
aff5a07
8af0435
 
 
aff5a07
 
 
8af0435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5613174
8af0435
 
 
5613174
8af0435
 
 
 
 
 
 
5613174
8af0435
 
 
 
 
 
 
 
 
 
 
 
 
5613174
8af0435
 
 
 
5613174
8af0435
 
5613174
8af0435
 
 
5613174
8af0435
 
 
 
5613174
8af0435
 
5613174
8af0435
 
5613174
8af0435
 
 
 
5613174
8af0435
 
 
 
5613174
8af0435
 
 
 
5613174
8af0435
 
 
 
 
5613174
8af0435
5613174
8af0435
5613174
8af0435
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# data_registry.py
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional
import os

class DataRegistry:
    def __init__(self):
        self.data = {}
        self.metadata = {}
        self.healthcare_metadata = {}
    
    def add_path(self, path: str) -> bool:
        """Add a data file to the registry with healthcare-specific handling."""
        try:
            file_name = os.path.basename(path)
            
            if file_name.endswith('.csv'):
                df = pd.read_csv(path)
                
                # Standardize column names
                df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_') for col in df.columns]
                
                self.data[file_name] = df
                
                # Basic metadata
                self.metadata[file_name] = {
                    'type': 'csv',
                    'columns': list(df.columns),
                    'shape': df.shape,
                    'sample': df.head(3).to_dict('records')
                }
                
                # Healthcare-specific metadata extraction
                self._extract_healthcare_metadata(file_name, df)
                
                return True
            return False
                
        except Exception as e:
            print(f"Error adding {path}: {e}")
            return False
    
    def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
        """Extract healthcare-specific metadata from the dataframe."""
        healthcare_meta = {}
        
        # Check for healthcare facility data
        if any(col in df.columns for col in ['facility_name', 'facility_type', 'odhf_facility_type']):
            healthcare_meta['data_type'] = 'healthcare_facilities'
            if 'facility_type' in df.columns:
                healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
            if 'city' in df.columns:
                healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
        
        # Check for bed capacity data
        if any(col in df.columns for col in ['beds_current', 'beds_prev', 'bed_count']):
            healthcare_meta['data_type'] = 'bed_capacity'
            if 'zone' in df.columns:
                healthcare_meta['zones'] = df['zone'].unique().tolist()
            if 'teaching_status' in df.columns:
                healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
            
            # Calculate derived metrics
            if 'beds_current' in df.columns and 'beds_prev' in df.columns:
                df['bed_change'] = df['beds_current'] - df['beds_prev']
                df['percent_change'] = (df['bed_change'] / df['beds_prev']) * 100
                healthcare_meta['has_derived_metrics'] = True
        
        # Check for patient data (with privacy warning)
        if any(col in df.columns for col in ['patient_id', 'patient_name', 'mrn']):
            healthcare_meta['data_type'] = 'patient_data'
            healthcare_meta['privacy_warning'] = "This file contains patient identifiers. Ensure proper handling."
        
        if healthcare_meta:
            self.healthcare_metadata[file_name] = healthcare_meta
    
    def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
        """Get healthcare-specific metadata for a file."""
        return self.healthcare_metadata.get(name, {})
    
    def get_data_type(self, name: str) -> str:
        """Get the healthcare data type of a file."""
        meta = self.get_healthcare_metadata(name)
        return meta.get('data_type', 'unknown')
    
    def names(self):
        return list(self.data.keys())
    
    def get(self, name):
        return self.data.get(name)
    
    def summarize_for_prompt(self) -> str:
        """Generate a summary of all data for prompt inclusion."""
        if not self.data:
            return "No data files registered."
        
        summary_parts = []
        for file_name in self.names():
            meta = self.metadata.get(file_name, {})
            health_meta = self.get_healthcare_metadata(file_name)
            
            summary_parts.append(f"File: {file_name}")
            summary_parts.append(f"Type: {meta.get('type', 'unknown')}")
            summary_parts.append(f"Columns: {', '.join(meta.get('columns', []))}")
            summary_parts.append(f"Shape: {meta.get('shape', 'unknown')}")
            
            if health_meta:
                summary_parts.append("Healthcare Context:")
                for key, value in health_meta.items():
                    if key != 'privacy_warning':  # Don't include warnings in prompt
                        summary_parts.append(f"  {key}: {value}")
            
            summary_parts.append("")
        
        return "\n".join(summary_parts)
    
    def clear(self):
        self.data.clear()
        self.metadata.clear()
        self.healthcare_metadata.clear()