Medica_DecisionSupportAI / data_registry.py
Rajan Sharma
Update data_registry.py
8af0435 verified
raw
history blame
5.01 kB
# data_registry.py
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional
import os
class DataRegistry:
def __init__(self):
self.data = {}
self.metadata = {}
self.healthcare_metadata = {}
def add_path(self, path: str) -> bool:
"""Add a data file to the registry with healthcare-specific handling."""
try:
file_name = os.path.basename(path)
if file_name.endswith('.csv'):
df = pd.read_csv(path)
# Standardize column names
df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_') for col in df.columns]
self.data[file_name] = df
# Basic metadata
self.metadata[file_name] = {
'type': 'csv',
'columns': list(df.columns),
'shape': df.shape,
'sample': df.head(3).to_dict('records')
}
# Healthcare-specific metadata extraction
self._extract_healthcare_metadata(file_name, df)
return True
return False
except Exception as e:
print(f"Error adding {path}: {e}")
return False
def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
"""Extract healthcare-specific metadata from the dataframe."""
healthcare_meta = {}
# Check for healthcare facility data
if any(col in df.columns for col in ['facility_name', 'facility_type', 'odhf_facility_type']):
healthcare_meta['data_type'] = 'healthcare_facilities'
if 'facility_type' in df.columns:
healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
if 'city' in df.columns:
healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
# Check for bed capacity data
if any(col in df.columns for col in ['beds_current', 'beds_prev', 'bed_count']):
healthcare_meta['data_type'] = 'bed_capacity'
if 'zone' in df.columns:
healthcare_meta['zones'] = df['zone'].unique().tolist()
if 'teaching_status' in df.columns:
healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
# Calculate derived metrics
if 'beds_current' in df.columns and 'beds_prev' in df.columns:
df['bed_change'] = df['beds_current'] - df['beds_prev']
df['percent_change'] = (df['bed_change'] / df['beds_prev']) * 100
healthcare_meta['has_derived_metrics'] = True
# Check for patient data (with privacy warning)
if any(col in df.columns for col in ['patient_id', 'patient_name', 'mrn']):
healthcare_meta['data_type'] = 'patient_data'
healthcare_meta['privacy_warning'] = "This file contains patient identifiers. Ensure proper handling."
if healthcare_meta:
self.healthcare_metadata[file_name] = healthcare_meta
def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
"""Get healthcare-specific metadata for a file."""
return self.healthcare_metadata.get(name, {})
def get_data_type(self, name: str) -> str:
"""Get the healthcare data type of a file."""
meta = self.get_healthcare_metadata(name)
return meta.get('data_type', 'unknown')
def names(self):
return list(self.data.keys())
def get(self, name):
return self.data.get(name)
def summarize_for_prompt(self) -> str:
"""Generate a summary of all data for prompt inclusion."""
if not self.data:
return "No data files registered."
summary_parts = []
for file_name in self.names():
meta = self.metadata.get(file_name, {})
health_meta = self.get_healthcare_metadata(file_name)
summary_parts.append(f"File: {file_name}")
summary_parts.append(f"Type: {meta.get('type', 'unknown')}")
summary_parts.append(f"Columns: {', '.join(meta.get('columns', []))}")
summary_parts.append(f"Shape: {meta.get('shape', 'unknown')}")
if health_meta:
summary_parts.append("Healthcare Context:")
for key, value in health_meta.items():
if key != 'privacy_warning': # Don't include warnings in prompt
summary_parts.append(f" {key}: {value}")
summary_parts.append("")
return "\n".join(summary_parts)
def clear(self):
self.data.clear()
self.metadata.clear()
self.healthcare_metadata.clear()