Spaces:
Sleeping
Sleeping
File size: 5,014 Bytes
8af0435 aff5a07 8af0435 aff5a07 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 5613174 8af0435 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# data_registry.py
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Optional
import os
class DataRegistry:
def __init__(self):
self.data = {}
self.metadata = {}
self.healthcare_metadata = {}
def add_path(self, path: str) -> bool:
"""Add a data file to the registry with healthcare-specific handling."""
try:
file_name = os.path.basename(path)
if file_name.endswith('.csv'):
df = pd.read_csv(path)
# Standardize column names
df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_') for col in df.columns]
self.data[file_name] = df
# Basic metadata
self.metadata[file_name] = {
'type': 'csv',
'columns': list(df.columns),
'shape': df.shape,
'sample': df.head(3).to_dict('records')
}
# Healthcare-specific metadata extraction
self._extract_healthcare_metadata(file_name, df)
return True
return False
except Exception as e:
print(f"Error adding {path}: {e}")
return False
def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
"""Extract healthcare-specific metadata from the dataframe."""
healthcare_meta = {}
# Check for healthcare facility data
if any(col in df.columns for col in ['facility_name', 'facility_type', 'odhf_facility_type']):
healthcare_meta['data_type'] = 'healthcare_facilities'
if 'facility_type' in df.columns:
healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
if 'city' in df.columns:
healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
# Check for bed capacity data
if any(col in df.columns for col in ['beds_current', 'beds_prev', 'bed_count']):
healthcare_meta['data_type'] = 'bed_capacity'
if 'zone' in df.columns:
healthcare_meta['zones'] = df['zone'].unique().tolist()
if 'teaching_status' in df.columns:
healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
# Calculate derived metrics
if 'beds_current' in df.columns and 'beds_prev' in df.columns:
df['bed_change'] = df['beds_current'] - df['beds_prev']
df['percent_change'] = (df['bed_change'] / df['beds_prev']) * 100
healthcare_meta['has_derived_metrics'] = True
# Check for patient data (with privacy warning)
if any(col in df.columns for col in ['patient_id', 'patient_name', 'mrn']):
healthcare_meta['data_type'] = 'patient_data'
healthcare_meta['privacy_warning'] = "This file contains patient identifiers. Ensure proper handling."
if healthcare_meta:
self.healthcare_metadata[file_name] = healthcare_meta
def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
"""Get healthcare-specific metadata for a file."""
return self.healthcare_metadata.get(name, {})
def get_data_type(self, name: str) -> str:
"""Get the healthcare data type of a file."""
meta = self.get_healthcare_metadata(name)
return meta.get('data_type', 'unknown')
def names(self):
return list(self.data.keys())
def get(self, name):
return self.data.get(name)
def summarize_for_prompt(self) -> str:
"""Generate a summary of all data for prompt inclusion."""
if not self.data:
return "No data files registered."
summary_parts = []
for file_name in self.names():
meta = self.metadata.get(file_name, {})
health_meta = self.get_healthcare_metadata(file_name)
summary_parts.append(f"File: {file_name}")
summary_parts.append(f"Type: {meta.get('type', 'unknown')}")
summary_parts.append(f"Columns: {', '.join(meta.get('columns', []))}")
summary_parts.append(f"Shape: {meta.get('shape', 'unknown')}")
if health_meta:
summary_parts.append("Healthcare Context:")
for key, value in health_meta.items():
if key != 'privacy_warning': # Don't include warnings in prompt
summary_parts.append(f" {key}: {value}")
summary_parts.append("")
return "\n".join(summary_parts)
def clear(self):
self.data.clear()
self.metadata.clear()
self.healthcare_metadata.clear() |