Rajan Sharma commited on
Commit
4073913
·
verified ·
1 Parent(s): fa74f5a

Update data_registry.py

Browse files
Files changed (1) hide show
  1. data_registry.py +124 -48
data_registry.py CHANGED
@@ -1,88 +1,159 @@
1
  # data_registry.py
2
  import pandas as pd
3
  import numpy as np
4
- from typing import Dict, Any, List, Optional
5
  import os
 
6
 
7
  class DataRegistry:
8
  def __init__(self):
9
  self.data = {}
10
  self.metadata = {}
11
  self.healthcare_metadata = {}
 
12
 
13
  def add_path(self, path: str) -> bool:
14
- """Add a data file to the registry with healthcare-specific handling."""
15
  try:
16
  file_name = os.path.basename(path)
 
17
 
18
- if file_name.endswith('.csv'):
 
19
  df = pd.read_csv(path)
20
-
21
- # Standardize column names
22
- df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_') for col in df.columns]
23
-
24
- self.data[file_name] = df
25
-
26
- # Basic metadata
27
- self.metadata[file_name] = {
28
- 'type': 'csv',
29
- 'columns': list(df.columns),
30
- 'shape': df.shape,
31
- 'sample': df.head(3).to_dict('records')
32
- }
33
-
34
- # Healthcare-specific metadata extraction
35
- self._extract_healthcare_metadata(file_name, df)
36
-
37
- return True
38
- return False
39
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
  print(f"Error adding {path}: {e}")
42
  return False
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
45
- """Extract healthcare-specific metadata from the dataframe."""
46
  healthcare_meta = {}
47
 
48
- # Check for healthcare facility data
49
- if any(col in df.columns for col in ['facility_name', 'facility_type', 'odhf_facility_type']):
50
- healthcare_meta['data_type'] = 'healthcare_facilities'
 
 
 
51
  if 'facility_type' in df.columns:
52
  healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
53
  if 'city' in df.columns:
54
  healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
55
 
56
- # Check for bed capacity data
57
- if any(col in df.columns for col in ['beds_current', 'beds_prev', 'bed_count']):
58
- healthcare_meta['data_type'] = 'bed_capacity'
59
  if 'zone' in df.columns:
60
  healthcare_meta['zones'] = df['zone'].unique().tolist()
61
  if 'teaching_status' in df.columns:
62
  healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
63
 
64
- # Calculate derived metrics
65
- if 'beds_current' in df.columns and 'beds_prev' in df.columns:
66
- df['bed_change'] = df['beds_current'] - df['beds_prev']
67
- df['percent_change'] = (df['bed_change'] / df['beds_prev']) * 100
68
  healthcare_meta['has_derived_metrics'] = True
69
 
70
- # Check for patient data (with privacy warning)
71
- if any(col in df.columns for col in ['patient_id', 'patient_name', 'mrn']):
72
- healthcare_meta['data_type'] = 'patient_data'
73
- healthcare_meta['privacy_warning'] = "This file contains patient identifiers. Ensure proper handling."
74
-
75
  if healthcare_meta:
76
  self.healthcare_metadata[file_name] = healthcare_meta
77
 
78
- def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
79
- """Get healthcare-specific metadata for a file."""
80
- return self.healthcare_metadata.get(name, {})
81
 
82
- def get_data_type(self, name: str) -> str:
83
- """Get the healthcare data type of a file."""
84
- meta = self.get_healthcare_metadata(name)
85
- return meta.get('data_type', 'unknown')
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def names(self):
88
  return list(self.data.keys())
@@ -108,14 +179,19 @@ class DataRegistry:
108
  if health_meta:
109
  summary_parts.append("Healthcare Context:")
110
  for key, value in health_meta.items():
111
- if key != 'privacy_warning': # Don't include warnings in prompt
112
  summary_parts.append(f" {key}: {value}")
113
 
114
  summary_parts.append("")
115
 
116
  return "\n".join(summary_parts)
117
 
 
 
 
 
118
  def clear(self):
119
  self.data.clear()
120
  self.metadata.clear()
121
- self.healthcare_metadata.clear()
 
 
1
  # data_registry.py
2
  import pandas as pd
3
  import numpy as np
4
+ from typing import Dict, Any, List, Optional, Union
5
  import os
6
+ import json
7
 
8
  class DataRegistry:
9
  def __init__(self):
10
  self.data = {}
11
  self.metadata = {}
12
  self.healthcare_metadata = {}
13
+ self.derived_columns = {} # Track derived columns per file
14
 
15
  def add_path(self, path: str) -> bool:
16
+ """Add a data file to the registry with dynamic processing."""
17
  try:
18
  file_name = os.path.basename(path)
19
+ file_ext = os.path.splitext(file_name)[1].lower()
20
 
21
+ # Read file based on extension
22
+ if file_ext == '.csv':
23
  df = pd.read_csv(path)
24
+ elif file_ext in ['.xlsx', '.xls']:
25
+ df = pd.read_excel(path)
26
+ elif file_ext == '.json':
27
+ with open(path, 'r') as f:
28
+ data = json.load(f)
29
+ df = pd.json_normalize(data)
30
+ elif file_ext in ['.parquet']:
31
+ df = pd.read_parquet(path)
32
+ else:
33
+ print(f"Unsupported file type: {file_ext}")
34
+ return False
35
+
36
+ # Standardize column names
37
+ df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_').replace('.', '_') for col in df.columns]
38
+
39
+ # Store original dataframe
40
+ self.data[file_name] = df.copy()
41
+
42
+ # Initialize derived columns tracking
43
+ self.derived_columns[file_name] = set()
44
+
45
+ # Process healthcare data dynamically
46
+ self._process_healthcare_data(file_name, df)
47
+
48
+ # Basic metadata
49
+ self.metadata[file_name] = {
50
+ 'type': file_ext,
51
+ 'columns': list(df.columns),
52
+ 'shape': df.shape,
53
+ 'sample': df.head(3).to_dict('records')
54
+ }
55
+
56
+ # Healthcare-specific metadata extraction
57
+ self._extract_healthcare_metadata(file_name, df)
58
+
59
+ return True
60
  except Exception as e:
61
  print(f"Error adding {path}: {e}")
62
  return False
63
 
64
+ def _process_healthcare_data(self, file_name: str, df: pd.DataFrame):
65
+ """Dynamically process healthcare data based on available columns."""
66
+ # Dynamic column pattern matching
67
+ column_patterns = {
68
+ 'facility_name': ['facility', 'name', 'hospital', 'site', 'location'],
69
+ 'facility_type': ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'],
70
+ 'beds_current': ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'],
71
+ 'beds_prev': ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'],
72
+ 'zone': ['zone', 'region', 'area', 'district'],
73
+ 'province': ['province', 'state', 'territory'],
74
+ 'city': ['city', 'municipality', 'town'],
75
+ 'teaching_status': ['teaching', 'status', 'type', 'hospital_type']
76
+ }
77
+
78
+ # Map actual columns to standard names
79
+ column_map = {}
80
+ for standard_col, patterns in column_patterns.items():
81
+ for col in df.columns:
82
+ if any(pattern in col for pattern in patterns):
83
+ column_map[standard_col] = col
84
+ break
85
+
86
+ # Create derived columns if we have the necessary base columns
87
+ if 'beds_current' in column_map and 'beds_prev' in column_map:
88
+ current_col = column_map['beds_current']
89
+ prev_col = column_map['beds_prev']
90
+
91
+ # Calculate bed change
92
+ df['bed_change'] = df[current_col] - df[prev_col]
93
+ self.derived_columns[file_name].add('bed_change')
94
+
95
+ # Calculate percentage change (avoid division by zero)
96
+ df['percent_change'] = df.apply(
97
+ lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
98
+ axis=1
99
+ )
100
+ self.derived_columns[file_name].add('percent_change')
101
+
102
+ # If we have facility_type but not in standard form, map it
103
+ if 'facility_type' in column_map and column_map['facility_type'] != 'facility_type':
104
+ df['facility_type'] = df[column_map['facility_type']]
105
+ self.derived_columns[file_name].add('facility_type')
106
+
107
  def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
108
+ """Extract healthcare-specific metadata dynamically."""
109
  healthcare_meta = {}
110
 
111
+ # Detect data type based on columns
112
+ facility_cols = [col for col in df.columns if any(pattern in col for pattern in ['facility', 'name', 'site'])]
113
+ bed_cols = [col for col in df.columns if any(pattern in col for pattern in ['bed', 'capacity'])]
114
+
115
+ if facility_cols:
116
+ healthcare_meta['data_type'] = 'facility_data'
117
  if 'facility_type' in df.columns:
118
  healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
119
  if 'city' in df.columns:
120
  healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
121
 
122
+ if bed_cols:
123
+ healthcare_meta['data_type'] = 'bed_data'
 
124
  if 'zone' in df.columns:
125
  healthcare_meta['zones'] = df['zone'].unique().tolist()
126
  if 'teaching_status' in df.columns:
127
  healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
128
 
129
+ # Check for derived metrics
130
+ if 'bed_change' in df.columns:
 
 
131
  healthcare_meta['has_derived_metrics'] = True
132
 
 
 
 
 
 
133
  if healthcare_meta:
134
  self.healthcare_metadata[file_name] = healthcare_meta
135
 
136
+ def get_derived_columns(self, file_name: str) -> set:
137
+ """Get derived columns for a file."""
138
+ return self.derived_columns.get(file_name, set())
139
 
140
+ def find_column(self, file_name: str, patterns: List[str]) -> Optional[str]:
141
+ """Find a column matching any of the given patterns."""
142
+ df = self.get(file_name)
143
+ if df is None:
144
+ return None
145
+
146
+ for col in df.columns:
147
+ if any(pattern.lower() in col.lower() for pattern in patterns):
148
+ return col
149
+ return None
150
+
151
+ def get_data_by_type(self, data_type: str) -> List[str]:
152
+ """Get all files of a specific data type."""
153
+ return [
154
+ file_name for file_name, meta in self.healthcare_metadata.items()
155
+ if meta.get('data_type') == data_type
156
+ ]
157
 
158
  def names(self):
159
  return list(self.data.keys())
 
179
  if health_meta:
180
  summary_parts.append("Healthcare Context:")
181
  for key, value in health_meta.items():
182
+ if key != 'privacy_warning':
183
  summary_parts.append(f" {key}: {value}")
184
 
185
  summary_parts.append("")
186
 
187
  return "\n".join(summary_parts)
188
 
189
+ def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
190
+ """Get healthcare-specific metadata for a file."""
191
+ return self.healthcare_metadata.get(name, {})
192
+
193
  def clear(self):
194
  self.data.clear()
195
  self.metadata.clear()
196
+ self.healthcare_metadata.clear()
197
+ self.derived_columns.clear()