Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 23

Commit

ef7ab85

verified ·

1 Parent(s): b1ec06f

Update data_registry.py

Browse files

Files changed (1) hide show

data_registry.py +67 -163

data_registry.py CHANGED Viewed

@@ -1,197 +1,101 @@
 # data_registry.py
 import pandas as pd
-import numpy as np
-from typing import Dict, Any, List, Optional, Union
 import os
-import json
 class DataRegistry:
     def __init__(self):
         self.data = {}
         self.metadata = {}
-        self.healthcare_metadata = {}
-        self.derived_columns = {}  # Track derived columns per file
-    def add_path(self, path: str) -> bool:
-        """Add a data file to the registry with dynamic processing."""
         try:
-            file_name = os.path.basename(path)
-            file_ext = os.path.splitext(file_name)[1].lower()
-            # Read file based on extension
             if file_ext == '.csv':
-                df = pd.read_csv(path)
             elif file_ext in ['.xlsx', '.xls']:
-                df = pd.read_excel(path)
             elif file_ext == '.json':
-                with open(path, 'r') as f:
-                    data = json.load(f)
-                df = pd.json_normalize(data)
-            elif file_ext in ['.parquet']:
-                df = pd.read_parquet(path)
             else:
-                print(f"Unsupported file type: {file_ext}")
                 return False
-            # Standardize column names
-            df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_').replace('.', '_') for col in df.columns]
-            # Store original dataframe
-            self.data[file_name] = df.copy()
-            # Initialize derived columns tracking
-            self.derived_columns[file_name] = set()
-            # Process healthcare data dynamically
-            self._process_healthcare_data(file_name, df)
-            # Basic metadata
-            self.metadata[file_name] = {
-                'type': file_ext,
-                'columns': list(df.columns),
-                'shape': df.shape,
-                'sample': df.head(3).to_dict('records')
             }
-            # Healthcare-specific metadata extraction
-            self._extract_healthcare_metadata(file_name, df)
             return True
         except Exception as e:
-            print(f"Error adding {path}: {e}")
             return False
-    def _process_healthcare_data(self, file_name: str, df: pd.DataFrame):
-        """Dynamically process healthcare data based on available columns."""
-        # Dynamic column pattern matching
-        column_patterns = {
-            'facility_name': ['facility', 'name', 'hospital', 'site', 'location'],
-            'facility_type': ['type', 'category', 'class', 'facility_type', 'odhf_facility_type'],
-            'beds_current': ['current', '2023', '2024', 'beds_current', 'staffed_beds', 'capacity'],
-            'beds_prev': ['prev', 'previous', '2022', 'beds_prev', 'previous_beds'],
-            'zone': ['zone', 'region', 'area', 'district'],
-            'province': ['province', 'state', 'territory'],
-            'city': ['city', 'municipality', 'town'],
-            'teaching_status': ['teaching', 'status', 'type', 'hospital_type']
-        }
-        # Map actual columns to standard names
-        column_map = {}
-        for standard_col, patterns in column_patterns.items():
-            for col in df.columns:
-                if any(pattern in col for pattern in patterns):
-                    column_map[standard_col] = col
-                    break
-        # Create derived columns if we have the necessary base columns
-        if 'beds_current' in column_map and 'beds_prev' in column_map:
-            current_col = column_map['beds_current']
-            prev_col = column_map['beds_prev']
-            # Calculate bed change
-            df['bed_change'] = df[current_col] - df[prev_col]
-            self.derived_columns[file_name].add('bed_change')
-            # Calculate percentage change (avoid division by zero)
-            df['percent_change'] = df.apply(
-                lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
-                axis=1
-            )
-            self.derived_columns[file_name].add('percent_change')
-        # If we have facility_type but not in standard form, map it
-        if 'facility_type' in column_map and column_map['facility_type'] != 'facility_type':
-            df['facility_type'] = df[column_map['facility_type']]
-            self.derived_columns[file_name].add('facility_type')
-    def _extract_healthcare_metadata(self, file_name: str, df: pd.DataFrame):
-        """Extract healthcare-specific metadata dynamically."""
-        healthcare_meta = {}
-        # Detect data type based on columns
-        facility_cols = [col for col in df.columns if any(pattern in col for pattern in ['facility', 'name', 'site'])]
-        bed_cols = [col for col in df.columns if any(pattern in col for pattern in ['bed', 'capacity'])]
-        if facility_cols:
-            healthcare_meta['data_type'] = 'facility_data'
-            if 'facility_type' in df.columns:
-                healthcare_meta['facility_types'] = df['facility_type'].value_counts().to_dict()
-            if 'city' in df.columns:
-                healthcare_meta['cities'] = df['city'].value_counts().head(10).to_dict()
-        if bed_cols:
-            healthcare_meta['data_type'] = 'bed_data'
-            if 'zone' in df.columns:
-                healthcare_meta['zones'] = df['zone'].unique().tolist()
-            if 'teaching_status' in df.columns:
-                healthcare_meta['teaching_status_counts'] = df['teaching_status'].value_counts().to_dict()
-            # Check for derived metrics
-            if 'bed_change' in df.columns:
-                healthcare_meta['has_derived_metrics'] = True
-        if healthcare_meta:
-            self.healthcare_metadata[file_name] = healthcare_meta
-    def get_derived_columns(self, file_name: str) -> set:
-        """Get derived columns for a file."""
-        return self.derived_columns.get(file_name, set())
-    def find_column(self, file_name: str, patterns: List[str]) -> Optional[str]:
-        """Find a column matching any of the given patterns."""
-        df = self.get(file_name)
-        if df is None:
-            return None
-        for col in df.columns:
-            if any(pattern.lower() in col.lower() for pattern in patterns):
-                return col
-        return None
     def get_data_by_type(self, data_type: str) -> List[str]:
-        """Get all files of a specific data type."""
-        return [
-            file_name for file_name, meta in self.healthcare_metadata.items()
-            if meta.get('data_type') == data_type
-        ]
-    def names(self):
-        return list(self.data.keys())
-    def get(self, name):
-        return self.data.get(name)
-    def summarize_for_prompt(self) -> str:
-        """Generate a summary of all data for prompt inclusion."""
-        if not self.data:
-            return "No data files registered."
-        summary_parts = []
-        for file_name in self.names():
-            meta = self.metadata.get(file_name, {})
-            health_meta = self.get_healthcare_metadata(file_name)
-            summary_parts.append(f"File: {file_name}")
-            summary_parts.append(f"Type: {meta.get('type', 'unknown')}")
-            summary_parts.append(f"Columns: {', '.join(meta.get('columns', []))}")
-            summary_parts.append(f"Shape: {meta.get('shape', 'unknown')}")
-            if health_meta:
-                summary_parts.append("Healthcare Context:")
-                for key, value in health_meta.items():
-                    if key != 'privacy_warning':
-                        summary_parts.append(f"  {key}: {value}")
-            summary_parts.append("")
-        return "\n".join(summary_parts)
-    def get_healthcare_metadata(self, name: str) -> Dict[str, Any]:
-        """Get healthcare-specific metadata for a file."""
-        return self.healthcare_metadata.get(name, {})
     def clear(self):
         self.data.clear()
-        self.metadata.clear()
-        self.healthcare_metadata.clear()
-        self.derived_columns.clear()

 # data_registry.py
 import pandas as pd
 import os
+from typing import Dict, List, Any, Optional, Union
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class DataRegistry:
     def __init__(self):
         self.data = {}
         self.metadata = {}
+    def add_path(self, file_path: str) -> bool:
+        """Add a file to the registry and return success status"""
         try:
+            file_ext = os.path.splitext(file_path)[1].lower()
             if file_ext == '.csv':
+                df = pd.read_csv(file_path)
             elif file_ext in ['.xlsx', '.xls']:
+                df = pd.read_excel(file_path)
             elif file_ext == '.json':
+                df = pd.read_json(file_path)
             else:
+                logger.warning(f"Unsupported file type: {file_ext}")
                 return False
+            # Store with filename as key
+            filename = os.path.basename(file_path)
+            self.data[filename] = df
+            # Store metadata
+            self.metadata[filename] = {
+                "path": file_path,
+                "type": file_ext,
+                "shape": df.shape,
+                "columns": list(df.columns),
+                "data_types": df.dtypes.to_dict(),
+                "null_counts": df.isnull().sum().to_dict(),
+                "sample_data": df.head(3).to_dict()
             }
+            logger.info(f"Successfully loaded {filename} with shape {df.shape}")
             return True
         except Exception as e:
+            logger.error(f"Error loading {file_path}: {str(e)}")
             return False
+    def get(self, name: str) -> Optional[pd.DataFrame]:
+        """Get a dataset by name"""
+        return self.data.get(name)
+    def names(self) -> List[str]:
+        """Get all dataset names"""
+        return list(self.data.keys())
     def get_data_by_type(self, data_type: str) -> List[str]:
+        """Get datasets matching a type pattern"""
+        matching = []
+        for name, meta in self.metadata.items():
+            if data_type.lower() in name.lower():
+                matching.append(name)
+        return matching
+    def get_data_summary(self) -> Dict[str, Any]:
+        """Generate a summary of all loaded datasets"""
+        return self.metadata
+    def find_related_datasets(self, keywords: List[str]) -> List[Dict[str, Any]]:
+        """Find datasets containing specific keywords in columns or data"""
+        related = []
+        for name in self.names():
+            df = self.get(name)
+            if df is None:
+                continue
+            # Check column names
+            col_matches = [col for col in df.columns if any(kw in col.lower() for kw in keywords)]
+            # Check data content
+            data_matches = False
+            for col in df.select_dtypes(include=['object']).columns:
+                if any(df[col].str.contains('|'.join(keywords), case=False, na=False).any()):
+                    data_matches = True
+                    break
+            if col_matches or data_matches:
+                related.append({
+                    "name": name,
+                    "matching_columns": col_matches,
+                    "has_matching_data": data_matches
+                })
+        return related
     def clear(self):
+        """Clear all data"""
         self.data.clear()
+        self.metadata.clear()