Medica_DecisionSupportAI / data_registry.py
Rajan Sharma
Update data_registry.py
ef7ab85 verified
raw
history blame
3.5 kB
# data_registry.py
import pandas as pd
import os
from typing import Dict, List, Any, Optional, Union
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DataRegistry:
def __init__(self):
self.data = {}
self.metadata = {}
def add_path(self, file_path: str) -> bool:
"""Add a file to the registry and return success status"""
try:
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.csv':
df = pd.read_csv(file_path)
elif file_ext in ['.xlsx', '.xls']:
df = pd.read_excel(file_path)
elif file_ext == '.json':
df = pd.read_json(file_path)
else:
logger.warning(f"Unsupported file type: {file_ext}")
return False
# Store with filename as key
filename = os.path.basename(file_path)
self.data[filename] = df
# Store metadata
self.metadata[filename] = {
"path": file_path,
"type": file_ext,
"shape": df.shape,
"columns": list(df.columns),
"data_types": df.dtypes.to_dict(),
"null_counts": df.isnull().sum().to_dict(),
"sample_data": df.head(3).to_dict()
}
logger.info(f"Successfully loaded {filename} with shape {df.shape}")
return True
except Exception as e:
logger.error(f"Error loading {file_path}: {str(e)}")
return False
def get(self, name: str) -> Optional[pd.DataFrame]:
"""Get a dataset by name"""
return self.data.get(name)
def names(self) -> List[str]:
"""Get all dataset names"""
return list(self.data.keys())
def get_data_by_type(self, data_type: str) -> List[str]:
"""Get datasets matching a type pattern"""
matching = []
for name, meta in self.metadata.items():
if data_type.lower() in name.lower():
matching.append(name)
return matching
def get_data_summary(self) -> Dict[str, Any]:
"""Generate a summary of all loaded datasets"""
return self.metadata
def find_related_datasets(self, keywords: List[str]) -> List[Dict[str, Any]]:
"""Find datasets containing specific keywords in columns or data"""
related = []
for name in self.names():
df = self.get(name)
if df is None:
continue
# Check column names
col_matches = [col for col in df.columns if any(kw in col.lower() for kw in keywords)]
# Check data content
data_matches = False
for col in df.select_dtypes(include=['object']).columns:
if any(df[col].str.contains('|'.join(keywords), case=False, na=False).any()):
data_matches = True
break
if col_matches or data_matches:
related.append({
"name": name,
"matching_columns": col_matches,
"has_matching_data": data_matches
})
return related
def clear(self):
"""Clear all data"""
self.data.clear()
self.metadata.clear()