Spaces:
Sleeping
Sleeping
| # data_registry.py | |
| import pandas as pd | |
| import os | |
| from typing import Dict, List, Any, Optional, Union | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class DataRegistry: | |
| def __init__(self): | |
| self.data = {} | |
| self.metadata = {} | |
| def add_path(self, file_path: str) -> bool: | |
| """Add a file to the registry and return success status""" | |
| try: | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext == '.csv': | |
| df = pd.read_csv(file_path) | |
| elif file_ext in ['.xlsx', '.xls']: | |
| df = pd.read_excel(file_path) | |
| elif file_ext == '.json': | |
| df = pd.read_json(file_path) | |
| else: | |
| logger.warning(f"Unsupported file type: {file_ext}") | |
| return False | |
| # Store with filename as key | |
| filename = os.path.basename(file_path) | |
| self.data[filename] = df | |
| # Store metadata | |
| self.metadata[filename] = { | |
| "path": file_path, | |
| "type": file_ext, | |
| "shape": df.shape, | |
| "columns": list(df.columns), | |
| "data_types": df.dtypes.to_dict(), | |
| "null_counts": df.isnull().sum().to_dict(), | |
| "sample_data": df.head(3).to_dict() | |
| } | |
| logger.info(f"Successfully loaded {filename} with shape {df.shape}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error loading {file_path}: {str(e)}") | |
| return False | |
| def get(self, name: str) -> Optional[pd.DataFrame]: | |
| """Get a dataset by name""" | |
| return self.data.get(name) | |
| def names(self) -> List[str]: | |
| """Get all dataset names""" | |
| return list(self.data.keys()) | |
| def get_data_by_type(self, data_type: str) -> List[str]: | |
| """Get datasets matching a type pattern""" | |
| matching = [] | |
| for name, meta in self.metadata.items(): | |
| if data_type.lower() in name.lower(): | |
| matching.append(name) | |
| return matching | |
| def get_data_summary(self) -> Dict[str, Any]: | |
| """Generate a summary of all loaded datasets""" | |
| return self.metadata | |
| def find_related_datasets(self, keywords: List[str]) -> List[Dict[str, Any]]: | |
| """Find datasets containing specific keywords in columns or data""" | |
| related = [] | |
| for name in self.names(): | |
| df = self.get(name) | |
| if df is None: | |
| continue | |
| # Check column names | |
| col_matches = [col for col in df.columns if any(kw in col.lower() for kw in keywords)] | |
| # Check data content | |
| data_matches = False | |
| for col in df.select_dtypes(include=['object']).columns: | |
| if any(df[col].str.contains('|'.join(keywords), case=False, na=False).any()): | |
| data_matches = True | |
| break | |
| if col_matches or data_matches: | |
| related.append({ | |
| "name": name, | |
| "matching_columns": col_matches, | |
| "has_matching_data": data_matches | |
| }) | |
| return related | |
| def clear(self): | |
| """Clear all data""" | |
| self.data.clear() | |
| self.metadata.clear() |