# data_registry.py import pandas as pd import os from typing import Dict, List, Any, Optional, Union import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DataRegistry: def __init__(self): self.data = {} self.metadata = {} def add_path(self, file_path: str) -> bool: """Add a file to the registry and return success status""" try: file_ext = os.path.splitext(file_path)[1].lower() if file_ext == '.csv': df = pd.read_csv(file_path) elif file_ext in ['.xlsx', '.xls']: df = pd.read_excel(file_path) elif file_ext == '.json': df = pd.read_json(file_path) else: logger.warning(f"Unsupported file type: {file_ext}") return False # Store with filename as key filename = os.path.basename(file_path) self.data[filename] = df # Store metadata self.metadata[filename] = { "path": file_path, "type": file_ext, "shape": df.shape, "columns": list(df.columns), "data_types": df.dtypes.to_dict(), "null_counts": df.isnull().sum().to_dict(), "sample_data": df.head(3).to_dict() } logger.info(f"Successfully loaded {filename} with shape {df.shape}") return True except Exception as e: logger.error(f"Error loading {file_path}: {str(e)}") return False def get(self, name: str) -> Optional[pd.DataFrame]: """Get a dataset by name""" return self.data.get(name) def names(self) -> List[str]: """Get all dataset names""" return list(self.data.keys()) def get_data_by_type(self, data_type: str) -> List[str]: """Get datasets matching a type pattern""" matching = [] for name, meta in self.metadata.items(): if data_type.lower() in name.lower(): matching.append(name) return matching def get_data_summary(self) -> Dict[str, Any]: """Generate a summary of all loaded datasets""" return self.metadata def find_related_datasets(self, keywords: List[str]) -> List[Dict[str, Any]]: """Find datasets containing specific keywords in columns or data""" related = [] for name in self.names(): df = self.get(name) if df is None: continue # Check column names col_matches = [col for col in df.columns if any(kw in col.lower() for kw in keywords)] # Check data content data_matches = False for col in df.select_dtypes(include=['object']).columns: try: # Create a boolean mask for rows containing any keyword # This is the generic approach that works for any keywords pattern = '|'.join(keywords) mask = df[col].str.contains(pattern, case=False, na=False) # Check if any match exists (this returns a single boolean) if mask.any(): data_matches = True break except Exception as e: # If there's an error with this column, skip it logger.debug(f"Error checking column {col} for keywords: {str(e)}") continue if col_matches or data_matches: related.append({ "name": name, "matching_columns": col_matches, "has_matching_data": data_matches }) return related def clear(self): """Clear all data""" self.data.clear() self.metadata.clear()