# data_registry.py
import pandas as pd
import os
from typing import Dict, List, Any, Optional, Union
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataRegistry:
    def __init__(self):
        self.data = {}
        self.metadata = {}
    
    def add_path(self, file_path: str) -> bool:
        """Add a file to the registry and return success status"""
        try:
            file_ext = os.path.splitext(file_path)[1].lower()
            
            if file_ext == '.csv':
                df = pd.read_csv(file_path)
            elif file_ext in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path)
            elif file_ext == '.json':
                df = pd.read_json(file_path)
            else:
                logger.warning(f"Unsupported file type: {file_ext}")
                return False
            
            # Store with filename as key
            filename = os.path.basename(file_path)
            self.data[filename] = df
            
            # Store metadata
            self.metadata[filename] = {
                "path": file_path,
                "type": file_ext,
                "shape": df.shape,
                "columns": list(df.columns),
                "data_types": df.dtypes.to_dict(),
                "null_counts": df.isnull().sum().to_dict(),
                "sample_data": df.head(3).to_dict()
            }
            
            logger.info(f"Successfully loaded {filename} with shape {df.shape}")
            return True
            
        except Exception as e:
            logger.error(f"Error loading {file_path}: {str(e)}")
            return False
    
    def get(self, name: str) -> Optional[pd.DataFrame]:
        """Get a dataset by name"""
        return self.data.get(name)
    
    def names(self) -> List[str]:
        """Get all dataset names"""
        return list(self.data.keys())
    
    def get_data_by_type(self, data_type: str) -> List[str]:
        """Get datasets matching a type pattern"""
        matching = []
        for name, meta in self.metadata.items():
            if data_type.lower() in name.lower():
                matching.append(name)
        return matching
    
    def get_data_summary(self) -> Dict[str, Any]:
        """Generate a summary of all loaded datasets"""
        return self.metadata
    
    def find_related_datasets(self, keywords: List[str]) -> List[Dict[str, Any]]:
        """Find datasets containing specific keywords in columns or data"""
        related = []
        for name in self.names():
            df = self.get(name)
            if df is None:
                continue
                
            # Check column names
            col_matches = [col for col in df.columns if any(kw in col.lower() for kw in keywords)]
            
            # Check data content
            data_matches = False
            for col in df.select_dtypes(include=['object']).columns:
                try:
                    # Create a boolean mask for rows containing any keyword
                    # This is the generic approach that works for any keywords
                    pattern = '|'.join(keywords)
                    mask = df[col].str.contains(pattern, case=False, na=False)
                    
                    # Check if any match exists (this returns a single boolean)
                    if mask.any():
                        data_matches = True
                        break
                except Exception as e:
                    # If there's an error with this column, skip it
                    logger.debug(f"Error checking column {col} for keywords: {str(e)}")
                    continue
            
            if col_matches or data_matches:
                related.append({
                    "name": name,
                    "matching_columns": col_matches,
                    "has_matching_data": data_matches
                })
        return related
    
    def clear(self):
        """Clear all data"""
        self.data.clear()
        self.metadata.clear()