"""Semantic matching utilities for QualiVec."""

import numpy as np
import pandas as pd
from typing import Dict, Any, List, Tuple, Optional
from sklearn.metrics.pairwise import cosine_similarity


class SemanticMatcher:
    """Handles semantic matching for QualiVec."""
    
    def __init__(self, 
                 threshold: float = 0.7,
                 verbose: bool = True):
        """Initialize the semantic matcher.
        
        Args:
            threshold: Cosine similarity threshold for matching.
            verbose: Whether to print status messages.
        """
        if not 0 <= threshold <= 1:
            raise ValueError("Threshold must be between 0 and 1.")
            
        self.threshold = threshold
        self.verbose = verbose
    
    def match(self, 
              query_embeddings: np.ndarray, 
              reference_data: Dict[str, Any],
              return_similarities: bool = False) -> pd.DataFrame:
        """Match query embeddings against reference vectors.
        
        Args:
            query_embeddings: Embeddings of the query texts.
            reference_data: Dictionary with reference vector information.
            return_similarities: Whether to return all similarity scores.
            
        Returns:
            DataFrame with matching results.
        """
        if self.verbose:
            print(f"Matching {len(query_embeddings)} queries against {len(reference_data['embeddings'])} reference vectors")
            print(f"Using cosine similarity threshold: {self.threshold}")
        
        # Calculate cosine similarity
        similarities = cosine_similarity(query_embeddings, reference_data['embeddings'])
        
        # Find best matches
        best_match_indices = np.argmax(similarities, axis=1)
        best_match_scores = np.max(similarities, axis=1)
        
        # Apply threshold
        matches_mask = best_match_scores >= self.threshold
        
        # Create results
        classes = np.array(reference_data['classes'])[best_match_indices]
        nodes = np.array(reference_data['nodes'])[best_match_indices]
        
        # Apply threshold (set to "Other" if below threshold)
        classes = np.where(matches_mask, classes, "Other")
        nodes = np.where(matches_mask, nodes, "")
        
        # Create result DataFrame
        results = pd.DataFrame({
            "predicted_class": classes,
            "matched_node": nodes,
            "similarity_score": best_match_scores
        })
        
        if return_similarities:
            results["all_similarities"] = list(similarities)
        
        if self.verbose:
            print(f"Matching complete: {matches_mask.sum()} matches above threshold ({matches_mask.mean():.1%})")
            print(f"Class distribution:\n{results['predicted_class'].value_counts().head(10)}")
        
        return results
    
    def classify_corpus(self,
                        corpus_embeddings: np.ndarray,
                        reference_data: Dict[str, Any],
                        corpus_df: pd.DataFrame) -> pd.DataFrame:
        """Classify an entire corpus using semantic matching.
        
        Args:
            corpus_embeddings: Embeddings of the corpus texts.
            reference_data: Dictionary with reference vector information.
            corpus_df: DataFrame containing the original corpus.
            
        Returns:
            DataFrame with classification results.
        """
        # Perform matching
        match_results = self.match(corpus_embeddings, reference_data)
        
        # Combine with original corpus
        result_df = pd.concat([corpus_df.reset_index(drop=True), 
                              match_results.reset_index(drop=True)], axis=1)
        
        if self.verbose:
            print(f"Classified {len(result_df)} documents")
            print(f"Class distribution:\n{result_df['predicted_class'].value_counts().head(10)}")
        
        return result_df