Spaces:
Sleeping
Sleeping
| """Semantic matching utilities for QualiVec.""" | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Dict, Any, List, Tuple, Optional | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| class SemanticMatcher: | |
| """Handles semantic matching for QualiVec.""" | |
| def __init__(self, | |
| threshold: float = 0.7, | |
| verbose: bool = True): | |
| """Initialize the semantic matcher. | |
| Args: | |
| threshold: Cosine similarity threshold for matching. | |
| verbose: Whether to print status messages. | |
| """ | |
| if not 0 <= threshold <= 1: | |
| raise ValueError("Threshold must be between 0 and 1.") | |
| self.threshold = threshold | |
| self.verbose = verbose | |
| def match(self, | |
| query_embeddings: np.ndarray, | |
| reference_data: Dict[str, Any], | |
| return_similarities: bool = False) -> pd.DataFrame: | |
| """Match query embeddings against reference vectors. | |
| Args: | |
| query_embeddings: Embeddings of the query texts. | |
| reference_data: Dictionary with reference vector information. | |
| return_similarities: Whether to return all similarity scores. | |
| Returns: | |
| DataFrame with matching results. | |
| """ | |
| if self.verbose: | |
| print(f"Matching {len(query_embeddings)} queries against {len(reference_data['embeddings'])} reference vectors") | |
| print(f"Using cosine similarity threshold: {self.threshold}") | |
| # Calculate cosine similarity | |
| similarities = cosine_similarity(query_embeddings, reference_data['embeddings']) | |
| # Find best matches | |
| best_match_indices = np.argmax(similarities, axis=1) | |
| best_match_scores = np.max(similarities, axis=1) | |
| # Apply threshold | |
| matches_mask = best_match_scores >= self.threshold | |
| # Create results | |
| classes = np.array(reference_data['classes'])[best_match_indices] | |
| nodes = np.array(reference_data['nodes'])[best_match_indices] | |
| # Apply threshold (set to "Other" if below threshold) | |
| classes = np.where(matches_mask, classes, "Other") | |
| nodes = np.where(matches_mask, nodes, "") | |
| # Create result DataFrame | |
| results = pd.DataFrame({ | |
| "predicted_class": classes, | |
| "matched_node": nodes, | |
| "similarity_score": best_match_scores | |
| }) | |
| if return_similarities: | |
| results["all_similarities"] = list(similarities) | |
| if self.verbose: | |
| print(f"Matching complete: {matches_mask.sum()} matches above threshold ({matches_mask.mean():.1%})") | |
| print(f"Class distribution:\n{results['predicted_class'].value_counts().head(10)}") | |
| return results | |
| def classify_corpus(self, | |
| corpus_embeddings: np.ndarray, | |
| reference_data: Dict[str, Any], | |
| corpus_df: pd.DataFrame) -> pd.DataFrame: | |
| """Classify an entire corpus using semantic matching. | |
| Args: | |
| corpus_embeddings: Embeddings of the corpus texts. | |
| reference_data: Dictionary with reference vector information. | |
| corpus_df: DataFrame containing the original corpus. | |
| Returns: | |
| DataFrame with classification results. | |
| """ | |
| # Perform matching | |
| match_results = self.match(corpus_embeddings, reference_data) | |
| # Combine with original corpus | |
| result_df = pd.concat([corpus_df.reset_index(drop=True), | |
| match_results.reset_index(drop=True)], axis=1) | |
| if self.verbose: | |
| print(f"Classified {len(result_df)} documents") | |
| print(f"Class distribution:\n{result_df['predicted_class'].value_counts().head(10)}") | |
| return result_df | |