bootstrap / src /qualivec /matching.py
akhil-vaidya's picture
Upload 26 files
f133a92 verified
"""Semantic matching utilities for QualiVec."""
import numpy as np
import pandas as pd
from typing import Dict, Any, List, Tuple, Optional
from sklearn.metrics.pairwise import cosine_similarity
class SemanticMatcher:
"""Handles semantic matching for QualiVec."""
def __init__(self,
threshold: float = 0.7,
verbose: bool = True):
"""Initialize the semantic matcher.
Args:
threshold: Cosine similarity threshold for matching.
verbose: Whether to print status messages.
"""
if not 0 <= threshold <= 1:
raise ValueError("Threshold must be between 0 and 1.")
self.threshold = threshold
self.verbose = verbose
def match(self,
query_embeddings: np.ndarray,
reference_data: Dict[str, Any],
return_similarities: bool = False) -> pd.DataFrame:
"""Match query embeddings against reference vectors.
Args:
query_embeddings: Embeddings of the query texts.
reference_data: Dictionary with reference vector information.
return_similarities: Whether to return all similarity scores.
Returns:
DataFrame with matching results.
"""
if self.verbose:
print(f"Matching {len(query_embeddings)} queries against {len(reference_data['embeddings'])} reference vectors")
print(f"Using cosine similarity threshold: {self.threshold}")
# Calculate cosine similarity
similarities = cosine_similarity(query_embeddings, reference_data['embeddings'])
# Find best matches
best_match_indices = np.argmax(similarities, axis=1)
best_match_scores = np.max(similarities, axis=1)
# Apply threshold
matches_mask = best_match_scores >= self.threshold
# Create results
classes = np.array(reference_data['classes'])[best_match_indices]
nodes = np.array(reference_data['nodes'])[best_match_indices]
# Apply threshold (set to "Other" if below threshold)
classes = np.where(matches_mask, classes, "Other")
nodes = np.where(matches_mask, nodes, "")
# Create result DataFrame
results = pd.DataFrame({
"predicted_class": classes,
"matched_node": nodes,
"similarity_score": best_match_scores
})
if return_similarities:
results["all_similarities"] = list(similarities)
if self.verbose:
print(f"Matching complete: {matches_mask.sum()} matches above threshold ({matches_mask.mean():.1%})")
print(f"Class distribution:\n{results['predicted_class'].value_counts().head(10)}")
return results
def classify_corpus(self,
corpus_embeddings: np.ndarray,
reference_data: Dict[str, Any],
corpus_df: pd.DataFrame) -> pd.DataFrame:
"""Classify an entire corpus using semantic matching.
Args:
corpus_embeddings: Embeddings of the corpus texts.
reference_data: Dictionary with reference vector information.
corpus_df: DataFrame containing the original corpus.
Returns:
DataFrame with classification results.
"""
# Perform matching
match_results = self.match(corpus_embeddings, reference_data)
# Combine with original corpus
result_df = pd.concat([corpus_df.reset_index(drop=True),
match_results.reset_index(drop=True)], axis=1)
if self.verbose:
print(f"Classified {len(result_df)} documents")
print(f"Class distribution:\n{result_df['predicted_class'].value_counts().head(10)}")
return result_df