File size: 1,645 Bytes
7e85729 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | import pandas as pd
from typing import List, Tuple, Dict, Any
from langchain_community.vectorstores import faiss
def multi_column(db: faiss.FAISS, df: pd.DataFrame, qc_pairs: Dict[str, str], threshold: float) -> List[Tuple[int, float, Dict[str, Any]]]:
"""Perform semantic search across multiple columns and return aggregated results.
Args:
db: FAISS vector database for search
df: Original DataFrame containing the data
qc_pairs: Dictionary mapping columns to query fragments
threshold: Minimum similarity threshold to include a result
Returns:
List[Tuple[int, float, Dict[str, Any]]]: List of tuples (row_id, avg_score, row_dict)
"""
per_column_scores = []
for column, query in qc_pairs.items():
hits = db.similarity_search_with_score(
query,
k=db.index.ntotal,
filter={'column': column},
distance_strategy=faiss.DistanceStrategy.COSINE
)
score_map = {
doc.metadata['row']: score
for doc, score in hits
if score >= threshold
}
per_column_scores.append(score_map)
all_rows = set()
for score_map in per_column_scores:
all_rows.update(score_map.keys())
results = []
for rid in all_rows:
scores = [score_map[rid] for score_map in per_column_scores if rid in score_map]
if scores:
avg_score = sum(scores) / len(scores)
row_dict = df.loc[rid].to_dict()
results.append((rid, avg_score, row_dict))
results.sort(key=lambda x: x[1], reverse=True)
return results
|