| import pandas as pd |
| from typing import List, Tuple, Dict, Any |
| from langchain_community.vectorstores import faiss |
|
|
|
|
| def multi_column(db: faiss.FAISS, df: pd.DataFrame, qc_pairs: Dict[str, str], threshold: float) -> List[Tuple[int, float, Dict[str, Any]]]: |
| """Perform semantic search across multiple columns and return aggregated results. |
| |
| Args: |
| db: FAISS vector database for search |
| df: Original DataFrame containing the data |
| qc_pairs: Dictionary mapping columns to query fragments |
| threshold: Minimum similarity threshold to include a result |
| |
| Returns: |
| List[Tuple[int, float, Dict[str, Any]]]: List of tuples (row_id, avg_score, row_dict) |
| """ |
| per_column_scores = [] |
| for column, query in qc_pairs.items(): |
| hits = db.similarity_search_with_score( |
| query, |
| k=db.index.ntotal, |
| filter={'column': column}, |
| distance_strategy=faiss.DistanceStrategy.COSINE |
| ) |
| score_map = { |
| doc.metadata['row']: score |
| for doc, score in hits |
| if score >= threshold |
| } |
| per_column_scores.append(score_map) |
|
|
| all_rows = set() |
| for score_map in per_column_scores: |
| all_rows.update(score_map.keys()) |
|
|
| results = [] |
| for rid in all_rows: |
| scores = [score_map[rid] for score_map in per_column_scores if rid in score_map] |
| if scores: |
| avg_score = sum(scores) / len(scores) |
| row_dict = df.loc[rid].to_dict() |
| results.append((rid, avg_score, row_dict)) |
|
|
| results.sort(key=lambda x: x[1], reverse=True) |
| return results |
|
|