Antientropy / qa_tool.py
Jose-Maria Segui
Add fuzzy matching for QA and LLM retry logic
98e8860
import os
import csv
import json
import numpy as np
from typing import List, Dict, Any, Optional
from langchain_core.tools import tool
from sentence_transformers import SentenceTransformer
# Try to import thefuzz for fuzzy string matching
try:
from thefuzz import fuzz, process
except ImportError:
fuzz = None
print("Warning: thefuzz not installed. Fuzzy matching disabled.")
# Load embedding model for local search
# We use a lightweight model to avoid heavy download
try:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
print(f"Warning: Could not load embedding model: {e}")
embedding_model = None
# Function to load CSV data
def load_qa_database(csv_path: str = "supabase_docs.csv") -> List[Dict]:
qa_data = []
if not os.path.exists(csv_path):
print(f"Warning: {csv_path} not found.")
return qa_data
try:
# Increase CSV field size limit
csv.field_size_limit(1000000)
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
if 'content' in row:
qa_data.append(row)
except Exception as e:
print(f"Error loading CSV: {e}")
return qa_data
# Global QA database
QA_DATABASE = load_qa_database()
def get_embedding(text: str) -> Optional[np.ndarray]:
if embedding_model:
return embedding_model.encode(text)
return None
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
@tool
def search_known_qa(query: str) -> str:
"""
Search for a similar question in the known Q&A database and return the answer if found.
Use this tool FIRST to check if the question has already been solved.
Args:
query: The question to search for.
"""
if not QA_DATABASE:
return "No known Q&A database available."
questions = [item.get('content', '').split('\n')[0] for item in QA_DATABASE]
# Method 1: Fuzzy Matching (Fast & Effective for near-duplicates)
if fuzz:
# Extract just the questions for matching to avoid noise from answers
best_match = process.extractOne(query, questions, scorer=fuzz.token_set_ratio)
if best_match:
match_text, score = best_match[0], best_match[1]
if score > 85: # High confidence threshold
# Find the full item
for item in QA_DATABASE:
if item.get('content', '').startswith(match_text):
return f"Found matching question (Confidence: {score}%):\n\n{item['content']}"
# Method 2: Vector Search (Semantic semantic matching)
if embedding_model:
try:
query_vec = get_embedding(query)
if query_vec is None:
return "Error generating query embedding."
best_score = -1.0
best_content = ""
for item in QA_DATABASE:
emb_str = item.get('embedding')
if emb_str:
try:
vec = json.loads(emb_str)
score = cosine_similarity(query_vec, vec)
if score > best_score:
best_score = score
best_content = item.get('content', '')
except:
continue
# Lower threshold slightly
if best_score > 0.5:
return f"Found semantically similar question (Score: {best_score:.2f}):\n\n{best_content}"
except Exception as e:
print(f"Vector search error: {e}")
# Fallback: Simple Keyword Search
matches = []
query_terms = set(query.lower().split())
# Remove common stop words
stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about'}
query_terms = query_terms - stop_words
if not query_terms:
return "Query too short for keyword search."
for item in QA_DATABASE:
content = item.get('content', '').lower()
score = sum(1 for term in query_terms if term in content)
if score > 0:
matches.append((score, item['content']))
matches.sort(key=lambda x: x[0], reverse=True)
if matches and matches[0][0] >= len(query_terms) * 0.5: # At least 50% term match
return f"Found potential match (Keyword match):\n\n{matches[0][1]}"
return "No similar questions found in database."