Spaces:

jomasego
/

Antientropy

Sleeping

Antientropy / qa_tool.py

Jose-Maria Segui

Add fuzzy matching for QA and LLM retry logic

98e8860 17 days ago

4.64 kB

	import os
	import csv
	import json
	import numpy as np
	from typing import List, Dict, Any, Optional
	from langchain_core.tools import tool
	from sentence_transformers import SentenceTransformer

	# Try to import thefuzz for fuzzy string matching
	try:
	from thefuzz import fuzz, process
	except ImportError:
	fuzz = None
	print("Warning: thefuzz not installed. Fuzzy matching disabled.")

	# Load embedding model for local search
	# We use a lightweight model to avoid heavy download
	try:
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	except Exception as e:
	print(f"Warning: Could not load embedding model: {e}")
	embedding_model = None

	# Function to load CSV data
	def load_qa_database(csv_path: str = "supabase_docs.csv") -> List[Dict]:
	qa_data = []
	if not os.path.exists(csv_path):
	print(f"Warning: {csv_path} not found.")
	return qa_data

	try:
	# Increase CSV field size limit
	csv.field_size_limit(1000000)

	with open(csv_path, 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	for row in reader:
	if 'content' in row:
	qa_data.append(row)
	except Exception as e:
	print(f"Error loading CSV: {e}")

	return qa_data

	# Global QA database
	QA_DATABASE = load_qa_database()

	def get_embedding(text: str) -> Optional[np.ndarray]:
	if embedding_model:
	return embedding_model.encode(text)
	return None

	def cosine_similarity(a, b):
	return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

	@tool
	def search_known_qa(query: str) -> str:
	"""
	Search for a similar question in the known Q&A database and return the answer if found.
	Use this tool FIRST to check if the question has already been solved.

	Args:
	query: The question to search for.
	"""
	if not QA_DATABASE:
	return "No known Q&A database available."

	questions = [item.get('content', '').split('\n')[0] for item in QA_DATABASE]

	# Method 1: Fuzzy Matching (Fast & Effective for near-duplicates)
	if fuzz:
	# Extract just the questions for matching to avoid noise from answers
	best_match = process.extractOne(query, questions, scorer=fuzz.token_set_ratio)
	if best_match:
	match_text, score = best_match[0], best_match[1]
	if score > 85: # High confidence threshold
	# Find the full item
	for item in QA_DATABASE:
	if item.get('content', '').startswith(match_text):
	return f"Found matching question (Confidence: {score}%):\n\n{item['content']}"

	# Method 2: Vector Search (Semantic semantic matching)
	if embedding_model:
	try:
	query_vec = get_embedding(query)
	if query_vec is None:
	return "Error generating query embedding."

	best_score = -1.0
	best_content = ""

	for item in QA_DATABASE:
	emb_str = item.get('embedding')
	if emb_str:
	try:
	vec = json.loads(emb_str)
	score = cosine_similarity(query_vec, vec)
	if score > best_score:
	best_score = score
	best_content = item.get('content', '')
	except:
	continue

	# Lower threshold slightly
	if best_score > 0.5:
	return f"Found semantically similar question (Score: {best_score:.2f}):\n\n{best_content}"

	except Exception as e:
	print(f"Vector search error: {e}")

	# Fallback: Simple Keyword Search
	matches = []
	query_terms = set(query.lower().split())
	# Remove common stop words
	stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about'}
	query_terms = query_terms - stop_words

	if not query_terms:
	return "Query too short for keyword search."

	for item in QA_DATABASE:
	content = item.get('content', '').lower()
	score = sum(1 for term in query_terms if term in content)
	if score > 0:
	matches.append((score, item['content']))

	matches.sort(key=lambda x: x[0], reverse=True)
	if matches and matches[0][0] >= len(query_terms) * 0.5: # At least 50% term match
	return f"Found potential match (Keyword match):\n\n{matches[0][1]}"

	return "No similar questions found in database."