import os import csv import json import numpy as np from typing import List, Dict, Any, Optional from langchain_core.tools import tool from sentence_transformers import SentenceTransformer # Try to import thefuzz for fuzzy string matching try: from thefuzz import fuzz, process except ImportError: fuzz = None print("Warning: thefuzz not installed. Fuzzy matching disabled.") # Load embedding model for local search # We use a lightweight model to avoid heavy download try: embedding_model = SentenceTransformer('all-MiniLM-L6-v2') except Exception as e: print(f"Warning: Could not load embedding model: {e}") embedding_model = None # Function to load CSV data def load_qa_database(csv_path: str = "supabase_docs.csv") -> List[Dict]: qa_data = [] if not os.path.exists(csv_path): print(f"Warning: {csv_path} not found.") return qa_data try: # Increase CSV field size limit csv.field_size_limit(1000000) with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: if 'content' in row: qa_data.append(row) except Exception as e: print(f"Error loading CSV: {e}") return qa_data # Global QA database QA_DATABASE = load_qa_database() def get_embedding(text: str) -> Optional[np.ndarray]: if embedding_model: return embedding_model.encode(text) return None def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) @tool def search_known_qa(query: str) -> str: """ Search for a similar question in the known Q&A database and return the answer if found. Use this tool FIRST to check if the question has already been solved. Args: query: The question to search for. """ if not QA_DATABASE: return "No known Q&A database available." questions = [item.get('content', '').split('\n')[0] for item in QA_DATABASE] # Method 1: Fuzzy Matching (Fast & Effective for near-duplicates) if fuzz: # Extract just the questions for matching to avoid noise from answers best_match = process.extractOne(query, questions, scorer=fuzz.token_set_ratio) if best_match: match_text, score = best_match[0], best_match[1] if score > 85: # High confidence threshold # Find the full item for item in QA_DATABASE: if item.get('content', '').startswith(match_text): return f"Found matching question (Confidence: {score}%):\n\n{item['content']}" # Method 2: Vector Search (Semantic semantic matching) if embedding_model: try: query_vec = get_embedding(query) if query_vec is None: return "Error generating query embedding." best_score = -1.0 best_content = "" for item in QA_DATABASE: emb_str = item.get('embedding') if emb_str: try: vec = json.loads(emb_str) score = cosine_similarity(query_vec, vec) if score > best_score: best_score = score best_content = item.get('content', '') except: continue # Lower threshold slightly if best_score > 0.5: return f"Found semantically similar question (Score: {best_score:.2f}):\n\n{best_content}" except Exception as e: print(f"Vector search error: {e}") # Fallback: Simple Keyword Search matches = [] query_terms = set(query.lower().split()) # Remove common stop words stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about'} query_terms = query_terms - stop_words if not query_terms: return "Query too short for keyword search." for item in QA_DATABASE: content = item.get('content', '').lower() score = sum(1 for term in query_terms if term in content) if score > 0: matches.append((score, item['content'])) matches.sort(key=lambda x: x[0], reverse=True) if matches and matches[0][0] >= len(query_terms) * 0.5: # At least 50% term match return f"Found potential match (Keyword match):\n\n{matches[0][1]}" return "No similar questions found in database."