Spaces:
Sleeping
Sleeping
| import os | |
| import csv | |
| import json | |
| import numpy as np | |
| from typing import List, Dict, Any, Optional | |
| from langchain_core.tools import tool | |
| from sentence_transformers import SentenceTransformer | |
| # Try to import thefuzz for fuzzy string matching | |
| try: | |
| from thefuzz import fuzz, process | |
| except ImportError: | |
| fuzz = None | |
| print("Warning: thefuzz not installed. Fuzzy matching disabled.") | |
| # Load embedding model for local search | |
| # We use a lightweight model to avoid heavy download | |
| try: | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| except Exception as e: | |
| print(f"Warning: Could not load embedding model: {e}") | |
| embedding_model = None | |
| # Function to load CSV data | |
| def load_qa_database(csv_path: str = "supabase_docs.csv") -> List[Dict]: | |
| qa_data = [] | |
| if not os.path.exists(csv_path): | |
| print(f"Warning: {csv_path} not found.") | |
| return qa_data | |
| try: | |
| # Increase CSV field size limit | |
| csv.field_size_limit(1000000) | |
| with open(csv_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| if 'content' in row: | |
| qa_data.append(row) | |
| except Exception as e: | |
| print(f"Error loading CSV: {e}") | |
| return qa_data | |
| # Global QA database | |
| QA_DATABASE = load_qa_database() | |
| def get_embedding(text: str) -> Optional[np.ndarray]: | |
| if embedding_model: | |
| return embedding_model.encode(text) | |
| return None | |
| def cosine_similarity(a, b): | |
| return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) | |
| def search_known_qa(query: str) -> str: | |
| """ | |
| Search for a similar question in the known Q&A database and return the answer if found. | |
| Use this tool FIRST to check if the question has already been solved. | |
| Args: | |
| query: The question to search for. | |
| """ | |
| if not QA_DATABASE: | |
| return "No known Q&A database available." | |
| questions = [item.get('content', '').split('\n')[0] for item in QA_DATABASE] | |
| # Method 1: Fuzzy Matching (Fast & Effective for near-duplicates) | |
| if fuzz: | |
| # Extract just the questions for matching to avoid noise from answers | |
| best_match = process.extractOne(query, questions, scorer=fuzz.token_set_ratio) | |
| if best_match: | |
| match_text, score = best_match[0], best_match[1] | |
| if score > 85: # High confidence threshold | |
| # Find the full item | |
| for item in QA_DATABASE: | |
| if item.get('content', '').startswith(match_text): | |
| return f"Found matching question (Confidence: {score}%):\n\n{item['content']}" | |
| # Method 2: Vector Search (Semantic semantic matching) | |
| if embedding_model: | |
| try: | |
| query_vec = get_embedding(query) | |
| if query_vec is None: | |
| return "Error generating query embedding." | |
| best_score = -1.0 | |
| best_content = "" | |
| for item in QA_DATABASE: | |
| emb_str = item.get('embedding') | |
| if emb_str: | |
| try: | |
| vec = json.loads(emb_str) | |
| score = cosine_similarity(query_vec, vec) | |
| if score > best_score: | |
| best_score = score | |
| best_content = item.get('content', '') | |
| except: | |
| continue | |
| # Lower threshold slightly | |
| if best_score > 0.5: | |
| return f"Found semantically similar question (Score: {best_score:.2f}):\n\n{best_content}" | |
| except Exception as e: | |
| print(f"Vector search error: {e}") | |
| # Fallback: Simple Keyword Search | |
| matches = [] | |
| query_terms = set(query.lower().split()) | |
| # Remove common stop words | |
| stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about'} | |
| query_terms = query_terms - stop_words | |
| if not query_terms: | |
| return "Query too short for keyword search." | |
| for item in QA_DATABASE: | |
| content = item.get('content', '').lower() | |
| score = sum(1 for term in query_terms if term in content) | |
| if score > 0: | |
| matches.append((score, item['content'])) | |
| matches.sort(key=lambda x: x[0], reverse=True) | |
| if matches and matches[0][0] >= len(query_terms) * 0.5: # At least 50% term match | |
| return f"Found potential match (Keyword match):\n\n{matches[0][1]}" | |
| return "No similar questions found in database." | |