import os
import csv
import json
import numpy as np
from typing import List, Dict, Any, Optional
from langchain_core.tools import tool
from sentence_transformers import SentenceTransformer

# Try to import thefuzz for fuzzy string matching
try:
    from thefuzz import fuzz, process
except ImportError:
    fuzz = None
    print("Warning: thefuzz not installed. Fuzzy matching disabled.")

# Load embedding model for local search
# We use a lightweight model to avoid heavy download
try:
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
    print(f"Warning: Could not load embedding model: {e}")
    embedding_model = None

# Function to load CSV data
def load_qa_database(csv_path: str = "supabase_docs.csv") -> List[Dict]:
    qa_data = []
    if not os.path.exists(csv_path):
        print(f"Warning: {csv_path} not found.")
        return qa_data
        
    try:
        # Increase CSV field size limit
        csv.field_size_limit(1000000)
        
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if 'content' in row:
                    qa_data.append(row)
    except Exception as e:
        print(f"Error loading CSV: {e}")
        
    return qa_data

# Global QA database
QA_DATABASE = load_qa_database()

def get_embedding(text: str) -> Optional[np.ndarray]:
    if embedding_model:
        return embedding_model.encode(text)
    return None

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

@tool
def search_known_qa(query: str) -> str:
    """
    Search for a similar question in the known Q&A database and return the answer if found.
    Use this tool FIRST to check if the question has already been solved.
    
    Args:
        query: The question to search for.
    """
    if not QA_DATABASE:
        return "No known Q&A database available."
    
    questions = [item.get('content', '').split('\n')[0] for item in QA_DATABASE]
    
    # Method 1: Fuzzy Matching (Fast & Effective for near-duplicates)
    if fuzz:
        # Extract just the questions for matching to avoid noise from answers
        best_match = process.extractOne(query, questions, scorer=fuzz.token_set_ratio)
        if best_match:
            match_text, score = best_match[0], best_match[1]
            if score > 85: # High confidence threshold
                # Find the full item
                for item in QA_DATABASE:
                    if item.get('content', '').startswith(match_text):
                        return f"Found matching question (Confidence: {score}%):\n\n{item['content']}"
    
    # Method 2: Vector Search (Semantic semantic matching)
    if embedding_model:
        try:
            query_vec = get_embedding(query)
            if query_vec is None:
                 return "Error generating query embedding."
                 
            best_score = -1.0
            best_content = ""
            
            for item in QA_DATABASE:
                emb_str = item.get('embedding')
                if emb_str:
                    try:
                        vec = json.loads(emb_str)
                        score = cosine_similarity(query_vec, vec)
                        if score > best_score:
                            best_score = score
                            best_content = item.get('content', '')
                    except:
                        continue
            
            # Lower threshold slightly
            if best_score > 0.5: 
                return f"Found semantically similar question (Score: {best_score:.2f}):\n\n{best_content}"
                
        except Exception as e:
            print(f"Vector search error: {e}")

    # Fallback: Simple Keyword Search
    matches = []
    query_terms = set(query.lower().split())
    # Remove common stop words
    stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about'}
    query_terms = query_terms - stop_words
    
    if not query_terms:
        return "Query too short for keyword search."

    for item in QA_DATABASE:
        content = item.get('content', '').lower()
        score = sum(1 for term in query_terms if term in content)
        if score > 0:
            matches.append((score, item['content']))
    
    matches.sort(key=lambda x: x[0], reverse=True)
    if matches and matches[0][0] >= len(query_terms) * 0.5: # At least 50% term match
        return f"Found potential match (Keyword match):\n\n{matches[0][1]}"

    return "No similar questions found in database."