"""
QUOOTA AGENT V3 - BILINGUAL RETRIEVAL + ACADEMIC CITATIONS
Query ES → traduce EN → busca en ambos → combina → respuesta ES
"""

import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import os

# Configuración
FAISS_PATH = "quoota_complete_cpu.faiss"
METADATA_PATH = "quoota_metadata_enriched.pkl"
CONFIG_PATH = "retrieval_config.pkl"
MODEL_NAME = "intfloat/multilingual-e5-large"

# Keywords
DATA_KW = ["porcentaje", "plazo", "número", "numero", "fecha", "euros", "dias"]
EGO_KW = ["ignorado", "interrumpe", "no valora", "menosprecia", "furioso", 
          "agotado", "siempre", "nunca", "constantemente"]

# Singleton
_model = None
_index = None
_docs = None
_config = None

def _init():
    """Inicializa recursos una sola vez"""
    global _model, _index, _docs, _config
    
    if _model is not None:
        return
    
    print("\n🚀 QUOOTA AGENT V3 - BILINGUAL")
    
    print("🧠 Cargando modelo...")
    _model = SentenceTransformer(MODEL_NAME)
    
    print("📊 Cargando FAISS...")
    _index = faiss.read_index(FAISS_PATH)
    
    print("📚 Cargando metadata...")
    with open(METADATA_PATH, 'rb') as f:
        _docs = pickle.load(f)
    
    print("⚙️  Cargando config...")
    with open(CONFIG_PATH, 'rb') as f:
        _config = pickle.load(f)
    
    print(f"✅ Listo: {_index.ntotal:,} vectores\n")

def _translate(text_es):
    """Traduce ES → EN con LLM (contexto LABORAL)"""
    try:
        from huggingface_hub import InferenceClient
        
        token = os.getenv("HF_TOKEN")
        if not token:
            return text_es
        
        client = InferenceClient(token=token)
        
        msgs = [
            {"role": "system", "content": "Translate this Spanish text about a WORKPLACE conflict to English. Only the translation, nothing else. Use workplace terminology: 'coworker' not 'roommate', 'manager' not 'landlord', etc."},
            {"role": "user", "content": text_es}
        ]
        
        resp = ""
        for m in client.chat_completion(
            messages=msgs,
            model="meta-llama/Llama-3.3-70B-Instruct",
            max_tokens=300,
            temperature=0.1,
            stream=True
        ):
            if m.choices and m.choices[0].delta.content:
                resp += m.choices[0].delta.content
        
        return resp.strip()
    
    except:
        return text_es

def _detect_conflict(query):
    """Detecta tipo conflicto"""
    q = query.lower()
    
    data = sum(1 for k in DATA_KW if k in q)
    ego = sum(1 for k in EGO_KW if k in q)
    
    if ego > data:
        return "ego", min(ego * 20, 100)
    elif data > ego:
        return "data", min(data * 20, 100)
    elif ego > 0 and data > 0:
        return "mixed", min((ego + data) * 15, 100)
    else:
        return "unknown", 20

def analyze_case(query, k=5):
    """
    Función principal para app.py
    
    Args:
        query: Consulta del usuario (string)
        k: Número de chunks a retornar (int, default=5)
    
    Returns:
        dict con:
            - conflict_type: str ("ego"|"data"|"mixed"|"unknown")
            - confidence: int (0-100)
            - num_sources: int
            - unique_tags: list[str]
            - sources_raw: list[str]
            - logical_works_used: list[str]
            - categories_used: list[str]
            - faiss_weights_applied: dict
            - citations: str (texto formateado ACADÉMICO)
    """
    _init()
    
    print(f"\n{'='*80}")
    print(f"🔍 QUERY: {query[:70]}...")
    
    # 1. Detectar conflicto
    conflict_type, confidence = _detect_conflict(query)
    print(f"📊 Conflicto: {conflict_type.upper()} ({confidence}%)")
    
    # 2. Traducir
    print(f"🌐 Traduciendo...")
    query_en = _translate(query)
    print(f"📝 EN: {query_en[:70]}...")
    
    # 3. Vectorizar ambas
    print(f"🧠 Vectorizando ES + EN...")
    emb_es = _model.encode([query], convert_to_numpy=True).astype('float32')
    emb_es = emb_es / np.linalg.norm(emb_es)
    
    emb_en = _model.encode([query_en], convert_to_numpy=True).astype('float32')
    emb_en = emb_en / np.linalg.norm(emb_en)
    
    # 4. Búsqueda FAISS doble
    print(f"📚 FAISS bilingüe (30+30)...")
    dist_es, idx_es = _index.search(emb_es, 30)
    dist_en, idx_en = _index.search(emb_en, 30)
    
    # 5. Combinar (sin duplicados)
    results = []
    seen = set()
    
    for i in range(30):
        # ES
        if i < len(idx_es[0]):
            idx = idx_es[0][i]
            if idx not in seen:
                results.append({
                    'idx': idx,
                    'sim': 1.0 / (1.0 + dist_es[0][i]),
                    'lang': 'es'
                })
                seen.add(idx)
        
        # EN
        if i < len(idx_en[0]):
            idx = idx_en[0][i]
            if idx not in seen:
                results.append({
                    'idx': idx,
                    'sim': 1.0 / (1.0 + dist_en[0][i]),
                    'lang': 'en'
                })
                seen.add(idx)
    
    print(f"🔀 Combinados: {len(results)} únicos")
    
    # 6. Scoring híbrido
    weights = _config["category_weights"]
    
    for r in results:
        doc = _docs[r['idx']]
        cat = doc.get('category', '')
        
        w = weights.get(cat, {}).get(conflict_type, 1.0)
        
        tags = doc.get('tags', [])
        relevant = ["ego", "cortesia"] if conflict_type == "ego" else ["datos", "negociacion"]
        tag_boost = len(set(tags) & set(relevant)) * 0.2
        
        r['score'] = 0.5 * r['sim'] + 0.3 * w + 0.2 * min(tag_boost, 1.0)
        r['weight'] = w  # Guardar peso aplicado
    
    # Ordenar
    results.sort(key=lambda x: x['score'], reverse=True)
    
    # 7. Top K
    top = results[:k]
    
    # Debug
    print(f"\n📊 TOP {k}:")
    for i, r in enumerate(top, 1):
        doc = _docs[r['idx']]
        print(f"{i}. {r['score']:.3f} | {r['lang']} | {doc['category'][:20]:20s} | {doc['source'][:50]}")
    print("="*80 + "\n")
    
    # 8. Metadata
    sources = []
    categories = []
    all_tags = []
    logical = []
    weights_applied = {}
    
    for r in top:
        doc = _docs[r['idx']]
        
        src = doc.get('source', 'unknown')
        sources.append(src)
        
        cat = doc.get('category', 'unknown')
        if cat not in categories:
            categories.append(cat)
            # Guardar peso aplicado
            weights_applied[cat] = weights.get(cat, {}).get(conflict_type, 1.0)
        
        all_tags.extend(doc.get('tags', []))
        
        # Logical work
        log = src.lower().replace('.epub', '').replace('.pdf', '').replace('.txt', '')
        log = log.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
        if log not in logical:
            logical.append(log)
    
    # ============================================================================
    # CITATIONS ACADÉMICAS MEJORADAS
    # ============================================================================
    
    citations_parts = []
    
    # HEADER: Sistema de pesos aplicado
    citations_parts.append("="*80)
    citations_parts.append("📊 SISTEMA DE RETRIEVAL APLICADO")
    citations_parts.append("="*80)
    citations_parts.append(f"\n🔍 CONFLICTO DETECTADO: {conflict_type.upper()} (confidence: {confidence}%)")
    citations_parts.append(f"🌐 QUERY BILINGÜE: ES + EN")
    citations_parts.append(f"📚 CHUNKS RECUPERADOS: {k} (de {_index.ntotal:,} vectores totales)")
    
    citations_parts.append(f"\n⚖️  PESOS METODOLÓGICOS APLICADOS:")
    for cat, weight in sorted(weights_applied.items(), key=lambda x: x[1], reverse=True):
        citations_parts.append(f"   • {cat}: {weight}x")
    
    citations_parts.append(f"\n🏷️  TAGS RELEVANTES DETECTADOS: {', '.join(list(set(all_tags)))}")
    
    citations_parts.append(f"\n{'='*80}")
    citations_parts.append(f"📖 FUENTES ACADÉMICAS (Top {k} por scoring híbrido)")
    citations_parts.append(f"{'='*80}\n")
    
    # CADA FUENTE con formato académico
    for i, r in enumerate(top, 1):
        doc = _docs[r['idx']]
        
        src = doc.get('source', 'unknown')
        cat = doc.get('category', 'unknown')
        tags = ', '.join(doc.get('tags', []))
        content_type = doc.get('content_type', 'unknown')
        text = doc.get('text', '')
        
        # Extraer nombre autor de filename (si es posible)
        author = "Autor desconocido"
        if "Brown" in src or "Levinson" in src:
            author = "Brown & Levinson"
        elif "Patterson" in src:
            author = "Patterson et al."
        elif "Stone" in src:
            author = "Stone, Patton & Heen"
        elif "Calsamiglia" in src:
            author = "Helena Calsamiglia"
        elif "Fisher" in src or "Ury" in src:
            author = "Fisher, Ury & Patton"
        elif "Covey" in src:
            author = "Stephen Covey"
        elif "Carnegie" in src:
            author = "Dale Carnegie"
        elif "Sinek" in src:
            author = "Simon Sinek"
        elif "Rosenberg" in src:
            author = "Marshall Rosenberg"
        elif "Lencioni" in src:
            author = "Patrick Lencioni"
        
        citations_parts.append(f"[FUENTE #{i}] {author}")
        citations_parts.append(f"─────────────────────────────────────────────────────────────────────────────")
        citations_parts.append(f"📁 Archivo: {src}")
        citations_parts.append(f"📂 Categoría: {cat}")
        citations_parts.append(f"🏷️  Tags: {tags}")
        citations_parts.append(f"📊 Tipo contenido: {content_type}")
        citations_parts.append(f"🔢 SCORING HÍBRIDO:")
        citations_parts.append(f"   • Similitud semántica: {r['sim']:.3f}")
        citations_parts.append(f"   • Peso metodológico: {r['weight']:.2f}x")
        citations_parts.append(f"   • Score final: {r['score']:.3f}")
        citations_parts.append(f"🌐 Idioma query: {r['lang'].upper()}")
        citations_parts.append(f"\n📝 CONTENIDO:")
        citations_parts.append(f"{text}\n")
        citations_parts.append(f"{'='*80}\n")
    
    # FOOTER: Instrucciones para el LLM
    citations_parts.append("📌 INSTRUCCIONES PARA ANÁLISIS:")
    citations_parts.append("─────────────────────────────────────────────────────────────────────────────")
    citations_parts.append("1. CITA EXPLÍCITAMENTE a los autores mencionados arriba")
    citations_parts.append("2. PRIORIZA fuentes con mayor peso metodológico")
    citations_parts.append("3. USA terminología técnica de las fuentes (ej: 'negative face', 'FTA')")
    citations_parts.append("4. EXPLICA conceptos técnicos de forma accesible")
    citations_parts.append("5. INTEGRA mínimo 2-3 fuentes en tu análisis")
    citations_parts.append(f"{'='*80}")
    
    citations = "\n".join(citations_parts)
    
    # Retornar en formato esperado por app.py
    return {
        "citations": citations,
        "metadata": {
            "conflict_type": conflict_type,
            "confidence": confidence,
            "num_sources": len(top),
            "unique_tags": list(set(all_tags)),
            "sources_raw": sources,
            "logical_works_used": logical[:5],
            "categories_used": categories,
            "faiss_weights_applied": weights_applied,
            "citations": citations
        }
    }