Spaces:

Malaji71
/

Quoota_RAG

Sleeping

File size: 13,754 Bytes

"""
QUOOTA AGENT V3 - BILINGUAL RETRIEVAL + ACADEMIC CITATIONS
Query ES → translate EN → search both → combine → response ES/EN
"""

import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import os

# Configuración
FAISS_PATH = "quoota_complete_cpu.faiss"
METADATA_PATH = "quoota_metadata_enriched.pkl"
CONFIG_PATH = "retrieval_config.pkl"
MODEL_NAME = "intfloat/multilingual-e5-large"

# Keywords - BILINGUAL (ES + EN) for conflict detection
DATA_KW = [
    # Spanish
    "porcentaje", "plazo", "número", "numero", "fecha", "euros", "dias",
    "deadline", "cifra", "monto", "tiempo", "presupuesto",
    # English
    "deadline", "percentage", "number", "date", "budget", "timeline",
    "metric", "kpi", "target", "quota", "figure", "report", "data"
]

EGO_KW = [
    # Spanish
    "ignorado", "interrumpe", "no valora", "menosprecia", "furioso", 
    "agotado", "siempre", "nunca", "constantemente", "excluye", "critica",
    "menosprecio", "injusto", "invisible", "frustrado", "enojado",
    # English
    "ignored", "interrupts", "interrupt", "undervalues", "unfair", 
    "exhausted", "burned out", "always", "never", "constantly",
    "excludes", "criticizes", "belittles", "invisible", "frustrated",
    "angry", "disrespected", "unappreciated", "overlooked", "dismissed",
    "credit", "takes credit", "my idea", "not recognized"
]

# Tag translation mapping (ES → EN)
TAG_TRANSLATIONS = {
    "negociacion": "negotiation",
    "liderazgo": "leadership",
    "feedback": "feedback",
    "ego": "ego",
    "cortesia": "politeness",
    "datos": "data",
    "conflicto": "conflict",
    "comunicacion": "communication",
    "gestion": "management",
    "equipo": "team",
    "jerarquia": "hierarchy",
    "autoridad": "authority",
    "confianza": "trust",
    "respeto": "respect",
    "reconocimiento": "recognition",
    "autonomia": "autonomy",
    "presion": "pressure",
    "estres": "stress",
    "motivacion": "motivation",
    "decision": "decision",
    "cambio": "change",
    "cultura": "culture",
    "valores": "values",
    "etica": "ethics",
    "responsabilidad": "responsibility",
    "rendicion_cuentas": "accountability",
    "colaboracion": "collaboration",
    "competencia": "competition",
    "poder": "power",
    "influencia": "influence"
}

def _translate_tags(tags_es):
    """Translate Spanish tags to English"""
    return [TAG_TRANSLATIONS.get(tag.lower(), tag) for tag in tags_es]

# Singleton
_model = None
_index = None
_docs = None
_config = None

def _init():
    """Inicializa recursos una sola vez"""
    global _model, _index, _docs, _config
    
    if _model is not None:
        return
    
    print("\n🚀 QUOOTA AGENT V3 - BILINGUAL")
    
    print("🧠 Loading model...")
    _model = SentenceTransformer(MODEL_NAME)
    
    print("📊 Loading FAISS...")
    _index = faiss.read_index(FAISS_PATH)
    
    print("📚 Loading metadata...")
    with open(METADATA_PATH, 'rb') as f:
        _docs = pickle.load(f)
    
    print("⚙️  Loading config...")
    with open(CONFIG_PATH, 'rb') as f:
        _config = pickle.load(f)
    
    print(f"✅ Ready: {_index.ntotal:,} vectors\n")

def _translate(text_es):
    """Traduce ES → EN con LLM (contexto LABORAL)"""
    try:
        from huggingface_hub import InferenceClient
        
        token = os.getenv("HF_TOKEN")
        if not token:
            return text_es
        
        client = InferenceClient(token=token)
        
        msgs = [
            {"role": "system", "content": "Translate this Spanish text about a WORKPLACE conflict to English. Only the translation, nothing else. Use workplace terminology: 'coworker' not 'roommate', 'manager' not 'landlord', etc."},
            {"role": "user", "content": text_es}
        ]
        
        resp = ""
        for m in client.chat_completion(
            messages=msgs,
            model="meta-llama/Llama-3.3-70B-Instruct",
            max_tokens=300,
            temperature=0.1,
            stream=True
        ):
            if m.choices and m.choices[0].delta.content:
                resp += m.choices[0].delta.content
        
        return resp.strip()
    
    except:
        return text_es

def _detect_conflict(query):
    """Detecta tipo conflicto (bilingual keywords)"""
    q = query.lower()
    
    data = sum(1 for k in DATA_KW if k in q)
    ego = sum(1 for k in EGO_KW if k in q)
    
    if ego > data:
        return "ego", min(ego * 20, 100)
    elif data > ego:
        return "data", min(data * 20, 100)
    elif ego > 0 and data > 0:
        return "mixed", min((ego + data) * 15, 100)
    else:
        return "unknown", 20

def analyze_case(query, k=5):
    """
    Función principal para app.py
    
    Args:
        query: Consulta del usuario (string)
        k: Número de chunks a retornar (int, default=5)
    
    Returns:
        dict con:
            - conflict_type: str ("ego"|"data"|"mixed"|"unknown")
            - confidence: int (0-100)
            - num_sources: int
            - unique_tags: list[str]
            - sources_raw: list[str]
            - logical_works_used: list[str]
            - categories_used: list[str]
            - faiss_weights_applied: dict
            - citations: str (texto formateado ACADÉMICO)
    """
    _init()
    
    print(f"\n{'='*80}")
    print(f"🔍 QUERY: {query[:70]}...")
    
    # 1. Detect conflict on ORIGINAL query (bilingual keywords)
    conflict_type, confidence = _detect_conflict(query)
    print(f"📊 Conflict: {conflict_type.upper()} ({confidence}%)")
    
    # 2. Translate
    print(f"🌐 Translating...")
    query_en = _translate(query)
    print(f"📝 EN: {query_en[:70]}...")
    
    # 3. Also detect on translated query for better accuracy
    conflict_type_en, confidence_en = _detect_conflict(query_en)
    if confidence_en > confidence:
        conflict_type = conflict_type_en
        confidence = confidence_en
        print(f"📊 Conflict (EN): {conflict_type.upper()} ({confidence}%)")
    
    # 4. Vectorize both
    print(f"🧠 Vectorizing ES + EN...")
    emb_es = _model.encode([query], convert_to_numpy=True).astype('float32')
    emb_es = emb_es / np.linalg.norm(emb_es)
    
    emb_en = _model.encode([query_en], convert_to_numpy=True).astype('float32')
    emb_en = emb_en / np.linalg.norm(emb_en)
    
    # 5. Bilingual FAISS search
    print(f"📚 Bilingual FAISS (30+30)...")
    dist_es, idx_es = _index.search(emb_es, 30)
    dist_en, idx_en = _index.search(emb_en, 30)
    
    # 6. Combine (no duplicates)
    results = []
    seen = set()
    
    for i in range(30):
        if i < len(idx_es[0]):
            idx = idx_es[0][i]
            if idx not in seen:
                results.append({
                    'idx': idx,
                    'sim': 1.0 / (1.0 + dist_es[0][i]),
                    'lang': 'es'
                })
                seen.add(idx)
        
        if i < len(idx_en[0]):
            idx = idx_en[0][i]
            if idx not in seen:
                results.append({
                    'idx': idx,
                    'sim': 1.0 / (1.0 + dist_en[0][i]),
                    'lang': 'en'
                })
                seen.add(idx)
    
    print(f"🔀 Combined: {len(results)} unique")
    
    # 7. Hybrid scoring
    weights = _config["category_weights"]
    
    for r in results:
        doc = _docs[r['idx']]
        cat = doc.get('category', '')
        
        w = weights.get(cat, {}).get(conflict_type, 1.0)
        
        tags = doc.get('tags', [])
        relevant = ["ego", "cortesia", "politeness"] if conflict_type == "ego" else ["datos", "negociacion", "negotiation", "data"]
        tag_boost = len(set(tags) & set(relevant)) * 0.2
        
        r['score'] = 0.5 * r['sim'] + 0.3 * w + 0.2 * min(tag_boost, 1.0)
        r['weight'] = w
    
    results.sort(key=lambda x: x['score'], reverse=True)
    
    top = results[:k]
    
    print(f"\n📊 TOP {k}:")
    for i, r in enumerate(top, 1):
        doc = _docs[r['idx']]
        print(f"{i}. {r['score']:.3f} | {r['lang']} | {doc['category'][:20]:20s} | {doc['source'][:50]}")
    print("="*80 + "\n")
    
    # 8. Metadata
    sources = []
    categories = []
    all_tags = []
    logical = []
    weights_applied = {}
    
    for r in top:
        doc = _docs[r['idx']]
        
        src = doc.get('source', 'unknown')
        sources.append(src)
        
        cat = doc.get('category', 'unknown')
        if cat not in categories:
            categories.append(cat)
            weights_applied[cat] = weights.get(cat, {}).get(conflict_type, 1.0)
        
        all_tags.extend(doc.get('tags', []))
        
        log = src.lower().replace('.epub', '').replace('.pdf', '').replace('.txt', '')
        log = log.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
        if log not in logical:
            logical.append(log)
    
    # ============================================================================
    # CITATIONS ACADÉMICAS MEJORADAS (ENGLISH LABELS)
    # ============================================================================
    
    citations_parts = []
    
    citations_parts.append("="*80)
    citations_parts.append("📊 RETRIEVAL SYSTEM APPLIED")
    citations_parts.append("="*80)
    citations_parts.append(f"\n🔍 CONFLICT DETECTED: {conflict_type.upper()} (confidence: {confidence}%)")
    citations_parts.append(f"🌐 QUERY LANGUAGE: ES + EN (Bilingual)")
    citations_parts.append(f"📚 CHUNKS RETRIEVED: {k} (of {_index.ntotal:,} total vectors)")
    
    citations_parts.append(f"\n⚖️  METHODOLOGICAL WEIGHTS APPLIED:")
    for cat, weight in sorted(weights_applied.items(), key=lambda x: x[1], reverse=True):
        citations_parts.append(f"   • {cat}: {weight}x")
    
    # Translate tags for display
    translated_tags = _translate_tags(list(set(all_tags)))
    citations_parts.append(f"\n🏷️  RELEVANT TAGS DETECTED: {', '.join(translated_tags)}")
    
    citations_parts.append(f"\n{'='*80}")
    citations_parts.append(f"📖 ACADEMIC SOURCES (Top {k} by hybrid scoring)")
    citations_parts.append(f"{'='*80}\n")
    
    for i, r in enumerate(top, 1):
        doc = _docs[r['idx']]
        
        src = doc.get('source', 'unknown')
        cat = doc.get('category', 'unknown')
        # Translate tags for each source
        tags = ', '.join(_translate_tags(doc.get('tags', [])))
        content_type = doc.get('content_type', 'unknown')
        text = doc.get('text', '')
        
        author = "Unknown Author"
        if "Brown" in src or "Levinson" in src:
            author = "Brown & Levinson"
        elif "Patterson" in src:
            author = "Patterson et al."
        elif "Stone" in src:
            author = "Stone, Patton & Heen"
        elif "Calsamiglia" in src:
            author = "Helena Calsamiglia"
        elif "Fisher" in src or "Ury" in src:
            author = "Fisher, Ury & Patton"
        elif "Covey" in src:
            author = "Stephen Covey"
        elif "Carnegie" in src:
            author = "Dale Carnegie"
        elif "Sinek" in src:
            author = "Simon Sinek"
        elif "Rosenberg" in src:
            author = "Marshall Rosenberg"
        elif "Lencioni" in src:
            author = "Patrick Lencioni"
        
        citations_parts.append(f"[SOURCE #{i}] {author}")
        citations_parts.append(f"─────────────────────────────────────────────────────────────────────────────")
        citations_parts.append(f"📁 File: {src}")
        citations_parts.append(f"📂 Category: {cat}")
        citations_parts.append(f"🏷️  Tags: {tags}")
        citations_parts.append(f"📊 Content Type: {content_type}")
        citations_parts.append(f"🔢 HYBRID SCORING:")
        citations_parts.append(f"   • Semantic Similarity: {r['sim']:.3f}")
        citations_parts.append(f"   • Methodological Weight: {r['weight']:.2f}x")
        citations_parts.append(f"   • Final Score: {r['score']:.3f}")
        citations_parts.append(f"🌐 Query Language: {r['lang'].upper()}")
        citations_parts.append(f"\n📝 CONTENT:")
        citations_parts.append(f"{text}\n")
        citations_parts.append(f"{'='*80}\n")
    
    citations_parts.append("📌 INSTRUCTIONS FOR ANALYSIS:")
    citations_parts.append("─────────────────────────────────────────────────────────────────────────────")
    citations_parts.append("1. CITE authors mentioned above EXPLICITLY")
    citations_parts.append("2. PRIORITIZE sources with higher methodological weight")
    citations_parts.append("3. USE technical terminology from sources (e.g., 'negative face', 'FTA')")
    citations_parts.append("4. EXPLAIN technical concepts in accessible language")
    citations_parts.append("5. INTEGRATE minimum 2-3 sources in your analysis")
    citations_parts.append(f"{'='*80}")
    
    citations = "\n".join(citations_parts)
    
    return {
        "citations": citations,
        "metadata": {
            "conflict_type": conflict_type,
            "confidence": confidence,
            "num_sources": len(top),
            "unique_tags": translated_tags,
            "sources_raw": sources,
            "logical_works_used": logical[:5],
            "categories_used": categories,
            "faiss_weights_applied": weights_applied,
            "citations": citations
        }
    }