Spaces:
Sleeping
Sleeping
| """ | |
| QUOOTA AGENT V3 - BILINGUAL RETRIEVAL + ACADEMIC CITATIONS | |
| Query ES β traduce EN β busca en ambos β combina β respuesta ES | |
| """ | |
| import faiss | |
| import pickle | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| # ConfiguraciΓ³n | |
| FAISS_PATH = "quoota_complete_cpu.faiss" | |
| METADATA_PATH = "quoota_metadata_enriched.pkl" | |
| CONFIG_PATH = "retrieval_config.pkl" | |
| MODEL_NAME = "intfloat/multilingual-e5-large" | |
| # Keywords | |
| DATA_KW = ["porcentaje", "plazo", "nΓΊmero", "numero", "fecha", "euros", "dias"] | |
| EGO_KW = ["ignorado", "interrumpe", "no valora", "menosprecia", "furioso", | |
| "agotado", "siempre", "nunca", "constantemente"] | |
| # Singleton | |
| _model = None | |
| _index = None | |
| _docs = None | |
| _config = None | |
| def _init(): | |
| """Inicializa recursos una sola vez""" | |
| global _model, _index, _docs, _config | |
| if _model is not None: | |
| return | |
| print("\nπ QUOOTA AGENT V3 - BILINGUAL") | |
| print("π§ Cargando modelo...") | |
| _model = SentenceTransformer(MODEL_NAME) | |
| print("π Cargando FAISS...") | |
| _index = faiss.read_index(FAISS_PATH) | |
| print("π Cargando metadata...") | |
| with open(METADATA_PATH, 'rb') as f: | |
| _docs = pickle.load(f) | |
| print("βοΈ Cargando config...") | |
| with open(CONFIG_PATH, 'rb') as f: | |
| _config = pickle.load(f) | |
| print(f"β Listo: {_index.ntotal:,} vectores\n") | |
| def _translate(text_es): | |
| """Traduce ES β EN con LLM (contexto LABORAL)""" | |
| try: | |
| from huggingface_hub import InferenceClient | |
| token = os.getenv("HF_TOKEN") | |
| if not token: | |
| return text_es | |
| client = InferenceClient(token=token) | |
| msgs = [ | |
| {"role": "system", "content": "Translate this Spanish text about a WORKPLACE conflict to English. Only the translation, nothing else. Use workplace terminology: 'coworker' not 'roommate', 'manager' not 'landlord', etc."}, | |
| {"role": "user", "content": text_es} | |
| ] | |
| resp = "" | |
| for m in client.chat_completion( | |
| messages=msgs, | |
| model="meta-llama/Llama-3.3-70B-Instruct", | |
| max_tokens=300, | |
| temperature=0.1, | |
| stream=True | |
| ): | |
| if m.choices and m.choices[0].delta.content: | |
| resp += m.choices[0].delta.content | |
| return resp.strip() | |
| except: | |
| return text_es | |
| def _detect_conflict(query): | |
| """Detecta tipo conflicto""" | |
| q = query.lower() | |
| data = sum(1 for k in DATA_KW if k in q) | |
| ego = sum(1 for k in EGO_KW if k in q) | |
| if ego > data: | |
| return "ego", min(ego * 20, 100) | |
| elif data > ego: | |
| return "data", min(data * 20, 100) | |
| elif ego > 0 and data > 0: | |
| return "mixed", min((ego + data) * 15, 100) | |
| else: | |
| return "unknown", 20 | |
| def analyze_case(query, k=5): | |
| """ | |
| FunciΓ³n principal para app.py | |
| Args: | |
| query: Consulta del usuario (string) | |
| k: NΓΊmero de chunks a retornar (int, default=5) | |
| Returns: | |
| dict con: | |
| - conflict_type: str ("ego"|"data"|"mixed"|"unknown") | |
| - confidence: int (0-100) | |
| - num_sources: int | |
| - unique_tags: list[str] | |
| - sources_raw: list[str] | |
| - logical_works_used: list[str] | |
| - categories_used: list[str] | |
| - faiss_weights_applied: dict | |
| - citations: str (texto formateado ACADΓMICO) | |
| """ | |
| _init() | |
| print(f"\n{'='*80}") | |
| print(f"π QUERY: {query[:70]}...") | |
| # 1. Detectar conflicto | |
| conflict_type, confidence = _detect_conflict(query) | |
| print(f"π Conflicto: {conflict_type.upper()} ({confidence}%)") | |
| # 2. Traducir | |
| print(f"π Traduciendo...") | |
| query_en = _translate(query) | |
| print(f"π EN: {query_en[:70]}...") | |
| # 3. Vectorizar ambas | |
| print(f"π§ Vectorizando ES + EN...") | |
| emb_es = _model.encode([query], convert_to_numpy=True).astype('float32') | |
| emb_es = emb_es / np.linalg.norm(emb_es) | |
| emb_en = _model.encode([query_en], convert_to_numpy=True).astype('float32') | |
| emb_en = emb_en / np.linalg.norm(emb_en) | |
| # 4. BΓΊsqueda FAISS doble | |
| print(f"π FAISS bilingΓΌe (30+30)...") | |
| dist_es, idx_es = _index.search(emb_es, 30) | |
| dist_en, idx_en = _index.search(emb_en, 30) | |
| # 5. Combinar (sin duplicados) | |
| results = [] | |
| seen = set() | |
| for i in range(30): | |
| # ES | |
| if i < len(idx_es[0]): | |
| idx = idx_es[0][i] | |
| if idx not in seen: | |
| results.append({ | |
| 'idx': idx, | |
| 'sim': 1.0 / (1.0 + dist_es[0][i]), | |
| 'lang': 'es' | |
| }) | |
| seen.add(idx) | |
| # EN | |
| if i < len(idx_en[0]): | |
| idx = idx_en[0][i] | |
| if idx not in seen: | |
| results.append({ | |
| 'idx': idx, | |
| 'sim': 1.0 / (1.0 + dist_en[0][i]), | |
| 'lang': 'en' | |
| }) | |
| seen.add(idx) | |
| print(f"π Combinados: {len(results)} ΓΊnicos") | |
| # 6. Scoring hΓbrido | |
| weights = _config["category_weights"] | |
| for r in results: | |
| doc = _docs[r['idx']] | |
| cat = doc.get('category', '') | |
| w = weights.get(cat, {}).get(conflict_type, 1.0) | |
| tags = doc.get('tags', []) | |
| relevant = ["ego", "cortesia"] if conflict_type == "ego" else ["datos", "negociacion"] | |
| tag_boost = len(set(tags) & set(relevant)) * 0.2 | |
| r['score'] = 0.5 * r['sim'] + 0.3 * w + 0.2 * min(tag_boost, 1.0) | |
| r['weight'] = w # Guardar peso aplicado | |
| # Ordenar | |
| results.sort(key=lambda x: x['score'], reverse=True) | |
| # 7. Top K | |
| top = results[:k] | |
| # Debug | |
| print(f"\nπ TOP {k}:") | |
| for i, r in enumerate(top, 1): | |
| doc = _docs[r['idx']] | |
| print(f"{i}. {r['score']:.3f} | {r['lang']} | {doc['category'][:20]:20s} | {doc['source'][:50]}") | |
| print("="*80 + "\n") | |
| # 8. Metadata | |
| sources = [] | |
| categories = [] | |
| all_tags = [] | |
| logical = [] | |
| weights_applied = {} | |
| for r in top: | |
| doc = _docs[r['idx']] | |
| src = doc.get('source', 'unknown') | |
| sources.append(src) | |
| cat = doc.get('category', 'unknown') | |
| if cat not in categories: | |
| categories.append(cat) | |
| # Guardar peso aplicado | |
| weights_applied[cat] = weights.get(cat, {}).get(conflict_type, 1.0) | |
| all_tags.extend(doc.get('tags', [])) | |
| # Logical work | |
| log = src.lower().replace('.epub', '').replace('.pdf', '').replace('.txt', '') | |
| log = log.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '') | |
| if log not in logical: | |
| logical.append(log) | |
| # ============================================================================ | |
| # CITATIONS ACADΓMICAS MEJORADAS | |
| # ============================================================================ | |
| citations_parts = [] | |
| # HEADER: Sistema de pesos aplicado | |
| citations_parts.append("="*80) | |
| citations_parts.append("π SISTEMA DE RETRIEVAL APLICADO") | |
| citations_parts.append("="*80) | |
| citations_parts.append(f"\nπ CONFLICTO DETECTADO: {conflict_type.upper()} (confidence: {confidence}%)") | |
| citations_parts.append(f"π QUERY BILINGΓE: ES + EN") | |
| citations_parts.append(f"π CHUNKS RECUPERADOS: {k} (de {_index.ntotal:,} vectores totales)") | |
| citations_parts.append(f"\nβοΈ PESOS METODOLΓGICOS APLICADOS:") | |
| for cat, weight in sorted(weights_applied.items(), key=lambda x: x[1], reverse=True): | |
| citations_parts.append(f" β’ {cat}: {weight}x") | |
| citations_parts.append(f"\nπ·οΈ TAGS RELEVANTES DETECTADOS: {', '.join(list(set(all_tags)))}") | |
| citations_parts.append(f"\n{'='*80}") | |
| citations_parts.append(f"π FUENTES ACADΓMICAS (Top {k} por scoring hΓbrido)") | |
| citations_parts.append(f"{'='*80}\n") | |
| # CADA FUENTE con formato acadΓ©mico | |
| for i, r in enumerate(top, 1): | |
| doc = _docs[r['idx']] | |
| src = doc.get('source', 'unknown') | |
| cat = doc.get('category', 'unknown') | |
| tags = ', '.join(doc.get('tags', [])) | |
| content_type = doc.get('content_type', 'unknown') | |
| text = doc.get('text', '') | |
| # Extraer nombre autor de filename (si es posible) | |
| author = "Autor desconocido" | |
| if "Brown" in src or "Levinson" in src: | |
| author = "Brown & Levinson" | |
| elif "Patterson" in src: | |
| author = "Patterson et al." | |
| elif "Stone" in src: | |
| author = "Stone, Patton & Heen" | |
| elif "Calsamiglia" in src: | |
| author = "Helena Calsamiglia" | |
| elif "Fisher" in src or "Ury" in src: | |
| author = "Fisher, Ury & Patton" | |
| elif "Covey" in src: | |
| author = "Stephen Covey" | |
| elif "Carnegie" in src: | |
| author = "Dale Carnegie" | |
| elif "Sinek" in src: | |
| author = "Simon Sinek" | |
| elif "Rosenberg" in src: | |
| author = "Marshall Rosenberg" | |
| elif "Lencioni" in src: | |
| author = "Patrick Lencioni" | |
| citations_parts.append(f"[FUENTE #{i}] {author}") | |
| citations_parts.append(f"βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| citations_parts.append(f"π Archivo: {src}") | |
| citations_parts.append(f"π CategorΓa: {cat}") | |
| citations_parts.append(f"π·οΈ Tags: {tags}") | |
| citations_parts.append(f"π Tipo contenido: {content_type}") | |
| citations_parts.append(f"π’ SCORING HΓBRIDO:") | |
| citations_parts.append(f" β’ Similitud semΓ‘ntica: {r['sim']:.3f}") | |
| citations_parts.append(f" β’ Peso metodolΓ³gico: {r['weight']:.2f}x") | |
| citations_parts.append(f" β’ Score final: {r['score']:.3f}") | |
| citations_parts.append(f"π Idioma query: {r['lang'].upper()}") | |
| citations_parts.append(f"\nπ CONTENIDO:") | |
| citations_parts.append(f"{text}\n") | |
| citations_parts.append(f"{'='*80}\n") | |
| # FOOTER: Instrucciones para el LLM | |
| citations_parts.append("π INSTRUCCIONES PARA ANΓLISIS:") | |
| citations_parts.append("βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| citations_parts.append("1. CITA EXPLΓCITAMENTE a los autores mencionados arriba") | |
| citations_parts.append("2. PRIORIZA fuentes con mayor peso metodolΓ³gico") | |
| citations_parts.append("3. USA terminologΓa tΓ©cnica de las fuentes (ej: 'negative face', 'FTA')") | |
| citations_parts.append("4. EXPLICA conceptos tΓ©cnicos de forma accesible") | |
| citations_parts.append("5. INTEGRA mΓnimo 2-3 fuentes en tu anΓ‘lisis") | |
| citations_parts.append(f"{'='*80}") | |
| citations = "\n".join(citations_parts) | |
| # Retornar en formato esperado por app.py | |
| return { | |
| "citations": citations, | |
| "metadata": { | |
| "conflict_type": conflict_type, | |
| "confidence": confidence, | |
| "num_sources": len(top), | |
| "unique_tags": list(set(all_tags)), | |
| "sources_raw": sources, | |
| "logical_works_used": logical[:5], | |
| "categories_used": categories, | |
| "faiss_weights_applied": weights_applied, | |
| "citations": citations | |
| } | |
| } |