Spaces:

Malaji71
/

Quoota_RAG

Sleeping

App Files Files Community

Quoota_RAG / agent.py

Malaji71

Update agent.py

2544a4a verified 30 days ago

raw

history blame contribute delete

11.8 kB

	"""
	QUOOTA AGENT V3 - BILINGUAL RETRIEVAL + ACADEMIC CITATIONS
	Query ES → traduce EN → busca en ambos → combina → respuesta ES
	"""

	import faiss
	import pickle
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import os

	# Configuración
	FAISS_PATH = "quoota_complete_cpu.faiss"
	METADATA_PATH = "quoota_metadata_enriched.pkl"
	CONFIG_PATH = "retrieval_config.pkl"
	MODEL_NAME = "intfloat/multilingual-e5-large"

	# Keywords
	DATA_KW = ["porcentaje", "plazo", "número", "numero", "fecha", "euros", "dias"]
	EGO_KW = ["ignorado", "interrumpe", "no valora", "menosprecia", "furioso",
	"agotado", "siempre", "nunca", "constantemente"]

	# Singleton
	_model = None
	_index = None
	_docs = None
	_config = None

	def _init():
	"""Inicializa recursos una sola vez"""
	global _model, _index, _docs, _config

	if _model is not None:
	return

	print("\n🚀 QUOOTA AGENT V3 - BILINGUAL")

	print("🧠 Cargando modelo...")
	_model = SentenceTransformer(MODEL_NAME)

	print("📊 Cargando FAISS...")
	_index = faiss.read_index(FAISS_PATH)

	print("📚 Cargando metadata...")
	with open(METADATA_PATH, 'rb') as f:
	_docs = pickle.load(f)

	print("⚙️ Cargando config...")
	with open(CONFIG_PATH, 'rb') as f:
	_config = pickle.load(f)

	print(f"✅ Listo: {_index.ntotal:,} vectores\n")

	def _translate(text_es):
	"""Traduce ES → EN con LLM (contexto LABORAL)"""
	try:
	from huggingface_hub import InferenceClient

	token = os.getenv("HF_TOKEN")
	if not token:
	return text_es

	client = InferenceClient(token=token)

	msgs = [
	{"role": "system", "content": "Translate this Spanish text about a WORKPLACE conflict to English. Only the translation, nothing else. Use workplace terminology: 'coworker' not 'roommate', 'manager' not 'landlord', etc."},
	{"role": "user", "content": text_es}
	]

	resp = ""
	for m in client.chat_completion(
	messages=msgs,
	model="meta-llama/Llama-3.3-70B-Instruct",
	max_tokens=300,
	temperature=0.1,
	stream=True
	):
	if m.choices and m.choices[0].delta.content:
	resp += m.choices[0].delta.content

	return resp.strip()

	except:
	return text_es

	def _detect_conflict(query):
	"""Detecta tipo conflicto"""
	q = query.lower()

	data = sum(1 for k in DATA_KW if k in q)
	ego = sum(1 for k in EGO_KW if k in q)

	if ego > data:
	return "ego", min(ego * 20, 100)
	elif data > ego:
	return "data", min(data * 20, 100)
	elif ego > 0 and data > 0:
	return "mixed", min((ego + data) * 15, 100)
	else:
	return "unknown", 20

	def analyze_case(query, k=5):
	"""
	Función principal para app.py

	Args:
	query: Consulta del usuario (string)
	k: Número de chunks a retornar (int, default=5)

	Returns:
	dict con:
	- conflict_type: str ("ego"\|"data"\|"mixed"\|"unknown")
	- confidence: int (0-100)
	- num_sources: int
	- unique_tags: list[str]
	- sources_raw: list[str]
	- logical_works_used: list[str]
	- categories_used: list[str]
	- faiss_weights_applied: dict
	- citations: str (texto formateado ACADÉMICO)
	"""
	_init()

	print(f"\n{'='*80}")
	print(f"🔍 QUERY: {query[:70]}...")

	# 1. Detectar conflicto
	conflict_type, confidence = _detect_conflict(query)
	print(f"📊 Conflicto: {conflict_type.upper()} ({confidence}%)")

	# 2. Traducir
	print(f"🌐 Traduciendo...")
	query_en = _translate(query)
	print(f"📝 EN: {query_en[:70]}...")

	# 3. Vectorizar ambas
	print(f"🧠 Vectorizando ES + EN...")
	emb_es = _model.encode([query], convert_to_numpy=True).astype('float32')
	emb_es = emb_es / np.linalg.norm(emb_es)

	emb_en = _model.encode([query_en], convert_to_numpy=True).astype('float32')
	emb_en = emb_en / np.linalg.norm(emb_en)

	# 4. Búsqueda FAISS doble
	print(f"📚 FAISS bilingüe (30+30)...")
	dist_es, idx_es = _index.search(emb_es, 30)
	dist_en, idx_en = _index.search(emb_en, 30)

	# 5. Combinar (sin duplicados)
	results = []
	seen = set()

	for i in range(30):
	# ES
	if i < len(idx_es[0]):
	idx = idx_es[0][i]
	if idx not in seen:
	results.append({
	'idx': idx,
	'sim': 1.0 / (1.0 + dist_es[0][i]),
	'lang': 'es'
	})
	seen.add(idx)

	# EN
	if i < len(idx_en[0]):
	idx = idx_en[0][i]
	if idx not in seen:
	results.append({
	'idx': idx,
	'sim': 1.0 / (1.0 + dist_en[0][i]),
	'lang': 'en'
	})
	seen.add(idx)

	print(f"🔀 Combinados: {len(results)} únicos")

	# 6. Scoring híbrido
	weights = _config["category_weights"]

	for r in results:
	doc = _docs[r['idx']]
	cat = doc.get('category', '')

	w = weights.get(cat, {}).get(conflict_type, 1.0)

	tags = doc.get('tags', [])
	relevant = ["ego", "cortesia"] if conflict_type == "ego" else ["datos", "negociacion"]
	tag_boost = len(set(tags) & set(relevant)) * 0.2

	r['score'] = 0.5 * r['sim'] + 0.3 * w + 0.2 * min(tag_boost, 1.0)
	r['weight'] = w # Guardar peso aplicado

	# Ordenar
	results.sort(key=lambda x: x['score'], reverse=True)

	# 7. Top K
	top = results[:k]

	# Debug
	print(f"\n📊 TOP {k}:")
	for i, r in enumerate(top, 1):
	doc = _docs[r['idx']]
	print(f"{i}. {r['score']:.3f} \| {r['lang']} \| {doc['category'][:20]:20s} \| {doc['source'][:50]}")
	print("="*80 + "\n")

	# 8. Metadata
	sources = []
	categories = []
	all_tags = []
	logical = []
	weights_applied = {}

	for r in top:
	doc = _docs[r['idx']]

	src = doc.get('source', 'unknown')
	sources.append(src)

	cat = doc.get('category', 'unknown')
	if cat not in categories:
	categories.append(cat)
	# Guardar peso aplicado
	weights_applied[cat] = weights.get(cat, {}).get(conflict_type, 1.0)

	all_tags.extend(doc.get('tags', []))

	# Logical work
	log = src.lower().replace('.epub', '').replace('.pdf', '').replace('.txt', '')
	log = log.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
	if log not in logical:
	logical.append(log)

	# ============================================================================
	# CITATIONS ACADÉMICAS MEJORADAS
	# ============================================================================

	citations_parts = []

	# HEADER: Sistema de pesos aplicado
	citations_parts.append("="*80)
	citations_parts.append("📊 SISTEMA DE RETRIEVAL APLICADO")
	citations_parts.append("="*80)
	citations_parts.append(f"\n🔍 CONFLICTO DETECTADO: {conflict_type.upper()} (confidence: {confidence}%)")
	citations_parts.append(f"🌐 QUERY BILINGÜE: ES + EN")
	citations_parts.append(f"📚 CHUNKS RECUPERADOS: {k} (de {_index.ntotal:,} vectores totales)")

	citations_parts.append(f"\n⚖️ PESOS METODOLÓGICOS APLICADOS:")
	for cat, weight in sorted(weights_applied.items(), key=lambda x: x[1], reverse=True):
	citations_parts.append(f" • {cat}: {weight}x")

	citations_parts.append(f"\n🏷️ TAGS RELEVANTES DETECTADOS: {', '.join(list(set(all_tags)))}")

	citations_parts.append(f"\n{'='*80}")
	citations_parts.append(f"📖 FUENTES ACADÉMICAS (Top {k} por scoring híbrido)")
	citations_parts.append(f"{'='*80}\n")

	# CADA FUENTE con formato académico
	for i, r in enumerate(top, 1):
	doc = _docs[r['idx']]

	src = doc.get('source', 'unknown')
	cat = doc.get('category', 'unknown')
	tags = ', '.join(doc.get('tags', []))
	content_type = doc.get('content_type', 'unknown')
	text = doc.get('text', '')

	# Extraer nombre autor de filename (si es posible)
	author = "Autor desconocido"
	if "Brown" in src or "Levinson" in src:
	author = "Brown & Levinson"
	elif "Patterson" in src:
	author = "Patterson et al."
	elif "Stone" in src:
	author = "Stone, Patton & Heen"
	elif "Calsamiglia" in src:
	author = "Helena Calsamiglia"
	elif "Fisher" in src or "Ury" in src:
	author = "Fisher, Ury & Patton"
	elif "Covey" in src:
	author = "Stephen Covey"
	elif "Carnegie" in src:
	author = "Dale Carnegie"
	elif "Sinek" in src:
	author = "Simon Sinek"
	elif "Rosenberg" in src:
	author = "Marshall Rosenberg"
	elif "Lencioni" in src:
	author = "Patrick Lencioni"

	citations_parts.append(f"[FUENTE #{i}] {author}")
	citations_parts.append(f"─────────────────────────────────────────────────────────────────────────────")
	citations_parts.append(f"📁 Archivo: {src}")
	citations_parts.append(f"📂 Categoría: {cat}")
	citations_parts.append(f"🏷️ Tags: {tags}")
	citations_parts.append(f"📊 Tipo contenido: {content_type}")
	citations_parts.append(f"🔢 SCORING HÍBRIDO:")
	citations_parts.append(f" • Similitud semántica: {r['sim']:.3f}")
	citations_parts.append(f" • Peso metodológico: {r['weight']:.2f}x")
	citations_parts.append(f" • Score final: {r['score']:.3f}")
	citations_parts.append(f"🌐 Idioma query: {r['lang'].upper()}")
	citations_parts.append(f"\n📝 CONTENIDO:")
	citations_parts.append(f"{text}\n")
	citations_parts.append(f"{'='*80}\n")

	# FOOTER: Instrucciones para el LLM
	citations_parts.append("📌 INSTRUCCIONES PARA ANÁLISIS:")
	citations_parts.append("─────────────────────────────────────────────────────────────────────────────")
	citations_parts.append("1. CITA EXPLÍCITAMENTE a los autores mencionados arriba")
	citations_parts.append("2. PRIORIZA fuentes con mayor peso metodológico")
	citations_parts.append("3. USA terminología técnica de las fuentes (ej: 'negative face', 'FTA')")
	citations_parts.append("4. EXPLICA conceptos técnicos de forma accesible")
	citations_parts.append("5. INTEGRA mínimo 2-3 fuentes en tu análisis")
	citations_parts.append(f"{'='*80}")

	citations = "\n".join(citations_parts)

	# Retornar en formato esperado por app.py
	return {
	"citations": citations,
	"metadata": {
	"conflict_type": conflict_type,
	"confidence": confidence,
	"num_sources": len(top),
	"unique_tags": list(set(all_tags)),
	"sources_raw": sources,
	"logical_works_used": logical[:5],
	"categories_used": categories,
	"faiss_weights_applied": weights_applied,
	"citations": citations
	}
	}