Spaces:
Sleeping
Sleeping
File size: 13,754 Bytes
d328280 2544a4a ba35d66 d328280 2288c06 47efbce 2288c06 42262f7 a55be36 47efbce 2288c06 5482a7e 72ddd68 dff3b47 42262f7 2288c06 42262f7 47efbce 42262f7 2288c06 42262f7 47efbce ba35d66 42262f7 a55be36 ba35d66 42262f7 72ddd68 ba35d66 42262f7 ba35d66 42262f7 ba35d66 42262f7 56eee01 42262f7 a55be36 42262f7 47efbce 42262f7 56eee01 42262f7 5482a7e 42262f7 2288c06 42262f7 f97dca9 42262f7 5482a7e 42262f7 5482a7e 42262f7 ba35d66 42262f7 5482a7e ba35d66 42262f7 5482a7e ba35d66 42262f7 5482a7e ba35d66 42262f7 5482a7e 42262f7 ba35d66 42262f7 5482a7e 42262f7 d328280 42262f7 a55be36 42262f7 dff3b47 42262f7 d328280 42262f7 dff3b47 2288c06 42262f7 f97dca9 42262f7 f97dca9 42262f7 2544a4a 42262f7 d328280 42262f7 d328280 42262f7 2544a4a a55be36 42262f7 d328280 42262f7 ef557bf 2544a4a ba35d66 2544a4a ba35d66 2544a4a ba35d66 2544a4a ba35d66 2544a4a dff3b47 2544a4a ba35d66 2544a4a dff3b47 2544a4a ba35d66 2544a4a ba35d66 2544a4a ba35d66 2544a4a ba35d66 2544a4a ba35d66 2544a4a ba35d66 2544a4a 42262f7 2544a4a dd0536b dff3b47 dd0536b 2544a4a dd0536b 42262f7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | """
QUOOTA AGENT V3 - BILINGUAL RETRIEVAL + ACADEMIC CITATIONS
Query ES β translate EN β search both β combine β response ES/EN
"""
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import os
# ConfiguraciΓ³n
FAISS_PATH = "quoota_complete_cpu.faiss"
METADATA_PATH = "quoota_metadata_enriched.pkl"
CONFIG_PATH = "retrieval_config.pkl"
MODEL_NAME = "intfloat/multilingual-e5-large"
# Keywords - BILINGUAL (ES + EN) for conflict detection
DATA_KW = [
# Spanish
"porcentaje", "plazo", "nΓΊmero", "numero", "fecha", "euros", "dias",
"deadline", "cifra", "monto", "tiempo", "presupuesto",
# English
"deadline", "percentage", "number", "date", "budget", "timeline",
"metric", "kpi", "target", "quota", "figure", "report", "data"
]
EGO_KW = [
# Spanish
"ignorado", "interrumpe", "no valora", "menosprecia", "furioso",
"agotado", "siempre", "nunca", "constantemente", "excluye", "critica",
"menosprecio", "injusto", "invisible", "frustrado", "enojado",
# English
"ignored", "interrupts", "interrupt", "undervalues", "unfair",
"exhausted", "burned out", "always", "never", "constantly",
"excludes", "criticizes", "belittles", "invisible", "frustrated",
"angry", "disrespected", "unappreciated", "overlooked", "dismissed",
"credit", "takes credit", "my idea", "not recognized"
]
# Tag translation mapping (ES β EN)
TAG_TRANSLATIONS = {
"negociacion": "negotiation",
"liderazgo": "leadership",
"feedback": "feedback",
"ego": "ego",
"cortesia": "politeness",
"datos": "data",
"conflicto": "conflict",
"comunicacion": "communication",
"gestion": "management",
"equipo": "team",
"jerarquia": "hierarchy",
"autoridad": "authority",
"confianza": "trust",
"respeto": "respect",
"reconocimiento": "recognition",
"autonomia": "autonomy",
"presion": "pressure",
"estres": "stress",
"motivacion": "motivation",
"decision": "decision",
"cambio": "change",
"cultura": "culture",
"valores": "values",
"etica": "ethics",
"responsabilidad": "responsibility",
"rendicion_cuentas": "accountability",
"colaboracion": "collaboration",
"competencia": "competition",
"poder": "power",
"influencia": "influence"
}
def _translate_tags(tags_es):
"""Translate Spanish tags to English"""
return [TAG_TRANSLATIONS.get(tag.lower(), tag) for tag in tags_es]
# Singleton
_model = None
_index = None
_docs = None
_config = None
def _init():
"""Inicializa recursos una sola vez"""
global _model, _index, _docs, _config
if _model is not None:
return
print("\nπ QUOOTA AGENT V3 - BILINGUAL")
print("π§ Loading model...")
_model = SentenceTransformer(MODEL_NAME)
print("π Loading FAISS...")
_index = faiss.read_index(FAISS_PATH)
print("π Loading metadata...")
with open(METADATA_PATH, 'rb') as f:
_docs = pickle.load(f)
print("βοΈ Loading config...")
with open(CONFIG_PATH, 'rb') as f:
_config = pickle.load(f)
print(f"β
Ready: {_index.ntotal:,} vectors\n")
def _translate(text_es):
"""Traduce ES β EN con LLM (contexto LABORAL)"""
try:
from huggingface_hub import InferenceClient
token = os.getenv("HF_TOKEN")
if not token:
return text_es
client = InferenceClient(token=token)
msgs = [
{"role": "system", "content": "Translate this Spanish text about a WORKPLACE conflict to English. Only the translation, nothing else. Use workplace terminology: 'coworker' not 'roommate', 'manager' not 'landlord', etc."},
{"role": "user", "content": text_es}
]
resp = ""
for m in client.chat_completion(
messages=msgs,
model="meta-llama/Llama-3.3-70B-Instruct",
max_tokens=300,
temperature=0.1,
stream=True
):
if m.choices and m.choices[0].delta.content:
resp += m.choices[0].delta.content
return resp.strip()
except:
return text_es
def _detect_conflict(query):
"""Detecta tipo conflicto (bilingual keywords)"""
q = query.lower()
data = sum(1 for k in DATA_KW if k in q)
ego = sum(1 for k in EGO_KW if k in q)
if ego > data:
return "ego", min(ego * 20, 100)
elif data > ego:
return "data", min(data * 20, 100)
elif ego > 0 and data > 0:
return "mixed", min((ego + data) * 15, 100)
else:
return "unknown", 20
def analyze_case(query, k=5):
"""
FunciΓ³n principal para app.py
Args:
query: Consulta del usuario (string)
k: NΓΊmero de chunks a retornar (int, default=5)
Returns:
dict con:
- conflict_type: str ("ego"|"data"|"mixed"|"unknown")
- confidence: int (0-100)
- num_sources: int
- unique_tags: list[str]
- sources_raw: list[str]
- logical_works_used: list[str]
- categories_used: list[str]
- faiss_weights_applied: dict
- citations: str (texto formateado ACADΓMICO)
"""
_init()
print(f"\n{'='*80}")
print(f"π QUERY: {query[:70]}...")
# 1. Detect conflict on ORIGINAL query (bilingual keywords)
conflict_type, confidence = _detect_conflict(query)
print(f"π Conflict: {conflict_type.upper()} ({confidence}%)")
# 2. Translate
print(f"π Translating...")
query_en = _translate(query)
print(f"π EN: {query_en[:70]}...")
# 3. Also detect on translated query for better accuracy
conflict_type_en, confidence_en = _detect_conflict(query_en)
if confidence_en > confidence:
conflict_type = conflict_type_en
confidence = confidence_en
print(f"π Conflict (EN): {conflict_type.upper()} ({confidence}%)")
# 4. Vectorize both
print(f"π§ Vectorizing ES + EN...")
emb_es = _model.encode([query], convert_to_numpy=True).astype('float32')
emb_es = emb_es / np.linalg.norm(emb_es)
emb_en = _model.encode([query_en], convert_to_numpy=True).astype('float32')
emb_en = emb_en / np.linalg.norm(emb_en)
# 5. Bilingual FAISS search
print(f"π Bilingual FAISS (30+30)...")
dist_es, idx_es = _index.search(emb_es, 30)
dist_en, idx_en = _index.search(emb_en, 30)
# 6. Combine (no duplicates)
results = []
seen = set()
for i in range(30):
if i < len(idx_es[0]):
idx = idx_es[0][i]
if idx not in seen:
results.append({
'idx': idx,
'sim': 1.0 / (1.0 + dist_es[0][i]),
'lang': 'es'
})
seen.add(idx)
if i < len(idx_en[0]):
idx = idx_en[0][i]
if idx not in seen:
results.append({
'idx': idx,
'sim': 1.0 / (1.0 + dist_en[0][i]),
'lang': 'en'
})
seen.add(idx)
print(f"π Combined: {len(results)} unique")
# 7. Hybrid scoring
weights = _config["category_weights"]
for r in results:
doc = _docs[r['idx']]
cat = doc.get('category', '')
w = weights.get(cat, {}).get(conflict_type, 1.0)
tags = doc.get('tags', [])
relevant = ["ego", "cortesia", "politeness"] if conflict_type == "ego" else ["datos", "negociacion", "negotiation", "data"]
tag_boost = len(set(tags) & set(relevant)) * 0.2
r['score'] = 0.5 * r['sim'] + 0.3 * w + 0.2 * min(tag_boost, 1.0)
r['weight'] = w
results.sort(key=lambda x: x['score'], reverse=True)
top = results[:k]
print(f"\nπ TOP {k}:")
for i, r in enumerate(top, 1):
doc = _docs[r['idx']]
print(f"{i}. {r['score']:.3f} | {r['lang']} | {doc['category'][:20]:20s} | {doc['source'][:50]}")
print("="*80 + "\n")
# 8. Metadata
sources = []
categories = []
all_tags = []
logical = []
weights_applied = {}
for r in top:
doc = _docs[r['idx']]
src = doc.get('source', 'unknown')
sources.append(src)
cat = doc.get('category', 'unknown')
if cat not in categories:
categories.append(cat)
weights_applied[cat] = weights.get(cat, {}).get(conflict_type, 1.0)
all_tags.extend(doc.get('tags', []))
log = src.lower().replace('.epub', '').replace('.pdf', '').replace('.txt', '')
log = log.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
if log not in logical:
logical.append(log)
# ============================================================================
# CITATIONS ACADΓMICAS MEJORADAS (ENGLISH LABELS)
# ============================================================================
citations_parts = []
citations_parts.append("="*80)
citations_parts.append("π RETRIEVAL SYSTEM APPLIED")
citations_parts.append("="*80)
citations_parts.append(f"\nπ CONFLICT DETECTED: {conflict_type.upper()} (confidence: {confidence}%)")
citations_parts.append(f"π QUERY LANGUAGE: ES + EN (Bilingual)")
citations_parts.append(f"π CHUNKS RETRIEVED: {k} (of {_index.ntotal:,} total vectors)")
citations_parts.append(f"\nβοΈ METHODOLOGICAL WEIGHTS APPLIED:")
for cat, weight in sorted(weights_applied.items(), key=lambda x: x[1], reverse=True):
citations_parts.append(f" β’ {cat}: {weight}x")
# Translate tags for display
translated_tags = _translate_tags(list(set(all_tags)))
citations_parts.append(f"\nπ·οΈ RELEVANT TAGS DETECTED: {', '.join(translated_tags)}")
citations_parts.append(f"\n{'='*80}")
citations_parts.append(f"π ACADEMIC SOURCES (Top {k} by hybrid scoring)")
citations_parts.append(f"{'='*80}\n")
for i, r in enumerate(top, 1):
doc = _docs[r['idx']]
src = doc.get('source', 'unknown')
cat = doc.get('category', 'unknown')
# Translate tags for each source
tags = ', '.join(_translate_tags(doc.get('tags', [])))
content_type = doc.get('content_type', 'unknown')
text = doc.get('text', '')
author = "Unknown Author"
if "Brown" in src or "Levinson" in src:
author = "Brown & Levinson"
elif "Patterson" in src:
author = "Patterson et al."
elif "Stone" in src:
author = "Stone, Patton & Heen"
elif "Calsamiglia" in src:
author = "Helena Calsamiglia"
elif "Fisher" in src or "Ury" in src:
author = "Fisher, Ury & Patton"
elif "Covey" in src:
author = "Stephen Covey"
elif "Carnegie" in src:
author = "Dale Carnegie"
elif "Sinek" in src:
author = "Simon Sinek"
elif "Rosenberg" in src:
author = "Marshall Rosenberg"
elif "Lencioni" in src:
author = "Patrick Lencioni"
citations_parts.append(f"[SOURCE #{i}] {author}")
citations_parts.append(f"βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
citations_parts.append(f"π File: {src}")
citations_parts.append(f"π Category: {cat}")
citations_parts.append(f"π·οΈ Tags: {tags}")
citations_parts.append(f"π Content Type: {content_type}")
citations_parts.append(f"π’ HYBRID SCORING:")
citations_parts.append(f" β’ Semantic Similarity: {r['sim']:.3f}")
citations_parts.append(f" β’ Methodological Weight: {r['weight']:.2f}x")
citations_parts.append(f" β’ Final Score: {r['score']:.3f}")
citations_parts.append(f"π Query Language: {r['lang'].upper()}")
citations_parts.append(f"\nπ CONTENT:")
citations_parts.append(f"{text}\n")
citations_parts.append(f"{'='*80}\n")
citations_parts.append("π INSTRUCTIONS FOR ANALYSIS:")
citations_parts.append("βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
citations_parts.append("1. CITE authors mentioned above EXPLICITLY")
citations_parts.append("2. PRIORITIZE sources with higher methodological weight")
citations_parts.append("3. USE technical terminology from sources (e.g., 'negative face', 'FTA')")
citations_parts.append("4. EXPLAIN technical concepts in accessible language")
citations_parts.append("5. INTEGRATE minimum 2-3 sources in your analysis")
citations_parts.append(f"{'='*80}")
citations = "\n".join(citations_parts)
return {
"citations": citations,
"metadata": {
"conflict_type": conflict_type,
"confidence": confidence,
"num_sources": len(top),
"unique_tags": translated_tags,
"sources_raw": sources,
"logical_works_used": logical[:5],
"categories_used": categories,
"faiss_weights_applied": weights_applied,
"citations": citations
}
} |