File size: 13,754 Bytes
d328280
2544a4a
ba35d66
d328280
 
2288c06
 
 
 
47efbce
2288c06
42262f7
a55be36
47efbce
 
 
2288c06
5482a7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ddd68
dff3b47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42262f7
 
 
 
 
2288c06
42262f7
 
 
47efbce
42262f7
 
2288c06
42262f7
47efbce
ba35d66
42262f7
a55be36
ba35d66
42262f7
72ddd68
ba35d66
42262f7
 
 
ba35d66
42262f7
 
 
ba35d66
42262f7
 
56eee01
42262f7
 
a55be36
42262f7
 
 
47efbce
42262f7
 
 
56eee01
42262f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5482a7e
42262f7
 
 
 
2288c06
42262f7
 
 
 
 
 
 
 
 
f97dca9
42262f7
 
5482a7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42262f7
 
 
 
 
 
5482a7e
42262f7
ba35d66
42262f7
5482a7e
ba35d66
42262f7
 
 
5482a7e
 
 
 
 
 
 
 
ba35d66
42262f7
 
 
 
 
 
5482a7e
ba35d66
42262f7
 
 
5482a7e
42262f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba35d66
42262f7
5482a7e
42262f7
 
 
 
 
d328280
42262f7
a55be36
42262f7
dff3b47
42262f7
d328280
42262f7
dff3b47
2288c06
42262f7
 
f97dca9
42262f7
f97dca9
42262f7
 
 
 
 
 
 
 
 
 
2544a4a
42262f7
 
 
d328280
42262f7
 
d328280
42262f7
 
 
2544a4a
a55be36
42262f7
d328280
42262f7
 
 
 
ef557bf
2544a4a
ba35d66
2544a4a
 
 
 
 
ba35d66
2544a4a
ba35d66
 
 
2544a4a
ba35d66
2544a4a
 
 
dff3b47
 
 
2544a4a
 
ba35d66
2544a4a
 
 
 
 
 
 
dff3b47
 
2544a4a
 
 
ba35d66
2544a4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba35d66
2544a4a
ba35d66
 
2544a4a
ba35d66
 
 
 
 
 
 
2544a4a
 
 
ba35d66
2544a4a
ba35d66
 
 
 
 
2544a4a
 
 
42262f7
 
2544a4a
dd0536b
 
 
 
dff3b47
dd0536b
 
 
2544a4a
 
dd0536b
42262f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
"""
QUOOTA AGENT V3 - BILINGUAL RETRIEVAL + ACADEMIC CITATIONS
Query ES β†’ translate EN β†’ search both β†’ combine β†’ response ES/EN
"""

import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import os

# ConfiguraciΓ³n
FAISS_PATH = "quoota_complete_cpu.faiss"
METADATA_PATH = "quoota_metadata_enriched.pkl"
CONFIG_PATH = "retrieval_config.pkl"
MODEL_NAME = "intfloat/multilingual-e5-large"

# Keywords - BILINGUAL (ES + EN) for conflict detection
DATA_KW = [
    # Spanish
    "porcentaje", "plazo", "nΓΊmero", "numero", "fecha", "euros", "dias",
    "deadline", "cifra", "monto", "tiempo", "presupuesto",
    # English
    "deadline", "percentage", "number", "date", "budget", "timeline",
    "metric", "kpi", "target", "quota", "figure", "report", "data"
]

EGO_KW = [
    # Spanish
    "ignorado", "interrumpe", "no valora", "menosprecia", "furioso", 
    "agotado", "siempre", "nunca", "constantemente", "excluye", "critica",
    "menosprecio", "injusto", "invisible", "frustrado", "enojado",
    # English
    "ignored", "interrupts", "interrupt", "undervalues", "unfair", 
    "exhausted", "burned out", "always", "never", "constantly",
    "excludes", "criticizes", "belittles", "invisible", "frustrated",
    "angry", "disrespected", "unappreciated", "overlooked", "dismissed",
    "credit", "takes credit", "my idea", "not recognized"
]

# Tag translation mapping (ES β†’ EN)
TAG_TRANSLATIONS = {
    "negociacion": "negotiation",
    "liderazgo": "leadership",
    "feedback": "feedback",
    "ego": "ego",
    "cortesia": "politeness",
    "datos": "data",
    "conflicto": "conflict",
    "comunicacion": "communication",
    "gestion": "management",
    "equipo": "team",
    "jerarquia": "hierarchy",
    "autoridad": "authority",
    "confianza": "trust",
    "respeto": "respect",
    "reconocimiento": "recognition",
    "autonomia": "autonomy",
    "presion": "pressure",
    "estres": "stress",
    "motivacion": "motivation",
    "decision": "decision",
    "cambio": "change",
    "cultura": "culture",
    "valores": "values",
    "etica": "ethics",
    "responsabilidad": "responsibility",
    "rendicion_cuentas": "accountability",
    "colaboracion": "collaboration",
    "competencia": "competition",
    "poder": "power",
    "influencia": "influence"
}

def _translate_tags(tags_es):
    """Translate Spanish tags to English"""
    return [TAG_TRANSLATIONS.get(tag.lower(), tag) for tag in tags_es]

# Singleton
_model = None
_index = None
_docs = None
_config = None

def _init():
    """Inicializa recursos una sola vez"""
    global _model, _index, _docs, _config
    
    if _model is not None:
        return
    
    print("\nπŸš€ QUOOTA AGENT V3 - BILINGUAL")
    
    print("🧠 Loading model...")
    _model = SentenceTransformer(MODEL_NAME)
    
    print("πŸ“Š Loading FAISS...")
    _index = faiss.read_index(FAISS_PATH)
    
    print("πŸ“š Loading metadata...")
    with open(METADATA_PATH, 'rb') as f:
        _docs = pickle.load(f)
    
    print("βš™οΈ  Loading config...")
    with open(CONFIG_PATH, 'rb') as f:
        _config = pickle.load(f)
    
    print(f"βœ… Ready: {_index.ntotal:,} vectors\n")

def _translate(text_es):
    """Traduce ES β†’ EN con LLM (contexto LABORAL)"""
    try:
        from huggingface_hub import InferenceClient
        
        token = os.getenv("HF_TOKEN")
        if not token:
            return text_es
        
        client = InferenceClient(token=token)
        
        msgs = [
            {"role": "system", "content": "Translate this Spanish text about a WORKPLACE conflict to English. Only the translation, nothing else. Use workplace terminology: 'coworker' not 'roommate', 'manager' not 'landlord', etc."},
            {"role": "user", "content": text_es}
        ]
        
        resp = ""
        for m in client.chat_completion(
            messages=msgs,
            model="meta-llama/Llama-3.3-70B-Instruct",
            max_tokens=300,
            temperature=0.1,
            stream=True
        ):
            if m.choices and m.choices[0].delta.content:
                resp += m.choices[0].delta.content
        
        return resp.strip()
    
    except:
        return text_es

def _detect_conflict(query):
    """Detecta tipo conflicto (bilingual keywords)"""
    q = query.lower()
    
    data = sum(1 for k in DATA_KW if k in q)
    ego = sum(1 for k in EGO_KW if k in q)
    
    if ego > data:
        return "ego", min(ego * 20, 100)
    elif data > ego:
        return "data", min(data * 20, 100)
    elif ego > 0 and data > 0:
        return "mixed", min((ego + data) * 15, 100)
    else:
        return "unknown", 20

def analyze_case(query, k=5):
    """
    FunciΓ³n principal para app.py
    
    Args:
        query: Consulta del usuario (string)
        k: NΓΊmero de chunks a retornar (int, default=5)
    
    Returns:
        dict con:
            - conflict_type: str ("ego"|"data"|"mixed"|"unknown")
            - confidence: int (0-100)
            - num_sources: int
            - unique_tags: list[str]
            - sources_raw: list[str]
            - logical_works_used: list[str]
            - categories_used: list[str]
            - faiss_weights_applied: dict
            - citations: str (texto formateado ACADÉMICO)
    """
    _init()
    
    print(f"\n{'='*80}")
    print(f"πŸ” QUERY: {query[:70]}...")
    
    # 1. Detect conflict on ORIGINAL query (bilingual keywords)
    conflict_type, confidence = _detect_conflict(query)
    print(f"πŸ“Š Conflict: {conflict_type.upper()} ({confidence}%)")
    
    # 2. Translate
    print(f"🌐 Translating...")
    query_en = _translate(query)
    print(f"πŸ“ EN: {query_en[:70]}...")
    
    # 3. Also detect on translated query for better accuracy
    conflict_type_en, confidence_en = _detect_conflict(query_en)
    if confidence_en > confidence:
        conflict_type = conflict_type_en
        confidence = confidence_en
        print(f"πŸ“Š Conflict (EN): {conflict_type.upper()} ({confidence}%)")
    
    # 4. Vectorize both
    print(f"🧠 Vectorizing ES + EN...")
    emb_es = _model.encode([query], convert_to_numpy=True).astype('float32')
    emb_es = emb_es / np.linalg.norm(emb_es)
    
    emb_en = _model.encode([query_en], convert_to_numpy=True).astype('float32')
    emb_en = emb_en / np.linalg.norm(emb_en)
    
    # 5. Bilingual FAISS search
    print(f"πŸ“š Bilingual FAISS (30+30)...")
    dist_es, idx_es = _index.search(emb_es, 30)
    dist_en, idx_en = _index.search(emb_en, 30)
    
    # 6. Combine (no duplicates)
    results = []
    seen = set()
    
    for i in range(30):
        if i < len(idx_es[0]):
            idx = idx_es[0][i]
            if idx not in seen:
                results.append({
                    'idx': idx,
                    'sim': 1.0 / (1.0 + dist_es[0][i]),
                    'lang': 'es'
                })
                seen.add(idx)
        
        if i < len(idx_en[0]):
            idx = idx_en[0][i]
            if idx not in seen:
                results.append({
                    'idx': idx,
                    'sim': 1.0 / (1.0 + dist_en[0][i]),
                    'lang': 'en'
                })
                seen.add(idx)
    
    print(f"πŸ”€ Combined: {len(results)} unique")
    
    # 7. Hybrid scoring
    weights = _config["category_weights"]
    
    for r in results:
        doc = _docs[r['idx']]
        cat = doc.get('category', '')
        
        w = weights.get(cat, {}).get(conflict_type, 1.0)
        
        tags = doc.get('tags', [])
        relevant = ["ego", "cortesia", "politeness"] if conflict_type == "ego" else ["datos", "negociacion", "negotiation", "data"]
        tag_boost = len(set(tags) & set(relevant)) * 0.2
        
        r['score'] = 0.5 * r['sim'] + 0.3 * w + 0.2 * min(tag_boost, 1.0)
        r['weight'] = w
    
    results.sort(key=lambda x: x['score'], reverse=True)
    
    top = results[:k]
    
    print(f"\nπŸ“Š TOP {k}:")
    for i, r in enumerate(top, 1):
        doc = _docs[r['idx']]
        print(f"{i}. {r['score']:.3f} | {r['lang']} | {doc['category'][:20]:20s} | {doc['source'][:50]}")
    print("="*80 + "\n")
    
    # 8. Metadata
    sources = []
    categories = []
    all_tags = []
    logical = []
    weights_applied = {}
    
    for r in top:
        doc = _docs[r['idx']]
        
        src = doc.get('source', 'unknown')
        sources.append(src)
        
        cat = doc.get('category', 'unknown')
        if cat not in categories:
            categories.append(cat)
            weights_applied[cat] = weights.get(cat, {}).get(conflict_type, 1.0)
        
        all_tags.extend(doc.get('tags', []))
        
        log = src.lower().replace('.epub', '').replace('.pdf', '').replace('.txt', '')
        log = log.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')
        if log not in logical:
            logical.append(log)
    
    # ============================================================================
    # CITATIONS ACADÉMICAS MEJORADAS (ENGLISH LABELS)
    # ============================================================================
    
    citations_parts = []
    
    citations_parts.append("="*80)
    citations_parts.append("πŸ“Š RETRIEVAL SYSTEM APPLIED")
    citations_parts.append("="*80)
    citations_parts.append(f"\nπŸ” CONFLICT DETECTED: {conflict_type.upper()} (confidence: {confidence}%)")
    citations_parts.append(f"🌐 QUERY LANGUAGE: ES + EN (Bilingual)")
    citations_parts.append(f"πŸ“š CHUNKS RETRIEVED: {k} (of {_index.ntotal:,} total vectors)")
    
    citations_parts.append(f"\nβš–οΈ  METHODOLOGICAL WEIGHTS APPLIED:")
    for cat, weight in sorted(weights_applied.items(), key=lambda x: x[1], reverse=True):
        citations_parts.append(f"   β€’ {cat}: {weight}x")
    
    # Translate tags for display
    translated_tags = _translate_tags(list(set(all_tags)))
    citations_parts.append(f"\n🏷️  RELEVANT TAGS DETECTED: {', '.join(translated_tags)}")
    
    citations_parts.append(f"\n{'='*80}")
    citations_parts.append(f"πŸ“– ACADEMIC SOURCES (Top {k} by hybrid scoring)")
    citations_parts.append(f"{'='*80}\n")
    
    for i, r in enumerate(top, 1):
        doc = _docs[r['idx']]
        
        src = doc.get('source', 'unknown')
        cat = doc.get('category', 'unknown')
        # Translate tags for each source
        tags = ', '.join(_translate_tags(doc.get('tags', [])))
        content_type = doc.get('content_type', 'unknown')
        text = doc.get('text', '')
        
        author = "Unknown Author"
        if "Brown" in src or "Levinson" in src:
            author = "Brown & Levinson"
        elif "Patterson" in src:
            author = "Patterson et al."
        elif "Stone" in src:
            author = "Stone, Patton & Heen"
        elif "Calsamiglia" in src:
            author = "Helena Calsamiglia"
        elif "Fisher" in src or "Ury" in src:
            author = "Fisher, Ury & Patton"
        elif "Covey" in src:
            author = "Stephen Covey"
        elif "Carnegie" in src:
            author = "Dale Carnegie"
        elif "Sinek" in src:
            author = "Simon Sinek"
        elif "Rosenberg" in src:
            author = "Marshall Rosenberg"
        elif "Lencioni" in src:
            author = "Patrick Lencioni"
        
        citations_parts.append(f"[SOURCE #{i}] {author}")
        citations_parts.append(f"─────────────────────────────────────────────────────────────────────────────")
        citations_parts.append(f"πŸ“ File: {src}")
        citations_parts.append(f"πŸ“‚ Category: {cat}")
        citations_parts.append(f"🏷️  Tags: {tags}")
        citations_parts.append(f"πŸ“Š Content Type: {content_type}")
        citations_parts.append(f"πŸ”’ HYBRID SCORING:")
        citations_parts.append(f"   β€’ Semantic Similarity: {r['sim']:.3f}")
        citations_parts.append(f"   β€’ Methodological Weight: {r['weight']:.2f}x")
        citations_parts.append(f"   β€’ Final Score: {r['score']:.3f}")
        citations_parts.append(f"🌐 Query Language: {r['lang'].upper()}")
        citations_parts.append(f"\nπŸ“ CONTENT:")
        citations_parts.append(f"{text}\n")
        citations_parts.append(f"{'='*80}\n")
    
    citations_parts.append("πŸ“Œ INSTRUCTIONS FOR ANALYSIS:")
    citations_parts.append("─────────────────────────────────────────────────────────────────────────────")
    citations_parts.append("1. CITE authors mentioned above EXPLICITLY")
    citations_parts.append("2. PRIORITIZE sources with higher methodological weight")
    citations_parts.append("3. USE technical terminology from sources (e.g., 'negative face', 'FTA')")
    citations_parts.append("4. EXPLAIN technical concepts in accessible language")
    citations_parts.append("5. INTEGRATE minimum 2-3 sources in your analysis")
    citations_parts.append(f"{'='*80}")
    
    citations = "\n".join(citations_parts)
    
    return {
        "citations": citations,
        "metadata": {
            "conflict_type": conflict_type,
            "confidence": confidence,
            "num_sources": len(top),
            "unique_tags": translated_tags,
            "sources_raw": sources,
            "logical_works_used": logical[:5],
            "categories_used": categories,
            "faiss_weights_applied": weights_applied,
            "citations": citations
        }
    }