Spaces:
Running
Running
File size: 20,194 Bytes
81e2a52 d25ae14 81e2a52 27fb911 81e2a52 bb68748 f99cba3 bb68748 e9481c3 1a9aaac e9481c3 bb68748 e9481c3 bb68748 800fbc2 f8f5b17 800fbc2 f8f5b17 800fbc2 e9481c3 bb68748 f99cba3 bb68748 f99cba3 bb68748 f99cba3 bb68748 62a336a bb68748 d25ae14 800fbc2 f8f5b17 d25ae14 f99cba3 e9481c3 d25ae14 e9481c3 d25ae14 f99cba3 f8f5b17 bb68748 f8f5b17 800fbc2 959adae d25ae14 bb68748 d25ae14 e9481c3 d25ae14 e9481c3 d25ae14 f8f5b17 d25ae14 3b85206 d25ae14 3b85206 d25ae14 bb68748 f99cba3 bb68748 800fbc2 bb68748 f99cba3 bb68748 800fbc2 3fe272e bb68748 f99cba3 bb68748 959adae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | """
🧠 Agent 2 (Interpretation) - Gemini 2.0 EDITION
-------------------------------------------------
This version of the AgentBrain handles all deep sociolinguistic logic,
prompt engineering, and dataset searching.
"""
import os
import glob
import pandas as pd
import json
import re
import concurrent.futures
from rapidfuzz import process, fuzz
from src.rag_manager import SociolinguisticRAG
class AgentInterpretation:
def __init__(self, config, gemini_manager_instance=None):
self.config = config
self.gemini_manager = gemini_manager_instance
self.gemini_manager_instance = gemini_manager_instance # Kept for your backward compatibility
# 🟢 1. Initialize RAG Engines (The Token Saver)
self.profiles_dir = os.path.join(self.config.BASE_DIR, "lab_profiles")
self.rag_engines = {}
self._initialize_rag_databases()
# 🟢 2. Restore your Concurrency and Background Tasks
import concurrent.futures # Ensure this is imported at the top of the file
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
print("🧠 Agent 2 (Interpretation) Online: Persistent Pool & RAG Engines Ready.")
# 🟢 3. Restore your background knowledge refresh (if it relies on updating CSVs)
self.refresh_knowledge_base()
def _safe_generate(self, prompt):
"""
Safely runs the async Groq generation inside a synchronous thread
by creating an isolated event loop. This prevents Gradio UI freezing.
"""
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# We await the async generate_fast method inside this isolated loop
return loop.run_until_complete(self.gemini_manager.generate_fast(prompt))
finally:
loop.close()
def _initialize_rag_databases(self):
"""Loads all heavy JSON personas into lightweight RAM vectors on startup."""
if not os.path.exists(self.profiles_dir):
return
for filename in os.listdir(self.profiles_dir):
if filename.endswith("Persona.json"):
dialect_name = filename.replace(" Persona.json", "")
filepath = os.path.join(self.profiles_dir, filename)
try:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# 🟢 Flatten the heavy JSON into a list of simple string rules
rules = []
if "lexicon" in data:
for word, meaning in data["lexicon"].items():
rules.append(f"Lexicon: '{word}' means {meaning}.")
if "pragmatics" in data:
for rule in data["pragmatics"]:
rules.append(f"Pragmatics: {rule}")
if "syntax" in data:
for rule in data["syntax"]:
rules.append(f"Syntax: {rule}")
# Initialize RAG for this specific dialect
rag = SociolinguisticRAG()
rag.load_persona_rules(dialect_name, rules)
self.rag_engines[dialect_name] = rag
except Exception as e:
print(f"⚠️ Failed to load RAG for {filename}: {e}")
def get_available_profiles(self):
files = glob.glob(os.path.join(self.config.PROFILES_DIR, "*.json"))
return [os.path.basename(f) for f in files]
def load_profile_by_name(self, filename):
path = os.path.join(self.config.PROFILES_DIR, filename)
try:
with open(path, 'r', encoding='utf-8') as f:
self.lab_profile = json.load(f)
return self.lab_profile
except: return {}
def save_specific_profile(self, filename, json_str):
if not filename.endswith(".json"): filename += ".json"
path = os.path.join(self.config.PROFILES_DIR, filename)
try:
with open(path, "w", encoding="utf-8") as f: json.dump(json.loads(json_str), f, indent=2)
return "✅ Saved locally (HF Sync pending)"
except Exception as e: return f"❌ Error: {e}"
def get_current_profile_text(self):
return json.dumps(self.lab_profile, indent=2)
def load_all_profiles_simultaneously(self):
files = glob.glob(os.path.join(self.config.PROFILES_DIR, "*.json"))
loaded_count = 0
names = []
self.active_profiles_list = []
for f_path in files:
try:
with open(f_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.active_profiles_list.append(data)
names.append(data.get("lab_name", os.path.basename(f_path)))
loaded_count += 1
except: pass
return f"✅ Loaded {loaded_count} profiles simultaneously: {', '.join(names)}"
def refresh_knowledge_base(self):
all_files = glob.glob(os.path.join(self.config.DATASET_DIR, "*.csv"))
df_list = []
for filename in all_files:
try:
temp_df = pd.read_csv(filename, encoding='utf-8-sig', on_bad_lines='skip', low_memory=True)
temp_df = temp_df[['Utterance', 'Clarification', 'Dialect']]
df_list.append(temp_df)
except: pass
if df_list:
self.df = pd.concat(df_list, ignore_index=True)
self.lookup_list = self.df["Utterance"].tolist()
else:
self.lookup_list = []
def normalize_keys(self, data_list):
cleaned_list = []
for item in data_list:
new_item = {
"Dialect": item.get("Dialect") or item.get("dialect", "Unknown"),
"Clarification": item.get("Clarification") or item.get("clarification", "---"),
"Tone": item.get("Tone") or item.get("tone", "---"),
"Context": item.get("Context") or item.get("context", "---"),
"Pragmatic Analysis": item.get("Pragmatic Analysis") or item.get("pragmatics", "---"),
"Source": item.get("Source", "✨ AI Generated")
}
cleaned_list.append(new_item)
return cleaned_list
# ==========================================
# MODULAR LOGIC (MOVED FROM UX_AGENT)
# ==========================================
def generate_conversation_starter(self, topic_str):
"""Generates the mission prompt for the React Games."""
if not self.gemini_manager:
if "You are Echo" in topic_str: return json.dumps({"text": "Haha, I hear you! What else is going on?"})
return json.dumps({"text": f"Let's talk about {topic_str}. What comes to mind?"})
try:
if "You are Echo" in topic_str:
prompt = f"""
{topic_str}
CRITICAL ROLEPLAY INSTRUCTIONS FOR THIS TURN:
1. You are Echo, a human peer and friend.
2. NEVER break character. NEVER say you are an AI, a computer program, or a language model.
3. Play along naturally. If they ask for favors or money (e.g., begging/joking), play along like a real person.
4. Keep your response to ONE short, highly conversational sentence. Do not be overly formal.
5. On login, you will switch the user's English dialect.
"""
else:
prompt = f"Generate a very short, engaging, single-sentence conversation starter or question about: {topic_str}."
response = self._safe_generate(prompt)
#response = self.gemini_manager.client.models.generate_content(
#model='gemini-2.0-flash',
#contents=prompt
#)
clean_text = response.text.replace("```json", "").replace("```", "").strip()
return json.dumps({"text": clean_text})
except Exception as e:
print(f"⚠️ Mission Gen Error: {e}")
if "You are Echo" in topic_str: return json.dumps({"text": "Haha, I hear you! What else is going on?"})
return json.dumps({"text": f"Let's talk about {topic_str}. What comes to mind?"})
def search_local_dataset(self, text):
"""Tier 1: Returns a single local match only if it is nearly identical."""
try:
if self.df.empty: return None
utterances = self.df["Utterance"].astype(str).tolist()
match = process.extractOne(text, utterances, scorer=fuzz.WRatio)
if match and match[1] >= 90:
row = self.df[self.df["Utterance"] == match[0]].iloc[0]
def clean_val(val, default=""):
return default if pd.isna(val) else str(val)
return {
"Source": "🗄️ Local Dataset",
"Speaker": "User",
"dialect": clean_val(row.get("Dialect"), "Unknown"),
"clarification": clean_val(row.get("Clarification")),
"tone": clean_val(row.get("Tone_Category"), "Neutral / Conversational"),
"context": clean_val(row.get("Linguistic_Context")),
"pragmatics": clean_val(row.get("Pragmatic_Analysis"), "Verified via local Regex")
}
except Exception as e:
print(f"Dataset Search Error: {e}")
return None
def search_personas(self, text):
"""Tier 2: Checks the currently loaded JSON persona for jargon."""
try:
if not self.lab_profile: return None
profile = self.lab_profile
text_lower = text.lower()
if "jargons" in profile:
for slang, meaning in profile["jargons"].items():
if re.search(r'\b' + re.escape(slang.lower()) + r'\b', text_lower):
rule = profile.get('pragmatic_rules', [''])[0] if profile.get('pragmatic_rules') else ""
return {
"Source": "🎭 Persona Injection",
"Speaker": "User",
"dialect": profile.get("dialect_name", "Unknown Persona"),
"clarification": f"Contains slang '{slang}' -> {meaning}",
"tone": "Casual / Slang",
"context": profile.get("cultural_context", "Inferred from active Persona"),
"pragmatics": f"Rule triggered: {rule}"
}
except Exception as e:
print(f"Persona Search Error: {e}")
return None
# ==========================================
# DEEP GENERATIVE AI LOGIC
# ==========================================
def analyze_dialect_single(self, text, language_code):
"""Generates a high-confidence, culturally nuanced interpretation from AI."""
print(f" -> 🧠 Preparing Sociolinguistic Prompt for: {language_code}")
# 🟢 1. RAG RETRIEVAL: Get the specific rules for this dialect
relevant_rules = "General dialect rules apply."
if language_code in self.rag_engines:
relevant_rules = self.rag_engines[language_code].retrieve_context(text, k=3)
print(f" -> 🎯 RAG Context Retrieved:\n{relevant_rules}")
# 🟢 2. OPTIMIZED PROMPT: Combine RAG context with your strict academic instructions
prompt = f"""Analyze the following utterance: '{text}'
Language/Dialect Context: {language_code}
CRITICAL CONTEXT (Apply ONLY these retrieved sociolinguistic rules to your analysis):
{relevant_rules}
CRITICAL SOCIOLINGUISTIC INSTRUCTIONS:
1. DIRECT MEANING: Provide the interpretation directly. Do not start with "This means..." or "The user is saying...".
2. PRESERVE CULTURAL NUANCE: Capture the exact social intent (e.g., humor, banter, sarcasm, respect, solidarity). Do not sterilize the translation into robotic, literal Standard English. Preserve the 'flavor' of the interaction.
3. LINGUISTIC EQUALITY: Treat this as a valid, deeply rule-governed language system.
4. FORBIDDEN WORDS: You must NEVER use "incorrect", "broken", "grammar error", "non-standard", or "learner".
Return ONLY a valid JSON object with exactly these keys. Do not use markdown formatting blocks:
{{
"dialect": "{language_code}",
"clarification": "[Direct Meaning, capturing the original intent]",
"tone": "[e.g., Playful Banter, Respectful, Sarcastic, Urgent, Casual]",
"context": "[The typical cultural or situational setting for this phrase]",
"pragmatics": "[The underlying social function, e.g., 'Establishing solidarity', 'Softening a request', 'Playful teasing']"
}}"""
# 🟢 3. API CALL & ERROR HANDLING (Kept exactly as it was)
try:
print(" -> 🧠 Awaiting API Response...")
response = self._safe_generate(prompt)
print(" -> 🧠 Parsing JSON Response...")
clean_text = response.text.replace("```json", "").replace("```", "").strip()
import re
match = re.search(r'\{.*\}', clean_text, re.DOTALL)
if match:
import json
res = json.loads(match.group(0))
res["Source"] = "🧠 Groq AI Engine (Cultural Context)"
return res
except Exception as e:
import traceback
print("\n🚨 CRITICAL GEMINI ERROR 🚨")
traceback.print_exc()
return {"Source": "🧠 AI Engine Error", "dialect": language_code, "clarification": f"AI Gen Failed: {e}", "tone": "Neutral", "context": "", "pragmatics": ""}
print(" -> ❌ AI returned invalid format.")
return {"Source": "🧠 AI Engine Error", "dialect": language_code, "clarification": "AI format failed", "tone": "Neutral", "context": "", "pragmatics": ""}
def generate_unknown_analysis(self, text):
if not self.gemini_manager: return []
all_jargon_keys = []
for p in self.active_profiles_list:
all_jargon_keys.extend(list(p.get("jargon", {}).keys()))
prompt = f"""
Analyze utterance: "{text}"
Context/Jargon Keys: {list(set(all_jargon_keys))[:50]}
Task: Provide 3 distinct interpretations (Casual, Formal, or Cultural).
CRITICAL INSTRUCTION: Treat the input as valid, meaningful Dialectal English. Do NOT label it as "incorrect".
Output Strictly JSON: [ {{ "Dialect": "General", "Clarification": "...", "Tone": "...", "Context": "...", "Pragmatics": "..." }} ]
"""
try:
response = self._safe_generate(prompt)
clean_text = re.sub(r"```json|```", "", response.text).strip()
data = json.loads(clean_text)
return self.normalize_keys(data)
except:
return [{"Dialect": "Unknown", "Clarification": "Analysis Failed", "Tone": "---", "Context": "---", "Pragmatic Analysis": "Error"}]
def adapt_with_ai(self, full_text, db_row):
if not self.gemini_manager: return db_row["Clarification"], db_row["Pragmatic_Analysis"]
prompt = f"""
Ref Term: "{db_row['Utterance']}" = "{db_row['Clarification']}"
User said: "{full_text}"
Task: Adapt meaning to full sentence. Treat as valid dialect.
Output JSON: {{ "clarification": "...", "pragmatics": "..." }}
"""
try:
response = self._safe_generate(prompt)
clean_json = re.search(r"\{.*\}", response.text, re.DOTALL)
if clean_json:
data = json.loads(clean_json.group(0))
return data.get("clarification", db_row["Clarification"]), data.get("pragmatics", "AI Adapted Analysis")
except: pass
return db_row["Clarification"], db_row["Pragmatic_Analysis"]
def detect_and_analyze(self, text, threshold=60):
clean_text = text.lower().strip()
seen_indices = set()
immediate_results = []
partial_candidates = []
if not self.df.empty:
for index, row in self.df.iterrows():
match_type = None
try:
regex = str(row.get("Syntax_Pattern", ""))
if len(regex) > 2 and re.search(regex, clean_text, re.IGNORECASE):
match_type = "Regex_Match"
except: pass
if not match_type:
db_str = str(row["Utterance"]).strip().lower()
if len(db_str) > 3 and db_str in clean_text:
match_type = "Exact_Substring"
if match_type:
seen_indices.add(index)
immediate_results.append({
"Source": f"💎 Database ({match_type})", "Dialect": row["Dialect"],
"Clarification": row["Clarification"], "Tone": row.get("Tone_Category", "---"),
"Context": row.get("Linguistic_Context", "---"), "Pragmatic Analysis": row.get("Pragmatic_Analysis", "---")
})
if not self.lookup_list: self.lookup_list = []
matches = process.extract(clean_text, self.lookup_list, scorer=fuzz.token_set_ratio, limit=5)
for match_str, score, index in matches:
if score >= threshold and index not in seen_indices and index < len(self.df):
seen_indices.add(index)
row = self.df.iloc[index]
partial_candidates.append({"row": row, "match_len": score, "type": f"Fuzzy ({score}%)"})
for profile in self.active_profiles_list:
jargon_dict = profile.get("jargon", {})
for term, definition in jargon_dict.items():
if term.lower() in clean_text:
immediate_results.append({
"Source": f"📜 Profile Rule ({term})", "Dialect": profile.get("lab_name", "Profile"),
"Clarification": definition, "Tone": "Detected Jargon",
"Context": f"Found in {profile.get('lab_name')} Profile", "Pragmatic Analysis": "Direct Profile Match"
})
final_results = list(immediate_results)
partial_candidates.sort(key=lambda x: x["match_len"], reverse=True)
top_candidates = partial_candidates[:3]
fallback_future = self.executor.submit(self.generate_unknown_analysis, text)
db_futures = {}
for cand in top_candidates:
f = self.executor.submit(self.adapt_with_ai, text, cand["row"])
db_futures[f] = cand
done, not_done = concurrent.futures.wait(list(db_futures.keys()) + [fallback_future], timeout=4.5, return_when=concurrent.futures.ALL_COMPLETED)
for f in db_futures:
if f in done:
try:
clar, prag = f.result()
cand = db_futures[f]
final_results.append({
"Source": f"💎 DB + AI ({cand['type']})", "Dialect": cand["row"]["Dialect"],
"Clarification": clar, "Tone": cand["row"].get("Tone_Category", "---"),
"Context": cand["row"].get("Linguistic_Context", "---"), "Pragmatic Analysis": prag
})
except: pass
if len(final_results) < 3 and fallback_future in done:
try:
res = self.normalize_keys(fallback_future.result())
final_results += res
except: pass
if not final_results:
final_results.append({"Source": "⚠️ AI Timeout", "Dialect": "---", "Clarification": "System Busy", "Tone": "---", "Context": "---", "Pragmatic Analysis": "---"})
return final_results[:3]
|