GenAICoder commited on
Commit
f17af50
·
verified ·
1 Parent(s): ea7c609

Upload deep_dive_agentic (1).py

Browse files
Files changed (1) hide show
  1. analytics/deep_dive_agentic (1).py +551 -0
analytics/deep_dive_agentic (1).py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deep_dive_agentic.py
2
+
3
+ """
4
+ Agentic analytical code generation + execution engine using Hugging Face
5
+
6
+ FLOW:
7
+ User Question
8
+
9
+ LLM generates pandas code
10
+
11
+ Python executes code safely
12
+
13
+ LLM interprets results
14
+
15
+ Return code + interpretation
16
+
17
+ Environment:
18
+ export HUGGINGFACE_API_TOKEN=...
19
+
20
+ FIXES APPLIED (v2):
21
+ - FIX 1: exec() now uses a single merged namespace dict so result variables
22
+ are reliably written back (Python bug with separate globals/locals).
23
+ - FIX 2: Smart result detection — scans namespace for any new DataFrame/Series
24
+ instead of relying on hardcoded variable names (result_1, final_result).
25
+ - FIX 3: _fix_pandas_compatibility() is now actually called before exec().
26
+ """
27
+
28
+ # ---------------------------------------------------
29
+ # IMPORTS
30
+ # ---------------------------------------------------
31
+
32
+ import pandas as pd
33
+ import json
34
+ import os
35
+ import re
36
+
37
+ try:
38
+ from huggingface_hub import InferenceClient
39
+ except ImportError as exc:
40
+ raise ImportError(
41
+ "huggingface_hub is required. Install with `pip install huggingface-hub`."
42
+ ) from exc
43
+
44
+ from analytics.performance_analysis import generate_metric_view
45
+
46
+ # ---------------------------------------------------
47
+ # HF CONFIG
48
+ # ---------------------------------------------------
49
+
50
+ HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
51
+ HF_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
52
+
53
+
54
+ # ---------------------------------------------------
55
+ # HELPER: GET INFERENCE CLIENT
56
+ # ---------------------------------------------------
57
+
58
+ def _get_hf_client():
59
+ if not HF_TOKEN:
60
+ raise RuntimeError(
61
+ "HUGGINGFACE_API_TOKEN is required. Set it in your environment."
62
+ )
63
+ return InferenceClient(token=HF_TOKEN)
64
+
65
+
66
+ # ---------------------------------------------------
67
+ # HELPER: EXTRACT JSON FROM LLM RESPONSE
68
+ # ---------------------------------------------------
69
+
70
+ def _extract_json(text: str):
71
+ match = re.search(r"\{.*\}", text, re.S)
72
+ if not match:
73
+ return None
74
+ payload = match.group(0)
75
+ try:
76
+ return json.loads(payload)
77
+ except json.JSONDecodeError:
78
+ try:
79
+ cleaned = re.sub(r"[\n\r]+", " ", payload)
80
+ cleaned = re.sub(r"(['\"])?([a-zA-Z0-9_]+)(['\"])?\s*:\s*", r'"\2": ', cleaned)
81
+ return json.loads(cleaned)
82
+ except Exception:
83
+ return None
84
+
85
+
86
+ # ---------------------------------------------------
87
+ # HELPER: FIX COMMON PANDAS COMPATIBILITY ISSUES
88
+ # ---------------------------------------------------
89
+
90
+ def _fix_pandas_compatibility(code: str) -> str:
91
+ """
92
+ Fix common pandas API compatibility issues in generated code.
93
+ Handles version differences between pandas versions.
94
+ """
95
+ # Fix: .reset_index(name=...) -> .reset_index(names=[...])
96
+ code = re.sub(
97
+ r"\.reset_index\(name=(['\"])([^'\"]+)\1\)",
98
+ r".reset_index(names=[\1\2\1])",
99
+ code
100
+ )
101
+
102
+ # Fix: .reset_index(name= with variable
103
+ code = re.sub(
104
+ r"\.reset_index\(name=([a-zA-Z_][a-zA-Z0-9_]*)\)",
105
+ r".reset_index(names=[\1])",
106
+ code
107
+ )
108
+
109
+ # Fix: df.append() deprecated in newer pandas -> pd.concat()
110
+ code = re.sub(
111
+ r"(\w+)\.append\((\w+),\s*ignore_index=True\)",
112
+ r"pd.concat([\1, \2], ignore_index=True)",
113
+ code
114
+ )
115
+
116
+ return code
117
+
118
+
119
+ # ---------------------------------------------------
120
+ # STEP 1: CODE GENERATION
121
+ # ---------------------------------------------------
122
+
123
+ def generate_analysis_requirements(
124
+ question: str,
125
+ acq: pd.DataFrame,
126
+ perf: pd.DataFrame,
127
+ master_df: pd.DataFrame
128
+ ):
129
+ """
130
+ LLM breaks down question into 1-3 structured analytics requirements.
131
+ Each requirement includes a description and executable pandas code.
132
+ """
133
+
134
+ client = _get_hf_client()
135
+
136
+ # Build detailed column descriptions
137
+ acq_cols = {
138
+ "account_id": "unique account identifier",
139
+ "booking_date": "when account was originated",
140
+ "booking_vintage": "year-month of origination (YYYY-MM)",
141
+ "fico_band": "FICO score bracket (e.g., 700-750, 750-800)",
142
+ "sourcing_channel": "acquisition channel (e.g., Online, Branch, Broker)",
143
+ "city_tier": "city classification (Tier-1, Tier-2, Tier-3)",
144
+ "occupation_type": "borrower occupation category",
145
+ "credit_limit": "approved credit line amount"
146
+ }
147
+
148
+ perf_cols = {
149
+ "account_id": "unique account identifier",
150
+ "reporting_month": "month of performance observation (YYYY-MM)",
151
+ "mob": "months on books (age of account in months)",
152
+ "dpd": "days past due (0, 30, 60, 90+)",
153
+ "balance": "current outstanding balance",
154
+ "ncl_amount": "net charge-off amount (dollars)",
155
+ "payment": "payment amount in period"
156
+ }
157
+
158
+ prompt = (
159
+ "You are a senior retail credit risk analyst with 15+ years of portfolio management experience.\n\n"
160
+ "Your task:\n"
161
+ "1. Analyze the user's analytical question deeply\n"
162
+ "2. Determine 1-3 specific analytics requirements needed to fully answer the question\n"
163
+ "3. For EACH requirement, generate executable pandas code\n"
164
+ "4. Return ONLY valid JSON, no other text\n\n"
165
+
166
+ "AVAILABLE DATA:\n"
167
+ "- acq: acquisition data with columns: " + ", ".join(acq_cols.keys()) + "\n"
168
+ "- perf: performance data with columns: " + ", ".join(perf_cols.keys()) + "\n"
169
+ "- master_df: merged acq+perf, includes all above columns\n\n"
170
+
171
+ "COLUMN DESCRIPTIONS:\n"
172
+ "Acquisition (acq):\n"
173
+ + "\n".join([f" - {k}: {v}" for k, v in acq_cols.items()]) + "\n\n"
174
+ "Performance (perf):\n"
175
+ + "\n".join([f" - {k}: {v}" for k, v in perf_cols.items()]) + "\n\n"
176
+
177
+ "Available Risk Metrics via generate_metric_view(df, metric_name, group_col):\n"
178
+ " - 30+@3 (30+ dpd at 3 months)\n"
179
+ " - 30+@6 (30+ dpd at 6 months)\n"
180
+ " - 60+@6 (60+ dpd at 6 months)\n"
181
+ " - Yr1 NCL (Year 1 net charge-off rate)\n\n"
182
+
183
+ "CODE GENERATION RULES:\n"
184
+ "- Generate pandas code ONLY\n"
185
+ "- IMPORTANT: Always store your final result in a variable named exactly 'result_1', 'result_2', or 'result_3' matching the sequence number\n"
186
+ "- Use meaningful intermediate variable names (e.g., vintage_analysis, segment_summary)\n"
187
+ "- Focus on GROUP BY aggregations for insights\n"
188
+ "- Calculate rates as dollars/total (percentage)\n"
189
+ "- Sort by risk metrics (descending) to identify worst segments\n"
190
+ "- Add brief comments for clarity\n"
191
+ "- NO markdown, NO explanations outside JSON\n\n"
192
+
193
+ "JSON STRUCTURE:\n"
194
+ "{\n"
195
+ ' "requirements": [\n'
196
+ ' {\n'
197
+ ' "sequence": 1,\n'
198
+ ' "title": "Analysis title",\n'
199
+ ' "description": "What this code does and why it matters",\n'
200
+ ' "code": "pandas code here — must assign final result to result_1"\n'
201
+ " }\n"
202
+ " ]\n"
203
+ "}\n\n"
204
+
205
+ "User Question:\n" + question
206
+ )
207
+
208
+ messages = [
209
+ {
210
+ "role": "system",
211
+ "content": (
212
+ "You are a senior credit risk analyst who generates pandas code for portfolio analytics. "
213
+ "Return ONLY valid JSON. Always store the final result in result_1, result_2, or result_3."
214
+ )
215
+ },
216
+ {"role": "user", "content": prompt}
217
+ ]
218
+
219
+ response = client.chat.completions.create(
220
+ model=HF_MODEL_ID,
221
+ messages=messages,
222
+ max_tokens=2048,
223
+ temperature=0.1,
224
+ top_p=0.95
225
+ )
226
+
227
+ response_text = (
228
+ response.choices[0].message.content
229
+ if hasattr(response, "choices")
230
+ else str(response)
231
+ )
232
+
233
+ # Extract JSON
234
+ spec = _extract_json(response_text)
235
+
236
+ if not spec:
237
+ return {
238
+ "success": False,
239
+ "requirements": [],
240
+ "error": f"Failed to parse JSON from LLM response: {response_text[:200]}",
241
+ "raw_response": response_text
242
+ }
243
+
244
+ requirements = spec.get("requirements", [])
245
+
246
+ if not requirements:
247
+ return {
248
+ "success": False,
249
+ "requirements": [],
250
+ "error": f"LLM returned no requirements. Response keys: {list(spec.keys())}",
251
+ "raw_response": response_text[:300]
252
+ }
253
+
254
+ print(f"[DEBUG] Generated {len(requirements)} requirements for question: {question[:80]}")
255
+ for i, req in enumerate(requirements, 1):
256
+ print(f" Req {i}: {req.get('title')}")
257
+
258
+ return {
259
+ "success": True,
260
+ "requirements": requirements,
261
+ "error": None
262
+ }
263
+
264
+
265
+ # ---------------------------------------------------
266
+ # STEP 2: CODE EXECUTION (LOOPED)
267
+ # ---------------------------------------------------
268
+
269
+ def execute_requirement_code(
270
+ code: str,
271
+ acq: pd.DataFrame,
272
+ perf: pd.DataFrame,
273
+ master_df: pd.DataFrame,
274
+ requirement_num: int
275
+ ):
276
+ """
277
+ Safely execute generated pandas code for a single requirement.
278
+
279
+ FIXES:
280
+ - FIX 1: Single namespace dict passed to exec() so variable assignments
281
+ are reliably captured (Python quirk with separate globals/locals).
282
+ - FIX 2: Smart result detection — checks named keys first, then scans
283
+ for any new DataFrame/Series, then any non-None new variable.
284
+ - FIX 3: _fix_pandas_compatibility() called before exec().
285
+ """
286
+
287
+ # FIX 3: Apply pandas compatibility patches BEFORE executing
288
+ code = _fix_pandas_compatibility(code)
289
+
290
+ # FIX 1: Merge everything into ONE dict so exec() writes back correctly.
291
+ # When you pass separate globals + locals to exec(), Python's bytecode
292
+ # compiler uses STORE_FAST which writes to an internal frame and does NOT
293
+ # update the locals dict you passed in — so result variables always come
294
+ # back None. Using a single namespace avoids this entirely.
295
+ namespace = {
296
+ "pd": pd,
297
+ "generate_metric_view": generate_metric_view,
298
+ "__builtins__": __builtins__,
299
+ # Data available to generated code
300
+ "acq": acq,
301
+ "perf": perf,
302
+ "master_df": master_df,
303
+ }
304
+
305
+ # Snapshot of keys before exec so we can detect newly created variables
306
+ keys_before = set(namespace.keys())
307
+
308
+ try:
309
+ print(f"[DEBUG] Executing requirement {requirement_num}...")
310
+ print(f"[DEBUG] Code preview: {code[:120].strip()}...")
311
+
312
+ exec(code, namespace) # FIX 1: single namespace
313
+
314
+ # FIX 2: Smart result detection — three priority tiers
315
+
316
+ # --- Tier 1: expected named result variables ---
317
+ result = None
318
+ expected_keys = [
319
+ f"result_{requirement_num}",
320
+ "final_result",
321
+ "result",
322
+ ]
323
+ for key in expected_keys:
324
+ if key in namespace and namespace[key] is not None:
325
+ result = namespace[key]
326
+ print(f"[DEBUG] Found result in expected variable: '{key}'")
327
+ break
328
+
329
+ # --- Tier 2: any NEW DataFrame or Series created during exec ---
330
+ if result is None:
331
+ new_keys = set(namespace.keys()) - keys_before
332
+ for key in new_keys:
333
+ val = namespace[key]
334
+ if isinstance(val, (pd.DataFrame, pd.Series)) and val is not None:
335
+ result = val
336
+ print(f"[DEBUG] Found result by scanning new DataFrame/Series: '{key}'")
337
+ break
338
+
339
+ # --- Tier 3: any new non-None, non-private variable ---
340
+ if result is None:
341
+ new_keys = set(namespace.keys()) - keys_before
342
+ for key in sorted(new_keys): # sorted for determinism
343
+ if key.startswith("_"):
344
+ continue
345
+ val = namespace[key]
346
+ if val is not None:
347
+ result = val
348
+ print(f"[DEBUG] Fallback: found result in new variable: '{key}'")
349
+ break
350
+
351
+ if result is None:
352
+ result = "Code executed successfully but no result variable was found in namespace."
353
+
354
+ print(f"[DEBUG] Req {requirement_num} success. Result type: {type(result).__name__}")
355
+ return {
356
+ "success": True,
357
+ "result": result,
358
+ "error": None
359
+ }
360
+
361
+ except Exception as e:
362
+ import traceback
363
+ tb = traceback.format_exc()
364
+ print(f"[DEBUG] Req {requirement_num} FAILED: {str(e)}")
365
+ print(f"[DEBUG] Traceback:\n{tb}")
366
+ return {
367
+ "success": False,
368
+ "result": None,
369
+ "error": str(e)
370
+ }
371
+
372
+
373
+ def execute_all_requirements(
374
+ requirements: list,
375
+ acq: pd.DataFrame,
376
+ perf: pd.DataFrame,
377
+ master_df: pd.DataFrame
378
+ ):
379
+ """
380
+ Execute all requirements sequentially, building context.
381
+ """
382
+
383
+ print(f"[DEBUG] Starting execution of {len(requirements)} requirements")
384
+ all_results = []
385
+ context_text = ""
386
+
387
+ for i, req in enumerate(requirements, 1):
388
+ code = req.get("code", "")
389
+ description = req.get("description", "")
390
+ title = req.get("title", f"Analysis {i}")
391
+
392
+ exec_result = execute_requirement_code(code, acq, perf, master_df, i)
393
+
394
+ all_results.append({
395
+ "sequence": i,
396
+ "title": title,
397
+ "description": description,
398
+ "code": code,
399
+ # "success" is what app.py checks via res.get("success")
400
+ # "execution_success" kept for backward compatibility
401
+ "success": exec_result["success"],
402
+ "execution_success": exec_result["success"],
403
+ "result": exec_result["result"],
404
+ "error": exec_result.get("error")
405
+ })
406
+
407
+ # Build context for interpretation
408
+ if exec_result["success"]:
409
+ context_text += f"\nAnalysis {i} ({title}):\n{str(exec_result['result'])}\n"
410
+ else:
411
+ context_text += f"\nAnalysis {i} ({title}) FAILED:\n{exec_result['error']}\n"
412
+
413
+ return all_results, context_text
414
+
415
+
416
+ # ---------------------------------------------------
417
+ # STEP 3: RESULT INTERPRETATION
418
+ # ---------------------------------------------------
419
+
420
+ def interpret_all_results(
421
+ question: str,
422
+ all_results: list,
423
+ context_text: str
424
+ ):
425
+ """
426
+ Senior risk analyst LLM interprets all results holistically.
427
+ """
428
+
429
+ print(f"[DEBUG] Interpreting results for {len(all_results)} analyses")
430
+ print(f"[DEBUG] Successful executions: {sum(1 for r in all_results if r.get('success'))}")
431
+
432
+ client = _get_hf_client()
433
+
434
+ # Format all analyses
435
+ analyses_text = ""
436
+ for res in all_results:
437
+ analyses_text += f"\n{'=' * 60}\n"
438
+ analyses_text += f"Analysis {res['sequence']}: {res['title']}\n"
439
+ analyses_text += f"Description: {res['description']}\n"
440
+ analyses_text += f"{'=' * 60}\n"
441
+ if res["success"]:
442
+ analyses_text += f"Result:\n{str(res['result'])}\n"
443
+ else:
444
+ analyses_text += f"Execution Error: {res['error']}\n"
445
+
446
+ prompt = (
447
+ "You are a senior retail credit risk analyst with 15+ years of portfolio management experience.\n\n"
448
+ "Your task:\n"
449
+ "Synthesize the analytical results and provide comprehensive risk insights.\n\n"
450
+
451
+ "Focus on:\n"
452
+ "- Key findings and patterns across all analyses\n"
453
+ "- Risk deterioration or improvement trends\n"
454
+ "- Vintage/segment concentration issues and implications\n"
455
+ "- Root causes of observed patterns\n"
456
+ "- Unusual trends, anomalies, or red flags\n"
457
+ "- Actionable recommendations for portfolio management\n"
458
+ "- Comparative risk assessment (which segments/vintages are most/least risky)\n\n"
459
+
460
+ "Guidelines:\n"
461
+ "- Be analytical and specific (not generic)\n"
462
+ "- Focus on business implications, not just statistics\n"
463
+ "- Avoid repeating raw tables; interpret the meaning\n"
464
+ "- Provide 3-5 key insights\n"
465
+ "- Suggest next investigative steps if needed\n\n"
466
+
467
+ "User's Original Question:\n" + question + "\n\n"
468
+
469
+ "Analyses Performed:\n" + analyses_text + "\n\n"
470
+
471
+ "Provide your senior analyst interpretation:"
472
+ )
473
+
474
+ messages = [
475
+ {
476
+ "role": "system",
477
+ "content": "You are a senior credit risk analyst providing executive insights from portfolio analytics."
478
+ },
479
+ {"role": "user", "content": prompt}
480
+ ]
481
+
482
+ response = client.chat.completions.create(
483
+ model=HF_MODEL_ID,
484
+ messages=messages,
485
+ max_tokens=1024,
486
+ temperature=0.3,
487
+ top_p=0.95
488
+ )
489
+
490
+ interpretation = (
491
+ response.choices[0].message.content
492
+ if hasattr(response, "choices")
493
+ else str(response)
494
+ )
495
+ return interpretation
496
+
497
+
498
+ # ---------------------------------------------------
499
+ # MASTER ORCHESTRATOR FUNCTION
500
+ # ---------------------------------------------------
501
+
502
+ def run_deep_dive_analysis(
503
+ question: str,
504
+ acq: pd.DataFrame,
505
+ perf: pd.DataFrame,
506
+ master_df: pd.DataFrame
507
+ ):
508
+ """
509
+ End-to-end deep dive analysis:
510
+ 1. Break question into 1-3 structured requirements
511
+ 2. Generate code for each requirement
512
+ 3. Execute each requirement's code sequentially
513
+ 4. Synthesize results and provide senior analyst interpretation
514
+ """
515
+
516
+ print(f"\n[DEEP DIVE START] Question: {question}")
517
+ print(f"[DEBUG] Data shapes - Acq: {acq.shape}, Perf: {perf.shape}, Master: {master_df.shape}")
518
+
519
+ # Step 1: Generate requirements
520
+ print("[DEBUG] Step 1: Generating requirements...")
521
+ req_response = generate_analysis_requirements(question, acq, perf, master_df)
522
+
523
+ if not req_response["success"]:
524
+ return {
525
+ "success": False,
526
+ "question": question,
527
+ "requirements": [],
528
+ "all_results": [],
529
+ "interpretation": f"Failed to generate requirements: {req_response['error']}",
530
+ "error": req_response["error"]
531
+ }
532
+
533
+ requirements = req_response["requirements"][:3] # Cap at 3
534
+
535
+ # Step 2 & 3: Execute all requirements
536
+ print(f"[DEBUG] Step 2-3: Executing {len(requirements)} requirements...")
537
+ all_results, context_text = execute_all_requirements(requirements, acq, perf, master_df)
538
+
539
+ # Step 4: Interpret results
540
+ print("[DEBUG] Step 4: Interpreting all results...")
541
+ interpretation = interpret_all_results(question, all_results, context_text)
542
+ print("[DEEP DIVE END] Analysis complete\n")
543
+
544
+ return {
545
+ "success": True,
546
+ "question": question,
547
+ "requirements": requirements,
548
+ "all_results": all_results,
549
+ "interpretation": interpretation,
550
+ "error": None
551
+ }