GenAICoder commited on
Commit
cb983ab
·
verified ·
1 Parent(s): 1b8bef5

Create deep_dive_agentic.py

Browse files
Files changed (1) hide show
  1. analytics/deep_dive_agentic.py +599 -0
analytics/deep_dive_agentic.py ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deep_dive_agentic.py
2
+
3
+ """
4
+ Agentic analytical code generation + execution engine using Hugging Face
5
+
6
+ FLOW:
7
+ User Question
8
+
9
+ LLM generates pandas code
10
+
11
+ Python executes code safely
12
+
13
+ LLM interprets results
14
+
15
+ Return code + interpretation
16
+
17
+ Environment:
18
+ export HUGGINGFACE_API_TOKEN=...
19
+ """
20
+
21
+ # ---------------------------------------------------
22
+ # IMPORTS
23
+ # ---------------------------------------------------
24
+
25
+ import pandas as pd
26
+ import json
27
+ import os
28
+ import re
29
+
30
+ try:
31
+ from huggingface_hub import InferenceClient
32
+ except ImportError as exc:
33
+ raise ImportError(
34
+ "huggingface_hub is required. Install with `pip install huggingface-hub`."
35
+ ) from exc
36
+
37
+ from analytics.performance_analysis import generate_metric_view
38
+
39
+ # ---------------------------------------------------
40
+ # HF CONFIG
41
+ # ---------------------------------------------------
42
+
43
+ HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
44
+ HF_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
45
+
46
+
47
+ # ---------------------------------------------------
48
+ # HELPER: GET INFERENCE CLIENT
49
+ # ---------------------------------------------------
50
+
51
+ def _get_hf_client():
52
+ if not HF_TOKEN:
53
+ raise RuntimeError(
54
+ "HUGGINGFACE_API_TOKEN is required. Set it in your environment."
55
+ )
56
+ return InferenceClient(token=HF_TOKEN)
57
+
58
+
59
+ # ---------------------------------------------------
60
+ # HELPER: EXTRACT JSON FROM LLM RESPONSE
61
+ # ---------------------------------------------------
62
+
63
+ def _extract_json(text: str):
64
+
65
+ # Remove markdown fences
66
+ text = text.replace("```json", "")
67
+ text = text.replace("```", "")
68
+
69
+ match = re.search(r"\{.*\}", text, re.S)
70
+
71
+ if not match:
72
+ return None
73
+
74
+ payload = match.group(0)
75
+
76
+ try:
77
+
78
+ return json.loads(payload)
79
+
80
+ except json.JSONDecodeError:
81
+
82
+ try:
83
+
84
+ cleaned = re.sub(
85
+ r"[\n\r]+",
86
+ " ",
87
+ payload
88
+ )
89
+
90
+ cleaned = re.sub(
91
+ r"(['\"])?([a-zA-Z0-9_]+)(['\"])?\s*:\s*",
92
+ r'"\2": ',
93
+ cleaned
94
+ )
95
+
96
+ return json.loads(cleaned)
97
+
98
+ except Exception:
99
+
100
+ return None
101
+
102
+
103
+ # ---------------------------------------------------
104
+ # HELPER: FIX COMMON PANDAS COMPATIBILITY ISSUES
105
+ # ---------------------------------------------------
106
+
107
+ def _fix_pandas_compatibility(code: str):
108
+ """
109
+ Fix common pandas API compatibility issues in generated code.
110
+ Handles version differences between pandas versions.
111
+ """
112
+ # Fix: .reset_index(name=...) -> .reset_index(names=[...])
113
+ code = re.sub(
114
+ r"\.reset_index\(name=(['\"])([^'\"]+)\1\)",
115
+ r".reset_index(names=[\1\2\1])",
116
+ code
117
+ )
118
+
119
+ # Fix: .reset_index(name= with variable
120
+ code = re.sub(
121
+ r"\.reset_index\(name=([a-zA-Z_][a-zA-Z0-9_]*)\)",
122
+ r".reset_index(names=[\1])",
123
+ code
124
+ )
125
+
126
+ return code
127
+
128
+
129
+ # ---------------------------------------------------
130
+ # STEP 1: CODE GENERATION
131
+ # ---------------------------------------------------
132
+
133
+ def generate_analysis_requirements(question: str, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame):
134
+ """
135
+ LLM breaks down question into 1-3 structured analytics requirements.
136
+ Each requirement includes a description and executable pandas code.
137
+ """
138
+
139
+ client = _get_hf_client()
140
+
141
+ # Build detailed column descriptions
142
+ acq_cols = {
143
+ "account_id": "unique account identifier",
144
+ "booking_date": "when account was originated",
145
+ "booking_vintage": "year-month of origination (YYYY-MM)",
146
+ "fico_band": "FICO score bracket (e.g., 700-750, 750-800)",
147
+ "sourcing_channel": "acquisition channel (e.g., Online, Branch, Broker)",
148
+ "city_tier": "city classification (Tier-1, Tier-2, Tier-3)",
149
+ "occupation_type": "borrower occupation category",
150
+ "credit_limit": "approved credit line amount"
151
+ }
152
+
153
+ perf_cols = {
154
+ "account_id": "unique account identifier",
155
+ "reporting_month": "month of performance observation (YYYY-MM)",
156
+ "mob": "months on books (age of account in months)",
157
+ "dpd": "days past due (0, 30, 60, 90+)",
158
+ "balance": "current outstanding balance",
159
+ "ncl_amount": "net charge-off amount (dollars)",
160
+ "payment": "payment amount in period"
161
+ }
162
+
163
+ master_df_desc = "acq merged with perf on account_id; contains all acquisition + performance columns"
164
+
165
+ prompt = (
166
+ "You are a senior retail credit risk analyst with 15+ years of portfolio management experience.\n\n"
167
+ "Your task:\n"
168
+ "1. Analyze the user's analytical question deeply\n"
169
+ "2. Determine 1-3 specific analytics requirements needed to fully answer the question\n"
170
+ "3. For EACH requirement, generate executable pandas code\n"
171
+ "4. Return ONLY valid JSON, no other text\n\n"
172
+
173
+ "AVAILABLE DATA:\n"
174
+ "- acq: acquisition data with columns: " + ", ".join(acq_cols.keys()) + "\n"
175
+ "- perf: performance data with columns: " + ", ".join(perf_cols.keys()) + "\n"
176
+ "- master_df: merged acq+perf, includes all above columns\n\n"
177
+
178
+ "COLUMN DESCRIPTIONS:\n"
179
+ "Acquisition (acq):\n"
180
+ + "\n".join([f" - {k}: {v}" for k, v in acq_cols.items()]) + "\n\n"
181
+ "Performance (perf):\n"
182
+ + "\n".join([f" - {k}: {v}" for k, v in perf_cols.items()]) + "\n\n"
183
+
184
+ "Available Risk Metrics via generate_metric_view(df, metric_name, group_col):\n"
185
+ " - 30+@3 (30+ dpd at 3 months)\n"
186
+ " - 30+@6 (30+ dpd at 6 months)\n"
187
+ " - 60+@6 (60+ dpd at 6 months)\n"
188
+ " - Yr1 NCL (Year 1 net charge-off rate)\n\n"
189
+
190
+ "CODE GENERATION RULES:\n"
191
+ "- Generate pandas code ONLY\n"
192
+ "- Use ONLY these dataframe names exactly:\n"
193
+ " - acq\n"
194
+ " - perf\n"
195
+ " - master_df\n\n"
196
+
197
+ "- NEVER:\n"
198
+ " - use pd.read_csv\n"
199
+ " - read files\n"
200
+ " - import libraries\n"
201
+ " - use open()\n"
202
+ " - create fake dataframe names\n\n"
203
+
204
+ "- Use meaningful variable names\n"
205
+ "- Store final outputs in variables like:\n"
206
+ " result_1, result_2, result_3\n\n"
207
+
208
+ "- Focus on GROUP BY aggregations\n"
209
+ "- Calculate rates as dollars/total (percentage)\n"
210
+ "- Sort by risk metrics descending\n"
211
+ "- Add brief comments for clarity\n"
212
+ "- Return executable python only inside JSON\n"
213
+ "- NO markdown\n"
214
+ "- NO explanations outside JSON\n\n"
215
+
216
+ "JSON STRUCTURE:\n"
217
+ "{\n"
218
+ ' "requirements": [\n'
219
+ ' {\n'
220
+ ' "sequence": 1,\n'
221
+ ' "title": "Analysis title",\n'
222
+ ' "description": "What this code does and why it matters",\n'
223
+ ' "code": "pandas code here"\n'
224
+ " }\n"
225
+ " ]\n"
226
+ "}\n\n"
227
+
228
+ "User Question:\n" + question
229
+ )
230
+
231
+ messages = [
232
+ {"role": "system", "content": "You are a senior credit risk analyst who generates pandas code for portfolio analytics. Return ONLY valid JSON."},
233
+ {"role": "user", "content": prompt}
234
+ ]
235
+
236
+ response = client.chat.completions.create(
237
+ model=HF_MODEL_ID,
238
+ messages=messages,
239
+ max_tokens=2048,
240
+ temperature=0.1,
241
+ top_p=0.95
242
+ )
243
+
244
+ response_text = response.choices[0].message.content if hasattr(response, 'choices') else str(response)
245
+
246
+ # Extract JSON
247
+ spec = _extract_json(response_text)
248
+
249
+ if not spec:
250
+ return {
251
+ "success": False,
252
+ "requirements": [],
253
+ "error": f"Failed to parse JSON from LLM response: {response_text[:200]}",
254
+ "raw_response": response_text
255
+ }
256
+
257
+ requirements = spec.get("requirements", [])
258
+
259
+ if not requirements:
260
+ return {
261
+ "success": False,
262
+ "requirements": [],
263
+ "error": f"LLM returned no requirements. Response keys: {list(spec.keys())}",
264
+ "raw_response": response_text[:300]
265
+ }
266
+
267
+ print(f"[DEBUG] Generated {len(requirements)} requirements for question: {question[:80]}")
268
+ for i, req in enumerate(requirements, 1):
269
+ print(f" Req {i}: {req.get('title')}")
270
+
271
+ return {
272
+ "success": True,
273
+ "requirements": requirements,
274
+ "error": None
275
+ }
276
+
277
+
278
+ # ---------------------------------------------------
279
+ # STEP 2: CODE EXECUTION (LOOPED)
280
+ # ---------------------------------------------------
281
+
282
+ # ---------------------------------------------------
283
+ # STEP 2: CODE EXECUTION (UPDATED + SAFER)
284
+ # ---------------------------------------------------
285
+
286
+ def validate_generated_code(code: str):
287
+ """
288
+ Block unsafe or unsupported operations.
289
+ """
290
+
291
+ blocked_patterns = [
292
+ "import os",
293
+ "import subprocess",
294
+ "import shutil",
295
+ "open(",
296
+ "__import__",
297
+ "exec(",
298
+ "eval(",
299
+ "pd.read_csv",
300
+ "to_csv",
301
+ "to_excel",
302
+ "os.",
303
+ "sys.",
304
+ "subprocess.",
305
+ "shutil."
306
+ ]
307
+
308
+ lowered = code.lower()
309
+
310
+ for pattern in blocked_patterns:
311
+ if pattern.lower() in lowered:
312
+ return False, pattern
313
+
314
+ return True, None
315
+
316
+
317
+ def execute_requirement_code(code: str,acq: pd.DataFrame,perf: pd.DataFrame,master_df: pd.DataFrame,requirement_num: int):
318
+ """
319
+ Safely execute generated pandas code for a single requirement.
320
+ """
321
+
322
+ print(f"\n[DEBUG] Requirement {requirement_num} RAW CODE:\n")
323
+ print(code)
324
+
325
+ # ---------------------------------------------------
326
+ # CLEAN CODE
327
+ # ---------------------------------------------------
328
+
329
+ code = code.replace("```python", "")
330
+ code = code.replace("```", "")
331
+
332
+ code = _fix_pandas_compatibility(code)
333
+
334
+ # ---------------------------------------------------
335
+ # VALIDATE CODE
336
+ # ---------------------------------------------------
337
+
338
+ is_valid, blocked_pattern = validate_generated_code(code)
339
+
340
+ if not is_valid:
341
+
342
+ print(
343
+ f"[DEBUG] Blocked unsafe pattern: {blocked_pattern}"
344
+ )
345
+
346
+ return {
347
+ "success": False,
348
+ "result": None,
349
+ "error": f"Blocked unsafe code pattern: {blocked_pattern}"
350
+ }
351
+
352
+ # ---------------------------------------------------
353
+ # UNIFIED EXECUTION SCOPE
354
+ # ---------------------------------------------------
355
+
356
+ execution_scope = {
357
+ "pd": pd,
358
+ "generate_metric_view": generate_metric_view,
359
+ "acq": acq,
360
+ "perf": perf,
361
+ "master_df": master_df,
362
+ "__builtins__": __builtins__
363
+ }
364
+
365
+ # ---------------------------------------------------
366
+ # EXECUTE
367
+ # ---------------------------------------------------
368
+
369
+ try:
370
+
371
+ print(
372
+ f"[DEBUG] Executing requirement {requirement_num}..."
373
+ )
374
+
375
+ exec(code, execution_scope)
376
+
377
+ # ---------------------------------------------------
378
+ # FIND RESULT VARIABLE
379
+ # ---------------------------------------------------
380
+
381
+ result_key = None
382
+
383
+ possible_result_keys = [
384
+ f"result_{requirement_num}",
385
+ "final_result",
386
+ "result"
387
+ ]
388
+
389
+ for candidate in possible_result_keys:
390
+
391
+ if candidate in execution_scope:
392
+ result_key = candidate
393
+ break
394
+
395
+ if result_key:
396
+
397
+ result = execution_scope[result_key]
398
+
399
+ else:
400
+
401
+ result = "No result generated"
402
+
403
+ print(
404
+ f"[DEBUG] Req {requirement_num} success. "
405
+ f"Result type: {type(result).__name__}"
406
+ )
407
+
408
+ # ---------------------------------------------------
409
+ # OPTIONAL:
410
+ # LIMIT HUGE OUTPUTS
411
+ # ---------------------------------------------------
412
+
413
+ if isinstance(result, pd.DataFrame):
414
+
415
+ preview = result.head(20)
416
+
417
+ else:
418
+
419
+ preview = result
420
+
421
+ return {
422
+ "success": True,
423
+ "result": preview,
424
+ "error": None
425
+ }
426
+
427
+ except Exception as e:
428
+
429
+ print(
430
+ f"[DEBUG] Req {requirement_num} FAILED:"
431
+ )
432
+
433
+ print(str(e))
434
+
435
+ return {
436
+ "success": False,
437
+ "result": None,
438
+ "error": str(e)
439
+ }
440
+
441
+
442
+
443
+
444
+
445
+ def execute_all_requirements(requirements: list, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame):
446
+ """
447
+ Execute all requirements sequentially, building context.
448
+ """
449
+
450
+ print(f"[DEBUG] Starting execution of {len(requirements)} requirements")
451
+ all_results = []
452
+ context_text = ""
453
+
454
+ for i, req in enumerate(requirements, 1):
455
+ code = req.get("code", "")
456
+ description = req.get("description", "")
457
+ title = req.get("title", f"Analysis {i}")
458
+
459
+ exec_result = execute_requirement_code(code, acq, perf, master_df, i)
460
+
461
+ all_results.append({
462
+ "sequence": i,
463
+ "title": title,
464
+ "description": description,
465
+ "code": code,
466
+ "execution_success": exec_result["success"],
467
+ "result": exec_result["result"],
468
+ "error": exec_result.get("error")
469
+ })
470
+
471
+ # Build context for interpretation
472
+ if exec_result["success"]:
473
+ context_text += f"\nAnalysis {i} ({title}):\n{str(exec_result['result'])}\n"
474
+ else:
475
+ context_text += f"\nAnalysis {i} ({title}) FAILED:\n{exec_result['error']}\n"
476
+
477
+ return all_results, context_text
478
+
479
+
480
+ # ---------------------------------------------------
481
+ # STEP 3: RESULT INTERPRETATION
482
+ # ---------------------------------------------------
483
+
484
+ def interpret_all_results(question: str, all_results: list, context_text: str):
485
+ """
486
+ Senior risk analyst LLM interprets all results holistically.
487
+ """
488
+
489
+ print(f"[DEBUG] Interpreting results for {len(all_results)} analyses")
490
+ print(f"[DEBUG] Successful executions: {sum(1 for r in all_results if r.get('success'))}")
491
+
492
+ client = _get_hf_client()
493
+
494
+ # Format all analyses
495
+ analyses_text = ""
496
+ for res in all_results:
497
+ analyses_text += f"\n{'='*60}\n"
498
+ analyses_text += f"Analysis {res['sequence']}: {res['title']}\n"
499
+ analyses_text += f"Description: {res['description']}\n"
500
+ analyses_text += f"{'='*60}\n"
501
+ if res['execution_success']:
502
+ analyses_text += f"Result:\n{str(res['result'])}\n"
503
+ else:
504
+ analyses_text += f"Execution Error: {res['error']}\n"
505
+
506
+ prompt = (
507
+ "You are a senior retail credit risk analyst with 15+ years of portfolio management experience.\n\n"
508
+ "Your task:\n"
509
+ "Synthesize the analytical results and provide comprehensive risk insights.\n\n"
510
+
511
+ "Focus on:\n"
512
+ "- Key findings and patterns across all analyses\n"
513
+ "- Risk deterioration or improvement trends\n"
514
+ "- Vintage/segment concentration issues and implications\n"
515
+ "- Root causes of observed patterns\n"
516
+ "- Unusual trends, anomalies, or red flags\n"
517
+ "- Actionable recommendations for portfolio management\n"
518
+ "- Comparative risk assessment (which segments/vintages are most/least risky)\n\n"
519
+
520
+ "Guidelines:\n"
521
+ "- Be analytical and specific (not generic)\n"
522
+ "- Focus on business implications, not just statistics\n"
523
+ "- Avoid repeating raw tables; interpret the meaning\n"
524
+ "- Provide 3-5 key insights\n"
525
+ "- Suggest next investigative steps if needed\n\n"
526
+
527
+ "User's Original Question:\n" + question + "\n\n"
528
+
529
+ "Analyses Performed:\n" + analyses_text + "\n\n"
530
+
531
+ "Provide your senior analyst interpretation:"
532
+ )
533
+
534
+ messages = [
535
+ {"role": "system", "content": "You are a senior credit risk analyst providing executive insights from portfolio analytics."},
536
+ {"role": "user", "content": prompt}
537
+ ]
538
+
539
+ response = client.chat.completions.create(
540
+ model=HF_MODEL_ID,
541
+ messages=messages,
542
+ max_tokens=1024,
543
+ temperature=0.3,
544
+ top_p=0.95
545
+ )
546
+
547
+ interpretation = response.choices[0].message.content if hasattr(response, 'choices') else str(response)
548
+ return interpretation
549
+
550
+
551
+ # ---------------------------------------------------
552
+ # MASTER ORCHESTRATOR FUNCTION
553
+ # ---------------------------------------------------
554
+
555
+ def run_deep_dive_analysis(question: str, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame):
556
+ """
557
+ End-to-end deep dive analysis:
558
+ 1. Break question into 1-3 structured requirements
559
+ 2. Generate code for each requirement
560
+ 3. Execute each requirement's code sequentially
561
+ 4. Synthesize results and provide senior analyst interpretation
562
+ """
563
+
564
+ print(f"\n[DEEP DIVE START] Question: {question}")
565
+ print(f"[DEBUG] Data shapes - Acq: {acq.shape}, Perf: {perf.shape}, Master: {master_df.shape}")
566
+
567
+ # Step 1: Generate requirements
568
+ print(f"[DEBUG] Step 1: Generating requirements...")
569
+ req_response = generate_analysis_requirements(question, acq, perf, master_df)
570
+
571
+ if not req_response["success"]:
572
+ return {
573
+ "success": False,
574
+ "question": question,
575
+ "requirements": [],
576
+ "all_results": [],
577
+ "interpretation": f"Failed to generate requirements: {req_response['error']}",
578
+ "error": req_response["error"]
579
+ }
580
+
581
+ requirements = req_response["requirements"][:3] # Cap at 3
582
+
583
+ # Step 2 & 3: Execute all requirements
584
+ print(f"[DEBUG] Step 2-3: Executing {len(requirements)} requirements...")
585
+ all_results, context_text = execute_all_requirements(requirements, acq, perf, master_df)
586
+
587
+ # Step 4: Interpret results
588
+ print(f"[DEBUG] Step 4: Interpreting all results...")
589
+ interpretation = interpret_all_results(question, all_results, context_text)
590
+ print(f"[DEEP DIVE END] Analysis complete\n")
591
+
592
+ return {
593
+ "success": True,
594
+ "question": question,
595
+ "requirements": requirements,
596
+ "all_results": all_results,
597
+ "interpretation": interpretation,
598
+ "error": None
599
+ }