GenAICoder commited on
Commit
8b3f63b
·
verified ·
1 Parent(s): 8af1065

Create deep_dive_agentic.py

Browse files
Files changed (1) hide show
  1. analytics/deep_dive_agentic.py +377 -0
analytics/deep_dive_agentic.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deep_dive_agentic.py
2
+
3
+ """
4
+ Agentic analytical code generation + execution engine using Hugging Face
5
+
6
+ FLOW:
7
+ User Question
8
+
9
+ LLM generates pandas code
10
+
11
+ Python executes code safely
12
+
13
+ LLM interprets results
14
+
15
+ Return code + interpretation
16
+
17
+ Environment:
18
+ export HUGGINGFACE_API_TOKEN=...
19
+ """
20
+
21
+ # ---------------------------------------------------
22
+ # IMPORTS
23
+ # ---------------------------------------------------
24
+
25
+ import pandas as pd
26
+ import json
27
+ import os
28
+ import re
29
+
30
+ try:
31
+ from huggingface_hub import InferenceClient
32
+ except ImportError as exc:
33
+ raise ImportError(
34
+ "huggingface_hub is required. Install with `pip install huggingface-hub`."
35
+ ) from exc
36
+
37
+ from analytics.performance_analysis import generate_metric_view
38
+
39
+ # ---------------------------------------------------
40
+ # HF CONFIG
41
+ # ---------------------------------------------------
42
+
43
+ HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
44
+ HF_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
45
+
46
+
47
+ # ---------------------------------------------------
48
+ # HELPER: GET INFERENCE CLIENT
49
+ # ---------------------------------------------------
50
+
51
+ def _get_hf_client():
52
+ if not HF_TOKEN:
53
+ raise RuntimeError(
54
+ "HUGGINGFACE_API_TOKEN is required. Set it in your environment."
55
+ )
56
+ return InferenceClient(token=HF_TOKEN)
57
+
58
+
59
+ # ---------------------------------------------------
60
+ # HELPER: EXTRACT JSON FROM LLM RESPONSE
61
+ # ---------------------------------------------------
62
+
63
+ def _extract_json(text: str):
64
+ match = re.search(r"\{.*\}", text, re.S)
65
+ if not match:
66
+ return None
67
+ payload = match.group(0)
68
+ try:
69
+ return json.loads(payload)
70
+ except json.JSONDecodeError:
71
+ try:
72
+ cleaned = re.sub(r"[\n\r]+", " ", payload)
73
+ cleaned = re.sub(r"(['\"])?([a-zA-Z0-9_]+)(['\"])?\s*:\s*", r'"\2": ', cleaned)
74
+ return json.loads(cleaned)
75
+ except Exception:
76
+ return None
77
+
78
+
79
+ # ---------------------------------------------------
80
+ # STEP 1: CODE GENERATION
81
+ # ---------------------------------------------------
82
+
83
+ def generate_analysis_requirements(question: str, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame):
84
+ """
85
+ LLM breaks down question into 1-3 structured analytics requirements.
86
+ Each requirement includes a description and executable pandas code.
87
+ """
88
+
89
+ client = _get_hf_client()
90
+
91
+ # Build detailed column descriptions
92
+ acq_cols = {
93
+ "account_id": "unique account identifier",
94
+ "booking_date": "when account was originated",
95
+ "booking_vintage": "year-month of origination (YYYY-MM)",
96
+ "fico_band": "FICO score bracket (e.g., 700-750, 750-800)",
97
+ "sourcing_channel": "acquisition channel (e.g., Online, Branch, Broker)",
98
+ "city_tier": "city classification (Tier-1, Tier-2, Tier-3)",
99
+ "occupation_type": "borrower occupation category",
100
+ "credit_limit": "approved credit line amount"
101
+ }
102
+
103
+ perf_cols = {
104
+ "account_id": "unique account identifier",
105
+ "reporting_month": "month of performance observation (YYYY-MM)",
106
+ "mob": "months on books (age of account in months)",
107
+ "dpd": "days past due (0, 30, 60, 90+)",
108
+ "balance": "current outstanding balance",
109
+ "ncl_amount": "net charge-off amount (dollars)",
110
+ "payment": "payment amount in period"
111
+ }
112
+
113
+ master_df_desc = "acq merged with perf on account_id; contains all acquisition + performance columns"
114
+
115
+ prompt = (
116
+ "You are a senior retail credit risk analyst with 15+ years of portfolio management experience.\n\n"
117
+ "Your task:\n"
118
+ "1. Analyze the user's analytical question deeply\n"
119
+ "2. Determine 1-3 specific analytics requirements needed to fully answer the question\n"
120
+ "3. For EACH requirement, generate executable pandas code\n"
121
+ "4. Return ONLY valid JSON, no other text\n\n"
122
+
123
+ "AVAILABLE DATA:\n"
124
+ "- acq: acquisition data with columns: " + ", ".join(acq_cols.keys()) + "\n"
125
+ "- perf: performance data with columns: " + ", ".join(perf_cols.keys()) + "\n"
126
+ "- master_df: merged acq+perf, includes all above columns\n\n"
127
+
128
+ "COLUMN DESCRIPTIONS:\n"
129
+ "Acquisition (acq):\n"
130
+ + "\n".join([f" - {k}: {v}" for k, v in acq_cols.items()]) + "\n\n"
131
+ "Performance (perf):\n"
132
+ + "\n".join([f" - {k}: {v}" for k, v in perf_cols.items()]) + "\n\n"
133
+
134
+ "Available Risk Metrics via generate_metric_view(df, metric_name, group_col):\n"
135
+ " - 30+@3 (30+ dpd at 3 months)\n"
136
+ " - 30+@6 (30+ dpd at 6 months)\n"
137
+ " - 60+@6 (60+ dpd at 6 months)\n"
138
+ " - Yr1 NCL (Year 1 net charge-off rate)\n\n"
139
+
140
+ "CODE GENERATION RULES:\n"
141
+ "- Generate pandas code ONLY\n"
142
+ "- Use meaningful variable names (e.g., vintage_analysis, segment_summary)\n"
143
+ "- Store analysis in a variable (e.g., result_1, result_2, result_3)\n"
144
+ "- Focus on GROUP BY aggregations for insights\n"
145
+ "- Calculate rates as dollars/total (percentage)\n"
146
+ "- Sort by risk metrics (descending) to identify worst segments\n"
147
+ "- Add brief comments for clarity\n"
148
+ "- NO markdown, NO explanations outside JSON\n\n"
149
+
150
+ "JSON STRUCTURE:\n"
151
+ "{\n"
152
+ ' "requirements": [\n'
153
+ ' {\n'
154
+ ' "sequence": 1,\n'
155
+ ' "title": "Analysis title",\n'
156
+ ' "description": "What this code does and why it matters",\n'
157
+ ' "code": "pandas code here"\n'
158
+ " }\n"
159
+ " ]\n"
160
+ "}\n\n"
161
+
162
+ "User Question:\n" + question
163
+ )
164
+
165
+ messages = [
166
+ {"role": "system", "content": "You are a senior credit risk analyst who generates pandas code for portfolio analytics. Return ONLY valid JSON."},
167
+ {"role": "user", "content": prompt}
168
+ ]
169
+
170
+ response = client.chat.completions.create(
171
+ model=HF_MODEL_ID,
172
+ messages=messages,
173
+ max_tokens=2048,
174
+ temperature=0.1,
175
+ top_p=0.95
176
+ )
177
+
178
+ response_text = response.choices[0].message.content if hasattr(response, 'choices') else str(response)
179
+
180
+ # Extract JSON
181
+ spec = _extract_json(response_text)
182
+
183
+ if not spec or "requirements" not in spec:
184
+ return {
185
+ "success": False,
186
+ "requirements": [],
187
+ "error": "Failed to parse requirements from LLM response",
188
+ "raw_response": response_text
189
+ }
190
+
191
+ return {
192
+ "success": True,
193
+ "requirements": spec.get("requirements", []),
194
+ "error": None
195
+ }
196
+
197
+
198
+ # ---------------------------------------------------
199
+ # STEP 2: CODE EXECUTION (LOOPED)
200
+ # ---------------------------------------------------
201
+
202
+ def execute_requirement_code(code: str, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame, requirement_num: int):
203
+ """
204
+ Safely execute generated pandas code for a single requirement.
205
+ """
206
+
207
+ local_scope = {
208
+ "pd": pd,
209
+ "acq": acq,
210
+ "perf": perf,
211
+ "master_df": master_df,
212
+ "generate_metric_view": generate_metric_view
213
+ }
214
+
215
+ try:
216
+ exec(code, {}, local_scope)
217
+ # Look for result variables (result_1, result_2, result_3, or final_result)
218
+ result_key = f"result_{requirement_num}" if f"result_{requirement_num}" in local_scope else "final_result"
219
+ result = local_scope.get(result_key, local_scope.get("result", "No result generated"))
220
+
221
+ return {
222
+ "success": True,
223
+ "result": result,
224
+ "error": None
225
+ }
226
+ except Exception as e:
227
+ return {
228
+ "success": False,
229
+ "result": None,
230
+ "error": str(e)
231
+ }
232
+
233
+
234
+ def execute_all_requirements(requirements: list, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame):
235
+ """
236
+ Execute all requirements sequentially, building context.
237
+ """
238
+
239
+ all_results = []
240
+ context_text = ""
241
+
242
+ for i, req in enumerate(requirements, 1):
243
+ code = req.get("code", "")
244
+ description = req.get("description", "")
245
+ title = req.get("title", f"Analysis {i}")
246
+
247
+ exec_result = execute_requirement_code(code, acq, perf, master_df, i)
248
+
249
+ all_results.append({
250
+ "sequence": i,
251
+ "title": title,
252
+ "description": description,
253
+ "code": code,
254
+ "execution_success": exec_result["success"],
255
+ "result": exec_result["result"],
256
+ "error": exec_result.get("error")
257
+ })
258
+
259
+ # Build context for interpretation
260
+ if exec_result["success"]:
261
+ context_text += f"\nAnalysis {i} ({title}):\n{str(exec_result['result'])}\n"
262
+ else:
263
+ context_text += f"\nAnalysis {i} ({title}) FAILED:\n{exec_result['error']}\n"
264
+
265
+ return all_results, context_text
266
+
267
+
268
+ # ---------------------------------------------------
269
+ # STEP 3: RESULT INTERPRETATION
270
+ # ---------------------------------------------------
271
+
272
+ def interpret_all_results(question: str, all_results: list, context_text: str):
273
+ """
274
+ Senior risk analyst LLM interprets all results holistically.
275
+ """
276
+
277
+ client = _get_hf_client()
278
+
279
+ # Format all analyses
280
+ analyses_text = ""
281
+ for res in all_results:
282
+ analyses_text += f"\n{'='*60}\n"
283
+ analyses_text += f"Analysis {res['sequence']}: {res['title']}\n"
284
+ analyses_text += f"Description: {res['description']}\n"
285
+ analyses_text += f"{'='*60}\n"
286
+ if res['execution_success']:
287
+ analyses_text += f"Result:\n{str(res['result'])}\n"
288
+ else:
289
+ analyses_text += f"Execution Error: {res['error']}\n"
290
+
291
+ prompt = (
292
+ "You are a senior retail credit risk analyst with 15+ years of portfolio management experience.\n\n"
293
+ "Your task:\n"
294
+ "Synthesize the analytical results and provide comprehensive risk insights.\n\n"
295
+
296
+ "Focus on:\n"
297
+ "- Key findings and patterns across all analyses\n"
298
+ "- Risk deterioration or improvement trends\n"
299
+ "- Vintage/segment concentration issues and implications\n"
300
+ "- Root causes of observed patterns\n"
301
+ "- Unusual trends, anomalies, or red flags\n"
302
+ "- Actionable recommendations for portfolio management\n"
303
+ "- Comparative risk assessment (which segments/vintages are most/least risky)\n\n"
304
+
305
+ "Guidelines:\n"
306
+ "- Be analytical and specific (not generic)\n"
307
+ "- Focus on business implications, not just statistics\n"
308
+ "- Avoid repeating raw tables; interpret the meaning\n"
309
+ "- Provide 3-5 key insights\n"
310
+ "- Suggest next investigative steps if needed\n\n"
311
+
312
+ "User's Original Question:\n" + question + "\n\n"
313
+
314
+ "Analyses Performed:\n" + analyses_text + "\n\n"
315
+
316
+ "Provide your senior analyst interpretation:"
317
+ )
318
+
319
+ messages = [
320
+ {"role": "system", "content": "You are a senior credit risk analyst providing executive insights from portfolio analytics."},
321
+ {"role": "user", "content": prompt}
322
+ ]
323
+
324
+ response = client.chat.completions.create(
325
+ model=HF_MODEL_ID,
326
+ messages=messages,
327
+ max_tokens=1024,
328
+ temperature=0.3,
329
+ top_p=0.95
330
+ )
331
+
332
+ interpretation = response.choices[0].message.content if hasattr(response, 'choices') else str(response)
333
+ return interpretation
334
+
335
+
336
+ # ---------------------------------------------------
337
+ # MASTER ORCHESTRATOR FUNCTION
338
+ # ---------------------------------------------------
339
+
340
+ def run_deep_dive_analysis(question: str, acq: pd.DataFrame, perf: pd.DataFrame, master_df: pd.DataFrame):
341
+ """
342
+ End-to-end deep dive analysis:
343
+ 1. Break question into 1-3 structured requirements
344
+ 2. Generate code for each requirement
345
+ 3. Execute each requirement's code sequentially
346
+ 4. Synthesize results and provide senior analyst interpretation
347
+ """
348
+
349
+ # Step 1: Generate requirements
350
+ req_response = generate_analysis_requirements(question, acq, perf, master_df)
351
+
352
+ if not req_response["success"]:
353
+ return {
354
+ "success": False,
355
+ "question": question,
356
+ "requirements": [],
357
+ "all_results": [],
358
+ "interpretation": f"Failed to generate requirements: {req_response['error']}",
359
+ "error": req_response["error"]
360
+ }
361
+
362
+ requirements = req_response["requirements"][:3] # Cap at 3
363
+
364
+ # Step 2 & 3: Execute all requirements
365
+ all_results, context_text = execute_all_requirements(requirements, acq, perf, master_df)
366
+
367
+ # Step 4: Interpret results
368
+ interpretation = interpret_all_results(question, all_results, context_text)
369
+
370
+ return {
371
+ "success": True,
372
+ "question": question,
373
+ "requirements": requirements,
374
+ "all_results": all_results,
375
+ "interpretation": interpretation,
376
+ "error": None
377
+ }