DOMMETI commited on
Commit
78f270c
Β·
verified Β·
1 Parent(s): 8e7badb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +256 -27
app.py CHANGED
@@ -1,45 +1,274 @@
1
  import gradio as gr
2
- import os
3
  import pandas as pd
 
 
 
 
4
  from huggingface_hub import InferenceClient
5
 
6
- # ===============================
7
- # LLM CLIENT SETUP
8
- # ===============================
9
- HF_TOKEN = os.getenv("HF")
10
- client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
11
 
12
- def analyze_excel(message, history, file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  """
14
- Problem Statement:
15
- 1. Add your own HF token in the settings to get the LLM working.
16
- 2. Update requirements.txt, app.py as needed.
17
- 3. Develop a robust "Text-to-Code" analytical workflow.
18
-
19
- Requirements:
20
- a.Code Generation : Transform natural language user queries into executable, sandboxed Python code (specifically using pandas).
21
- b.Execution : Securely execute the generated code on the Hugging Face Space server against the uploaded dataset.
22
- c.Synthesis : Capture the raw output of the code execution and feed it back to the LLM to generate a natural language insight.
23
  """
24
- if file is None:
25
- return "Please upload an Excel file to begin."
26
-
27
- # The function needs a return here to avoid a NoneType error in Gradio
28
- return "File received! Candidate: Implement the Planner-Action-Synthesis logic here."
29
-
30
- # ===============================
31
- # UI CONFIGURATION
32
- # ===============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
34
  gr.Markdown("# πŸ“Š Technical Assessment: Data Analysis Agent")
35
  gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
36
-
37
  with gr.Row():
38
  excel_file = gr.File(
39
  label="1. Upload Dataset (.xlsx)",
40
  file_types=[".xlsx"]
41
  )
42
-
43
  gr.ChatInterface(
44
  fn=analyze_excel,
45
  additional_inputs=[excel_file],
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import traceback
4
+ import sys
5
+ import io
6
+ import re
7
  from huggingface_hub import InferenceClient
8
 
9
+ # ─────────────────────────────────────────────
10
+ # CONFIG
11
+ # ─────────────────────────────────────────────
12
+ MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
13
+ client = InferenceClient(MODEL_ID) # uses HF_TOKEN secret from Space settings
14
 
15
+ # ─────────────────────────────────────────────
16
+ # STEP 1 β€” LOAD EXCEL
17
+ # ─────────────────────────────────────────────
18
+ def load_excel(file) -> pd.DataFrame:
19
+ """Load xlsx into a DataFrame, handling multi-sheet files."""
20
+ xl = pd.ExcelFile(file.name)
21
+ # Use first sheet by default
22
+ df = xl.parse(xl.sheet_names[0])
23
+ df.columns = df.columns.str.strip() # clean column names
24
+ return df
25
+
26
+
27
+ def get_df_info(df: pd.DataFrame) -> str:
28
+ """Build a compact dataset description for the LLM prompt."""
29
+ return f"""Columns & dtypes:
30
+ {df.dtypes.to_string()}
31
+
32
+ Shape: {df.shape[0]} rows x {df.shape[1]} columns
33
+
34
+ Sample (first 5 rows):
35
+ {df.head(5).to_string(index=False)}
36
+
37
+ Numeric summary:
38
+ {df.describe().to_string()}
39
+ """
40
+
41
+
42
+ # ─────────────────────────────────────────────
43
+ # STEP 2 β€” CODE GENERATION via Qwen 2.5
44
+ # ─────────────────────────────────────────────
45
+ CODE_GEN_SYSTEM = """You are an expert Python data analyst.
46
+ Given a dataset description and a user question, generate ONLY executable Python/pandas code.
47
+
48
+ STRICT RULES:
49
+ - The DataFrame is already loaded as variable `df`.
50
+ - Only use pandas (pd) and Python built-ins. Do NOT import anything else.
51
+ - Store your final answer in a variable called `result`.
52
+ - `result` must be a string, number, Series, or DataFrame.
53
+ - Do NOT wrap output in markdown code fences.
54
+ - Do NOT add explanations or comments β€” code only.
55
+ """
56
+
57
+ def generate_code(question: str, df_info: str, history: list) -> str:
58
+ """Ask Qwen 2.5 to generate pandas code for the question."""
59
+ messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]
60
+
61
+ # Add prior turns for conversation context (last 3 Q&A pairs)
62
+ for msg in history[-6:]:
63
+ if msg["role"] in ("user", "assistant"):
64
+ messages.append({"role": msg["role"], "content": msg["content"]})
65
+
66
+ messages.append({
67
+ "role": "user",
68
+ "content": f"""Dataset info:
69
+ {df_info}
70
+
71
+ Question: {question}
72
+
73
+ Write the pandas code now:"""
74
+ })
75
+
76
+ response = client.chat_completion(
77
+ messages=messages,
78
+ max_tokens=600,
79
+ temperature=0.1,
80
+ )
81
+ code = response.choices[0].message.content.strip()
82
+
83
+ # Strip accidental markdown fences
84
+ code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
85
+ code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
86
+ return code
87
+
88
+
89
+ # ─────────────────────────────────────────────
90
+ # STEP 3 β€” SANDBOXED EXECUTION
91
+ # ─────────────────────────────────────────────
92
+ BLACKLIST = [
93
+ "import os", "import sys", "subprocess", "open(",
94
+ "__import__", "shutil", "socket", "requests",
95
+ "eval(", "exec(", "globals(", "locals(",
96
+ ]
97
+
98
+ def safe_execute(code: str, df: pd.DataFrame):
99
+ """Execute code in a restricted namespace. Returns result or raises."""
100
+ for pattern in BLACKLIST:
101
+ if pattern in code:
102
+ raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")
103
+
104
+ safe_builtins = {
105
+ "len": len, "range": range, "print": print,
106
+ "str": str, "int": int, "float": float,
107
+ "list": list, "dict": dict, "tuple": tuple,
108
+ "sum": sum, "min": min, "max": max, "round": round,
109
+ "enumerate": enumerate, "zip": zip, "sorted": sorted,
110
+ "isinstance": isinstance, "type": type, "abs": abs,
111
+ "bool": bool, "set": set, "map": map, "filter": filter,
112
+ }
113
+ namespace = {
114
+ "__builtins__": safe_builtins,
115
+ "pd": pd,
116
+ "df": df.copy(),
117
+ "result": None,
118
+ }
119
+
120
+ old_stdout = sys.stdout
121
+ sys.stdout = buf = io.StringIO()
122
+ try:
123
+ exec(code, namespace)
124
+ finally:
125
+ sys.stdout = old_stdout
126
+
127
+ result = namespace.get("result")
128
+ if result is None:
129
+ result = buf.getvalue().strip() or "Code ran but produced no output."
130
+ return result
131
+
132
+
133
+ def format_result(result) -> str:
134
+ """Convert any result type to a readable string."""
135
+ if isinstance(result, pd.DataFrame):
136
+ return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
137
+ elif isinstance(result, pd.Series):
138
+ return result.to_string()
139
+ else:
140
+ return str(result)
141
+
142
+
143
+ # ─────────────────────────────────────────────
144
+ # STEP 4 β€” INSIGHT SYNTHESIS via Qwen 2.5
145
+ # ─────────────────────────────────────────────
146
+ SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
147
+ Given a user's question and raw output from Python execution,
148
+ write a clear natural-language insight in 2-4 sentences.
149
+ - Highlight key numbers or trends.
150
+ - Do NOT mention code, pandas, or DataFrames.
151
+ - Speak directly to the business insight.
152
+ """
153
+
154
+ def synthesize_insight(question: str, raw_output: str) -> str:
155
+ """Ask Qwen 2.5 to turn raw output into a plain-English insight."""
156
+ response = client.chat_completion(
157
+ messages=[
158
+ {"role": "system", "content": SYNTHESIS_SYSTEM},
159
+ {"role": "user", "content": f"""Question: {question}
160
+
161
+ Execution result:
162
+ {raw_output[:3000]}
163
+
164
+ Write the insight:"""},
165
+ ],
166
+ max_tokens=350,
167
+ temperature=0.4,
168
+ )
169
+ return response.choices[0].message.content.strip()
170
+
171
+
172
+ # ─────────────────────────────────────────────
173
+ # MAIN CHAT HANDLER
174
+ # ─────────────────────────────────────────────
175
+ def analyze_excel(message: str, history: list, excel_file):
176
  """
177
+ Full 3-step pipeline:
178
+ user question β†’ code generation β†’ sandboxed execution β†’ insight synthesis
179
+ Supports streaming (yield) for live status updates in ChatInterface.
 
 
 
 
 
 
180
  """
181
+ # Guard: file not uploaded
182
+ if excel_file is None:
183
+ yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above."
184
+ return
185
+
186
+ # Load dataset
187
+ try:
188
+ df = load_excel(excel_file)
189
+ df_info = get_df_info(df)
190
+ except Exception as e:
191
+ yield f"❌ Failed to read the Excel file: {e}"
192
+ return
193
+
194
+ # ── Step 1: Generate Code ─────────────────────────────────────────────
195
+ yield "πŸ” Generating pandas code for your question..."
196
+ try:
197
+ code = generate_code(message, df_info, history)
198
+ except Exception as e:
199
+ yield f"❌ Code generation failed: {e}"
200
+ return
201
+
202
+ # ── Step 2: Execute Code ──────────────────────────────────────────────
203
+ yield "βš™οΈ Executing code on your dataset..."
204
+ exec_error = None
205
+ try:
206
+ raw_result = safe_execute(code, df)
207
+ raw_str = format_result(raw_result)
208
+ except PermissionError as pe:
209
+ exec_error = str(pe)
210
+ raw_str = exec_error
211
+ except Exception as e:
212
+ exec_error = f"{type(e).__name__}: {e}"
213
+ raw_str = exec_error
214
+
215
+ # ── Step 3: Synthesize Insight ────────────────────────────────────────
216
+ if exec_error:
217
+ yield f"""⚠️ **Execution Error**
218
+
219
+ ```
220
+ {exec_error}
221
+ ```
222
+
223
+ <details>
224
+ <summary>🐍 Generated Code (for debugging)</summary>
225
+
226
+ ```python
227
+ {code}
228
+ ```
229
+ </details>"""
230
+ return
231
+
232
+ yield "πŸ’‘ Synthesizing insight..."
233
+ try:
234
+ insight = synthesize_insight(message, raw_str)
235
+ except Exception as e:
236
+ insight = f"_(Could not generate insight: {e})_"
237
+
238
+ # ── Final formatted response ──────────────────────────────────────────
239
+ yield f"""{insight}
240
+
241
+ ---
242
+ <details>
243
+ <summary>🐍 View Generated Code</summary>
244
+
245
+ ```python
246
+ {code}
247
+ ```
248
+ </details>
249
+
250
+ <details>
251
+ <summary>πŸ“€ View Raw Output</summary>
252
+
253
+ ```
254
+ {raw_str[:2000]}
255
+ ```
256
+ </details>"""
257
+
258
+
259
+ # ─────────────────────────────────────────────
260
+ # GRADIO UI
261
+ # ─────────────────────────────────────────────
262
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
263
  gr.Markdown("# πŸ“Š Technical Assessment: Data Analysis Agent")
264
  gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
265
+
266
  with gr.Row():
267
  excel_file = gr.File(
268
  label="1. Upload Dataset (.xlsx)",
269
  file_types=[".xlsx"]
270
  )
271
+
272
  gr.ChatInterface(
273
  fn=analyze_excel,
274
  additional_inputs=[excel_file],