Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,45 +1,274 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import os
|
| 3 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from huggingface_hub import InferenceClient
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
client = InferenceClient(
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
3. Develop a robust "Text-to-Code" analytical workflow.
|
| 18 |
-
|
| 19 |
-
Requirements:
|
| 20 |
-
a.Code Generation : Transform natural language user queries into executable, sandboxed Python code (specifically using pandas).
|
| 21 |
-
b.Execution : Securely execute the generated code on the Hugging Face Space server against the uploaded dataset.
|
| 22 |
-
c.Synthesis : Capture the raw output of the code execution and feed it back to the LLM to generate a natural language insight.
|
| 23 |
"""
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 34 |
gr.Markdown("# π Technical Assessment: Data Analysis Agent")
|
| 35 |
gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
|
| 36 |
-
|
| 37 |
with gr.Row():
|
| 38 |
excel_file = gr.File(
|
| 39 |
label="1. Upload Dataset (.xlsx)",
|
| 40 |
file_types=[".xlsx"]
|
| 41 |
)
|
| 42 |
-
|
| 43 |
gr.ChatInterface(
|
| 44 |
fn=analyze_excel,
|
| 45 |
additional_inputs=[excel_file],
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
+
import traceback
|
| 4 |
+
import sys
|
| 5 |
+
import io
|
| 6 |
+
import re
|
| 7 |
from huggingface_hub import InferenceClient
|
| 8 |
|
| 9 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
# CONFIG
|
| 11 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
|
| 13 |
+
client = InferenceClient(MODEL_ID) # uses HF_TOKEN secret from Space settings
|
| 14 |
|
| 15 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
# STEP 1 β LOAD EXCEL
|
| 17 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
def load_excel(file) -> pd.DataFrame:
|
| 19 |
+
"""Load xlsx into a DataFrame, handling multi-sheet files."""
|
| 20 |
+
xl = pd.ExcelFile(file.name)
|
| 21 |
+
# Use first sheet by default
|
| 22 |
+
df = xl.parse(xl.sheet_names[0])
|
| 23 |
+
df.columns = df.columns.str.strip() # clean column names
|
| 24 |
+
return df
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_df_info(df: pd.DataFrame) -> str:
|
| 28 |
+
"""Build a compact dataset description for the LLM prompt."""
|
| 29 |
+
return f"""Columns & dtypes:
|
| 30 |
+
{df.dtypes.to_string()}
|
| 31 |
+
|
| 32 |
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
| 33 |
+
|
| 34 |
+
Sample (first 5 rows):
|
| 35 |
+
{df.head(5).to_string(index=False)}
|
| 36 |
+
|
| 37 |
+
Numeric summary:
|
| 38 |
+
{df.describe().to_string()}
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
# STEP 2 β CODE GENERATION via Qwen 2.5
|
| 44 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
CODE_GEN_SYSTEM = """You are an expert Python data analyst.
|
| 46 |
+
Given a dataset description and a user question, generate ONLY executable Python/pandas code.
|
| 47 |
+
|
| 48 |
+
STRICT RULES:
|
| 49 |
+
- The DataFrame is already loaded as variable `df`.
|
| 50 |
+
- Only use pandas (pd) and Python built-ins. Do NOT import anything else.
|
| 51 |
+
- Store your final answer in a variable called `result`.
|
| 52 |
+
- `result` must be a string, number, Series, or DataFrame.
|
| 53 |
+
- Do NOT wrap output in markdown code fences.
|
| 54 |
+
- Do NOT add explanations or comments β code only.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def generate_code(question: str, df_info: str, history: list) -> str:
|
| 58 |
+
"""Ask Qwen 2.5 to generate pandas code for the question."""
|
| 59 |
+
messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]
|
| 60 |
+
|
| 61 |
+
# Add prior turns for conversation context (last 3 Q&A pairs)
|
| 62 |
+
for msg in history[-6:]:
|
| 63 |
+
if msg["role"] in ("user", "assistant"):
|
| 64 |
+
messages.append({"role": msg["role"], "content": msg["content"]})
|
| 65 |
+
|
| 66 |
+
messages.append({
|
| 67 |
+
"role": "user",
|
| 68 |
+
"content": f"""Dataset info:
|
| 69 |
+
{df_info}
|
| 70 |
+
|
| 71 |
+
Question: {question}
|
| 72 |
+
|
| 73 |
+
Write the pandas code now:"""
|
| 74 |
+
})
|
| 75 |
+
|
| 76 |
+
response = client.chat_completion(
|
| 77 |
+
messages=messages,
|
| 78 |
+
max_tokens=600,
|
| 79 |
+
temperature=0.1,
|
| 80 |
+
)
|
| 81 |
+
code = response.choices[0].message.content.strip()
|
| 82 |
+
|
| 83 |
+
# Strip accidental markdown fences
|
| 84 |
+
code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
|
| 85 |
+
code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
|
| 86 |
+
return code
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
# STEP 3 β SANDBOXED EXECUTION
|
| 91 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
+
BLACKLIST = [
|
| 93 |
+
"import os", "import sys", "subprocess", "open(",
|
| 94 |
+
"__import__", "shutil", "socket", "requests",
|
| 95 |
+
"eval(", "exec(", "globals(", "locals(",
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
def safe_execute(code: str, df: pd.DataFrame):
|
| 99 |
+
"""Execute code in a restricted namespace. Returns result or raises."""
|
| 100 |
+
for pattern in BLACKLIST:
|
| 101 |
+
if pattern in code:
|
| 102 |
+
raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")
|
| 103 |
+
|
| 104 |
+
safe_builtins = {
|
| 105 |
+
"len": len, "range": range, "print": print,
|
| 106 |
+
"str": str, "int": int, "float": float,
|
| 107 |
+
"list": list, "dict": dict, "tuple": tuple,
|
| 108 |
+
"sum": sum, "min": min, "max": max, "round": round,
|
| 109 |
+
"enumerate": enumerate, "zip": zip, "sorted": sorted,
|
| 110 |
+
"isinstance": isinstance, "type": type, "abs": abs,
|
| 111 |
+
"bool": bool, "set": set, "map": map, "filter": filter,
|
| 112 |
+
}
|
| 113 |
+
namespace = {
|
| 114 |
+
"__builtins__": safe_builtins,
|
| 115 |
+
"pd": pd,
|
| 116 |
+
"df": df.copy(),
|
| 117 |
+
"result": None,
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
old_stdout = sys.stdout
|
| 121 |
+
sys.stdout = buf = io.StringIO()
|
| 122 |
+
try:
|
| 123 |
+
exec(code, namespace)
|
| 124 |
+
finally:
|
| 125 |
+
sys.stdout = old_stdout
|
| 126 |
+
|
| 127 |
+
result = namespace.get("result")
|
| 128 |
+
if result is None:
|
| 129 |
+
result = buf.getvalue().strip() or "Code ran but produced no output."
|
| 130 |
+
return result
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def format_result(result) -> str:
|
| 134 |
+
"""Convert any result type to a readable string."""
|
| 135 |
+
if isinstance(result, pd.DataFrame):
|
| 136 |
+
return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
|
| 137 |
+
elif isinstance(result, pd.Series):
|
| 138 |
+
return result.to_string()
|
| 139 |
+
else:
|
| 140 |
+
return str(result)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
+
# STEP 4 β INSIGHT SYNTHESIS via Qwen 2.5
|
| 145 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 146 |
+
SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
|
| 147 |
+
Given a user's question and raw output from Python execution,
|
| 148 |
+
write a clear natural-language insight in 2-4 sentences.
|
| 149 |
+
- Highlight key numbers or trends.
|
| 150 |
+
- Do NOT mention code, pandas, or DataFrames.
|
| 151 |
+
- Speak directly to the business insight.
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
def synthesize_insight(question: str, raw_output: str) -> str:
|
| 155 |
+
"""Ask Qwen 2.5 to turn raw output into a plain-English insight."""
|
| 156 |
+
response = client.chat_completion(
|
| 157 |
+
messages=[
|
| 158 |
+
{"role": "system", "content": SYNTHESIS_SYSTEM},
|
| 159 |
+
{"role": "user", "content": f"""Question: {question}
|
| 160 |
+
|
| 161 |
+
Execution result:
|
| 162 |
+
{raw_output[:3000]}
|
| 163 |
+
|
| 164 |
+
Write the insight:"""},
|
| 165 |
+
],
|
| 166 |
+
max_tokens=350,
|
| 167 |
+
temperature=0.4,
|
| 168 |
+
)
|
| 169 |
+
return response.choices[0].message.content.strip()
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 173 |
+
# MAIN CHAT HANDLER
|
| 174 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
+
def analyze_excel(message: str, history: list, excel_file):
|
| 176 |
"""
|
| 177 |
+
Full 3-step pipeline:
|
| 178 |
+
user question β code generation β sandboxed execution β insight synthesis
|
| 179 |
+
Supports streaming (yield) for live status updates in ChatInterface.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
"""
|
| 181 |
+
# Guard: file not uploaded
|
| 182 |
+
if excel_file is None:
|
| 183 |
+
yield "β οΈ Please upload an Excel (.xlsx) file first using the upload box above."
|
| 184 |
+
return
|
| 185 |
+
|
| 186 |
+
# Load dataset
|
| 187 |
+
try:
|
| 188 |
+
df = load_excel(excel_file)
|
| 189 |
+
df_info = get_df_info(df)
|
| 190 |
+
except Exception as e:
|
| 191 |
+
yield f"β Failed to read the Excel file: {e}"
|
| 192 |
+
return
|
| 193 |
+
|
| 194 |
+
# ββ Step 1: Generate Code βββββββββββββββββββββββββββββββββββββββββββββ
|
| 195 |
+
yield "π Generating pandas code for your question..."
|
| 196 |
+
try:
|
| 197 |
+
code = generate_code(message, df_info, history)
|
| 198 |
+
except Exception as e:
|
| 199 |
+
yield f"β Code generation failed: {e}"
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
# ββ Step 2: Execute Code ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 203 |
+
yield "βοΈ Executing code on your dataset..."
|
| 204 |
+
exec_error = None
|
| 205 |
+
try:
|
| 206 |
+
raw_result = safe_execute(code, df)
|
| 207 |
+
raw_str = format_result(raw_result)
|
| 208 |
+
except PermissionError as pe:
|
| 209 |
+
exec_error = str(pe)
|
| 210 |
+
raw_str = exec_error
|
| 211 |
+
except Exception as e:
|
| 212 |
+
exec_error = f"{type(e).__name__}: {e}"
|
| 213 |
+
raw_str = exec_error
|
| 214 |
+
|
| 215 |
+
# ββ Step 3: Synthesize Insight ββββββββββββββββββββββββββββββββββββββββ
|
| 216 |
+
if exec_error:
|
| 217 |
+
yield f"""β οΈ **Execution Error**
|
| 218 |
+
|
| 219 |
+
```
|
| 220 |
+
{exec_error}
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
<details>
|
| 224 |
+
<summary>π Generated Code (for debugging)</summary>
|
| 225 |
+
|
| 226 |
+
```python
|
| 227 |
+
{code}
|
| 228 |
+
```
|
| 229 |
+
</details>"""
|
| 230 |
+
return
|
| 231 |
+
|
| 232 |
+
yield "π‘ Synthesizing insight..."
|
| 233 |
+
try:
|
| 234 |
+
insight = synthesize_insight(message, raw_str)
|
| 235 |
+
except Exception as e:
|
| 236 |
+
insight = f"_(Could not generate insight: {e})_"
|
| 237 |
+
|
| 238 |
+
# ββ Final formatted response ββββββββββββββββββββββββββββββββββββββββββ
|
| 239 |
+
yield f"""{insight}
|
| 240 |
+
|
| 241 |
+
---
|
| 242 |
+
<details>
|
| 243 |
+
<summary>π View Generated Code</summary>
|
| 244 |
+
|
| 245 |
+
```python
|
| 246 |
+
{code}
|
| 247 |
+
```
|
| 248 |
+
</details>
|
| 249 |
+
|
| 250 |
+
<details>
|
| 251 |
+
<summary>π€ View Raw Output</summary>
|
| 252 |
+
|
| 253 |
+
```
|
| 254 |
+
{raw_str[:2000]}
|
| 255 |
+
```
|
| 256 |
+
</details>"""
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 260 |
+
# GRADIO UI
|
| 261 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 262 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 263 |
gr.Markdown("# π Technical Assessment: Data Analysis Agent")
|
| 264 |
gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
|
| 265 |
+
|
| 266 |
with gr.Row():
|
| 267 |
excel_file = gr.File(
|
| 268 |
label="1. Upload Dataset (.xlsx)",
|
| 269 |
file_types=[".xlsx"]
|
| 270 |
)
|
| 271 |
+
|
| 272 |
gr.ChatInterface(
|
| 273 |
fn=analyze_excel,
|
| 274 |
additional_inputs=[excel_file],
|