DOMMETI's picture
Update app.py
125ed6a verified
import gradio as gr
import pandas as pd
import traceback
import sys
import io
import re
import os
from huggingface_hub import InferenceClient
# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
HF_TOKEN = os.environ.get("hf_token") # set this in Space Settings β†’ Secrets
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
# ─────────────────────────────────────────────
# STEP 1 β€” LOAD EXCEL
# ─────────────────────────────────────────────
def load_excel(file) -> pd.DataFrame:
"""Load xlsx into a DataFrame, handling multi-sheet files."""
xl = pd.ExcelFile(file.name)
# Use first sheet by default
df = xl.parse(xl.sheet_names[0])
df.columns = df.columns.str.strip() # clean column names
return df
def get_df_info(df: pd.DataFrame) -> str:
"""Build a compact dataset description for the LLM prompt."""
return f"""Columns & dtypes:
{df.dtypes.to_string()}
Shape: {df.shape[0]} rows x {df.shape[1]} columns
Sample (first 5 rows):
{df.head(5).to_string(index=False)}
Numeric summary:
{df.describe().to_string()}
"""
# ─────────────────────────────────────────────
# STEP 2 β€” CODE GENERATION via Qwen 2.5
# ─────────────────────────────────────────────
CODE_GEN_SYSTEM = """You are an expert Python data analyst.
Given a dataset description and a user question, generate ONLY executable Python/pandas code.
STRICT RULES:
- The DataFrame is already loaded as variable `df`.
- Only use pandas (pd) and Python built-ins. Do NOT import anything else.
- Store your final answer in a variable called `result`.
- `result` must be a string, number, Series, or DataFrame.
- Do NOT wrap output in markdown code fences.
- Do NOT add explanations or comments β€” code only.
"""
def generate_code(question: str, df_info: str, history: list) -> str:
"""Ask Qwen 2.5 to generate pandas code for the question."""
messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]
# Add prior turns for conversation context (last 3 Q&A pairs)
for msg in history[-6:]:
if msg["role"] in ("user", "assistant"):
messages.append({"role": msg["role"], "content": msg["content"]})
messages.append({
"role": "user",
"content": f"""Dataset info:
{df_info}
Question: {question}
Write the pandas code now:"""
})
response = client.chat_completion(
messages=messages,
max_tokens=600,
temperature=0.1,
)
code = response.choices[0].message.content.strip()
# Strip accidental markdown fences
code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
return code
# ─────────────────────────────────────────────
# STEP 3 β€” SANDBOXED EXECUTION
# ─────────────────────────────────────────────
BLACKLIST = [
"import os", "import sys", "subprocess", "open(",
"__import__", "shutil", "socket", "requests",
"eval(", "exec(", "globals(", "locals(",
]
def safe_execute(code: str, df: pd.DataFrame):
"""Execute code in a restricted namespace. Returns result or raises."""
for pattern in BLACKLIST:
if pattern in code:
raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")
safe_builtins = {
"len": len, "range": range, "print": print,
"str": str, "int": int, "float": float,
"list": list, "dict": dict, "tuple": tuple,
"sum": sum, "min": min, "max": max, "round": round,
"enumerate": enumerate, "zip": zip, "sorted": sorted,
"isinstance": isinstance, "type": type, "abs": abs,
"bool": bool, "set": set, "map": map, "filter": filter,
}
namespace = {
"__builtins__": safe_builtins,
"pd": pd,
"df": df.copy(),
"result": None,
}
old_stdout = sys.stdout
sys.stdout = buf = io.StringIO()
try:
exec(code, namespace)
finally:
sys.stdout = old_stdout
result = namespace.get("result")
if result is None:
result = buf.getvalue().strip() or "Code ran but produced no output."
return result
def format_result(result) -> str:
"""Convert any result type to a readable string."""
if isinstance(result, pd.DataFrame):
return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
elif isinstance(result, pd.Series):
return result.to_string()
else:
return str(result)
# ─────────────────────────────────────────────
# STEP 4 β€” INSIGHT SYNTHESIS via Qwen 2.5
# ─────────────────────────────────────────────
SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
Given a user's question and raw output from Python execution,
write a clear natural-language insight in 2-4 sentences.
- Highlight key numbers or trends.
- Do NOT mention code, pandas, or DataFrames.
- Speak directly to the business insight.
"""
def synthesize_insight(question: str, raw_output: str) -> str:
"""Ask Qwen 2.5 to turn raw output into a plain-English insight."""
response = client.chat_completion(
messages=[
{"role": "system", "content": SYNTHESIS_SYSTEM},
{"role": "user", "content": f"""Question: {question}
Execution result:
{raw_output[:3000]}
Write the insight:"""},
],
max_tokens=350,
temperature=0.4,
)
return response.choices[0].message.content.strip()
# ─────────────────────────────────────────────
# MAIN CHAT HANDLER
# ─────────────────────────────────────────────
def analyze_excel(message: str, history: list, excel_file):
"""
Full 3-step pipeline:
user question β†’ code generation β†’ sandboxed execution β†’ insight synthesis
Supports streaming (yield) for live status updates in ChatInterface.
"""
# Guard: file not uploaded
if excel_file is None:
yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above."
return
# Load dataset
try:
df = load_excel(excel_file)
df_info = get_df_info(df)
except Exception as e:
yield f"❌ Failed to read the Excel file: {e}"
return
# ── Step 1: Generate Code ─────────────────────────────────────────────
yield "πŸ” Generating pandas code for your question..."
try:
code = generate_code(message, df_info, history)
except Exception as e:
yield f"❌ Code generation failed: {e}"
return
# ── Step 2: Execute Code ──────────────────────────────────────────────
yield "βš™οΈ Executing code on your dataset..."
exec_error = None
try:
raw_result = safe_execute(code, df)
raw_str = format_result(raw_result)
except PermissionError as pe:
exec_error = str(pe)
raw_str = exec_error
except Exception as e:
exec_error = f"{type(e).__name__}: {e}"
raw_str = exec_error
# ── Step 3: Synthesize Insight ────────────────────────────────────────
if exec_error:
yield f"""⚠️ **Execution Error**
```
{exec_error}
```
<details>
<summary>🐍 Generated Code (for debugging)</summary>
```python
{code}
```
</details>"""
return
yield "πŸ’‘ Synthesizing insight..."
try:
insight = synthesize_insight(message, raw_str)
except Exception as e:
insight = f"_(Could not generate insight: {e})_"
# ── Final formatted response ──────────────────────────────────────────
yield f"""{insight}
---
<details>
<summary>🐍 View Generated Code</summary>
```python
{code}
```
</details>
<details>
<summary>πŸ“€ View Raw Output</summary>
```
{raw_str[:2000]}
```
</details>"""
# ─────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸ“Š Technical Assessment: Data Analysis Agent")
gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
with gr.Row():
excel_file = gr.File(
label="1. Upload Dataset (.xlsx)",
file_types=[".xlsx"]
)
gr.ChatInterface(
fn=analyze_excel,
additional_inputs=[excel_file],
type="messages",
description="2. Ask questions about your data (e.g., 'What is the average profit by region?')",
)
if __name__ == "__main__":
demo.launch()