File size: 10,052 Bytes
ca8dfad
8e7badb
78f270c
 
 
 
aa28add
ca8dfad
 
78f270c
 
 
 
125ed6a
aa28add
8e7badb
78f270c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e7badb
78f270c
 
 
8e7badb
78f270c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e15f4cd
8e7badb
 
78f270c
8e7badb
 
 
 
 
78f270c
e15f4cd
 
 
 
8e7badb
e15f4cd
ca8dfad
 
8e7badb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import gradio as gr
import pandas as pd
import traceback
import sys
import io
import re
import os
from huggingface_hub import InferenceClient

# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
HF_TOKEN = os.environ.get("hf_token")  # set this in Space Settings β†’ Secrets
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)

# ─────────────────────────────────────────────
# STEP 1 β€” LOAD EXCEL
# ─────────────────────────────────────────────
def load_excel(file) -> pd.DataFrame:
    """Load xlsx into a DataFrame, handling multi-sheet files."""
    xl = pd.ExcelFile(file.name)
    # Use first sheet by default
    df = xl.parse(xl.sheet_names[0])
    df.columns = df.columns.str.strip()  # clean column names
    return df


def get_df_info(df: pd.DataFrame) -> str:
    """Build a compact dataset description for the LLM prompt."""
    return f"""Columns & dtypes:
{df.dtypes.to_string()}

Shape: {df.shape[0]} rows x {df.shape[1]} columns

Sample (first 5 rows):
{df.head(5).to_string(index=False)}

Numeric summary:
{df.describe().to_string()}
"""


# ─────────────────────────────────────────────
# STEP 2 β€” CODE GENERATION via Qwen 2.5
# ─────────────────────────────────────────────
CODE_GEN_SYSTEM = """You are an expert Python data analyst. 
Given a dataset description and a user question, generate ONLY executable Python/pandas code.

STRICT RULES:
- The DataFrame is already loaded as variable `df`.
- Only use pandas (pd) and Python built-ins. Do NOT import anything else.
- Store your final answer in a variable called `result`.
- `result` must be a string, number, Series, or DataFrame.
- Do NOT wrap output in markdown code fences.
- Do NOT add explanations or comments β€” code only.
"""

def generate_code(question: str, df_info: str, history: list) -> str:
    """Ask Qwen 2.5 to generate pandas code for the question."""
    messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]

    # Add prior turns for conversation context (last 3 Q&A pairs)
    for msg in history[-6:]:
        if msg["role"] in ("user", "assistant"):
            messages.append({"role": msg["role"], "content": msg["content"]})

    messages.append({
        "role": "user",
        "content": f"""Dataset info:
{df_info}

Question: {question}

Write the pandas code now:"""
    })

    response = client.chat_completion(
        messages=messages,
        max_tokens=600,
        temperature=0.1,
    )
    code = response.choices[0].message.content.strip()

    # Strip accidental markdown fences
    code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
    code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
    return code


# ─────────────────────────────────────────────
# STEP 3 β€” SANDBOXED EXECUTION
# ─────────────────────────────────────────────
BLACKLIST = [
    "import os", "import sys", "subprocess", "open(",
    "__import__", "shutil", "socket", "requests",
    "eval(", "exec(", "globals(", "locals(",
]

def safe_execute(code: str, df: pd.DataFrame):
    """Execute code in a restricted namespace. Returns result or raises."""
    for pattern in BLACKLIST:
        if pattern in code:
            raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")

    safe_builtins = {
        "len": len, "range": range, "print": print,
        "str": str, "int": int, "float": float,
        "list": list, "dict": dict, "tuple": tuple,
        "sum": sum, "min": min, "max": max, "round": round,
        "enumerate": enumerate, "zip": zip, "sorted": sorted,
        "isinstance": isinstance, "type": type, "abs": abs,
        "bool": bool, "set": set, "map": map, "filter": filter,
    }
    namespace = {
        "__builtins__": safe_builtins,
        "pd": pd,
        "df": df.copy(),
        "result": None,
    }

    old_stdout = sys.stdout
    sys.stdout = buf = io.StringIO()
    try:
        exec(code, namespace)
    finally:
        sys.stdout = old_stdout

    result = namespace.get("result")
    if result is None:
        result = buf.getvalue().strip() or "Code ran but produced no output."
    return result


def format_result(result) -> str:
    """Convert any result type to a readable string."""
    if isinstance(result, pd.DataFrame):
        return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
    elif isinstance(result, pd.Series):
        return result.to_string()
    else:
        return str(result)


# ─────────────────────────────────────────────
# STEP 4 β€” INSIGHT SYNTHESIS via Qwen 2.5
# ─────────────────────────────────────────────
SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
Given a user's question and raw output from Python execution,
write a clear natural-language insight in 2-4 sentences.
- Highlight key numbers or trends.
- Do NOT mention code, pandas, or DataFrames.
- Speak directly to the business insight.
"""

def synthesize_insight(question: str, raw_output: str) -> str:
    """Ask Qwen 2.5 to turn raw output into a plain-English insight."""
    response = client.chat_completion(
        messages=[
            {"role": "system", "content": SYNTHESIS_SYSTEM},
            {"role": "user", "content": f"""Question: {question}

Execution result:
{raw_output[:3000]}

Write the insight:"""},
        ],
        max_tokens=350,
        temperature=0.4,
    )
    return response.choices[0].message.content.strip()


# ─────────────────────────────────────────────
# MAIN CHAT HANDLER
# ─────────────────────────────────────────────
def analyze_excel(message: str, history: list, excel_file):
    """
    Full 3-step pipeline:
      user question β†’ code generation β†’ sandboxed execution β†’ insight synthesis
    Supports streaming (yield) for live status updates in ChatInterface.
    """
    # Guard: file not uploaded
    if excel_file is None:
        yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above."
        return

    # Load dataset
    try:
        df = load_excel(excel_file)
        df_info = get_df_info(df)
    except Exception as e:
        yield f"❌ Failed to read the Excel file: {e}"
        return

    # ── Step 1: Generate Code ─────────────────────────────────────────────
    yield "πŸ” Generating pandas code for your question..."
    try:
        code = generate_code(message, df_info, history)
    except Exception as e:
        yield f"❌ Code generation failed: {e}"
        return

    # ── Step 2: Execute Code ──────────────────────────────────────────────
    yield "βš™οΈ Executing code on your dataset..."
    exec_error = None
    try:
        raw_result = safe_execute(code, df)
        raw_str = format_result(raw_result)
    except PermissionError as pe:
        exec_error = str(pe)
        raw_str = exec_error
    except Exception as e:
        exec_error = f"{type(e).__name__}: {e}"
        raw_str = exec_error

    # ── Step 3: Synthesize Insight ────────────────────────────────────────
    if exec_error:
        yield f"""⚠️ **Execution Error**

```
{exec_error}
```

<details>
<summary>🐍 Generated Code (for debugging)</summary>

```python
{code}
```
</details>"""
        return

    yield "πŸ’‘ Synthesizing insight..."
    try:
        insight = synthesize_insight(message, raw_str)
    except Exception as e:
        insight = f"_(Could not generate insight: {e})_"

    # ── Final formatted response ──────────────────────────────────────────
    yield f"""{insight}

---
<details>
<summary>🐍 View Generated Code</summary>

```python
{code}
```
</details>

<details>
<summary>πŸ“€ View Raw Output</summary>

```
{raw_str[:2000]}
```
</details>"""


# ─────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ“Š Technical Assessment: Data Analysis Agent")
    gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")

    with gr.Row():
        excel_file = gr.File(
            label="1. Upload Dataset (.xlsx)",
            file_types=[".xlsx"]
        )

    gr.ChatInterface(
        fn=analyze_excel,
        additional_inputs=[excel_file],
        type="messages",
        description="2. Ask questions about your data (e.g., 'What is the average profit by region?')",
    )

if __name__ == "__main__":
    demo.launch()