import gradio as gr import pandas as pd import traceback import sys import io import re import os from huggingface_hub import InferenceClient # ───────────────────────────────────────────── # CONFIG # ───────────────────────────────────────────── MODEL_ID = "Qwen/Qwen2.5-72B-Instruct" HF_TOKEN = os.environ.get("hf_token") # set this in Space Settings → Secrets client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) # ───────────────────────────────────────────── # STEP 1 — LOAD EXCEL # ───────────────────────────────────────────── def load_excel(file) -> pd.DataFrame: """Load xlsx into a DataFrame, handling multi-sheet files.""" xl = pd.ExcelFile(file.name) # Use first sheet by default df = xl.parse(xl.sheet_names[0]) df.columns = df.columns.str.strip() # clean column names return df def get_df_info(df: pd.DataFrame) -> str: """Build a compact dataset description for the LLM prompt.""" return f"""Columns & dtypes: {df.dtypes.to_string()} Shape: {df.shape[0]} rows x {df.shape[1]} columns Sample (first 5 rows): {df.head(5).to_string(index=False)} Numeric summary: {df.describe().to_string()} """ # ───────────────────────────────────────────── # STEP 2 — CODE GENERATION via Qwen 2.5 # ───────────────────────────────────────────── CODE_GEN_SYSTEM = """You are an expert Python data analyst. Given a dataset description and a user question, generate ONLY executable Python/pandas code. STRICT RULES: - The DataFrame is already loaded as variable `df`. - Only use pandas (pd) and Python built-ins. Do NOT import anything else. - Store your final answer in a variable called `result`. - `result` must be a string, number, Series, or DataFrame. - Do NOT wrap output in markdown code fences. - Do NOT add explanations or comments — code only. """ def generate_code(question: str, df_info: str, history: list) -> str: """Ask Qwen 2.5 to generate pandas code for the question.""" messages = [{"role": "system", "content": CODE_GEN_SYSTEM}] # Add prior turns for conversation context (last 3 Q&A pairs) for msg in history[-6:]: if msg["role"] in ("user", "assistant"): messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({ "role": "user", "content": f"""Dataset info: {df_info} Question: {question} Write the pandas code now:""" }) response = client.chat_completion( messages=messages, max_tokens=600, temperature=0.1, ) code = response.choices[0].message.content.strip() # Strip accidental markdown fences code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip() code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip() return code # ───────────────────────────────────────────── # STEP 3 — SANDBOXED EXECUTION # ───────────────────────────────────────────── BLACKLIST = [ "import os", "import sys", "subprocess", "open(", "__import__", "shutil", "socket", "requests", "eval(", "exec(", "globals(", "locals(", ] def safe_execute(code: str, df: pd.DataFrame): """Execute code in a restricted namespace. Returns result or raises.""" for pattern in BLACKLIST: if pattern in code: raise PermissionError(f"Blocked unsafe pattern: `{pattern}`") safe_builtins = { "len": len, "range": range, "print": print, "str": str, "int": int, "float": float, "list": list, "dict": dict, "tuple": tuple, "sum": sum, "min": min, "max": max, "round": round, "enumerate": enumerate, "zip": zip, "sorted": sorted, "isinstance": isinstance, "type": type, "abs": abs, "bool": bool, "set": set, "map": map, "filter": filter, } namespace = { "__builtins__": safe_builtins, "pd": pd, "df": df.copy(), "result": None, } old_stdout = sys.stdout sys.stdout = buf = io.StringIO() try: exec(code, namespace) finally: sys.stdout = old_stdout result = namespace.get("result") if result is None: result = buf.getvalue().strip() or "Code ran but produced no output." return result def format_result(result) -> str: """Convert any result type to a readable string.""" if isinstance(result, pd.DataFrame): return result.to_string(index=False) if not result.empty else "Empty DataFrame returned." elif isinstance(result, pd.Series): return result.to_string() else: return str(result) # ───────────────────────────────────────────── # STEP 4 — INSIGHT SYNTHESIS via Qwen 2.5 # ───────────────────────────────────────────── SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst. Given a user's question and raw output from Python execution, write a clear natural-language insight in 2-4 sentences. - Highlight key numbers or trends. - Do NOT mention code, pandas, or DataFrames. - Speak directly to the business insight. """ def synthesize_insight(question: str, raw_output: str) -> str: """Ask Qwen 2.5 to turn raw output into a plain-English insight.""" response = client.chat_completion( messages=[ {"role": "system", "content": SYNTHESIS_SYSTEM}, {"role": "user", "content": f"""Question: {question} Execution result: {raw_output[:3000]} Write the insight:"""}, ], max_tokens=350, temperature=0.4, ) return response.choices[0].message.content.strip() # ───────────────────────────────────────────── # MAIN CHAT HANDLER # ───────────────────────────────────────────── def analyze_excel(message: str, history: list, excel_file): """ Full 3-step pipeline: user question → code generation → sandboxed execution → insight synthesis Supports streaming (yield) for live status updates in ChatInterface. """ # Guard: file not uploaded if excel_file is None: yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above." return # Load dataset try: df = load_excel(excel_file) df_info = get_df_info(df) except Exception as e: yield f"❌ Failed to read the Excel file: {e}" return # ── Step 1: Generate Code ───────────────────────────────────────────── yield "🔍 Generating pandas code for your question..." try: code = generate_code(message, df_info, history) except Exception as e: yield f"❌ Code generation failed: {e}" return # ── Step 2: Execute Code ────────────────────────────────────────────── yield "⚙️ Executing code on your dataset..." exec_error = None try: raw_result = safe_execute(code, df) raw_str = format_result(raw_result) except PermissionError as pe: exec_error = str(pe) raw_str = exec_error except Exception as e: exec_error = f"{type(e).__name__}: {e}" raw_str = exec_error # ── Step 3: Synthesize Insight ──────────────────────────────────────── if exec_error: yield f"""⚠️ **Execution Error** ``` {exec_error} ```
🐍 Generated Code (for debugging) ```python {code} ```
""" return yield "💡 Synthesizing insight..." try: insight = synthesize_insight(message, raw_str) except Exception as e: insight = f"_(Could not generate insight: {e})_" # ── Final formatted response ────────────────────────────────────────── yield f"""{insight} ---
🐍 View Generated Code ```python {code} ```
📤 View Raw Output ``` {raw_str[:2000]} ```
""" # ───────────────────────────────────────────── # GRADIO UI # ───────────────────────────────────────────── with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 📊 Technical Assessment: Data Analysis Agent") gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5") with gr.Row(): excel_file = gr.File( label="1. Upload Dataset (.xlsx)", file_types=[".xlsx"] ) gr.ChatInterface( fn=analyze_excel, additional_inputs=[excel_file], type="messages", description="2. Ask questions about your data (e.g., 'What is the average profit by region?')", ) if __name__ == "__main__": demo.launch()