Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import traceback | |
| import sys | |
| import io | |
| import re | |
| import os | |
| from huggingface_hub import InferenceClient | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIG | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_ID = "Qwen/Qwen2.5-72B-Instruct" | |
| HF_TOKEN = os.environ.get("hf_token") # set this in Space Settings β Secrets | |
| client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 1 β LOAD EXCEL | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_excel(file) -> pd.DataFrame: | |
| """Load xlsx into a DataFrame, handling multi-sheet files.""" | |
| xl = pd.ExcelFile(file.name) | |
| # Use first sheet by default | |
| df = xl.parse(xl.sheet_names[0]) | |
| df.columns = df.columns.str.strip() # clean column names | |
| return df | |
| def get_df_info(df: pd.DataFrame) -> str: | |
| """Build a compact dataset description for the LLM prompt.""" | |
| return f"""Columns & dtypes: | |
| {df.dtypes.to_string()} | |
| Shape: {df.shape[0]} rows x {df.shape[1]} columns | |
| Sample (first 5 rows): | |
| {df.head(5).to_string(index=False)} | |
| Numeric summary: | |
| {df.describe().to_string()} | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 2 β CODE GENERATION via Qwen 2.5 | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| CODE_GEN_SYSTEM = """You are an expert Python data analyst. | |
| Given a dataset description and a user question, generate ONLY executable Python/pandas code. | |
| STRICT RULES: | |
| - The DataFrame is already loaded as variable `df`. | |
| - Only use pandas (pd) and Python built-ins. Do NOT import anything else. | |
| - Store your final answer in a variable called `result`. | |
| - `result` must be a string, number, Series, or DataFrame. | |
| - Do NOT wrap output in markdown code fences. | |
| - Do NOT add explanations or comments β code only. | |
| """ | |
| def generate_code(question: str, df_info: str, history: list) -> str: | |
| """Ask Qwen 2.5 to generate pandas code for the question.""" | |
| messages = [{"role": "system", "content": CODE_GEN_SYSTEM}] | |
| # Add prior turns for conversation context (last 3 Q&A pairs) | |
| for msg in history[-6:]: | |
| if msg["role"] in ("user", "assistant"): | |
| messages.append({"role": msg["role"], "content": msg["content"]}) | |
| messages.append({ | |
| "role": "user", | |
| "content": f"""Dataset info: | |
| {df_info} | |
| Question: {question} | |
| Write the pandas code now:""" | |
| }) | |
| response = client.chat_completion( | |
| messages=messages, | |
| max_tokens=600, | |
| temperature=0.1, | |
| ) | |
| code = response.choices[0].message.content.strip() | |
| # Strip accidental markdown fences | |
| code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip() | |
| code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip() | |
| return code | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 3 β SANDBOXED EXECUTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| BLACKLIST = [ | |
| "import os", "import sys", "subprocess", "open(", | |
| "__import__", "shutil", "socket", "requests", | |
| "eval(", "exec(", "globals(", "locals(", | |
| ] | |
| def safe_execute(code: str, df: pd.DataFrame): | |
| """Execute code in a restricted namespace. Returns result or raises.""" | |
| for pattern in BLACKLIST: | |
| if pattern in code: | |
| raise PermissionError(f"Blocked unsafe pattern: `{pattern}`") | |
| safe_builtins = { | |
| "len": len, "range": range, "print": print, | |
| "str": str, "int": int, "float": float, | |
| "list": list, "dict": dict, "tuple": tuple, | |
| "sum": sum, "min": min, "max": max, "round": round, | |
| "enumerate": enumerate, "zip": zip, "sorted": sorted, | |
| "isinstance": isinstance, "type": type, "abs": abs, | |
| "bool": bool, "set": set, "map": map, "filter": filter, | |
| } | |
| namespace = { | |
| "__builtins__": safe_builtins, | |
| "pd": pd, | |
| "df": df.copy(), | |
| "result": None, | |
| } | |
| old_stdout = sys.stdout | |
| sys.stdout = buf = io.StringIO() | |
| try: | |
| exec(code, namespace) | |
| finally: | |
| sys.stdout = old_stdout | |
| result = namespace.get("result") | |
| if result is None: | |
| result = buf.getvalue().strip() or "Code ran but produced no output." | |
| return result | |
| def format_result(result) -> str: | |
| """Convert any result type to a readable string.""" | |
| if isinstance(result, pd.DataFrame): | |
| return result.to_string(index=False) if not result.empty else "Empty DataFrame returned." | |
| elif isinstance(result, pd.Series): | |
| return result.to_string() | |
| else: | |
| return str(result) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 4 β INSIGHT SYNTHESIS via Qwen 2.5 | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst. | |
| Given a user's question and raw output from Python execution, | |
| write a clear natural-language insight in 2-4 sentences. | |
| - Highlight key numbers or trends. | |
| - Do NOT mention code, pandas, or DataFrames. | |
| - Speak directly to the business insight. | |
| """ | |
| def synthesize_insight(question: str, raw_output: str) -> str: | |
| """Ask Qwen 2.5 to turn raw output into a plain-English insight.""" | |
| response = client.chat_completion( | |
| messages=[ | |
| {"role": "system", "content": SYNTHESIS_SYSTEM}, | |
| {"role": "user", "content": f"""Question: {question} | |
| Execution result: | |
| {raw_output[:3000]} | |
| Write the insight:"""}, | |
| ], | |
| max_tokens=350, | |
| temperature=0.4, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN CHAT HANDLER | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def analyze_excel(message: str, history: list, excel_file): | |
| """ | |
| Full 3-step pipeline: | |
| user question β code generation β sandboxed execution β insight synthesis | |
| Supports streaming (yield) for live status updates in ChatInterface. | |
| """ | |
| # Guard: file not uploaded | |
| if excel_file is None: | |
| yield "β οΈ Please upload an Excel (.xlsx) file first using the upload box above." | |
| return | |
| # Load dataset | |
| try: | |
| df = load_excel(excel_file) | |
| df_info = get_df_info(df) | |
| except Exception as e: | |
| yield f"β Failed to read the Excel file: {e}" | |
| return | |
| # ββ Step 1: Generate Code βββββββββββββββββββββββββββββββββββββββββββββ | |
| yield "π Generating pandas code for your question..." | |
| try: | |
| code = generate_code(message, df_info, history) | |
| except Exception as e: | |
| yield f"β Code generation failed: {e}" | |
| return | |
| # ββ Step 2: Execute Code ββββββββββββββββββββββββββββββββββββββββββββββ | |
| yield "βοΈ Executing code on your dataset..." | |
| exec_error = None | |
| try: | |
| raw_result = safe_execute(code, df) | |
| raw_str = format_result(raw_result) | |
| except PermissionError as pe: | |
| exec_error = str(pe) | |
| raw_str = exec_error | |
| except Exception as e: | |
| exec_error = f"{type(e).__name__}: {e}" | |
| raw_str = exec_error | |
| # ββ Step 3: Synthesize Insight ββββββββββββββββββββββββββββββββββββββββ | |
| if exec_error: | |
| yield f"""β οΈ **Execution Error** | |
| ``` | |
| {exec_error} | |
| ``` | |
| <details> | |
| <summary>π Generated Code (for debugging)</summary> | |
| ```python | |
| {code} | |
| ``` | |
| </details>""" | |
| return | |
| yield "π‘ Synthesizing insight..." | |
| try: | |
| insight = synthesize_insight(message, raw_str) | |
| except Exception as e: | |
| insight = f"_(Could not generate insight: {e})_" | |
| # ββ Final formatted response ββββββββββββββββββββββββββββββββββββββββββ | |
| yield f"""{insight} | |
| --- | |
| <details> | |
| <summary>π View Generated Code</summary> | |
| ```python | |
| {code} | |
| ``` | |
| </details> | |
| <details> | |
| <summary>π€ View Raw Output</summary> | |
| ``` | |
| {raw_str[:2000]} | |
| ``` | |
| </details>""" | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π Technical Assessment: Data Analysis Agent") | |
| gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5") | |
| with gr.Row(): | |
| excel_file = gr.File( | |
| label="1. Upload Dataset (.xlsx)", | |
| file_types=[".xlsx"] | |
| ) | |
| gr.ChatInterface( | |
| fn=analyze_excel, | |
| additional_inputs=[excel_file], | |
| type="messages", | |
| description="2. Ask questions about your data (e.g., 'What is the average profit by region?')", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |