Interview_template

Sleeping

App Files Files Community

DOMMETI commited on 28 days ago

Commit

78f270c

verified ·

1 Parent(s): 8e7badb

Update app.py

Browse files

Files changed (1) hide show

app.py +256 -27

app.py CHANGED Viewed

@@ -1,45 +1,274 @@
 import gradio as gr
-import os
 import pandas as pd
 from huggingface_hub import InferenceClient
-# ===============================
-# LLM CLIENT SETUP
-# ===============================
-HF_TOKEN = os.getenv("HF")
-client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
-def analyze_excel(message, history, file):
     """
-    Problem Statement:
-    1. Add your own HF token in the settings to get the LLM working.
-    2. Update requirements.txt, app.py as needed.
-    3. Develop a robust "Text-to-Code" analytical workflow.
-    Requirements:
-    a.Code Generation : Transform natural language user queries into executable, sandboxed Python code (specifically using pandas).
-    b.Execution : Securely execute the generated code on the Hugging Face Space server against the uploaded dataset.
-    c.Synthesis : Capture the raw output of the code execution and feed it back to the LLM to generate a natural language insight.
     """
-    if file is None:
-        return "Please upload an Excel file to begin."
-    # The function needs a return here to avoid a NoneType error in Gradio
-    return "File received! Candidate: Implement the Planner-Action-Synthesis logic here."
-# ===============================
-# UI CONFIGURATION
-# ===============================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📊 Technical Assessment: Data Analysis Agent")
     gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
     with gr.Row():
         excel_file = gr.File(
             label="1. Upload Dataset (.xlsx)",
             file_types=[".xlsx"]
         )
     gr.ChatInterface(
         fn=analyze_excel,
         additional_inputs=[excel_file],

 import gradio as gr
 import pandas as pd
+import traceback
+import sys
+import io
+import re
 from huggingface_hub import InferenceClient
+# ─────────────────────────────────────────────
+# CONFIG
+# ─────────────────────────────────────────────
+MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
+client = InferenceClient(MODEL_ID)  # uses HF_TOKEN secret from Space settings
+# ─────────────────────────────────────────────
+# STEP 1 — LOAD EXCEL
+# ─────────────────────────────────────────────
+def load_excel(file) -> pd.DataFrame:
+    """Load xlsx into a DataFrame, handling multi-sheet files."""
+    xl = pd.ExcelFile(file.name)
+    # Use first sheet by default
+    df = xl.parse(xl.sheet_names[0])
+    df.columns = df.columns.str.strip()  # clean column names
+    return df
+def get_df_info(df: pd.DataFrame) -> str:
+    """Build a compact dataset description for the LLM prompt."""
+    return f"""Columns & dtypes:
+{df.dtypes.to_string()}
+Shape: {df.shape[0]} rows x {df.shape[1]} columns
+Sample (first 5 rows):
+{df.head(5).to_string(index=False)}
+Numeric summary:
+{df.describe().to_string()}
+"""
+# ─────────────────────────────────────────────
+# STEP 2 — CODE GENERATION via Qwen 2.5
+# ─────────────────────────────────────────────
+CODE_GEN_SYSTEM = """You are an expert Python data analyst.
+Given a dataset description and a user question, generate ONLY executable Python/pandas code.
+STRICT RULES:
+- The DataFrame is already loaded as variable `df`.
+- Only use pandas (pd) and Python built-ins. Do NOT import anything else.
+- Store your final answer in a variable called `result`.
+- `result` must be a string, number, Series, or DataFrame.
+- Do NOT wrap output in markdown code fences.
+- Do NOT add explanations or comments — code only.
+"""
+def generate_code(question: str, df_info: str, history: list) -> str:
+    """Ask Qwen 2.5 to generate pandas code for the question."""
+    messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]
+    # Add prior turns for conversation context (last 3 Q&A pairs)
+    for msg in history[-6:]:
+        if msg["role"] in ("user", "assistant"):
+            messages.append({"role": msg["role"], "content": msg["content"]})
+    messages.append({
+        "role": "user",
+        "content": f"""Dataset info:
+{df_info}
+Question: {question}
+Write the pandas code now:"""
+    })
+    response = client.chat_completion(
+        messages=messages,
+        max_tokens=600,
+        temperature=0.1,
+    )
+    code = response.choices[0].message.content.strip()
+    # Strip accidental markdown fences
+    code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
+    code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
+    return code
+# ─────────────────────────────────────────────
+# STEP 3 — SANDBOXED EXECUTION
+# ─────────────────────────────────────────────
+BLACKLIST = [
+    "import os", "import sys", "subprocess", "open(",
+    "__import__", "shutil", "socket", "requests",
+    "eval(", "exec(", "globals(", "locals(",
+]
+def safe_execute(code: str, df: pd.DataFrame):
+    """Execute code in a restricted namespace. Returns result or raises."""
+    for pattern in BLACKLIST:
+        if pattern in code:
+            raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")
+    safe_builtins = {
+        "len": len, "range": range, "print": print,
+        "str": str, "int": int, "float": float,
+        "list": list, "dict": dict, "tuple": tuple,
+        "sum": sum, "min": min, "max": max, "round": round,
+        "enumerate": enumerate, "zip": zip, "sorted": sorted,
+        "isinstance": isinstance, "type": type, "abs": abs,
+        "bool": bool, "set": set, "map": map, "filter": filter,
+    }
+    namespace = {
+        "__builtins__": safe_builtins,
+        "pd": pd,
+        "df": df.copy(),
+        "result": None,
+    }
+    old_stdout = sys.stdout
+    sys.stdout = buf = io.StringIO()
+    try:
+        exec(code, namespace)
+    finally:
+        sys.stdout = old_stdout
+    result = namespace.get("result")
+    if result is None:
+        result = buf.getvalue().strip() or "Code ran but produced no output."
+    return result
+def format_result(result) -> str:
+    """Convert any result type to a readable string."""
+    if isinstance(result, pd.DataFrame):
+        return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
+    elif isinstance(result, pd.Series):
+        return result.to_string()
+    else:
+        return str(result)
+# ─────────────────────────────────────────────
+# STEP 4 — INSIGHT SYNTHESIS via Qwen 2.5
+# ─────────────────────────────────────────────
+SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
+Given a user's question and raw output from Python execution,
+write a clear natural-language insight in 2-4 sentences.
+- Highlight key numbers or trends.
+- Do NOT mention code, pandas, or DataFrames.
+- Speak directly to the business insight.
+"""
+def synthesize_insight(question: str, raw_output: str) -> str:
+    """Ask Qwen 2.5 to turn raw output into a plain-English insight."""
+    response = client.chat_completion(
+        messages=[
+            {"role": "system", "content": SYNTHESIS_SYSTEM},
+            {"role": "user", "content": f"""Question: {question}
+Execution result:
+{raw_output[:3000]}
+Write the insight:"""},
+        ],
+        max_tokens=350,
+        temperature=0.4,
+    )
+    return response.choices[0].message.content.strip()
+# ─────────────────────────────────────────────
+# MAIN CHAT HANDLER
+# ─────────────────────────────────────────────
+def analyze_excel(message: str, history: list, excel_file):
     """
+    Full 3-step pipeline:
+      user question → code generation → sandboxed execution → insight synthesis
+    Supports streaming (yield) for live status updates in ChatInterface.
     """
+    # Guard: file not uploaded
+    if excel_file is None:
+        yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above."
+        return
+    # Load dataset
+    try:
+        df = load_excel(excel_file)
+        df_info = get_df_info(df)
+    except Exception as e:
+        yield f"❌ Failed to read the Excel file: {e}"
+        return
+    # ── Step 1: Generate Code ─────────────────────────────────────────────
+    yield "🔍 Generating pandas code for your question..."
+    try:
+        code = generate_code(message, df_info, history)
+    except Exception as e:
+        yield f"❌ Code generation failed: {e}"
+        return
+    # ── Step 2: Execute Code ──────────────────────────────────────────────
+    yield "⚙️ Executing code on your dataset..."
+    exec_error = None
+    try:
+        raw_result = safe_execute(code, df)
+        raw_str = format_result(raw_result)
+    except PermissionError as pe:
+        exec_error = str(pe)
+        raw_str = exec_error
+    except Exception as e:
+        exec_error = f"{type(e).__name__}: {e}"
+        raw_str = exec_error
+    # ── Step 3: Synthesize Insight ────────────────────────────────────────
+    if exec_error:
+        yield f"""⚠️ **Execution Error**
+```
+{exec_error}
+```
+<details>
+<summary>🐍 Generated Code (for debugging)</summary>
+```python
+{code}
+```
+</details>"""
+        return
+    yield "💡 Synthesizing insight..."
+    try:
+        insight = synthesize_insight(message, raw_str)
+    except Exception as e:
+        insight = f"_(Could not generate insight: {e})_"
+    # ── Final formatted response ──────────────────────────────────────────
+    yield f"""{insight}
+---
+<details>
+<summary>🐍 View Generated Code</summary>
+```python
+{code}
+```
+</details>
+<details>
+<summary>📤 View Raw Output</summary>
+```
+{raw_str[:2000]}
+```
+</details>"""
+# ─────────────────────────────────────────────
+# GRADIO UI
+# ─────────────────────────────────────────────
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📊 Technical Assessment: Data Analysis Agent")
     gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")
     with gr.Row():
         excel_file = gr.File(
             label="1. Upload Dataset (.xlsx)",
             file_types=[".xlsx"]
         )
     gr.ChatInterface(
         fn=analyze_excel,
         additional_inputs=[excel_file],