Interview_template

Sleeping

File size: 10,052 Bytes

import gradio as gr
import pandas as pd
import traceback
import sys
import io
import re
import os
from huggingface_hub import InferenceClient

# ─────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
HF_TOKEN = os.environ.get("hf_token")  # set this in Space Settings → Secrets
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)

# ─────────────────────────────────────────────
# STEP 1 — LOAD EXCEL
# ─────────────────────────────────────────────
def load_excel(file) -> pd.DataFrame:
    """Load xlsx into a DataFrame, handling multi-sheet files."""
    xl = pd.ExcelFile(file.name)
    # Use first sheet by default
    df = xl.parse(xl.sheet_names[0])
    df.columns = df.columns.str.strip()  # clean column names
    return df


def get_df_info(df: pd.DataFrame) -> str:
    """Build a compact dataset description for the LLM prompt."""
    return f"""Columns & dtypes:
{df.dtypes.to_string()}

Shape: {df.shape[0]} rows x {df.shape[1]} columns

Sample (first 5 rows):
{df.head(5).to_string(index=False)}

Numeric summary:
{df.describe().to_string()}
"""


# ─────────────────────────────────────────────
# STEP 2 — CODE GENERATION via Qwen 2.5
# ─────────────────────────────────────────────
CODE_GEN_SYSTEM = """You are an expert Python data analyst. 
Given a dataset description and a user question, generate ONLY executable Python/pandas code.

STRICT RULES:
- The DataFrame is already loaded as variable `df`.
- Only use pandas (pd) and Python built-ins. Do NOT import anything else.
- Store your final answer in a variable called `result`.
- `result` must be a string, number, Series, or DataFrame.
- Do NOT wrap output in markdown code fences.
- Do NOT add explanations or comments — code only.
"""

def generate_code(question: str, df_info: str, history: list) -> str:
    """Ask Qwen 2.5 to generate pandas code for the question."""
    messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]

    # Add prior turns for conversation context (last 3 Q&A pairs)
    for msg in history[-6:]:
        if msg["role"] in ("user", "assistant"):
            messages.append({"role": msg["role"], "content": msg["content"]})

    messages.append({
        "role": "user",
        "content": f"""Dataset info:
{df_info}

Question: {question}

Write the pandas code now:"""
    })

    response = client.chat_completion(
        messages=messages,
        max_tokens=600,
        temperature=0.1,
    )
    code = response.choices[0].message.content.strip()

    # Strip accidental markdown fences
    code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
    code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
    return code


# ─────────────────────────────────────────────
# STEP 3 — SANDBOXED EXECUTION
# ─────────────────────────────────────────────
BLACKLIST = [
    "import os", "import sys", "subprocess", "open(",
    "__import__", "shutil", "socket", "requests",
    "eval(", "exec(", "globals(", "locals(",
]

def safe_execute(code: str, df: pd.DataFrame):
    """Execute code in a restricted namespace. Returns result or raises."""
    for pattern in BLACKLIST:
        if pattern in code:
            raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")

    safe_builtins = {
        "len": len, "range": range, "print": print,
        "str": str, "int": int, "float": float,
        "list": list, "dict": dict, "tuple": tuple,
        "sum": sum, "min": min, "max": max, "round": round,
        "enumerate": enumerate, "zip": zip, "sorted": sorted,
        "isinstance": isinstance, "type": type, "abs": abs,
        "bool": bool, "set": set, "map": map, "filter": filter,
    }
    namespace = {
        "__builtins__": safe_builtins,
        "pd": pd,
        "df": df.copy(),
        "result": None,
    }

    old_stdout = sys.stdout
    sys.stdout = buf = io.StringIO()
    try:
        exec(code, namespace)
    finally:
        sys.stdout = old_stdout

    result = namespace.get("result")
    if result is None:
        result = buf.getvalue().strip() or "Code ran but produced no output."
    return result


def format_result(result) -> str:
    """Convert any result type to a readable string."""
    if isinstance(result, pd.DataFrame):
        return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
    elif isinstance(result, pd.Series):
        return result.to_string()
    else:
        return str(result)


# ─────────────────────────────────────────────
# STEP 4 — INSIGHT SYNTHESIS via Qwen 2.5
# ─────────────────────────────────────────────
SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
Given a user's question and raw output from Python execution,
write a clear natural-language insight in 2-4 sentences.
- Highlight key numbers or trends.
- Do NOT mention code, pandas, or DataFrames.
- Speak directly to the business insight.
"""

def synthesize_insight(question: str, raw_output: str) -> str:
    """Ask Qwen 2.5 to turn raw output into a plain-English insight."""
    response = client.chat_completion(
        messages=[
            {"role": "system", "content": SYNTHESIS_SYSTEM},
            {"role": "user", "content": f"""Question: {question}

Execution result:
{raw_output[:3000]}

Write the insight:"""},
        ],
        max_tokens=350,
        temperature=0.4,
    )
    return response.choices[0].message.content.strip()


# ─────────────────────────────────────────────
# MAIN CHAT HANDLER
# ─────────────────────────────────────────────
def analyze_excel(message: str, history: list, excel_file):
    """
    Full 3-step pipeline:
      user question → code generation → sandboxed execution → insight synthesis
    Supports streaming (yield) for live status updates in ChatInterface.
    """
    # Guard: file not uploaded
    if excel_file is None:
        yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above."
        return

    # Load dataset
    try:
        df = load_excel(excel_file)
        df_info = get_df_info(df)
    except Exception as e:
        yield f"❌ Failed to read the Excel file: {e}"
        return

    # ── Step 1: Generate Code ─────────────────────────────────────────────
    yield "🔍 Generating pandas code for your question..."
    try:
        code = generate_code(message, df_info, history)
    except Exception as e:
        yield f"❌ Code generation failed: {e}"
        return

    # ── Step 2: Execute Code ──────────────────────────────────────────────
    yield "⚙️ Executing code on your dataset..."
    exec_error = None
    try:
        raw_result = safe_execute(code, df)
        raw_str = format_result(raw_result)
    except PermissionError as pe:
        exec_error = str(pe)
        raw_str = exec_error
    except Exception as e:
        exec_error = f"{type(e).__name__}: {e}"
        raw_str = exec_error

    # ── Step 3: Synthesize Insight ────────────────────────────────────────
    if exec_error:
        yield f"""⚠️ **Execution Error**

```
{exec_error}
```

<details>
<summary>🐍 Generated Code (for debugging)</summary>

```python
{code}
```
</details>"""
        return

    yield "💡 Synthesizing insight..."
    try:
        insight = synthesize_insight(message, raw_str)
    except Exception as e:
        insight = f"_(Could not generate insight: {e})_"

    # ── Final formatted response ──────────────────────────────────────────
    yield f"""{insight}

---
<details>
<summary>🐍 View Generated Code</summary>

```python
{code}
```
</details>

<details>
<summary>📤 View Raw Output</summary>

```
{raw_str[:2000]}
```
</details>"""


# ─────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📊 Technical Assessment: Data Analysis Agent")
    gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")

    with gr.Row():
        excel_file = gr.File(
            label="1. Upload Dataset (.xlsx)",
            file_types=[".xlsx"]
        )

    gr.ChatInterface(
        fn=analyze_excel,
        additional_inputs=[excel_file],
        type="messages",
        description="2. Ask questions about your data (e.g., 'What is the average profit by region?')",
    )

if __name__ == "__main__":
    demo.launch()