Interview_template

Sleeping

App Files Files Community

Interview_template / app.py

DOMMETI

Update app.py

125ed6a verified 25 days ago

raw

history blame contribute delete

10.1 kB

	import gradio as gr
	import pandas as pd
	import traceback
	import sys
	import io
	import re
	import os
	from huggingface_hub import InferenceClient

	# ─────────────────────────────────────────────
	# CONFIG
	# ─────────────────────────────────────────────
	MODEL_ID = "Qwen/Qwen2.5-72B-Instruct"
	HF_TOKEN = os.environ.get("hf_token") # set this in Space Settings → Secrets
	client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)

	# ─────────────────────────────────────────────
	# STEP 1 — LOAD EXCEL
	# ─────────────────────────────────────────────
	def load_excel(file) -> pd.DataFrame:
	"""Load xlsx into a DataFrame, handling multi-sheet files."""
	xl = pd.ExcelFile(file.name)
	# Use first sheet by default
	df = xl.parse(xl.sheet_names[0])
	df.columns = df.columns.str.strip() # clean column names
	return df


	def get_df_info(df: pd.DataFrame) -> str:
	"""Build a compact dataset description for the LLM prompt."""
	return f"""Columns & dtypes:
	{df.dtypes.to_string()}

	Shape: {df.shape[0]} rows x {df.shape[1]} columns

	Sample (first 5 rows):
	{df.head(5).to_string(index=False)}

	Numeric summary:
	{df.describe().to_string()}
	"""


	# ─────────────────────────────────────────────
	# STEP 2 — CODE GENERATION via Qwen 2.5
	# ─────────────────────────────────────────────
	CODE_GEN_SYSTEM = """You are an expert Python data analyst.
	Given a dataset description and a user question, generate ONLY executable Python/pandas code.

	STRICT RULES:
	- The DataFrame is already loaded as variable `df`.
	- Only use pandas (pd) and Python built-ins. Do NOT import anything else.
	- Store your final answer in a variable called `result`.
	- `result` must be a string, number, Series, or DataFrame.
	- Do NOT wrap output in markdown code fences.
	- Do NOT add explanations or comments — code only.
	"""

	def generate_code(question: str, df_info: str, history: list) -> str:
	"""Ask Qwen 2.5 to generate pandas code for the question."""
	messages = [{"role": "system", "content": CODE_GEN_SYSTEM}]

	# Add prior turns for conversation context (last 3 Q&A pairs)
	for msg in history[-6:]:
	if msg["role"] in ("user", "assistant"):
	messages.append({"role": msg["role"], "content": msg["content"]})

	messages.append({
	"role": "user",
	"content": f"""Dataset info:
	{df_info}

	Question: {question}

	Write the pandas code now:"""
	})

	response = client.chat_completion(
	messages=messages,
	max_tokens=600,
	temperature=0.1,
	)
	code = response.choices[0].message.content.strip()

	# Strip accidental markdown fences
	code = re.sub(r"^```(?:python)?", "", code, flags=re.MULTILINE).strip()
	code = re.sub(r"```$", "", code, flags=re.MULTILINE).strip()
	return code


	# ─────────────────────────────────────────────
	# STEP 3 — SANDBOXED EXECUTION
	# ─────────────────────────────────────────────
	BLACKLIST = [
	"import os", "import sys", "subprocess", "open(",
	"__import__", "shutil", "socket", "requests",
	"eval(", "exec(", "globals(", "locals(",
	]

	def safe_execute(code: str, df: pd.DataFrame):
	"""Execute code in a restricted namespace. Returns result or raises."""
	for pattern in BLACKLIST:
	if pattern in code:
	raise PermissionError(f"Blocked unsafe pattern: `{pattern}`")

	safe_builtins = {
	"len": len, "range": range, "print": print,
	"str": str, "int": int, "float": float,
	"list": list, "dict": dict, "tuple": tuple,
	"sum": sum, "min": min, "max": max, "round": round,
	"enumerate": enumerate, "zip": zip, "sorted": sorted,
	"isinstance": isinstance, "type": type, "abs": abs,
	"bool": bool, "set": set, "map": map, "filter": filter,
	}
	namespace = {
	"__builtins__": safe_builtins,
	"pd": pd,
	"df": df.copy(),
	"result": None,
	}

	old_stdout = sys.stdout
	sys.stdout = buf = io.StringIO()
	try:
	exec(code, namespace)
	finally:
	sys.stdout = old_stdout

	result = namespace.get("result")
	if result is None:
	result = buf.getvalue().strip() or "Code ran but produced no output."
	return result


	def format_result(result) -> str:
	"""Convert any result type to a readable string."""
	if isinstance(result, pd.DataFrame):
	return result.to_string(index=False) if not result.empty else "Empty DataFrame returned."
	elif isinstance(result, pd.Series):
	return result.to_string()
	else:
	return str(result)


	# ─────────────────────────────────────────────
	# STEP 4 — INSIGHT SYNTHESIS via Qwen 2.5
	# ─────────────────────────────────────────────
	SYNTHESIS_SYSTEM = """You are a friendly, concise data analyst.
	Given a user's question and raw output from Python execution,
	write a clear natural-language insight in 2-4 sentences.
	- Highlight key numbers or trends.
	- Do NOT mention code, pandas, or DataFrames.
	- Speak directly to the business insight.
	"""

	def synthesize_insight(question: str, raw_output: str) -> str:
	"""Ask Qwen 2.5 to turn raw output into a plain-English insight."""
	response = client.chat_completion(
	messages=[
	{"role": "system", "content": SYNTHESIS_SYSTEM},
	{"role": "user", "content": f"""Question: {question}

	Execution result:
	{raw_output[:3000]}

	Write the insight:"""},
	],
	max_tokens=350,
	temperature=0.4,
	)
	return response.choices[0].message.content.strip()


	# ─────────────────────────────────────────────
	# MAIN CHAT HANDLER
	# ─────────────────────────────────────────────
	def analyze_excel(message: str, history: list, excel_file):
	"""
	Full 3-step pipeline:
	user question → code generation → sandboxed execution → insight synthesis
	Supports streaming (yield) for live status updates in ChatInterface.
	"""
	# Guard: file not uploaded
	if excel_file is None:
	yield "⚠️ Please upload an Excel (.xlsx) file first using the upload box above."
	return

	# Load dataset
	try:
	df = load_excel(excel_file)
	df_info = get_df_info(df)
	except Exception as e:
	yield f"❌ Failed to read the Excel file: {e}"
	return

	# ── Step 1: Generate Code ─────────────────────────────────────────────
	yield "🔍 Generating pandas code for your question..."
	try:
	code = generate_code(message, df_info, history)
	except Exception as e:
	yield f"❌ Code generation failed: {e}"
	return

	# ── Step 2: Execute Code ──────────────────────────────────────────────
	yield "⚙️ Executing code on your dataset..."
	exec_error = None
	try:
	raw_result = safe_execute(code, df)
	raw_str = format_result(raw_result)
	except PermissionError as pe:
	exec_error = str(pe)
	raw_str = exec_error
	except Exception as e:
	exec_error = f"{type(e).__name__}: {e}"
	raw_str = exec_error

	# ── Step 3: Synthesize Insight ────────────────────────────────────────
	if exec_error:
	yield f"""⚠️ Execution Error

	```
	{exec_error}
	```

	<details>
	<summary>🐍 Generated Code (for debugging)</summary>

	```python
	{code}
	```
	</details>"""
	return

	yield "💡 Synthesizing insight..."
	try:
	insight = synthesize_insight(message, raw_str)
	except Exception as e:
	insight = f"_(Could not generate insight: {e})_"

	# ── Final formatted response ──────────────────────────────────────────
	yield f"""{insight}

	---
	<details>
	<summary>🐍 View Generated Code</summary>

	```python
	{code}
	```
	</details>

	<details>
	<summary>📤 View Raw Output</summary>

	```
	{raw_str[:2000]}
	```
	</details>"""


	# ─────────────────────────────────────────────
	# GRADIO UI
	# ─────────────────────────────────────────────
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 📊 Technical Assessment: Data Analysis Agent")
	gr.Markdown("### Objective: Build a Text-to-Code workflow using Qwen 2.5")

	with gr.Row():
	excel_file = gr.File(
	label="1. Upload Dataset (.xlsx)",
	file_types=[".xlsx"]
	)

	gr.ChatInterface(
	fn=analyze_excel,
	additional_inputs=[excel_file],
	type="messages",
	description="2. Ask questions about your data (e.g., 'What is the average profit by region?')",
	)

	if __name__ == "__main__":
	demo.launch()