Spaces:

rithwikreal
/

AnalysisApp

Sleeping

App Files Files Community

AnalysisApp / app.py

rithwikreal

Update app.py

59cee21 verified 3 months ago

raw

history blame contribute delete

9.15 kB

	# app.py
	import gradio as gr
	import pandas as pd
	import io
	import os
	import google.generativeai as genai
	import gc
	import traceback
	from typing import Tuple, Optional

	# Load API key from secrets (don't put key in code)
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if not GEMINI_API_KEY:
	raise ValueError("Gemini API key not set. Please add GEMINI_API_KEY in Space Secrets.")
	genai.configure(api_key=GEMINI_API_KEY)

	# session DataFrame (kept in memory for the session)
	session_df = None

	# ---------------- robust file-reading helper ----------------
	def read_file_bytes_flexible(file) -> Tuple[Optional[bytes], Optional[str], Optional[str]]:
	"""
	Try many ways to extract raw bytes and filename from the uploaded object.
	Returns: (content_bytes \| None, filename \| None, error_message \| None)
	"""
	if file is None:
	return None, None, "No file uploaded."

	# 1) If it's already raw bytes
	if isinstance(file, (bytes, bytearray)):
	return bytes(file), None, None

	# 2) If object has attribute 'bytes' (some wrappers do)
	try:
	b = getattr(file, "bytes", None)
	if isinstance(b, (bytes, bytearray)):
	# try name if available
	name = getattr(file, "name", None) or getattr(file, "filename", None)
	return bytes(b), name, None
	except Exception:
	pass

	# 3) If object has attribute 'read' and calling it works
	read_attr = getattr(file, "read", None)
	if callable(read_attr):
	try:
	content = read_attr()
	# some frameworks return coroutine for read() - handle it gracefully
	if hasattr(content, "__await__"):
	# can't await in sync; try file.file.read() below
	pass
	else:
	if isinstance(content, (bytes, bytearray)):
	name = getattr(file, "name", None) or getattr(file, "filename", None)
	return bytes(content), name, None
	# sometimes read() returns str (rare), turn to bytes
	if isinstance(content, str):
	return content.encode("utf-8"), getattr(file, "name", None), None
	except TypeError:
	# read() may require args or be not callable in this context
	pass
	except Exception:
	# ignore and try other ways
	pass

	# 4) If object has a .file attribute (like starlette UploadFile.file)
	try:
	attr_file = getattr(file, "file", None)
	if attr_file is not None and hasattr(attr_file, "read"):
	try:
	content = attr_file.read()
	if isinstance(content, (bytes, bytearray)):
	name = getattr(file, "name", None) or getattr(file, "filename", None)
	return bytes(content), name, None
	except Exception:
	pass
	except Exception:
	pass

	# 5) If object is a dict-like (some environments)
	try:
	if isinstance(file, dict):
	# common keys
	for k in ("content", "data", "bytes", "file", "body"):
	v = file.get(k)
	if isinstance(v, (bytes, bytearray)):
	name = file.get("name") or file.get("filename")
	return bytes(v), name, None
	if isinstance(v, str) and os.path.exists(v):
	with open(v, "rb") as f:
	return f.read(), os.path.basename(v), None
	except Exception:
	pass

	# 6) Fallback: try attributes that might contain a path string
	try:
	for attr in ("name", "filename", "path"):
	val = getattr(file, attr, None)
	if isinstance(val, str) and os.path.exists(val):
	with open(val, "rb") as f:
	return f.read(), os.path.basename(val), None
	except Exception:
	pass

	# 7) Give up with a helpful error (include repr for debugging)
	try:
	rep = repr(file)
	except Exception:
	rep = "<unrepresentable object>"
	return None, None, f"Uploaded file format not supported by this server environment. Object repr: {rep}"

	# ---------------- load file to DataFrame ----------------
	def load_file(file) -> Tuple[Optional[pd.DataFrame], str]:
	"""
	Returns (df or None, status_message).
	"""
	global session_df
	content, fname, err = read_file_bytes_flexible(file)
	if err:
	return None, f"Error reading file: {err}"
	if content is None:
	return None, "No bytes could be read from uploaded object."

	try:
	name = (fname or "").lower()
	# Quick heuristic: csv if filename endswith .csv or bytes contain commas/newlines in header
	if name.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
	df = pd.read_csv(io.BytesIO(content))
	else:
	# assume excel by default
	df = pd.read_excel(io.BytesIO(content))
	except Exception as e:
	# include traceback to help debug unusual formats (will show in UI only)
	tb = traceback.format_exc()
	return None, f"Error parsing file into DataFrame: {e}\n{tb}"
	finally:
	try:
	del content
	except Exception:
	pass
	gc.collect()

	session_df = df
	return df, f"File loaded: {df.shape[0]} rows x {df.shape[1]} columns."

	# ---------------- Gemini-powered question answering ----------------
	def ask_question_gemini(query: str):
	"""
	Sends the user's query and a small preview to Gemini; expects back Python code that sets `result`.
	Executes the code in a controlled local environment.
	"""
	global session_df
	if session_df is None:
	return None, "Please upload and load a file first."

	# build prompt: include columns & small preview
	cols = list(session_df.columns)
	preview_csv = session_df.head(10).to_csv(index=False)
	prompt = f"""
	You are a helpful Python data analyst. The user uploaded a dataset with columns: {cols}.
	Here are the first 10 rows (CSV):
	{preview_csv}

	User question: {query}

	Return ONLY Python code (no explanations) that when executed will create a pandas DataFrame named `result`
	that contains the answer (a DataFrame, up to 200 rows). Use `df` as the variable for the dataset.
	Do not import libraries; assume pandas is available as pd. If you need to compute percentages, include them as columns.
	If the query asks for a single number, return it as a one-row DataFrame, e.g. pd.DataFrame({'value':[...]}).
	"""
	try:
	model = genai.GenerativeModel("gemini-pro")
	response = model.generate_content(prompt)
	code = response.text.strip("`\n ")
	except Exception as e:
	return None, f"Error calling Gemini: {e}"

	# Execute the code in a controlled namespace
	local_vars = {"pd": pd, "df": session_df.copy(), "result": None}
	try:
	exec(code, {}, local_vars)
	except Exception as e:
	tb = traceback.format_exc()
	return None, f"Error executing code returned by Gemini: {e}\nCode was:\n{code}\n\nTraceback:\n{tb}"

	result = local_vars.get("result", None)
	if isinstance(result, pd.DataFrame):
	# limit to 200 rows to avoid huge outputs
	return result.head(200), f"Success — executed Gemini code."
	else:
	# If not a DataFrame, try to wrap scalar into DF
	if isinstance(result, (int, float, str)):
	return pd.DataFrame({"value": [result]}), "Gemini returned a scalar; wrapped into DataFrame."
	return None, f"Gemini did not return a DataFrame. Code was:\n{code}"

	# ---------------- Gradio functions ----------------
	def fn_load(file):
	df, msg = load_file(file)
	if df is None:
	return None, msg
	preview = df.head(5)
	return preview, msg

	def fn_ask(query):
	res, msg = ask_question_gemini(query)
	return res, msg

	def fn_clear():
	global session_df
	session_df = None
	gc.collect()
	return (
	gr.File.update(value=None),
	gr.Dataframe.update(value=None),
	gr.Textbox.update(value=""),
	gr.Textbox.update(value=""),
	)

	# ---------------- UI ----------------
	with gr.Blocks() as demo:
	gr.Markdown("# Chat-with-CSV — Gemini-powered (secure API key via Secrets)")
	with gr.Row():
	file_input = gr.File(label="Upload CSV or XLSX (will not be saved)")
	load_btn = gr.Button("Load file")
	preview_table = gr.Dataframe(headers=None, label="Preview (first 5 rows)")
	file_status = gr.Textbox(label="File status")

	query_input = gr.Textbox(label="Ask a question (English)")
	ask_btn = gr.Button("Ask Gemini")
	result_table = gr.Dataframe(headers=None, label="Result")
	status = gr.Textbox(label="Status / Messages")

	clear_btn = gr.Button("Clear / Reset")

	load_btn.click(fn=fn_load, inputs=file_input, outputs=[preview_table, file_status])
	ask_btn.click(fn=fn_ask, inputs=query_input, outputs=[result_table, status])
	clear_btn.click(fn=fn_clear, outputs=[file_input, preview_table, query_input, result_table])

	if __name__ == "__main__":
	demo.launch()