Spaces:

Tulika2000
/

CsvPal-AI

Sleeping

App Files Files Community

CsvPal-AI / app.py

Tulika2000

Update app.py

e8e1300 verified 7 months ago

raw

history blame contribute delete

7.19 kB

	# -- coding: utf-8 --
	"""app.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1DSQjlXwb4UHeF4RILlBwwHfN5jRyyvov
	"""

	import gradio as gr
	import pandas as pd
	import os
	from tempfile import NamedTemporaryFile
	from uuid import uuid4

	# LangChain & Groq imports for embedding and LLM access
	from langchain_community.document_loaders import CSVLoader
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_groq import ChatGroq

	# ========== API Key Setup ==========
	GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
	if not GROQ_API_KEY or not GROQ_API_KEY.strip():
	raise ValueError("❌ Please provide GROQ_API_KEY as a Space secret (Settings > Secrets).")

	# ========== Global State ==========
	vectorstore = None
	df = None

	def safe_delete_collection(vs):
	"""Deletes Chroma collection if exists (prevents stale retrieval)."""
	try:
	if vs is not None and hasattr(vs, "delete_collection"):
	vs.delete_collection()
	except Exception:
	pass

	def load_and_index_csv(file):
	"""
	Load the CSV, build vectorstore with embeddings, and provide preview/status.
	"""
	global vectorstore, df # Moved global declaration to the top

	if file is None:
	return None, "⚠️ Please upload a CSV.", None

	try:
	# Remove the prior index/collection and state
	safe_delete_collection(vectorstore)
	vectorstore = None
	df = None

	# Load user's CSV
	df = pd.read_csv(file.name)
	csv_text = df.to_csv(index=False)
	with NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
	tmp.write(csv_text.encode("utf-8"))
	tmp_path = tmp.name

	# Create new index with unique collection for this upload
	loader = CSVLoader(file_path=tmp_path)
	docs = loader.load()
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'}
	)
	collection_name = f"csvpal-{uuid4().hex[:8]}"
	vectorstore = Chroma.from_documents(
	documents=docs,
	embedding=embeddings,
	collection_name=collection_name
	)

	preview_html = df.head().to_html(index=False)
	status_msg = f"✅ CSV Loaded! {len(df)} rows, {len(df.columns)} columns."
	return preview_html, status_msg, True

	except Exception as e:
	return None, f"Error loading CSV: {e}", False

	def answer_question(user_input):
	"""
	Retrieve top-10 relevant rows and ask the LLM to answer strictly using them.
	"""
	global vectorstore, df

	if vectorstore is None or df is None:
	return "⚠️ Please upload and load a CSV first."
	if not user_input or not user_input.strip():
	return "⚠️ Please enter a question."

	# Retrieval and strict grounding
	k = 10
	try:
	docs_and_scores = vectorstore.similarity_search_with_score(user_input, k=k)
	except Exception as e:
	return f"❌ Retrieval error: {e}"

	if not docs_and_scores:
	return "⚠️ No relevant data found for your question in the CSV."

	try:
	context = "\n".join(getattr(doc, "page_content", getattr(doc, "content", "")) for doc, _ in docs_and_scores)
	except Exception:
	context = ""

	prompt = f"""You are an expert data analyst AI assistant. Use ONLY the following CSV data to answer the question below.
	If the answer cannot be found in the data, say 'No sufficient data to answer.'
	Do NOT guess or invent information.

	Data:
	{context}

	Question:
	{user_input}

	Please answer clearly and provide Python pandas code using DataFrame `df` to reproduce the answer exactly.
	"""

	try:
	chat_model = ChatGroq(model_name="llama-3.3-70b-versatile", api_key=GROQ_API_KEY)
	response = chat_model.invoke(prompt)
	if isinstance(response, str):
	answer = response
	elif hasattr(response, "content"):
	answer = response.content
	else:
	answer = str(response)
	except Exception as e:
	answer = f"❌ Error generating answer: {e}"

	return answer

	# ========== Gradio UI ==========
	with gr.Blocks(title="CsvPal-AI: Chat with Your CSV (RAG-powered Q&A)") as demo:
	gr.Markdown(
	"""
	# 📊 CsvPal-AI
	Upload a CSV file and ask natural language questions.
	Receive clear, data-grounded answers and Python pandas code snippets.
	"""
	)
	with gr.Row():
	csv_file = gr.File(label="📂 Upload CSV", file_types=[".csv"])

	status = gr.Textbox(label="Status", interactive=False, max_lines=3)
	df_preview = gr.HTML(label="🧾 CSV Preview")

	gr.Markdown(
	"⚠️ Note: Answers are based on the top 10 most relevant rows retrieved from your CSV for each question."
	)
	gr.Markdown("----")

	user_input = gr.Textbox(
	label="📝 Ask your question:",
	placeholder="e.g. What analysis can be done in the CSV data?",
	interactive=False,
	lines=1
	)

	with gr.Row():
	ask_btn = gr.Button("🤖 Get Answer", interactive=False)
	clear_btn = gr.Button("Clear")

	answer_output = gr.Textbox(
	label="Answer",
	lines=15,
	interactive=False
	)

	# UI clear on manual file clear
	def clear_all_from_file():
	return "", "", gr.update(value="", interactive=False), gr.update(interactive=False), ""

	# On file upload, index fresh and enable
	def on_file_upload(file):
	preview, msg, enable_chat = load_and_index_csv(file)
	return (
	preview or "",
	msg,
	gr.update(value="", interactive=bool(enable_chat)),
	gr.update(interactive=bool(enable_chat)),
	"" # answer_output
	)

	# Global clear: wipes index, df, and UI
	def clear_everything():
	global vectorstore, df
	safe_delete_collection(vectorstore)
	vectorstore = None
	df = None
	return (
	None, # csv_file (clears)
	"", # df_preview
	"", # status
	gr.update(value="", interactive=False), # user_input clear+disable
	gr.update(interactive=False), # ask_btn disable
	"" # answer_output clear
	)

	csv_file.upload(
	on_file_upload,
	inputs=[csv_file],
	outputs=[df_preview, status, user_input, ask_btn, answer_output]
	)
	csv_file.clear(clear_all_from_file, None, [df_preview, status, user_input, ask_btn, answer_output])

	ask_btn.click(answer_question, inputs=[user_input], outputs=[answer_output])
	user_input.submit(answer_question, inputs=[user_input], outputs=[answer_output])

	clear_btn.click(
	clear_everything,
	inputs=[],
	outputs=[csv_file, df_preview, status, user_input, ask_btn, answer_output]
	)

	demo.launch()