CsvPal-AI / app.py
Tulika2000's picture
Update app.py
e8e1300 verified
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1DSQjlXwb4UHeF4RILlBwwHfN5jRyyvov
"""
import gradio as gr
import pandas as pd
import os
from tempfile import NamedTemporaryFile
from uuid import uuid4
# LangChain & Groq imports for embedding and LLM access
from langchain_community.document_loaders import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
# ========== API Key Setup ==========
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
if not GROQ_API_KEY or not GROQ_API_KEY.strip():
raise ValueError("❌ Please provide GROQ_API_KEY as a Space secret (Settings > Secrets).")
# ========== Global State ==========
vectorstore = None
df = None
def safe_delete_collection(vs):
"""Deletes Chroma collection if exists (prevents stale retrieval)."""
try:
if vs is not None and hasattr(vs, "delete_collection"):
vs.delete_collection()
except Exception:
pass
def load_and_index_csv(file):
"""
Load the CSV, build vectorstore with embeddings, and provide preview/status.
"""
global vectorstore, df # Moved global declaration to the top
if file is None:
return None, "⚠️ Please upload a CSV.", None
try:
# Remove the prior index/collection and state
safe_delete_collection(vectorstore)
vectorstore = None
df = None
# Load user's CSV
df = pd.read_csv(file.name)
csv_text = df.to_csv(index=False)
with NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
tmp.write(csv_text.encode("utf-8"))
tmp_path = tmp.name
# Create new index with unique collection for this upload
loader = CSVLoader(file_path=tmp_path)
docs = loader.load()
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'}
)
collection_name = f"csvpal-{uuid4().hex[:8]}"
vectorstore = Chroma.from_documents(
documents=docs,
embedding=embeddings,
collection_name=collection_name
)
preview_html = df.head().to_html(index=False)
status_msg = f"✅ CSV Loaded! {len(df)} rows, {len(df.columns)} columns."
return preview_html, status_msg, True
except Exception as e:
return None, f"Error loading CSV: {e}", False
def answer_question(user_input):
"""
Retrieve top-10 relevant rows and ask the LLM to answer strictly using them.
"""
global vectorstore, df
if vectorstore is None or df is None:
return "⚠️ Please upload and load a CSV first."
if not user_input or not user_input.strip():
return "⚠️ Please enter a question."
# Retrieval and strict grounding
k = 10
try:
docs_and_scores = vectorstore.similarity_search_with_score(user_input, k=k)
except Exception as e:
return f"❌ Retrieval error: {e}"
if not docs_and_scores:
return "⚠️ No relevant data found for your question in the CSV."
try:
context = "\n".join(getattr(doc, "page_content", getattr(doc, "content", "")) for doc, _ in docs_and_scores)
except Exception:
context = ""
prompt = f"""You are an expert data analyst AI assistant. Use ONLY the following CSV data to answer the question below.
If the answer cannot be found in the data, say 'No sufficient data to answer.'
Do NOT guess or invent information.
Data:
{context}
Question:
{user_input}
Please answer clearly and provide Python pandas code using DataFrame `df` to reproduce the answer exactly.
"""
try:
chat_model = ChatGroq(model_name="llama-3.3-70b-versatile", api_key=GROQ_API_KEY)
response = chat_model.invoke(prompt)
if isinstance(response, str):
answer = response
elif hasattr(response, "content"):
answer = response.content
else:
answer = str(response)
except Exception as e:
answer = f"❌ Error generating answer: {e}"
return answer
# ========== Gradio UI ==========
with gr.Blocks(title="CsvPal-AI: Chat with Your CSV (RAG-powered Q&A)") as demo:
gr.Markdown(
"""
# 📊 CsvPal-AI
Upload a CSV file and ask natural language questions.
Receive clear, data-grounded answers and Python pandas code snippets.
"""
)
with gr.Row():
csv_file = gr.File(label="📂 Upload CSV", file_types=[".csv"])
status = gr.Textbox(label="Status", interactive=False, max_lines=3)
df_preview = gr.HTML(label="🧾 CSV Preview")
gr.Markdown(
"⚠️ Note: Answers are based on the top 10 most relevant rows retrieved from your CSV for each question."
)
gr.Markdown("----")
user_input = gr.Textbox(
label="📝 Ask your question:",
placeholder="e.g. What analysis can be done in the CSV data?",
interactive=False,
lines=1
)
with gr.Row():
ask_btn = gr.Button("🤖 Get Answer", interactive=False)
clear_btn = gr.Button("Clear")
answer_output = gr.Textbox(
label="Answer",
lines=15,
interactive=False
)
# UI clear on manual file clear
def clear_all_from_file():
return "", "", gr.update(value="", interactive=False), gr.update(interactive=False), ""
# On file upload, index fresh and enable
def on_file_upload(file):
preview, msg, enable_chat = load_and_index_csv(file)
return (
preview or "",
msg,
gr.update(value="", interactive=bool(enable_chat)),
gr.update(interactive=bool(enable_chat)),
"" # answer_output
)
# Global clear: wipes index, df, and UI
def clear_everything():
global vectorstore, df
safe_delete_collection(vectorstore)
vectorstore = None
df = None
return (
None, # csv_file (clears)
"", # df_preview
"", # status
gr.update(value="", interactive=False), # user_input clear+disable
gr.update(interactive=False), # ask_btn disable
"" # answer_output clear
)
csv_file.upload(
on_file_upload,
inputs=[csv_file],
outputs=[df_preview, status, user_input, ask_btn, answer_output]
)
csv_file.clear(clear_all_from_file, None, [df_preview, status, user_input, ask_btn, answer_output])
ask_btn.click(answer_question, inputs=[user_input], outputs=[answer_output])
user_input.submit(answer_question, inputs=[user_input], outputs=[answer_output])
clear_btn.click(
clear_everything,
inputs=[],
outputs=[csv_file, df_preview, status, user_input, ask_btn, answer_output]
)
demo.launch()