# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1DSQjlXwb4UHeF4RILlBwwHfN5jRyyvov """ import gradio as gr import pandas as pd import os from tempfile import NamedTemporaryFile from uuid import uuid4 # LangChain & Groq imports for embedding and LLM access from langchain_community.document_loaders import CSVLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain_groq import ChatGroq # ========== API Key Setup ========== GROQ_API_KEY = os.environ.get("GROQ_API_KEY") if not GROQ_API_KEY or not GROQ_API_KEY.strip(): raise ValueError("❌ Please provide GROQ_API_KEY as a Space secret (Settings > Secrets).") # ========== Global State ========== vectorstore = None df = None def safe_delete_collection(vs): """Deletes Chroma collection if exists (prevents stale retrieval).""" try: if vs is not None and hasattr(vs, "delete_collection"): vs.delete_collection() except Exception: pass def load_and_index_csv(file): """ Load the CSV, build vectorstore with embeddings, and provide preview/status. """ global vectorstore, df # Moved global declaration to the top if file is None: return None, "⚠️ Please upload a CSV.", None try: # Remove the prior index/collection and state safe_delete_collection(vectorstore) vectorstore = None df = None # Load user's CSV df = pd.read_csv(file.name) csv_text = df.to_csv(index=False) with NamedTemporaryFile(delete=False, suffix=".csv") as tmp: tmp.write(csv_text.encode("utf-8")) tmp_path = tmp.name # Create new index with unique collection for this upload loader = CSVLoader(file_path=tmp_path) docs = loader.load() embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'} ) collection_name = f"csvpal-{uuid4().hex[:8]}" vectorstore = Chroma.from_documents( documents=docs, embedding=embeddings, collection_name=collection_name ) preview_html = df.head().to_html(index=False) status_msg = f"✅ CSV Loaded! {len(df)} rows, {len(df.columns)} columns." return preview_html, status_msg, True except Exception as e: return None, f"Error loading CSV: {e}", False def answer_question(user_input): """ Retrieve top-10 relevant rows and ask the LLM to answer strictly using them. """ global vectorstore, df if vectorstore is None or df is None: return "⚠️ Please upload and load a CSV first." if not user_input or not user_input.strip(): return "⚠️ Please enter a question." # Retrieval and strict grounding k = 10 try: docs_and_scores = vectorstore.similarity_search_with_score(user_input, k=k) except Exception as e: return f"❌ Retrieval error: {e}" if not docs_and_scores: return "⚠️ No relevant data found for your question in the CSV." try: context = "\n".join(getattr(doc, "page_content", getattr(doc, "content", "")) for doc, _ in docs_and_scores) except Exception: context = "" prompt = f"""You are an expert data analyst AI assistant. Use ONLY the following CSV data to answer the question below. If the answer cannot be found in the data, say 'No sufficient data to answer.' Do NOT guess or invent information. Data: {context} Question: {user_input} Please answer clearly and provide Python pandas code using DataFrame `df` to reproduce the answer exactly. """ try: chat_model = ChatGroq(model_name="llama-3.3-70b-versatile", api_key=GROQ_API_KEY) response = chat_model.invoke(prompt) if isinstance(response, str): answer = response elif hasattr(response, "content"): answer = response.content else: answer = str(response) except Exception as e: answer = f"❌ Error generating answer: {e}" return answer # ========== Gradio UI ========== with gr.Blocks(title="CsvPal-AI: Chat with Your CSV (RAG-powered Q&A)") as demo: gr.Markdown( """ # 📊 CsvPal-AI Upload a CSV file and ask natural language questions. Receive clear, data-grounded answers and Python pandas code snippets. """ ) with gr.Row(): csv_file = gr.File(label="📂 Upload CSV", file_types=[".csv"]) status = gr.Textbox(label="Status", interactive=False, max_lines=3) df_preview = gr.HTML(label="🧾 CSV Preview") gr.Markdown( "⚠️ Note: Answers are based on the top 10 most relevant rows retrieved from your CSV for each question." ) gr.Markdown("----") user_input = gr.Textbox( label="📝 Ask your question:", placeholder="e.g. What analysis can be done in the CSV data?", interactive=False, lines=1 ) with gr.Row(): ask_btn = gr.Button("🤖 Get Answer", interactive=False) clear_btn = gr.Button("Clear") answer_output = gr.Textbox( label="Answer", lines=15, interactive=False ) # UI clear on manual file clear def clear_all_from_file(): return "", "", gr.update(value="", interactive=False), gr.update(interactive=False), "" # On file upload, index fresh and enable def on_file_upload(file): preview, msg, enable_chat = load_and_index_csv(file) return ( preview or "", msg, gr.update(value="", interactive=bool(enable_chat)), gr.update(interactive=bool(enable_chat)), "" # answer_output ) # Global clear: wipes index, df, and UI def clear_everything(): global vectorstore, df safe_delete_collection(vectorstore) vectorstore = None df = None return ( None, # csv_file (clears) "", # df_preview "", # status gr.update(value="", interactive=False), # user_input clear+disable gr.update(interactive=False), # ask_btn disable "" # answer_output clear ) csv_file.upload( on_file_upload, inputs=[csv_file], outputs=[df_preview, status, user_input, ask_btn, answer_output] ) csv_file.clear(clear_all_from_file, None, [df_preview, status, user_input, ask_btn, answer_output]) ask_btn.click(answer_question, inputs=[user_input], outputs=[answer_output]) user_input.submit(answer_question, inputs=[user_input], outputs=[answer_output]) clear_btn.click( clear_everything, inputs=[], outputs=[csv_file, df_preview, status, user_input, ask_btn, answer_output] ) demo.launch()