Spaces:

GlitchGhost
/

Multi-Source-RAG-Assistant

Sleeping

App Files Files Community

GlitchGhost commited on May 11, 2025

Commit

603f5d9

verified ·

1 Parent(s): 72271e9

Create app.py

Browse files

Files changed (1) hide show

app.py +140 -0

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import fitz
+import tempfile
+import requests
+import streamlit as st
+import pandas as pd
+from bs4 import BeautifulSoup
+from sentence_transformers import SentenceTransformer
+from langchain.vectorstores.faiss import FAISS
+from langchain.embeddings.base import Embeddings
+import google.generativeai as genai
+# === Embeddings Wrapper ===
+class SentenceTransformerEmbeddings(Embeddings):
+    def __init__(self, model_name="all-MiniLM-L6-v2"):
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts):
+        return self.model.encode(texts).tolist()
+    def embed_query(self, text):
+        return self.model.encode([text])[0].tolist()
+# === Utility Functions ===
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    return "\n".join([page.get_text() for page in doc])
+def split_text(text, chunk_size=500, overlap=50):
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(start + chunk_size, len(text))
+        chunks.append(text[start:end])
+        start += chunk_size - overlap
+    return chunks
+def ask_gemini(question, context, api_key):
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-pro")
+    prompt = f"""You are a helpful assistant. Use the context below to answer the question.
+Context:
+{context}
+Question: {question}
+Answer:"""
+    response = model.generate_content(prompt)
+    return response.text
+def create_vectorstore(chunks):
+    embeddings = SentenceTransformerEmbeddings()
+    return FAISS.from_texts(chunks, embedding=embeddings)
+def generate_answer(vectorstore, question, api_key):
+    docs = vectorstore.similarity_search(question, k=3)
+    context = "\n".join([doc.page_content for doc in docs])
+    return ask_gemini(question, context, api_key), docs
+def extract_website_text(url):
+    try:
+        res = requests.get(url, timeout=10)
+        soup = BeautifulSoup(res.text, "html.parser")
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text(separator="\n")
+        return text.strip()
+    except Exception as e:
+        return f"Error extracting website: {e}"
+# === Streamlit App ===
+st.set_page_config(page_title="📚 Multi-Source RAG Assistant", layout="wide")
+st.title("🔍 RAG Assistant: Chat with PDF, CSV, or Website")
+# Sidebar
+with st.sidebar:
+    data_source = st.selectbox("📂 Select Input Type", ["PDF", "CSV", "Website URL"])
+    gemini_api_key = st.text_input("🔑 Enter Gemini API Key", type="password")
+# === Logic by Data Source ===
+vectorstore = None
+full_data_text = ""
+if data_source == "PDF":
+    pdf_file = st.file_uploader("📄 Upload PDF", type="pdf")
+    if pdf_file:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            tmp.write(pdf_file.read())
+            text = extract_text_from_pdf(tmp.name)
+            chunks = split_text(text)
+            vectorstore = create_vectorstore(chunks)
+            full_data_text = text
+        st.success("✅ PDF processed and indexed!")
+elif data_source == "CSV":
+    csv_file = st.file_uploader("📊 Upload CSV", type="csv")
+    if csv_file:
+        df = pd.read_csv(csv_file)
+        st.subheader("🔍 Exploratory Data Analysis")
+        st.dataframe(df)
+        st.write("📈 Summary Statistics")
+        st.write(df.describe(include="all").transpose())
+        csv_text = df.to_string(index=False)
+        chunks = split_text(csv_text)
+        vectorstore = create_vectorstore(chunks)
+        full_data_text = csv_text
+        st.success("✅ CSV indexed and ready for Q&A!")
+elif data_source == "Website URL":
+    url = st.text_input("🌐 Enter Website URL")
+    if url and st.button("📥 Extract Website"):
+        web_text = extract_website_text(url)
+        if web_text.startswith("Error"):
+            st.error(web_text)
+        else:
+            chunks = split_text(web_text)
+            vectorstore = create_vectorstore(chunks)
+            full_data_text = web_text
+            st.success("✅ Website text extracted and indexed!")
+# === QA Section ===
+if vectorstore and gemini_api_key:
+    st.subheader("❓ Ask a Question")
+    question = st.text_input("💬 Your question")
+    if question:
+        with st.spinner("🔍 Thinking..."):
+            answer, top_docs = generate_answer(vectorstore, question, gemini_api_key)
+            st.success("🧠 Answer")
+            st.write(answer)
+            with st.expander("📌 Top Relevant Chunks"):
+                for i, doc in enumerate(top_docs):
+                    st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```")
+            st.download_button("📤 Download Answer", answer, file_name="rag_answer.txt")
+elif not gemini_api_key:
+    st.info("🔐 Please enter your Gemini API key in the sidebar.")