Spaces:

sourize
/

RagBot

Sleeping

App Files Files Community

sourize commited on Apr 20, 2025

Commit

e07c00d

1 Parent(s): 13eba5e

Updated main.py

Browse files

Files changed (3) hide show

.streamlit/secrets.toml +0 -8
app.py +83 -54
requirements.txt +6 -3

.streamlit/secrets.toml DELETED Viewed

@@ -1,8 +0,0 @@
-# .streamlit/secrets.toml
-# Your FastAPI backend (if you host it separately, e.g. on Railway or Render)
-backend_url = "https://rag-pathway.onrender.com"
-# Supabase credentials (if you call Supabase directly from Streamlit)
-SUPABASE_URL   = "https://iddmmovzjstbinuptpit.supabase.co"
-SUPABASE_KEY   = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImlkZG1tb3Z6anN0YmludXB0cGl0Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTc0MzQxMDgxMSwiZXhwIjoyMDU4OTg2ODExfQ.MQUoU3JhDSWofJ7Z3zmytbKVF8DOJ9yLBYraDI_YIFw"

app.py CHANGED Viewed

@@ -1,57 +1,86 @@
 import streamlit as st
-import requests
-st.set_page_config(page_title="RagBot", layout="wide")
-# --- Sidebar: Upload files ---
-st.sidebar.header("📂 Upload Documents")
-uploaded = st.sidebar.file_uploader(
-    "Choose PDF/TXT files",
-    type=["pdf", "txt"],
-    accept_multiple_files=True
-)
-if uploaded:
-    # Show a little notification
-    with st.spinner("Uploading and re-indexing…"):
-        for f in uploaded:
-            files = {"file": (f.name, f.getvalue())}
-            res = requests.post(
-                f"{st.secrets.backend_url}/upload",
-                files=files
-            )
-            res.raise_for_status()
-        # kick off reindex
-        requests.post(f"{st.secrets.backend_url}/reindex")
-    st.sidebar.success(f"Uploaded {len(uploaded)} file(s) and reindexed.")
-# --- Main chat interface ---
-st.title("🤖 RagBot")
-# keep chat history in session state
-if "history" not in st.session_state:
-    st.session_state.history = []
-# show previous messages
-for role, text in st.session_state.history:
-    align = "→" if role=="user" else "←"
-    st.markdown(f"**{align} {role.capitalize()}**: {text}")
-# input your question
-question = st.text_input("Ask a question about your documents:")
-if st.button("Send") and question:
-    st.session_state.history.append(("user", question))
-    with st.spinner("Thinking…"):
-        resp = requests.post(
-            f"{st.secrets.backend_url}/qa",
-            json={"question": question}
-        )
-    if resp.status_code == 200:
-        answer = resp.json().get("answer")
-        st.session_state.history.append(("assistant", answer))
     else:
-        st.error("Failed to fetch answer.")
-# auto-scroll to bottom (hack)
-st.write("")  # ensures the last message is visible

 import streamlit as st
+from PyPDF2 import PdfReader
+import docx
+from sentence_transformers import SentenceTransformer
+import faiss
+from transformers import pipeline
+# Caching heavy resources
+@st.cache_resource
+def load_models():
+    # Embedding model (lightweight)
+    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    # QA model (distilled SQuAD)
+    qa = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')
+    return embedder, qa
+# Extract text from uploaded file
+def extract_text_from_file(uploaded_file):
+    name = uploaded_file.name.lower()
+    if name.endswith('.pdf'):
+        reader = PdfReader(uploaded_file)
+        text = ''.join(page.extract_text() or '' for page in reader.pages)
+    elif name.endswith('.docx'):
+        doc = docx.Document(uploaded_file)
+        text = '\n'.join(para.text for para in doc.paragraphs)
     else:
+        text = uploaded_file.getvalue().decode('utf-8', errors='ignore')
+    return text
+# Split text into chunks
+def chunk_text(text, chunk_size=500, overlap=50):
+    words = text.split()
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = min(start + chunk_size, len(words))
+        chunk = ' '.join(words[start:end])
+        chunks.append(chunk)
+        start += chunk_size - overlap
+    return chunks
+# Build FAISS index from chunks
+@st.cache_resource
+def build_faiss_index(chunks, embedder):
+    embeddings = embedder.encode(chunks)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(embeddings)
+    return index
+# Main Streamlit app
+def main():
+    st.title('📄 Streamlit RAG: Document QA')
+    st.markdown('Upload a PDF or DOCX and ask questions about its content.')
+    uploaded = st.file_uploader('Upload Document', type=['pdf', 'docx', 'txt'], accept_multiple_files=False)
+    if uploaded:
+        with st.spinner('Extracting text...'):
+            text = extract_text_from_file(uploaded)
+        st.success('Text extracted!')
+        # Chunk and index
+        chunks = chunk_text(text)
+        embedder, qa = load_models()
+        index = build_faiss_index(chunks, embedder)
+        # Ask questions
+        question = st.text_input('Ask a question:')
+        if question:
+            with st.spinner('Searching relevant passages...'):
+                q_emb = embedder.encode([question])
+                D, I = index.search(q_emb, k=3)
+                context = '\n\n'.join(chunks[i] for i in I[0])
+            with st.spinner('Answering...'):
+                result = qa({'question': question, 'context': context})
+                answer = result.get('answer', 'Sorry, could not find an answer.')
+            st.write('**Answer:**', answer)
+            st.write('---')
+            st.write('**Context snippets:**')
+            for idx in I[0]:
+                st.write('- ', chunks[idx][:200].replace('\n', ' '), '...')
+if __name__ == '__main__':
+    main()

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-streamlit
-requests
-python-multipart      # to handle uploads if you ever embed FastAPI inside Streamlit

+streamlit
+faiss-cpu
+sentence-transformers
+transformers
+PyPDF2
+python-docx