Spaces:

shamilcoded
/

RagBaseApp

Sleeping

App Files Files Community

SHAMIL SHAHBAZ AWAN commited on Dec 25, 2024

Commit

1f8c160

verified ·

1 Parent(s): 755213a

Create app.py

Browse files

Files changed (1) hide show

app.py +83 -0

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import streamlit as st
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import faiss
+import numpy as np
+from groq import Client  # Assuming Groq API client is installed
+# Load Hugging Face Secrets
+HUGGINGFACE_KEY = os.getenv("HF_API_TOKEN")  # Set in Hugging Face Spaces secret manager
+if not HUGGINGFACE_KEY:
+    st.error("Hugging Face API token not found. Please set it in the Hugging Face Secrets.")
+# Initialize Groq client
+groq_client = Client(api_key=HUGGINGFACE_KEY)
+# Load models
+embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# Paths
+DOCUMENTS_FOLDER = "documents/"
+VECTORSTORE_FOLDER = "vectorstore/"
+# Initialize FAISS vector store
+if not os.path.exists(VECTORSTORE_FOLDER):
+    os.makedirs(VECTORSTORE_FOLDER)
+vectorstore_path = os.path.join(VECTORSTORE_FOLDER, "index.faiss")
+if os.path.exists(vectorstore_path):
+    index = faiss.read_index(vectorstore_path)
+else:
+    index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
+# Load and process documents
+def load_documents(folder):
+    documents = []
+    for filename in os.listdir(folder):
+        if filename.endswith(".pdf"):
+            pdf_reader = PdfReader(os.path.join(folder, filename))
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+            documents.append(text)
+    return documents
+def chunk_text(text, chunk_size=500, overlap=100):
+    chunks = []
+    for i in range(0, len(text), chunk_size - overlap):
+        chunks.append(text[i:i + chunk_size])
+    return chunks
+if st.button("Process Documents"):
+    st.info("Processing documents...")
+    all_text = load_documents(DOCUMENTS_FOLDER)
+    chunks = []
+    for text in all_text:
+        chunks.extend(chunk_text(text))
+    embeddings = embedder.encode(chunks, show_progress_bar=True)
+    index.add(np.array(embeddings))
+    faiss.write_index(index, vectorstore_path)
+    st.success("Documents processed and vectorstore updated!")
+# User interface
+st.title("RAG Application with Streamlit")
+user_query = st.text_input("Enter your query:")
+if user_query:
+    query_embedding = embedder.encode([user_query])
+    distances, indices = index.search(np.array(query_embedding), k=5)
+    retrieved_chunks = [chunks[idx] for idx in indices[0]]
+    st.subheader("Retrieved Chunks")
+    for chunk in retrieved_chunks:
+        st.write(chunk)
+    combined_input = " ".join(retrieved_chunks) + user_query
+    response = groq_client.generate(model="llama-8b", prompt=combined_input, max_tokens=200)
+    st.subheader("Generated Response")
+    st.write(response["text"])