Spaces:

shamilcoded
/

RagBaseApp

Build error

App Files Files Community

SHAMIL SHAHBAZ AWAN commited on Dec 25, 2024

Commit

458a679

verified ·

1 Parent(s): 71994d6

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -13

app.py CHANGED Viewed

@@ -2,78 +2,106 @@ import os
 import streamlit as st
 import pdfplumber
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 import faiss
 import numpy as np
-from groq import Client  # Assuming Groq API client is installed
 # Load Hugging Face Secrets
-HUGGINGFACE_KEY = os.getenv("HUGGINGFACE_KEY")  # Set in Hugging Face Spaces secret manager
 if not HUGGINGFACE_KEY:
     st.error("Hugging Face API token not found. Please set it in the Hugging Face Secrets.")
 # Initialize Groq client
 groq_client = Client(api_key=HUGGINGFACE_KEY)
-# Load models
 embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-# Paths
-file_path = "RagBaseApp/Atomic habits ( PDFDrive ).pdf"
 VECTORSTORE_FOLDER = "vectorstore"
-# Initialize FAISS vector store
 if not os.path.exists(VECTORSTORE_FOLDER):
     os.makedirs(VECTORSTORE_FOLDER)
 vectorstore_path = os.path.join(VECTORSTORE_FOLDER, "index.faiss")
 if os.path.exists(vectorstore_path):
     index = faiss.read_index(vectorstore_path)
 else:
     index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
-# Load and process the PDF file
 def load_pdf_text(file_path):
-    """Extract text from a PDF file."""
     text = ""
     with pdfplumber.open(file_path) as pdf:
         for page in pdf.pages:
             text += page.extract_text()
     return text
 def chunk_text(text, chunk_size=500, overlap=100):
-    """Split text into overlapping chunks."""
     chunks = []
     for i in range(0, len(text), chunk_size - overlap):
         chunks.append(text[i:i + chunk_size])
     return chunks
-if st.button("Process PDF"):
     st.info("Processing PDF document...")
     text = load_pdf_text(file_path)
     chunks = chunk_text(text)
     embeddings = embedder.encode(chunks, show_progress_bar=True)
     index.add(np.array(embeddings))
     faiss.write_index(index, vectorstore_path)
-    st.success("PDF processed and vectorstore updated!")
-# User interface
 st.title("Atomic Habits RAG Application")
 user_query = st.text_input("Enter your query:")
 if user_query:
     query_embedding = embedder.encode([user_query])
     distances, indices = index.search(np.array(query_embedding), k=5)
     retrieved_chunks = [chunks[idx] for idx in indices[0]]
     st.subheader("Retrieved Chunks")
     for chunk in retrieved_chunks:
         st.write(chunk)
     combined_input = " ".join(retrieved_chunks) + user_query
     response = groq_client.generate(model="llama3-8b-8192", prompt=combined_input, max_tokens=200)
     st.subheader("Generated Response")
     st.write(response["text"])

 import streamlit as st
 import pdfplumber
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
+from groq import Client
 # Load Hugging Face Secrets
+HUGGINGFACE_KEY = os.getenv("HUGGINGFACE_KEY")
 if not HUGGINGFACE_KEY:
     st.error("Hugging Face API token not found. Please set it in the Hugging Face Secrets.")
 # Initialize Groq client
 groq_client = Client(api_key=HUGGINGFACE_KEY)
+# Load the SentenceTransformer model for embedding generation
 embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# Define file path and vector store folder
+file_path = "Atomic habits ( PDFDrive ).pdf"  # File directly in the root directory of the app
 VECTORSTORE_FOLDER = "vectorstore"
+# Ensure the vector store folder exists
 if not os.path.exists(VECTORSTORE_FOLDER):
     os.makedirs(VECTORSTORE_FOLDER)
+# Define the vector store path
 vectorstore_path = os.path.join(VECTORSTORE_FOLDER, "index.faiss")
+# Load or create FAISS index
 if os.path.exists(vectorstore_path):
     index = faiss.read_index(vectorstore_path)
 else:
     index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
+# Function to load text from PDF
 def load_pdf_text(file_path):
+    """Extract text from the given PDF file."""
     text = ""
     with pdfplumber.open(file_path) as pdf:
         for page in pdf.pages:
             text += page.extract_text()
     return text
+# Function to chunk text into smaller pieces
 def chunk_text(text, chunk_size=500, overlap=100):
+    """Chunk the text into overlapping chunks."""
     chunks = []
     for i in range(0, len(text), chunk_size - overlap):
         chunks.append(text[i:i + chunk_size])
     return chunks
+# Process the document and update vector store
+def process_and_store_document(file_path):
+    """Process the PDF document, chunk text, generate embeddings, and store them in FAISS."""
     st.info("Processing PDF document...")
+    # Extract text from the PDF file
     text = load_pdf_text(file_path)
+    # Chunk the text into smaller pieces
     chunks = chunk_text(text)
+    # Generate embeddings for each chunk
     embeddings = embedder.encode(chunks, show_progress_bar=True)
+    # Add the embeddings to the FAISS index
     index.add(np.array(embeddings))
+    # Save the updated FAISS index
     faiss.write_index(index, vectorstore_path)
+    st.success("Document processed and vector store updated!")
+# User interface for Streamlit
 st.title("Atomic Habits RAG Application")
+# Button to trigger document processing
+if st.button("Process PDF"):
+    process_and_store_document(file_path)
+# Query input for the user
 user_query = st.text_input("Enter your query:")
 if user_query:
+    # Generate embedding for the user query
     query_embedding = embedder.encode([user_query])
+    # Perform the search on the FAISS index
     distances, indices = index.search(np.array(query_embedding), k=5)
+    # Retrieve the most relevant chunks based on the indices
     retrieved_chunks = [chunks[idx] for idx in indices[0]]
+    # Display the retrieved chunks
     st.subheader("Retrieved Chunks")
     for chunk in retrieved_chunks:
         st.write(chunk)
+    # Combine the retrieved chunks with the query and generate a response using Groq
     combined_input = " ".join(retrieved_chunks) + user_query
     response = groq_client.generate(model="llama3-8b-8192", prompt=combined_input, max_tokens=200)
+    # Display the generated response
     st.subheader("Generated Response")
     st.write(response["text"])