Spaces:

shamilcoded
/

RagBaseApp

Build error

App Files Files Community

SHAMIL SHAHBAZ AWAN commited on Dec 25, 2024

Commit

69d986f

verified ·

1 Parent(s): 57c6937

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -23

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import streamlit as st
-from PyPDF2 import PdfReader
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 import faiss
@@ -20,10 +20,6 @@ embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 # Paths
 file_path = "RagBaseApp/Atomic habits ( PDFDrive ).pdf"
-with pdfplumber.open(file_path) as pdf:
-    for page in pdf.pages:
-        print(page.extract_text())
 VECTORSTORE_FOLDER = "vectorstore"
 # Initialize FAISS vector store
@@ -36,35 +32,31 @@ if os.path.exists(vectorstore_path):
 else:
     index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
-# Load and process documents
-def load_documents(folder):
-    documents = []
-    for filename in os.listdir(folder):
-        if filename.endswith(".pdf"):
-            pdf_reader = PdfReader(os.path.join(folder, filename))
-            text = ""
-            for page in pdf_reader.pages:
-                text += page.extract_text()
-            documents.append(text)
-    return documents
 def chunk_text(text, chunk_size=500, overlap=100):
     chunks = []
     for i in range(0, len(text), chunk_size - overlap):
         chunks.append(text[i:i + chunk_size])
     return chunks
-if st.button("Process Documents"):
-    st.info("Processing documents...")
-    all_text = load_documents(DOCUMENTS_FOLDER)
-    chunks = []
-    for text in all_text:
-        chunks.extend(chunk_text(text))
     embeddings = embedder.encode(chunks, show_progress_bar=True)
     index.add(np.array(embeddings))
     faiss.write_index(index, vectorstore_path)
-    st.success("Documents processed and vectorstore updated!")
 # User interface
 st.title("Atomic Habits RAG Application")

 import os
 import streamlit as st
+import pdfplumber
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 import faiss
 # Paths
 file_path = "RagBaseApp/Atomic habits ( PDFDrive ).pdf"
 VECTORSTORE_FOLDER = "vectorstore"
 # Initialize FAISS vector store
 else:
     index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
+# Load and process the PDF file
+def load_pdf_text(file_path):
+    """Extract text from a PDF file."""
+    text = ""
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text()
+    return text
 def chunk_text(text, chunk_size=500, overlap=100):
+    """Split text into overlapping chunks."""
     chunks = []
     for i in range(0, len(text), chunk_size - overlap):
         chunks.append(text[i:i + chunk_size])
     return chunks
+if st.button("Process PDF"):
+    st.info("Processing PDF document...")
+    text = load_pdf_text(file_path)
+    chunks = chunk_text(text)
     embeddings = embedder.encode(chunks, show_progress_bar=True)
     index.add(np.array(embeddings))
     faiss.write_index(index, vectorstore_path)
+    st.success("PDF processed and vectorstore updated!")
 # User interface
 st.title("Atomic Habits RAG Application")