SHAMIL SHAHBAZ AWAN commited on
Commit
69d986f
·
verified ·
1 Parent(s): 57c6937

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -23
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import streamlit as st
3
- from PyPDF2 import PdfReader
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import pipeline
6
  import faiss
@@ -20,10 +20,6 @@ embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
20
 
21
  # Paths
22
  file_path = "RagBaseApp/Atomic habits ( PDFDrive ).pdf"
23
-
24
- with pdfplumber.open(file_path) as pdf:
25
- for page in pdf.pages:
26
- print(page.extract_text())
27
  VECTORSTORE_FOLDER = "vectorstore"
28
 
29
  # Initialize FAISS vector store
@@ -36,35 +32,31 @@ if os.path.exists(vectorstore_path):
36
  else:
37
  index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
38
 
39
- # Load and process documents
40
- def load_documents(folder):
41
- documents = []
42
- for filename in os.listdir(folder):
43
- if filename.endswith(".pdf"):
44
- pdf_reader = PdfReader(os.path.join(folder, filename))
45
- text = ""
46
- for page in pdf_reader.pages:
47
- text += page.extract_text()
48
- documents.append(text)
49
- return documents
50
 
51
  def chunk_text(text, chunk_size=500, overlap=100):
 
52
  chunks = []
53
  for i in range(0, len(text), chunk_size - overlap):
54
  chunks.append(text[i:i + chunk_size])
55
  return chunks
56
 
57
- if st.button("Process Documents"):
58
- st.info("Processing documents...")
59
- all_text = load_documents(DOCUMENTS_FOLDER)
60
- chunks = []
61
- for text in all_text:
62
- chunks.extend(chunk_text(text))
63
 
64
  embeddings = embedder.encode(chunks, show_progress_bar=True)
65
  index.add(np.array(embeddings))
66
  faiss.write_index(index, vectorstore_path)
67
- st.success("Documents processed and vectorstore updated!")
68
 
69
  # User interface
70
  st.title("Atomic Habits RAG Application")
 
1
  import os
2
  import streamlit as st
3
+ import pdfplumber
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import pipeline
6
  import faiss
 
20
 
21
  # Paths
22
  file_path = "RagBaseApp/Atomic habits ( PDFDrive ).pdf"
 
 
 
 
23
  VECTORSTORE_FOLDER = "vectorstore"
24
 
25
  # Initialize FAISS vector store
 
32
  else:
33
  index = faiss.IndexFlatL2(embedder.get_sentence_embedding_dimension())
34
 
35
+ # Load and process the PDF file
36
+ def load_pdf_text(file_path):
37
+ """Extract text from a PDF file."""
38
+ text = ""
39
+ with pdfplumber.open(file_path) as pdf:
40
+ for page in pdf.pages:
41
+ text += page.extract_text()
42
+ return text
 
 
 
43
 
44
  def chunk_text(text, chunk_size=500, overlap=100):
45
+ """Split text into overlapping chunks."""
46
  chunks = []
47
  for i in range(0, len(text), chunk_size - overlap):
48
  chunks.append(text[i:i + chunk_size])
49
  return chunks
50
 
51
+ if st.button("Process PDF"):
52
+ st.info("Processing PDF document...")
53
+ text = load_pdf_text(file_path)
54
+ chunks = chunk_text(text)
 
 
55
 
56
  embeddings = embedder.encode(chunks, show_progress_bar=True)
57
  index.add(np.array(embeddings))
58
  faiss.write_index(index, vectorstore_path)
59
+ st.success("PDF processed and vectorstore updated!")
60
 
61
  # User interface
62
  st.title("Atomic Habits RAG Application")