1MR commited on
Commit
4f754be
·
verified ·
1 Parent(s): 19b55fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -6
app.py CHANGED
@@ -18,13 +18,25 @@ from langchain.embeddings import HuggingFaceEmbeddings
18
 
19
  # Retrieve the Hugging Face token from environment variables
20
  # token = os.getenv("HUGGINGFACEHUB_TOKEN")
 
 
21
  def get_pdf_text(pdf_docs):
22
- text = ""
23
- for pdf in pdf_docs:
24
- pdf_reader = PdfReader(pdf)
25
- for page in pdf_reader.pages:
26
- text += page.extract_text()
27
- return text
 
 
 
 
 
 
 
 
 
 
28
 
29
  def get_text_chunks(text):
30
  text_splitter=CharacterTextSplitter(
 
18
 
19
  # Retrieve the Hugging Face token from environment variables
20
  # token = os.getenv("HUGGINGFACEHUB_TOKEN")
21
+ import fitz # PyMuPDF
22
+
23
  def get_pdf_text(pdf_docs):
24
+ text = ""
25
+ for pdf in pdf_docs:
26
+ try:
27
+ doc = fitz.open(stream=pdf.read(), filetype="pdf")
28
+ for page in doc:
29
+ text += page.get_text()
30
+ except Exception as e:
31
+ st.error(f"Could not read the file: {pdf.name}. Error: {e}")
32
+ return text
33
+ # def get_pdf_text(pdf_docs):
34
+ # text = ""
35
+ # for pdf in pdf_docs:
36
+ # pdf_reader = PdfReader(pdf)
37
+ # for page in pdf_reader.pages:
38
+ # text += page.extract_text()
39
+ # return text
40
 
41
  def get_text_chunks(text):
42
  text_splitter=CharacterTextSplitter(