ArthurLin commited on
Commit
d4ce689
·
1 Parent(s): b2ec1d4

CHANGE: replace pymupdf with pdfplumber

Browse files
Files changed (2) hide show
  1. data.py +5 -5
  2. requirement.txt +1 -1
data.py CHANGED
@@ -1,14 +1,14 @@
1
- import fitz
2
  import os
 
3
 
4
  def load_text(path):
5
  file_extension = os.path.splitext(path)[1].lower()
6
-
7
  if file_extension == ".pdf":
8
- doc = fitz.open(path)
9
  texts = []
10
- for page in doc:
11
- texts.append(page.get_text())
 
12
  return "\n".join(texts)
13
 
14
  elif file_extension == ".txt":
 
 
1
  import os
2
+ import pdfplumber
3
 
4
  def load_text(path):
5
  file_extension = os.path.splitext(path)[1].lower()
6
+
7
  if file_extension == ".pdf":
 
8
  texts = []
9
+ with pdfplumber.open(path) as pdf:
10
+ for page in pdf.pages:
11
+ texts.append(page.extract_text() or "")
12
  return "\n".join(texts)
13
 
14
  elif file_extension == ".txt":
requirement.txt CHANGED
@@ -5,7 +5,7 @@ torch==1.13.1
5
  torchaudio==0.13.1
6
  torchvision==0.14.1
7
  faiss-cpu==1.9.0
8
- pymupdf
9
 
10
  # Web UI
11
  gradio==3.50.2
 
5
  torchaudio==0.13.1
6
  torchvision==0.14.1
7
  faiss-cpu==1.9.0
8
+ pdfplumber
9
 
10
  # Web UI
11
  gradio==3.50.2