ArthurLin commited on
Commit
bf38aae
·
1 Parent(s): d4ce689

CHANGE: requirement.txt to requirements.txt

Browse files
Files changed (2) hide show
  1. data.py +5 -6
  2. requirement.txt → requirements.txt +1 -1
data.py CHANGED
@@ -1,16 +1,15 @@
1
  import os
2
- import pdfplumber
3
 
4
  def load_text(path):
5
  file_extension = os.path.splitext(path)[1].lower()
6
 
7
  if file_extension == ".pdf":
 
8
  texts = []
9
- with pdfplumber.open(path) as pdf:
10
- for page in pdf.pages:
11
- texts.append(page.extract_text() or "")
12
- return "\n".join(texts)
13
-
14
  elif file_extension == ".txt":
15
  with open(path, "r", encoding="utf-8") as file:
16
  return file.read()
 
1
  import os
2
+ import fitz # PyMuPDF
3
 
4
  def load_text(path):
5
  file_extension = os.path.splitext(path)[1].lower()
6
 
7
  if file_extension == ".pdf":
8
+ doc = fitz.open(path)
9
  texts = []
10
+ for page in doc:
11
+ texts.append(page.get_text())
12
+ return "\n".join(texts)
 
 
13
  elif file_extension == ".txt":
14
  with open(path, "r", encoding="utf-8") as file:
15
  return file.read()
requirement.txt → requirements.txt RENAMED
@@ -5,7 +5,6 @@ torch==1.13.1
5
  torchaudio==0.13.1
6
  torchvision==0.14.1
7
  faiss-cpu==1.9.0
8
- pdfplumber
9
 
10
  # Web UI
11
  gradio==3.50.2
@@ -20,6 +19,7 @@ scipy==1.15.3
20
  PyYAML==6.0.2
21
  pdfplumber
22
  pydub==0.25.1
 
23
 
24
  # Visual / Debugging
25
  matplotlib==3.10.5
 
5
  torchaudio==0.13.1
6
  torchvision==0.14.1
7
  faiss-cpu==1.9.0
 
8
 
9
  # Web UI
10
  gradio==3.50.2
 
19
  PyYAML==6.0.2
20
  pdfplumber
21
  pydub==0.25.1
22
+ PyMuPDF==1.23.9
23
 
24
  # Visual / Debugging
25
  matplotlib==3.10.5