ReneeHWT commited on
Commit
530ec5f
·
verified ·
1 Parent(s): f5b1291

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -13
app.py CHANGED
@@ -1,34 +1,39 @@
 
 
1
  import gradio as gr
2
- from pdf2image import convert_from_bytes
 
 
3
  import pytesseract
4
 
5
  def extract_text(file):
6
  """
7
  Extracts text from a PDF file using OCR.
8
- - Converts PDF pages to images.
9
  - Runs Tesseract OCR on each image.
10
  """
11
  if file is None:
12
  return ""
13
- # 讀取 PDF 原始位元組
14
  pdf_bytes = file.read()
15
- # 每頁轉成影像 (dpi=300 for better OCR accuracy)
16
- images = convert_from_bytes(pdf_bytes, dpi=300)
17
- # Tesseract OCR 擷取文字並累積
18
- text = ""
19
- for img in images:
20
- text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
21
- return text
 
 
 
 
22
 
23
- # 建立 Gradio 介面
24
  iface = gr.Interface(
25
  fn=extract_text,
26
  inputs=gr.inputs.File(label="Upload your PDF"),
27
  outputs=gr.outputs.Textbox(label="Extracted Text"),
28
  title="PDF Text Extractor with OCR",
29
- description="Upload a PDF file and extract all text (including from images) using Tesseract OCR."
30
  )
31
 
32
  if __name__ == "__main__":
33
- # 若要產生公開連結,可改成 iface.launch(share=True)
34
  iface.launch()
 
1
+ pip install gradio pymupdf pillow pytesseract
2
+
3
  import gradio as gr
4
+ import fitz # PyMuPDF
5
+ from PIL import Image
6
+ import io
7
  import pytesseract
8
 
9
  def extract_text(file):
10
  """
11
  Extracts text from a PDF file using OCR.
12
+ - Uses PyMuPDF to render each page as an image.
13
  - Runs Tesseract OCR on each image.
14
  """
15
  if file is None:
16
  return ""
 
17
  pdf_bytes = file.read()
18
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
19
+ full_text = ""
20
+ for page in doc:
21
+ # 300 dpi 渲染
22
+ pix = page.get_pixmap(dpi=300)
23
+ img_data = pix.tobytes("png")
24
+ img = Image.open(io.BytesIO(img_data))
25
+ # OCR,支援英文與繁體中文
26
+ page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
27
+ full_text += page_text + "\n\n"
28
+ return full_text
29
 
 
30
  iface = gr.Interface(
31
  fn=extract_text,
32
  inputs=gr.inputs.File(label="Upload your PDF"),
33
  outputs=gr.outputs.Textbox(label="Extracted Text"),
34
  title="PDF Text Extractor with OCR",
35
+ description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
36
  )
37
 
38
  if __name__ == "__main__":
 
39
  iface.launch()