ReneeHWT commited on
Commit
4ada510
·
verified ·
1 Parent(s): 2628b24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -9
app.py CHANGED
@@ -5,11 +5,6 @@ import io
5
  import pytesseract
6
 
7
  def extract_text(file):
8
- """
9
- Extracts text from a PDF file using OCR.
10
- - Uses PyMuPDF to render each page as an image.
11
- - Runs Tesseract OCR on each image.
12
- """
13
  if file is None:
14
  return ""
15
  pdf_bytes = file.read()
@@ -18,14 +13,14 @@ def extract_text(file):
18
  for page in doc:
19
  pix = page.get_pixmap(dpi=300)
20
  img = Image.open(io.BytesIO(pix.tobytes("png")))
21
- page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
22
- full_text += page_text + "\n\n"
23
  return full_text
24
 
25
  iface = gr.Interface(
26
  fn=extract_text,
27
- inputs=gr.inputs.File(label="Upload your PDF"),
28
- outputs=gr.outputs.Textbox(label="Extracted Text"),
 
29
  title="PDF Text Extractor with OCR",
30
  description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
31
  )
 
5
  import pytesseract
6
 
7
  def extract_text(file):
 
 
 
 
 
8
  if file is None:
9
  return ""
10
  pdf_bytes = file.read()
 
13
  for page in doc:
14
  pix = page.get_pixmap(dpi=300)
15
  img = Image.open(io.BytesIO(pix.tobytes("png")))
16
+ full_text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
 
17
  return full_text
18
 
19
  iface = gr.Interface(
20
  fn=extract_text,
21
+ # 這裡用新 API,直接用 gr.File gr.Textbox
22
+ inputs=gr.File(label="Upload your PDF"),
23
+ outputs=gr.Textbox(label="Extracted Text"),
24
  title="PDF Text Extractor with OCR",
25
  description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
26
  )