ReneeHWT commited on
Commit
f5b1291
·
verified ·
1 Parent(s): 93ff969

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -0
app.py CHANGED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pdf2image import convert_from_bytes
3
+ import pytesseract
4
+
5
+ def extract_text(file):
6
+ """
7
+ Extracts text from a PDF file using OCR.
8
+ - Converts PDF pages to images.
9
+ - Runs Tesseract OCR on each image.
10
+ """
11
+ if file is None:
12
+ return ""
13
+ # 讀取 PDF 原始位元組
14
+ pdf_bytes = file.read()
15
+ # 每頁轉成影像 (dpi=300 for better OCR accuracy)
16
+ images = convert_from_bytes(pdf_bytes, dpi=300)
17
+ # 用 Tesseract OCR 擷取文字並累積
18
+ text = ""
19
+ for img in images:
20
+ text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
21
+ return text
22
+
23
+ # 建立 Gradio 介面
24
+ iface = gr.Interface(
25
+ fn=extract_text,
26
+ inputs=gr.inputs.File(label="Upload your PDF"),
27
+ outputs=gr.outputs.Textbox(label="Extracted Text"),
28
+ title="PDF Text Extractor with OCR",
29
+ description="Upload a PDF file and extract all text (including from images) using Tesseract OCR."
30
+ )
31
+
32
+ if __name__ == "__main__":
33
+ # 若要產生公開連結,可改成 iface.launch(share=True)
34
+ iface.launch()