Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,11 +16,8 @@ def extract_text(file):
|
|
| 16 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 17 |
full_text = ""
|
| 18 |
for page in doc:
|
| 19 |
-
# 以 300 dpi 渲染
|
| 20 |
pix = page.get_pixmap(dpi=300)
|
| 21 |
-
|
| 22 |
-
img = Image.open(io.BytesIO(img_data))
|
| 23 |
-
# OCR,支援英文與繁體中文
|
| 24 |
page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
|
| 25 |
full_text += page_text + "\n\n"
|
| 26 |
return full_text
|
|
|
|
| 16 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 17 |
full_text = ""
|
| 18 |
for page in doc:
|
|
|
|
| 19 |
pix = page.get_pixmap(dpi=300)
|
| 20 |
+
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
|
|
|
|
|
|
| 21 |
page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
|
| 22 |
full_text += page_text + "\n\n"
|
| 23 |
return full_text
|