Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

PDF_OCR_Extract / app.py

Update app.py

df0821f verified 7 months ago

831 Bytes

	import gradio as gr
	import fitz # PyMuPDF

	def extract_text(pdf_file):
	# 打開 PDF
	doc = fitz.open(pdf_file.name)
	full_text = ""

	# 純文字抽取
	for page in doc:
	full_text += page.get_text()

	# 過濾非 ASCII 字元（只保留英數、標點）
	filtered = full_text.encode("ascii", errors="ignore").decode()

	# 輸出成 txt
	out_path = "output.txt"
	with open(out_path, "w", encoding="utf-8") as f:
	f.write(filtered)

	return out_path

	# 建立 Gradio 介面
	demo = gr.Interface(
	fn=extract_text,
	inputs=gr.File(label="Upload PDF (.pdf)"),
	outputs=gr.File(label="Download TXT"),
	title="PDF → TXT (English only)",
	description="Extract English text from PDF (純文字抽取) and download as .txt"
	)

	if __name__ == "__main__":
	demo.launch()