Spaces:

jithenderchoudary
/

poext1

Sleeping

poext1 / app.py

Update app.py

c674136 verified over 1 year ago

1.21 kB

	import fitz # PyMuPDF
	import pandas as pd
	import gradio as gr
	import tempfile
	import re

	def extract_po_text(pdf_file):
	# Initialize list to store text data from each page
	text_data = []

	# Load PDF and extract text page by page
	with fitz.open(pdf_file.name) as pdf:
	for page_num in range(pdf.page_count):
	page = pdf[page_num]
	text = page.get_text("text")
	text_data.append(f"Page {page_num + 1}:\n{text}\n")

	# Combine all page texts into one for inspection
	full_text = "\n".join(text_data)
	return full_text

	def main(pdf_file):
	# Extract and display raw text for debugging purposes
	extracted_text = extract_po_text(pdf_file)
	return None, "Raw text extracted from PDF:\n\n" + extracted_text

	# Gradio interface to display raw text output
	interface = gr.Interface(
	fn=main,
	inputs=gr.File(label="Upload PO PDF"),
	outputs=[gr.File(label="Download Excel File (will not work for now)"), gr.Textbox(label="Raw Text from PDF")],
	title="PDF Text Extractor",
	description="Upload a PDF file to view its raw text content for troubleshooting extraction issues."
	)

	if __name__ == "__main__":
	interface.launch()