poext1 / app.py
jithenderchoudary's picture
Update app.py
c674136 verified
import fitz # PyMuPDF
import pandas as pd
import gradio as gr
import tempfile
import re
def extract_po_text(pdf_file):
# Initialize list to store text data from each page
text_data = []
# Load PDF and extract text page by page
with fitz.open(pdf_file.name) as pdf:
for page_num in range(pdf.page_count):
page = pdf[page_num]
text = page.get_text("text")
text_data.append(f"Page {page_num + 1}:\n{text}\n")
# Combine all page texts into one for inspection
full_text = "\n".join(text_data)
return full_text
def main(pdf_file):
# Extract and display raw text for debugging purposes
extracted_text = extract_po_text(pdf_file)
return None, "Raw text extracted from PDF:\n\n" + extracted_text
# Gradio interface to display raw text output
interface = gr.Interface(
fn=main,
inputs=gr.File(label="Upload PO PDF"),
outputs=[gr.File(label="Download Excel File (will not work for now)"), gr.Textbox(label="Raw Text from PDF")],
title="PDF Text Extractor",
description="Upload a PDF file to view its raw text content for troubleshooting extraction issues."
)
if __name__ == "__main__":
interface.launch()