import fitz # PyMuPDF import pandas as pd import gradio as gr import tempfile import re def extract_po_text(pdf_file): # Initialize list to store text data from each page text_data = [] # Load PDF and extract text page by page with fitz.open(pdf_file.name) as pdf: for page_num in range(pdf.page_count): page = pdf[page_num] text = page.get_text("text") text_data.append(f"Page {page_num + 1}:\n{text}\n") # Combine all page texts into one for inspection full_text = "\n".join(text_data) return full_text def main(pdf_file): # Extract and display raw text for debugging purposes extracted_text = extract_po_text(pdf_file) return None, "Raw text extracted from PDF:\n\n" + extracted_text # Gradio interface to display raw text output interface = gr.Interface( fn=main, inputs=gr.File(label="Upload PO PDF"), outputs=[gr.File(label="Download Excel File (will not work for now)"), gr.Textbox(label="Raw Text from PDF")], title="PDF Text Extractor", description="Upload a PDF file to view its raw text content for troubleshooting extraction issues." ) if __name__ == "__main__": interface.launch()