Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import pandas as pd | |
| import gradio as gr | |
| import tempfile | |
| import re | |
| def extract_po_text(pdf_file): | |
| # Initialize list to store text data from each page | |
| text_data = [] | |
| # Load PDF and extract text page by page | |
| with fitz.open(pdf_file.name) as pdf: | |
| for page_num in range(pdf.page_count): | |
| page = pdf[page_num] | |
| text = page.get_text("text") | |
| text_data.append(f"Page {page_num + 1}:\n{text}\n") | |
| # Combine all page texts into one for inspection | |
| full_text = "\n".join(text_data) | |
| return full_text | |
| def main(pdf_file): | |
| # Extract and display raw text for debugging purposes | |
| extracted_text = extract_po_text(pdf_file) | |
| return None, "Raw text extracted from PDF:\n\n" + extracted_text | |
| # Gradio interface to display raw text output | |
| interface = gr.Interface( | |
| fn=main, | |
| inputs=gr.File(label="Upload PO PDF"), | |
| outputs=[gr.File(label="Download Excel File (will not work for now)"), gr.Textbox(label="Raw Text from PDF")], | |
| title="PDF Text Extractor", | |
| description="Upload a PDF file to view its raw text content for troubleshooting extraction issues." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |