import fitz  # PyMuPDF
import pandas as pd
import gradio as gr
import tempfile
import re

def extract_po_text(pdf_file):
    # Initialize list to store text data from each page
    text_data = []

    # Load PDF and extract text page by page
    with fitz.open(pdf_file.name) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf[page_num]
            text = page.get_text("text")
            text_data.append(f"Page {page_num + 1}:\n{text}\n")

    # Combine all page texts into one for inspection
    full_text = "\n".join(text_data)
    return full_text

def main(pdf_file):
    # Extract and display raw text for debugging purposes
    extracted_text = extract_po_text(pdf_file)
    return None, "Raw text extracted from PDF:\n\n" + extracted_text

# Gradio interface to display raw text output
interface = gr.Interface(
    fn=main,
    inputs=gr.File(label="Upload PO PDF"),
    outputs=[gr.File(label="Download Excel File (will not work for now)"), gr.Textbox(label="Raw Text from PDF")],
    title="PDF Text Extractor",
    description="Upload a PDF file to view its raw text content for troubleshooting extraction issues."
)

if __name__ == "__main__":
    interface.launch()