import streamlit as st
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load GPT-Neo model and tokenizer from Hugging Face
@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
    model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
    return tokenizer, model

# Function to extract text from a PDF file
def extract_pdf_text(uploaded_file):
    """Extract text content from a PDF file using PyMuPDF."""
    text = ""
    with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf:
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            text += page.get_text("text") + "\n"
    return text

# Helper function to generate response using GPT-Neo
def generate_response(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=500)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Predefined customer instructions
instruction_sets = {
    "Toshiba": """
    Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total.
    - Identify Item Code blocks starting with a numeric code (e.g., 155569003011).
    - Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins.
    - Maintain the exact line order and formatting, preserving sub-lines.
    """,
    "BHEL": """
    Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value.
    - Include primary description (e.g., BPS 017507).
    - Add Material Number, HSN Code, GST percentage.
    """,
    "Federal Electric": """
    Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT.
    Ensure all relevant data fields are included and validated.
    """,
    "AL NISF": """
    Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price.
    - Add a bold header 'DESCRIPTION'.
    - Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year.
    """,
    "Others": """
    Perform dynamic field mapping to extract all relevant data fields.
    - Ensure the fields are captured accurately.
    """
}

# Streamlit app
def main():
    st.title("PMP Auto-PO Generator (Direct PDF Processing)")

    # Step 1: Welcome and Option Selection
    st.write("Welcome! Please select a PO file type and upload the corresponding PDF.")
    options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"]
    selected_option = st.selectbox("Select an option:", options)

    if not selected_option:
        st.warning("Please select an option to proceed.")
        return

    # Step 2: File Upload
    uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"])
    if not uploaded_file:
        st.warning("Please upload a PDF file to proceed.")
        return

    # Extract text from the uploaded PDF
    st.write("Extracting text from the uploaded PDF...")
    try:
        extracted_text = extract_pdf_text(uploaded_file)
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return

    # Retrieve associated instructions
    instructions = instruction_sets[selected_option]

    # Combine all inputs for the model prompt
    prompt = f"""
    Parse the following Purchase Order (PO) based on the selected option and predefined instructions:

    Selected Option: {selected_option}

    Instructions:
    {instructions}

    PDF Content:
    {extracted_text}
    """

    # Load model and tokenizer
    st.write("Loading the model and generating response...")
    tokenizer, model = load_model()

    # Generate response
    try:
        response = generate_response(prompt, tokenizer, model)
        st.success("Parsing successful! Here is the output:")
        st.text_area("Parsed Output", value=response, height=300)

        # Download parsed output as JSON
        st.download_button(
            label="Download JSON",
            data=response,
            file_name=f"{selected_option}_parsed_output.json",
            mime="application/json"
        )

    except Exception as e:
        st.error(f"Error generating response: {e}")

if __name__ == "__main__":
    main()