Spaces:

SuriRaja
/

UC3-Raja

Sleeping

File size: 4,431 Bytes

615cd02
3a7fd72
b11b6bf
615cd02
b11b6bf
 
 
 
 
 
615cd02
3a7fd72
 
 
 
 
 
 
 
 
 
 
b11b6bf
 
 
 
615cd02
3a7fd72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615cd02
 
3a7fd72
 
615cd02
3a7fd72
b11b6bf
 
615cd02
b11b6bf
 
 
615cd02
 
b11b6bf
 
 
 
 
3a7fd72
 
 
 
 
 
b11b6bf
615cd02
3a7fd72
 
 
b11b6bf
 
3a7fd72
615cd02
b11b6bf
615cd02
3a7fd72
 
 
 
 
b11b6bf
615cd02
b11b6bf
3a7fd72
b11b6bf
 
 
3a7fd72
 
 
 
b11b6bf
3a7fd72
b11b6bf
 
 
3a7fd72
b11b6bf
 
615cd02
3a7fd72
 
 
615cd02

import streamlit as st
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load GPT-Neo model and tokenizer from Hugging Face
@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
    model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
    return tokenizer, model

# Function to extract text from a PDF file
def extract_pdf_text(uploaded_file):
    """Extract text content from a PDF file using PyMuPDF."""
    text = ""
    with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf:
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            text += page.get_text("text") + "\n"
    return text

# Helper function to generate response using GPT-Neo
def generate_response(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=500)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Predefined customer instructions
instruction_sets = {
    "Toshiba": """
    Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total.
    - Identify Item Code blocks starting with a numeric code (e.g., 155569003011).
    - Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins.
    - Maintain the exact line order and formatting, preserving sub-lines.
    """,
    "BHEL": """
    Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value.
    - Include primary description (e.g., BPS 017507).
    - Add Material Number, HSN Code, GST percentage.
    """,
    "Federal Electric": """
    Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT.
    Ensure all relevant data fields are included and validated.
    """,
    "AL NISF": """
    Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price.
    - Add a bold header 'DESCRIPTION'.
    - Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year.
    """,
    "Others": """
    Perform dynamic field mapping to extract all relevant data fields.
    - Ensure the fields are captured accurately.
    """
}

# Streamlit app
def main():
    st.title("PMP Auto-PO Generator (Direct PDF Processing)")

    # Step 1: Welcome and Option Selection
    st.write("Welcome! Please select a PO file type and upload the corresponding PDF.")
    options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"]
    selected_option = st.selectbox("Select an option:", options)

    if not selected_option:
        st.warning("Please select an option to proceed.")
        return

    # Step 2: File Upload
    uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"])
    if not uploaded_file:
        st.warning("Please upload a PDF file to proceed.")
        return

    # Extract text from the uploaded PDF
    st.write("Extracting text from the uploaded PDF...")
    try:
        extracted_text = extract_pdf_text(uploaded_file)
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return

    # Retrieve associated instructions
    instructions = instruction_sets[selected_option]

    # Combine all inputs for the model prompt
    prompt = f"""
    Parse the following Purchase Order (PO) based on the selected option and predefined instructions:

    Selected Option: {selected_option}

    Instructions:
    {instructions}

    PDF Content:
    {extracted_text}
    """

    # Load model and tokenizer
    st.write("Loading the model and generating response...")
    tokenizer, model = load_model()

    # Generate response
    try:
        response = generate_response(prompt, tokenizer, model)
        st.success("Parsing successful! Here is the output:")
        st.text_area("Parsed Output", value=response, height=300)

        # Download parsed output as JSON
        st.download_button(
            label="Download JSON",
            data=response,
            file_name=f"{selected_option}_parsed_output.json",
            mime="application/json"
        )

    except Exception as e:
        st.error(f"Error generating response: {e}")

if __name__ == "__main__":
    main()