import streamlit as st import fitz # PyMuPDF from transformers import AutoTokenizer, AutoModelForCausalLM # Load GPT-Neo model and tokenizer from Hugging Face @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") return tokenizer, model # Function to extract text from a PDF file def extract_pdf_text(uploaded_file): """Extract text content from a PDF file using PyMuPDF.""" text = "" with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf: for page_num in range(len(pdf)): page = pdf[page_num] text += page.get_text("text") + "\n" return text # Helper function to generate response using GPT-Neo def generate_response(prompt, tokenizer, model): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=500) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Predefined customer instructions instruction_sets = { "Toshiba": """ Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total. - Identify Item Code blocks starting with a numeric code (e.g., 155569003011). - Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins. - Maintain the exact line order and formatting, preserving sub-lines. """, "BHEL": """ Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value. - Include primary description (e.g., BPS 017507). - Add Material Number, HSN Code, GST percentage. """, "Federal Electric": """ Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT. Ensure all relevant data fields are included and validated. """, "AL NISF": """ Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price. - Add a bold header 'DESCRIPTION'. - Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year. """, "Others": """ Perform dynamic field mapping to extract all relevant data fields. - Ensure the fields are captured accurately. """ } # Streamlit app def main(): st.title("PMP Auto-PO Generator (Direct PDF Processing)") # Step 1: Welcome and Option Selection st.write("Welcome! Please select a PO file type and upload the corresponding PDF.") options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"] selected_option = st.selectbox("Select an option:", options) if not selected_option: st.warning("Please select an option to proceed.") return # Step 2: File Upload uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"]) if not uploaded_file: st.warning("Please upload a PDF file to proceed.") return # Extract text from the uploaded PDF st.write("Extracting text from the uploaded PDF...") try: extracted_text = extract_pdf_text(uploaded_file) except Exception as e: st.error(f"Error extracting text from PDF: {e}") return # Retrieve associated instructions instructions = instruction_sets[selected_option] # Combine all inputs for the model prompt prompt = f""" Parse the following Purchase Order (PO) based on the selected option and predefined instructions: Selected Option: {selected_option} Instructions: {instructions} PDF Content: {extracted_text} """ # Load model and tokenizer st.write("Loading the model and generating response...") tokenizer, model = load_model() # Generate response try: response = generate_response(prompt, tokenizer, model) st.success("Parsing successful! Here is the output:") st.text_area("Parsed Output", value=response, height=300) # Download parsed output as JSON st.download_button( label="Download JSON", data=response, file_name=f"{selected_option}_parsed_output.json", mime="application/json" ) except Exception as e: st.error(f"Error generating response: {e}") if __name__ == "__main__": main()