| import streamlit as st |
| import fitz |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| |
| @st.cache_resource |
| def load_model(): |
| tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") |
| model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") |
| return tokenizer, model |
|
|
| |
| def extract_pdf_text(uploaded_file): |
| """Extract text content from a PDF file using PyMuPDF.""" |
| text = "" |
| with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf: |
| for page_num in range(len(pdf)): |
| page = pdf[page_num] |
| text += page.get_text("text") + "\n" |
| return text |
|
|
| |
| def generate_response(prompt, tokenizer, model): |
| inputs = tokenizer(prompt, return_tensors="pt") |
| outputs = model.generate(**inputs, max_new_tokens=500) |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
| |
| instruction_sets = { |
| "Toshiba": """ |
| Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total. |
| - Identify Item Code blocks starting with a numeric code (e.g., 155569003011). |
| - Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins. |
| - Maintain the exact line order and formatting, preserving sub-lines. |
| """, |
| "BHEL": """ |
| Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value. |
| - Include primary description (e.g., BPS 017507). |
| - Add Material Number, HSN Code, GST percentage. |
| """, |
| "Federal Electric": """ |
| Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT. |
| Ensure all relevant data fields are included and validated. |
| """, |
| "AL NISF": """ |
| Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price. |
| - Add a bold header 'DESCRIPTION'. |
| - Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year. |
| """, |
| "Others": """ |
| Perform dynamic field mapping to extract all relevant data fields. |
| - Ensure the fields are captured accurately. |
| """ |
| } |
|
|
| |
| def main(): |
| st.title("PMP Auto-PO Generator (Direct PDF Processing)") |
|
|
| |
| st.write("Welcome! Please select a PO file type and upload the corresponding PDF.") |
| options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"] |
| selected_option = st.selectbox("Select an option:", options) |
|
|
| if not selected_option: |
| st.warning("Please select an option to proceed.") |
| return |
|
|
| |
| uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"]) |
| if not uploaded_file: |
| st.warning("Please upload a PDF file to proceed.") |
| return |
|
|
| |
| st.write("Extracting text from the uploaded PDF...") |
| try: |
| extracted_text = extract_pdf_text(uploaded_file) |
| except Exception as e: |
| st.error(f"Error extracting text from PDF: {e}") |
| return |
|
|
| |
| instructions = instruction_sets[selected_option] |
|
|
| |
| prompt = f""" |
| Parse the following Purchase Order (PO) based on the selected option and predefined instructions: |
| |
| Selected Option: {selected_option} |
| |
| Instructions: |
| {instructions} |
| |
| PDF Content: |
| {extracted_text} |
| """ |
|
|
| |
| st.write("Loading the model and generating response...") |
| tokenizer, model = load_model() |
|
|
| |
| try: |
| response = generate_response(prompt, tokenizer, model) |
| st.success("Parsing successful! Here is the output:") |
| st.text_area("Parsed Output", value=response, height=300) |
|
|
| |
| st.download_button( |
| label="Download JSON", |
| data=response, |
| file_name=f"{selected_option}_parsed_output.json", |
| mime="application/json" |
| ) |
|
|
| except Exception as e: |
| st.error(f"Error generating response: {e}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|