File size: 4,431 Bytes
615cd02 3a7fd72 b11b6bf 615cd02 b11b6bf 615cd02 3a7fd72 b11b6bf 615cd02 3a7fd72 615cd02 3a7fd72 615cd02 3a7fd72 b11b6bf 615cd02 b11b6bf 615cd02 b11b6bf 3a7fd72 b11b6bf 615cd02 3a7fd72 b11b6bf 3a7fd72 615cd02 b11b6bf 615cd02 3a7fd72 b11b6bf 615cd02 b11b6bf 3a7fd72 b11b6bf 3a7fd72 b11b6bf 3a7fd72 b11b6bf 3a7fd72 b11b6bf 615cd02 3a7fd72 615cd02 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | import streamlit as st
import fitz # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load GPT-Neo model and tokenizer from Hugging Face
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
return tokenizer, model
# Function to extract text from a PDF file
def extract_pdf_text(uploaded_file):
"""Extract text content from a PDF file using PyMuPDF."""
text = ""
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf:
for page_num in range(len(pdf)):
page = pdf[page_num]
text += page.get_text("text") + "\n"
return text
# Helper function to generate response using GPT-Neo
def generate_response(prompt, tokenizer, model):
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=500)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Predefined customer instructions
instruction_sets = {
"Toshiba": """
Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total.
- Identify Item Code blocks starting with a numeric code (e.g., 155569003011).
- Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins.
- Maintain the exact line order and formatting, preserving sub-lines.
""",
"BHEL": """
Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value.
- Include primary description (e.g., BPS 017507).
- Add Material Number, HSN Code, GST percentage.
""",
"Federal Electric": """
Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT.
Ensure all relevant data fields are included and validated.
""",
"AL NISF": """
Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price.
- Add a bold header 'DESCRIPTION'.
- Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year.
""",
"Others": """
Perform dynamic field mapping to extract all relevant data fields.
- Ensure the fields are captured accurately.
"""
}
# Streamlit app
def main():
st.title("PMP Auto-PO Generator (Direct PDF Processing)")
# Step 1: Welcome and Option Selection
st.write("Welcome! Please select a PO file type and upload the corresponding PDF.")
options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"]
selected_option = st.selectbox("Select an option:", options)
if not selected_option:
st.warning("Please select an option to proceed.")
return
# Step 2: File Upload
uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"])
if not uploaded_file:
st.warning("Please upload a PDF file to proceed.")
return
# Extract text from the uploaded PDF
st.write("Extracting text from the uploaded PDF...")
try:
extracted_text = extract_pdf_text(uploaded_file)
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return
# Retrieve associated instructions
instructions = instruction_sets[selected_option]
# Combine all inputs for the model prompt
prompt = f"""
Parse the following Purchase Order (PO) based on the selected option and predefined instructions:
Selected Option: {selected_option}
Instructions:
{instructions}
PDF Content:
{extracted_text}
"""
# Load model and tokenizer
st.write("Loading the model and generating response...")
tokenizer, model = load_model()
# Generate response
try:
response = generate_response(prompt, tokenizer, model)
st.success("Parsing successful! Here is the output:")
st.text_area("Parsed Output", value=response, height=300)
# Download parsed output as JSON
st.download_button(
label="Download JSON",
data=response,
file_name=f"{selected_option}_parsed_output.json",
mime="application/json"
)
except Exception as e:
st.error(f"Error generating response: {e}")
if __name__ == "__main__":
main()
|