UC3-Raja / app.py
SuriRaja's picture
Update app.py
3a7fd72 verified
import streamlit as st
import fitz # PyMuPDF
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load GPT-Neo model and tokenizer from Hugging Face
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
return tokenizer, model
# Function to extract text from a PDF file
def extract_pdf_text(uploaded_file):
"""Extract text content from a PDF file using PyMuPDF."""
text = ""
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf:
for page_num in range(len(pdf)):
page = pdf[page_num]
text += page.get_text("text") + "\n"
return text
# Helper function to generate response using GPT-Neo
def generate_response(prompt, tokenizer, model):
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=500)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Predefined customer instructions
instruction_sets = {
"Toshiba": """
Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total.
- Identify Item Code blocks starting with a numeric code (e.g., 155569003011).
- Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins.
- Maintain the exact line order and formatting, preserving sub-lines.
""",
"BHEL": """
Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value.
- Include primary description (e.g., BPS 017507).
- Add Material Number, HSN Code, GST percentage.
""",
"Federal Electric": """
Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT.
Ensure all relevant data fields are included and validated.
""",
"AL NISF": """
Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price.
- Add a bold header 'DESCRIPTION'.
- Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year.
""",
"Others": """
Perform dynamic field mapping to extract all relevant data fields.
- Ensure the fields are captured accurately.
"""
}
# Streamlit app
def main():
st.title("PMP Auto-PO Generator (Direct PDF Processing)")
# Step 1: Welcome and Option Selection
st.write("Welcome! Please select a PO file type and upload the corresponding PDF.")
options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"]
selected_option = st.selectbox("Select an option:", options)
if not selected_option:
st.warning("Please select an option to proceed.")
return
# Step 2: File Upload
uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"])
if not uploaded_file:
st.warning("Please upload a PDF file to proceed.")
return
# Extract text from the uploaded PDF
st.write("Extracting text from the uploaded PDF...")
try:
extracted_text = extract_pdf_text(uploaded_file)
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return
# Retrieve associated instructions
instructions = instruction_sets[selected_option]
# Combine all inputs for the model prompt
prompt = f"""
Parse the following Purchase Order (PO) based on the selected option and predefined instructions:
Selected Option: {selected_option}
Instructions:
{instructions}
PDF Content:
{extracted_text}
"""
# Load model and tokenizer
st.write("Loading the model and generating response...")
tokenizer, model = load_model()
# Generate response
try:
response = generate_response(prompt, tokenizer, model)
st.success("Parsing successful! Here is the output:")
st.text_area("Parsed Output", value=response, height=300)
# Download parsed output as JSON
st.download_button(
label="Download JSON",
data=response,
file_name=f"{selected_option}_parsed_output.json",
mime="application/json"
)
except Exception as e:
st.error(f"Error generating response: {e}")
if __name__ == "__main__":
main()