Spaces:

SuriRaja
/

UC3-Raja

Sleeping

App Files Files Community

UC3-Raja / app.py

SuriRaja

Update app.py

3a7fd72 verified over 1 year ago

raw

history blame contribute delete

4.43 kB

	import streamlit as st
	import fitz # PyMuPDF
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# Load GPT-Neo model and tokenizer from Hugging Face
	@st.cache_resource
	def load_model():
	tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
	model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
	return tokenizer, model

	# Function to extract text from a PDF file
	def extract_pdf_text(uploaded_file):
	"""Extract text content from a PDF file using PyMuPDF."""
	text = ""
	with fitz.open(stream=uploaded_file.read(), filetype="pdf") as pdf:
	for page_num in range(len(pdf)):
	page = pdf[page_num]
	text += page.get_text("text") + "\n"
	return text

	# Helper function to generate response using GPT-Neo
	def generate_response(prompt, tokenizer, model):
	inputs = tokenizer(prompt, return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=500)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Predefined customer instructions
	instruction_sets = {
	"Toshiba": """
	Extract columns: Pos., Item Code, Unit, Delivery Date, Quantity, Basic Price, Discount, Cur., Amount, Sub Total.
	- Identify Item Code blocks starting with a numeric code (e.g., 155569003011).
	- Include all subsequent lines (e.g., descriptions, additional codes) until a new numeric block or section begins.
	- Maintain the exact line order and formatting, preserving sub-lines.
	""",
	"BHEL": """
	Extract columns: SI No, Material Description, Unit, Quantity, Dely Qty, Dely Date, Unit Rate, Value.
	- Include primary description (e.g., BPS 017507).
	- Add Material Number, HSN Code, GST percentage.
	""",
	"Federal Electric": """
	Extract columns: S. No, Material No, Material Description, Qty, Unit, Price, Delivery Date, Total Value, Vat%, Amount Incl.VAT.
	Ensure all relevant data fields are included and validated.
	""",
	"AL NISF": """
	Extract columns: Item, Description, Qty, Unit, Unit Price, Total Price.
	- Add a bold header 'DESCRIPTION'.
	- Include Computer Code Number, Product Name, Designation Number, Dimensions, Serial Number, and Manufacturing Year.
	""",
	"Others": """
	Perform dynamic field mapping to extract all relevant data fields.
	- Ensure the fields are captured accurately.
	"""
	}

	# Streamlit app
	def main():
	st.title("PMP Auto-PO Generator (Direct PDF Processing)")

	# Step 1: Welcome and Option Selection
	st.write("Welcome! Please select a PO file type and upload the corresponding PDF.")
	options = ["Toshiba", "BHEL", "Federal Electric", "AL NISF", "Others"]
	selected_option = st.selectbox("Select an option:", options)

	if not selected_option:
	st.warning("Please select an option to proceed.")
	return

	# Step 2: File Upload
	uploaded_file = st.file_uploader("Upload your PO file (PDF format only):", type=["pdf"])
	if not uploaded_file:
	st.warning("Please upload a PDF file to proceed.")
	return

	# Extract text from the uploaded PDF
	st.write("Extracting text from the uploaded PDF...")
	try:
	extracted_text = extract_pdf_text(uploaded_file)
	except Exception as e:
	st.error(f"Error extracting text from PDF: {e}")
	return

	# Retrieve associated instructions
	instructions = instruction_sets[selected_option]

	# Combine all inputs for the model prompt
	prompt = f"""
	Parse the following Purchase Order (PO) based on the selected option and predefined instructions:

	Selected Option: {selected_option}

	Instructions:
	{instructions}

	PDF Content:
	{extracted_text}
	"""

	# Load model and tokenizer
	st.write("Loading the model and generating response...")
	tokenizer, model = load_model()

	# Generate response
	try:
	response = generate_response(prompt, tokenizer, model)
	st.success("Parsing successful! Here is the output:")
	st.text_area("Parsed Output", value=response, height=300)

	# Download parsed output as JSON
	st.download_button(
	label="Download JSON",
	data=response,
	file_name=f"{selected_option}_parsed_output.json",
	mime="application/json"
	)

	except Exception as e:
	st.error(f"Error generating response: {e}")

	if __name__ == "__main__":
	main()