Spaces:

MLBench
/

Logistics-OCR-Text-Extractor

Sleeping

App Files Files Community

Logistics-OCR-Text-Extractor / app.py

mlbench123

Update app.py

1e9fc11 verified 5 months ago

raw

history blame

4.47 kB

	#!/usr/bin/env python3
	# app.py — Logistics OCR Extractor (PDF + Images) with strict ship_from rules

	import base64
	import json
	from pathlib import Path
	import gradio as gr
	from openai import OpenAI

	API_KEY = "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"
	MODEL = "gpt-5.1"

	client = OpenAI(api_key=API_KEY)


	# ----------------------- PDF Upload -----------------------
	def upload_pdf(path):
	f = client.files.create(
	file=open(path, "rb"),
	purpose="assistants"
	)
	return f.id


	# ----------------------- Prompt Builder -----------------------
	def build_prompt():
	return (
	"Extract structured JSON from this logistics shipping document. "
	"Use only what appears in the PDF/image, never hallucinate. "
	"Return strictly valid JSON in this schema:\n\n"
	"{\n"
	" \"po_number\": string\|null,\n"
	" \"ship_from\": string\|null,\n"
	" \"carrier_type\": string\|null,\n"
	" \"rail_car_number\": string\|null,\n"
	" \"total_quantity\": number\|null,\n"
	" \"inventories\": [\n"
	" {\n"
	" \"productName\": string,\n"
	" \"productCode\": string\|null,\n"
	" \"variants\": [\n"
	" {\n"
	" \"dimensions\": string\|null,\n"
	" \"pcs_per_pkg\": number\|null,\n"
	" \"length_ft\": number\|null,\n"
	" \"width\": number\|null,\n"
	" \"packages\": number\|null,\n"
	" \"pieces\": number\|null,\n"
	" \"fbm\": number\|null\n"
	" }\n"
	" ],\n"
	" \"total_pcs\": number\|null,\n"
	" \"total_fbm\": number\|null\n"
	" }\n"
	" ],\n"
	" \"custom_fields\": {}\n"
	"}\n\n"

	"SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
	"1. If document contains explicit Origin/Ship From labels, extract that value.\n"
	"2. If document is an email-based inbound notice and no explicit origin exists, "
	"set ship_from = the email 'From:' field.\n"
	"3. If both Origin and Mill exist, use Origin.\n"
	"4. If only Mill exists AND it is clearly the shipping location, use Mill.\n"
	"5. Priority order: Origin → Email From → Mill → Sender company block.\n"
	"6. If none apply, ship_from = null.\n\n"

	"Rules for inventories:\n"
	"- Do NOT merge different lengths; create a separate variant per length.\n"
	"- Extract EXACT numbers shown: packages, pcs_per_pkg, pieces, fbm.\n"
	"- total_pcs = sum of all variant pieces.\n"
	"- total_fbm = sum of all variant fbm.\n\n"

	"Rules for total_quantity:\n"
	"- If the document shows a total PCS value explicitly, use it.\n"
	"- If only variants exist, do not compute total_quantity unless the document explicitly states it.\n\n"

	"Parse tables carefully. If a dimension group (like 2x6) appears, use that.\n"
	"Return only JSON. No explanations."
	)


	# ----------------------- Extraction Logic -----------------------
	def extract(file):
	path = Path(file.name)
	suffix = path.suffix.lower()

	if suffix == ".pdf":
	fid = upload_pdf(path)
	msg = [
	{"type": "text", "text": build_prompt()},
	{"type": "file", "file": {"file_id": fid}}
	]
	else:
	b64 = base64.b64encode(path.read_bytes()).decode()
	msg = [
	{"type": "text", "text": build_prompt()},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}
	}
	]

	r = client.chat.completions.create(
	model=MODEL,
	messages=[{"role": "user", "content": msg}]
	)

	txt = r.choices[0].message.content
	s = txt.find("{")
	e = txt.rfind("}")
	return txt[s:e+1]


	# ----------------------- Gradio UI -----------------------
	def ui(file):
	return extract(file)


	# Sample images (optional)
	sample_files = [
	("IMG_0001.jpg", "IMG_0001.jpg"),
	("IMG_0002.jpg", "IMG_0002.jpg")
	]

	gr.Interface(
	fn=ui,
	inputs=gr.File(label="Upload PDF or Image"),
	outputs=gr.JSON(label="Extracted JSON"),
	title="Logistics OCR Data Extractor (GPT-5.1)",
	examples=sample_files
	).launch()