mlbench123's picture
Update app.py
cb7c829 verified
raw
history blame
4.26 kB
#!/usr/bin/env python3
import base64
import json
from pathlib import Path
import gradio as gr
from openai import OpenAI
API_KEY = "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"
MODEL = "gpt-5.1"
client = OpenAI(api_key=API_KEY)
def upload_pdf(path):
return client.files.create(file=open(path, "rb"), purpose="assistants").id
def prompt():
return (
"Extract structured JSON from the attached logistics document. Return ONLY valid JSON.\n"
"{\n"
" \"po_number\": string|null,\n"
" \"ship_from_name\": string|null,\n"
" \"ship_from_email\": string|null,\n"
" \"carrier_type\": string|null,\n"
" \"rail_car_number\": string|null,\n"
" \"total_quantity\": number|null,\n"
" \"inventories\": [\n"
" {\n"
" \"productName\": string|null,\n"
" \"productCode\": string|null,\n"
" \"variants\": [\n"
" {\n"
" \"dimensions\": string|null,\n"
" \"pcs_per_pkg\": number|null,\n"
" \"length_ft\": number|null,\n"
" \"width\": number|null,\n"
" \"packages\": number|null,\n"
" \"pieces\": number|null,\n"
" \"fbm\": number|string|null\n"
" }\n"
" ],\n"
" \"total_pcs\": number|null,\n"
" \"total_fbm\": number|string|null\n"
" }\n"
" ],\n"
" \"custom_fields\": {}\n"
"}\n\n"
"SHIP FROM RULES:\n"
"- If explicit fields like 'Origin', 'Ship From' exist, extract that value.\n"
"- If the document is an email-style inbound notice (header block) and shows:\n"
" From: Name <email>\n"
" then ship_from_name = Name, ship_from_email = email.\n"
"- If only an email exists and no human name, set both fields to that email.\n"
"- If both Origin and an email sender exist, use Origin for ship_from_name and still capture the email under ship_from_email.\n"
"- Priority: Origin β†’ Email Name β†’ Mill β†’ Sender block β†’ null.\n\n"
"CARRIER / EQUIPMENT RULE:\n"
"- If the table contains:\n"
" Equipment id = <value>\n"
" Mark = <value>\n"
" then ALWAYS treat 'Equipment id' as the railcar number.\n"
"- NEVER use 'Mark' as railcar number.\n"
"- Carrier type must match the carrier text exactly (e.g., CHICAGO RAIL LINK).\n\n"
"INVENTORY RULES:\n"
"- Do not merge length groups. Each unique length or dimension is its own variant.\n"
"- Extract pcs_per_pkg, packages, pieces, fbm exactly as written.\n"
"- total_pcs = sum of pieces.\n"
"- total_fbm = sum of fbm.\n\n"
"TOTAL QUANTITY RULE:\n"
"- Use explicit totals if they appear.\n"
"- If no explicit total quantity appears, leave null.\n\n"
"CUSTOM FIELDS RULE:\n"
"- Capture all meaningful leftover fields not part of main schema.\n\n"
"Return ONLY the JSON."
)
def extract(file):
path = Path(file.name)
suffix = path.suffix.lower()
if suffix == ".pdf":
fid = upload_pdf(path)
content = [
{"type": "text", "text": prompt()},
{"type": "file", "file": {"file_id": fid}}
]
else:
b64 = base64.b64encode(path.read_bytes()).decode()
content = [
{"type": "text", "text": prompt()},
{"type": "image_url", "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}}
]
r = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": content}]
)
text = r.choices[0].message.content
s = text.find("{")
e = text.rfind("}")
return text[s:e+1]
def ui(file):
return extract(file)
examples = [
"IMG_0001.jpg",
"IMG_0002.jpg"
]
gr.Interface(
fn=ui,
inputs=gr.File(label="Upload PDF or Image"),
outputs=gr.JSON(label="Extracted JSON"),
title="Logistics OCR Data Extractor (GPT-5.1)",
examples=examples
).launch(True)