|
|
|
|
|
|
|
|
import base64 |
|
|
import json |
|
|
from pathlib import Path |
|
|
import gradio as gr |
|
|
from openai import OpenAI |
|
|
|
|
|
API_KEY = "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" |
|
|
MODEL = "gpt-5.1" |
|
|
|
|
|
client = OpenAI(api_key=API_KEY) |
|
|
|
|
|
|
|
|
def upload_pdf(path): |
|
|
return client.files.create(file=open(path, "rb"), purpose="assistants").id |
|
|
|
|
|
|
|
|
|
|
|
def prompt(): |
|
|
return ( |
|
|
"Extract structured JSON from the attached logistics document. Return ONLY valid JSON.\n" |
|
|
"{\n" |
|
|
" \"po_number\": string|null,\n" |
|
|
" \"ship_from_name\": string|null,\n" |
|
|
" \"ship_from_email\": string|null,\n" |
|
|
" \"carrier_type\": string|null,\n" |
|
|
" \"rail_car_number\": string|null,\n" |
|
|
" \"total_quantity\": number|null,\n" |
|
|
" \"inventories\": [\n" |
|
|
" {\n" |
|
|
" \"productName\": string|null,\n" |
|
|
" \"productCode\": string|null,\n" |
|
|
" \"variants\": [\n" |
|
|
" {\n" |
|
|
" \"dimensions\": string|null,\n" |
|
|
" \"pcs_per_pkg\": number|null,\n" |
|
|
" \"length_ft\": number|null,\n" |
|
|
" \"width\": number|null,\n" |
|
|
" \"packages\": number|null,\n" |
|
|
" \"pieces\": number|null,\n" |
|
|
" \"fbm\": number|string|null\n" |
|
|
" }\n" |
|
|
" ],\n" |
|
|
" \"total_pcs\": number|null,\n" |
|
|
" \"total_fbm\": number|string|null\n" |
|
|
" }\n" |
|
|
" ],\n" |
|
|
" \"custom_fields\": {}\n" |
|
|
"}\n\n" |
|
|
"SHIP FROM RULES:\n" |
|
|
"- If explicit fields like 'Origin', 'Ship From' exist, extract that value.\n" |
|
|
"- If the document is an email-style inbound notice (header block) and shows:\n" |
|
|
" From: Name <email>\n" |
|
|
" then ship_from_name = Name, ship_from_email = email.\n" |
|
|
"- If only an email exists and no human name, set both fields to that email.\n" |
|
|
"- If both Origin and an email sender exist, use Origin for ship_from_name and still capture the email under ship_from_email.\n" |
|
|
"- Priority: Origin β Email Name β Mill β Sender block β null.\n\n" |
|
|
"CARRIER / EQUIPMENT RULE:\n" |
|
|
"- If the table contains:\n" |
|
|
" Equipment id = <value>\n" |
|
|
" Mark = <value>\n" |
|
|
" then ALWAYS treat 'Equipment id' as the railcar number.\n" |
|
|
"- NEVER use 'Mark' as railcar number.\n" |
|
|
"- Carrier type must match the carrier text exactly (e.g., CHICAGO RAIL LINK).\n\n" |
|
|
"INVENTORY RULES:\n" |
|
|
"- Do not merge length groups. Each unique length or dimension is its own variant.\n" |
|
|
"- Extract pcs_per_pkg, packages, pieces, fbm exactly as written.\n" |
|
|
"- total_pcs = sum of pieces.\n" |
|
|
"- total_fbm = sum of fbm.\n\n" |
|
|
"TOTAL QUANTITY RULE:\n" |
|
|
"- Use explicit totals if they appear.\n" |
|
|
"- If no explicit total quantity appears, leave null.\n\n" |
|
|
"CUSTOM FIELDS RULE:\n" |
|
|
"- Capture all meaningful leftover fields not part of main schema.\n\n" |
|
|
"Return ONLY the JSON." |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract(path): |
|
|
suffix = Path(path).suffix.lower() |
|
|
|
|
|
if suffix == ".pdf": |
|
|
fid = upload_pdf(path) |
|
|
content = [ |
|
|
{"type": "text", "text": prompt()}, |
|
|
{"type": "file", "file": {"file_id": fid}} |
|
|
] |
|
|
else: |
|
|
b64 = base64.b64encode(Path(path).read_bytes()).decode() |
|
|
ext = suffix[1:] |
|
|
content = [ |
|
|
{"type": "text", "text": prompt()}, |
|
|
{"type": "image_url", "image_url": {"url": f"data:image/{ext};base64,{b64}"}} |
|
|
] |
|
|
|
|
|
r = client.chat.completions.create( |
|
|
model=MODEL, |
|
|
messages=[{"role": "user", "content": content}] |
|
|
) |
|
|
|
|
|
text = r.choices[0].message.content |
|
|
return text[text.find("{"): text.rfind("}") + 1] |
|
|
|
|
|
|
|
|
def ui(image_input, pdf_input): |
|
|
if image_input: |
|
|
return extract(image_input) |
|
|
if pdf_input: |
|
|
return extract(pdf_input.name) |
|
|
return "{}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# **Logistics OCR Data Extractor (GPT-5.1)**") |
|
|
|
|
|
with gr.Row(): |
|
|
img = gr.Image(label="Upload Image", type="filepath") |
|
|
pdf = gr.File(label="Upload PDF", file_types=["pdf"]) |
|
|
|
|
|
out = gr.JSON(label="Extracted JSON") |
|
|
btn = gr.Button("Submit") |
|
|
|
|
|
btn.click(fn=ui, inputs=[img, pdf], outputs=out) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["IMG_0001.jpg", None], |
|
|
["IMG_0002.jpg", None] |
|
|
], |
|
|
inputs=[img, pdf], |
|
|
label="Sample Images" |
|
|
) |
|
|
|
|
|
demo.launch(share=True) |
|
|
|