|
|
|
|
|
|
|
|
import base64 |
|
|
import json |
|
|
from pathlib import Path |
|
|
import gradio as gr |
|
|
from openai import OpenAI |
|
|
|
|
|
API_KEY = "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" |
|
|
MODEL = "gpt-5.1" |
|
|
|
|
|
client = OpenAI(api_key=API_KEY) |
|
|
|
|
|
def upload_pdf(path): |
|
|
return client.files.create(file=open(path, "rb"), purpose="assistants").id |
|
|
|
|
|
def prompt(): |
|
|
return ( |
|
|
"Extract structured JSON from the attached logistics document. Return ONLY valid JSON.\n" |
|
|
"{\n" |
|
|
" \"po_number\": string|null,\n" |
|
|
" \"ship_from_name\": string|null,\n" |
|
|
" \"ship_from_email\": string|null,\n" |
|
|
" \"carrier_type\": string|null,\n" |
|
|
" \"rail_car_number\": string|null,\n" |
|
|
" \"total_quantity\": number|null,\n" |
|
|
" \"inventories\": [\n" |
|
|
" {\n" |
|
|
" \"productName\": string|null,\n" |
|
|
" \"productCode\": string|null,\n" |
|
|
" \"variants\": [\n" |
|
|
" {\n" |
|
|
" \"dimensions\": string|null,\n" |
|
|
" \"pcs_per_pkg\": number|null,\n" |
|
|
" \"length_ft\": number|null,\n" |
|
|
" \"width\": number|null,\n" |
|
|
" \"packages\": number|null,\n" |
|
|
" \"pieces\": number|null,\n" |
|
|
" \"fbm\": number|string|null\n" |
|
|
" }\n" |
|
|
" ],\n" |
|
|
" \"total_pcs\": number|null,\n" |
|
|
" \"total_fbm\": number|string|null\n" |
|
|
" }\n" |
|
|
" ],\n" |
|
|
" \"custom_fields\": {}\n" |
|
|
"}\n\n" |
|
|
"SHIP FROM RULES:\n" |
|
|
"- If explicit fields like 'Origin', 'Ship From' exist, extract that value.\n" |
|
|
"- If the document is an email-style inbound notice (header block) and shows:\n" |
|
|
" From: Name <email>\n" |
|
|
" then ship_from_name = Name, ship_from_email = email.\n" |
|
|
"- If only an email exists and no human name, set both fields to that email.\n" |
|
|
"- If both Origin and an email sender exist, use Origin for ship_from_name and still capture the email under ship_from_email.\n" |
|
|
"- Priority: Origin β Email Name β Mill β Sender block β null.\n\n" |
|
|
"CARRIER / EQUIPMENT RULE:\n" |
|
|
"- If the table contains:\n" |
|
|
" Equipment id = <value>\n" |
|
|
" Mark = <value>\n" |
|
|
" then ALWAYS treat 'Equipment id' as the railcar number.\n" |
|
|
"- NEVER use 'Mark' as railcar number.\n" |
|
|
"- Carrier type must match the carrier text exactly (e.g., CHICAGO RAIL LINK).\n\n" |
|
|
"INVENTORY RULES:\n" |
|
|
"- Do not merge length groups. Each unique length or dimension is its own variant.\n" |
|
|
"- Extract pcs_per_pkg, packages, pieces, fbm exactly as written.\n" |
|
|
"- total_pcs = sum of pieces.\n" |
|
|
"- total_fbm = sum of fbm.\n\n" |
|
|
"TOTAL QUANTITY RULE:\n" |
|
|
"- Use explicit totals if they appear.\n" |
|
|
"- If no explicit total quantity appears, leave null.\n\n" |
|
|
"CUSTOM FIELDS RULE:\n" |
|
|
"- Capture all meaningful leftover fields not part of main schema.\n\n" |
|
|
"Return ONLY the JSON." |
|
|
) |
|
|
|
|
|
def extract(file): |
|
|
path = Path(file.name) |
|
|
suffix = path.suffix.lower() |
|
|
|
|
|
if suffix == ".pdf": |
|
|
fid = upload_pdf(path) |
|
|
content = [ |
|
|
{"type": "text", "text": prompt()}, |
|
|
{"type": "file", "file": {"file_id": fid}} |
|
|
] |
|
|
else: |
|
|
b64 = base64.b64encode(path.read_bytes()).decode() |
|
|
content = [ |
|
|
{"type": "text", "text": prompt()}, |
|
|
{"type": "image_url", "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}} |
|
|
] |
|
|
|
|
|
r = client.chat.completions.create( |
|
|
model=MODEL, |
|
|
messages=[{"role": "user", "content": content}] |
|
|
) |
|
|
|
|
|
text = r.choices[0].message.content |
|
|
s = text.find("{") |
|
|
e = text.rfind("}") |
|
|
return text[s:e+1] |
|
|
|
|
|
def ui(file): |
|
|
return extract(file) |
|
|
|
|
|
examples = [ |
|
|
"IMG_0001.jpg", |
|
|
"IMG_0002.jpg" |
|
|
] |
|
|
|
|
|
gr.Interface( |
|
|
fn=ui, |
|
|
inputs=gr.File(label="Upload PDF or Image"), |
|
|
outputs=gr.JSON(label="Extracted JSON"), |
|
|
title="Logistics OCR Data Extractor (GPT-5.1)", |
|
|
examples=examples |
|
|
).launch(True) |
|
|
|