mlbench123's picture
Update app.py
1e9fc11 verified
raw
history blame
4.47 kB
#!/usr/bin/env python3
# app.py β€” Logistics OCR Extractor (PDF + Images) with strict ship_from rules
import base64
import json
from pathlib import Path
import gradio as gr
from openai import OpenAI
API_KEY = "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"
MODEL = "gpt-5.1"
client = OpenAI(api_key=API_KEY)
# ----------------------- PDF Upload -----------------------
def upload_pdf(path):
f = client.files.create(
file=open(path, "rb"),
purpose="assistants"
)
return f.id
# ----------------------- Prompt Builder -----------------------
def build_prompt():
return (
"Extract structured JSON from this logistics shipping document. "
"Use only what appears in the PDF/image, never hallucinate. "
"Return strictly valid JSON in this schema:\n\n"
"{\n"
" \"po_number\": string|null,\n"
" \"ship_from\": string|null,\n"
" \"carrier_type\": string|null,\n"
" \"rail_car_number\": string|null,\n"
" \"total_quantity\": number|null,\n"
" \"inventories\": [\n"
" {\n"
" \"productName\": string,\n"
" \"productCode\": string|null,\n"
" \"variants\": [\n"
" {\n"
" \"dimensions\": string|null,\n"
" \"pcs_per_pkg\": number|null,\n"
" \"length_ft\": number|null,\n"
" \"width\": number|null,\n"
" \"packages\": number|null,\n"
" \"pieces\": number|null,\n"
" \"fbm\": number|null\n"
" }\n"
" ],\n"
" \"total_pcs\": number|null,\n"
" \"total_fbm\": number|null\n"
" }\n"
" ],\n"
" \"custom_fields\": {}\n"
"}\n\n"
"SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
"1. If document contains explicit Origin/Ship From labels, extract that value.\n"
"2. If document is an email-based inbound notice and no explicit origin exists, "
"set ship_from = the email 'From:' field.\n"
"3. If both Origin and Mill exist, use Origin.\n"
"4. If only Mill exists AND it is clearly the shipping location, use Mill.\n"
"5. Priority order: Origin β†’ Email From β†’ Mill β†’ Sender company block.\n"
"6. If none apply, ship_from = null.\n\n"
"Rules for inventories:\n"
"- Do NOT merge different lengths; create a separate variant per length.\n"
"- Extract EXACT numbers shown: packages, pcs_per_pkg, pieces, fbm.\n"
"- total_pcs = sum of all variant pieces.\n"
"- total_fbm = sum of all variant fbm.\n\n"
"Rules for total_quantity:\n"
"- If the document shows a total PCS value explicitly, use it.\n"
"- If only variants exist, do not compute total_quantity unless the document explicitly states it.\n\n"
"Parse tables carefully. If a dimension group (like 2x6) appears, use that.\n"
"Return only JSON. No explanations."
)
# ----------------------- Extraction Logic -----------------------
def extract(file):
path = Path(file.name)
suffix = path.suffix.lower()
if suffix == ".pdf":
fid = upload_pdf(path)
msg = [
{"type": "text", "text": build_prompt()},
{"type": "file", "file": {"file_id": fid}}
]
else:
b64 = base64.b64encode(path.read_bytes()).decode()
msg = [
{"type": "text", "text": build_prompt()},
{
"type": "image_url",
"image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}
}
]
r = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": msg}]
)
txt = r.choices[0].message.content
s = txt.find("{")
e = txt.rfind("}")
return txt[s:e+1]
# ----------------------- Gradio UI -----------------------
def ui(file):
return extract(file)
# Sample images (optional)
sample_files = [
("IMG_0001.jpg", "IMG_0001.jpg"),
("IMG_0002.jpg", "IMG_0002.jpg")
]
gr.Interface(
fn=ui,
inputs=gr.File(label="Upload PDF or Image"),
outputs=gr.JSON(label="Extracted JSON"),
title="Logistics OCR Data Extractor (GPT-5.1)",
examples=sample_files
).launch()