| |
| |
|
|
| import base64 |
| import json |
| from pathlib import Path |
| import gradio as gr |
| from openai import OpenAI |
|
|
| API_KEY = "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" |
| MODEL = "gpt-5.1" |
|
|
| client = OpenAI(api_key=API_KEY) |
|
|
|
|
| |
| def upload_pdf(path): |
| f = client.files.create( |
| file=open(path, "rb"), |
| purpose="assistants" |
| ) |
| return f.id |
|
|
|
|
| |
| def build_prompt(): |
| return ( |
| "Extract structured JSON from this logistics shipping document. " |
| "Use only what appears in the PDF/image, never hallucinate. " |
| "Return strictly valid JSON in this schema:\n\n" |
| "{\n" |
| " \"po_number\": string|null,\n" |
| " \"ship_from\": string|null,\n" |
| " \"carrier_type\": string|null,\n" |
| " \"rail_car_number\": string|null,\n" |
| " \"total_quantity\": number|null,\n" |
| " \"inventories\": [\n" |
| " {\n" |
| " \"productName\": string,\n" |
| " \"productCode\": string|null,\n" |
| " \"variants\": [\n" |
| " {\n" |
| " \"dimensions\": string|null,\n" |
| " \"pcs_per_pkg\": number|null,\n" |
| " \"length_ft\": number|null,\n" |
| " \"width\": number|null,\n" |
| " \"packages\": number|null,\n" |
| " \"pieces\": number|null,\n" |
| " \"fbm\": number|null\n" |
| " }\n" |
| " ],\n" |
| " \"total_pcs\": number|null,\n" |
| " \"total_fbm\": number|null\n" |
| " }\n" |
| " ],\n" |
| " \"custom_fields\": {}\n" |
| "}\n\n" |
|
|
| "SHIP_FROM EXTRACTION RULES (MANDATORY):\n" |
| "1. If document contains explicit Origin/Ship From labels, extract that value.\n" |
| "2. If document is an email-based inbound notice and no explicit origin exists, " |
| "set ship_from = the email 'From:' field.\n" |
| "3. If both Origin and Mill exist, use Origin.\n" |
| "4. If only Mill exists AND it is clearly the shipping location, use Mill.\n" |
| "5. Priority order: Origin β Email From β Mill β Sender company block.\n" |
| "6. If none apply, ship_from = null.\n\n" |
|
|
| "Rules for inventories:\n" |
| "- Do NOT merge different lengths; create a separate variant per length.\n" |
| "- Extract EXACT numbers shown: packages, pcs_per_pkg, pieces, fbm.\n" |
| "- total_pcs = sum of all variant pieces.\n" |
| "- total_fbm = sum of all variant fbm.\n\n" |
|
|
| "Rules for total_quantity:\n" |
| "- If the document shows a total PCS value explicitly, use it.\n" |
| "- If only variants exist, do not compute total_quantity unless the document explicitly states it.\n\n" |
|
|
| "Parse tables carefully. If a dimension group (like 2x6) appears, use that.\n" |
| "Return only JSON. No explanations." |
| ) |
|
|
|
|
| |
| def extract(file): |
| path = Path(file.name) |
| suffix = path.suffix.lower() |
|
|
| if suffix == ".pdf": |
| fid = upload_pdf(path) |
| msg = [ |
| {"type": "text", "text": build_prompt()}, |
| {"type": "file", "file": {"file_id": fid}} |
| ] |
| else: |
| b64 = base64.b64encode(path.read_bytes()).decode() |
| msg = [ |
| {"type": "text", "text": build_prompt()}, |
| { |
| "type": "image_url", |
| "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"} |
| } |
| ] |
|
|
| r = client.chat.completions.create( |
| model=MODEL, |
| messages=[{"role": "user", "content": msg}] |
| ) |
|
|
| txt = r.choices[0].message.content |
| s = txt.find("{") |
| e = txt.rfind("}") |
| return txt[s:e+1] |
|
|
|
|
| |
| def ui(file): |
| return extract(file) |
|
|
|
|
| |
| sample_files = [ |
| ("IMG_0001.jpg", "IMG_0001.jpg"), |
| ("IMG_0002.jpg", "IMG_0002.jpg") |
| ] |
|
|
| gr.Interface( |
| fn=ui, |
| inputs=gr.File(label="Upload PDF or Image"), |
| outputs=gr.JSON(label="Extracted JSON"), |
| title="Logistics OCR Data Extractor (GPT-5.1)", |
| examples=sample_files |
| ).launch() |
|
|