Spaces:

MLBench
/

Logistics-OCR-Text-Extractor

Sleeping

App Files Files Community

mlbench123 commited on Nov 20, 2025

Commit

5af3434

verified ·

1 Parent(s): aa6d0d8

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -65

app.py CHANGED Viewed

@@ -11,106 +11,98 @@ MODEL = "gpt-5.1"
 client = OpenAI(api_key=API_KEY)
 def upload_pdf(path):
-    f = client.files.create(
-        file=open(path, "rb"),
-        purpose="assistants"
-    )
-    return f.id
-def build_prompt():
     return (
-        "Extract structured JSON from this logistics shipping document. "
-        "Return ONLY valid JSON.\n\n"
         "{\n"
         "  \"po_number\": string|null,\n"
-        "  \"ship_from\": string|null,\n"
         "  \"carrier_type\": string|null,\n"
         "  \"rail_car_number\": string|null,\n"
         "  \"total_quantity\": number|null,\n"
         "  \"inventories\": [\n"
-        "     {\n"
-        "       \"productName\": string,\n"
-        "       \"productCode\": string|null,\n"
-        "       \"variants\": [\n"
-        "         {\n"
-        "           \"dimensions\": string|null,\n"
-        "           \"pcs_per_pkg\": number|null,\n"
-        "           \"length_ft\": number|null,\n"
-        "           \"width\": number|null,\n"
-        "           \"packages\": number|null,\n"
-        "           \"pieces\": number|null,\n"
-        "           \"fbm\": number|null\n"
-        "         }\n"
-        "       ],\n"
-        "       \"total_pcs\": number|null,\n"
-        "       \"total_fbm\": number|null\n"
-        "     }\n"
         "  ],\n"
         "  \"custom_fields\": {}\n"
         "}\n\n"
-        "SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
-        "1. If document contains explicit Origin/Ship From labels, extract that value.\n"
-        "2. If it is an email-based inbound notice and no explicit origin exists, use the email 'From:' field.\n"
-        "3. If both Origin and Mill exist, use Origin.\n"
-        "4. If only Mill exists and is clearly the shipping point, use Mill.\n"
-        "5. Priority: Origin → Email From → Mill → Sender block.\n"
-        "6. If nothing matches, ship_from = null.\n\n"
-        "Inventory rules:\n"
-        "- Never merge different lengths.\n"
-        "- Extract packages, pcs_per_pkg, pieces, fbm EXACTLY.\n"
-        "- total_pcs = sum of variant pieces.\n"
-        "- total_fbm = sum of variant fbm.\n\n"
-        "total_quantity rules:\n"
-        "- Use the explicit total if shown.\n"
-        "- If not explicitly shown, do not infer.\n\n"
         "Return ONLY the JSON."
     )
 def extract(file):
     path = Path(file.name)
     suffix = path.suffix.lower()
     if suffix == ".pdf":
         fid = upload_pdf(path)
-        msg = [
-            {"type": "text", "text": build_prompt()},
             {"type": "file", "file": {"file_id": fid}}
         ]
     else:
         b64 = base64.b64encode(path.read_bytes()).decode()
-        msg = [
-            {"type": "text", "text": build_prompt()},
-            {
-                "type": "image_url",
-                "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}
-            }
         ]
     r = client.chat.completions.create(
         model=MODEL,
-        messages=[{"role": "user", "content": msg}]
     )
-    txt = r.choices[0].message.content
-    s = txt.find("{")
-    e = txt.rfind("}")
-    return txt[s:e+1]
 def ui(file):
     return extract(file)
-# EXAMPLES must be ONLY filenames,
-# and these files must exist in the root directory of the Space.
 examples = [
     "IMG_0001.jpg",
     "IMG_0002.jpg"

 client = OpenAI(api_key=API_KEY)
 def upload_pdf(path):
+    return client.files.create(file=open(path, "rb"), purpose="assistants").id
+def prompt():
     return (
+        "Extract structured JSON from the attached logistics document. Return ONLY valid JSON.\n"
         "{\n"
         "  \"po_number\": string|null,\n"
+        "  \"ship_from_name\": string|null,\n"
+        "  \"ship_from_email\": string|null,\n"
         "  \"carrier_type\": string|null,\n"
         "  \"rail_car_number\": string|null,\n"
         "  \"total_quantity\": number|null,\n"
         "  \"inventories\": [\n"
+        "    {\n"
+        "      \"productName\": string|null,\n"
+        "      \"productCode\": string|null,\n"
+        "      \"variants\": [\n"
+        "        {\n"
+        "          \"dimensions\": string|null,\n"
+        "          \"pcs_per_pkg\": number|null,\n"
+        "          \"length_ft\": number|null,\n"
+        "          \"width\": number|null,\n"
+        "          \"packages\": number|null,\n"
+        "          \"pieces\": number|null,\n"
+        "          \"fbm\": number|string|null\n"
+        "        }\n"
+        "      ],\n"
+        "      \"total_pcs\": number|null,\n"
+        "      \"total_fbm\": number|string|null\n"
+        "    }\n"
         "  ],\n"
         "  \"custom_fields\": {}\n"
         "}\n\n"
+        "SHIP FROM RULES:\n"
+        "- If explicit fields like 'Origin', 'Ship From' exist, extract that value.\n"
+        "- If the document is an email-style inbound notice (header block) and shows:\n"
+        "    From: Name <email>\n"
+        "  then ship_from_name = Name, ship_from_email = email.\n"
+        "- If only an email exists and no human name, set both fields to that email.\n"
+        "- If both Origin and an email sender exist, use Origin for ship_from_name and still capture the email under ship_from_email.\n"
+        "- Priority: Origin → Email Name → Mill → Sender block → null.\n\n"
+        "CARRIER / EQUIPMENT RULE:\n"
+        "- If the table contains:\n"
+        "      Equipment id = <value>\n"
+        "      Mark = <value>\n"
+        "  then ALWAYS treat 'Equipment id' as the railcar number.\n"
+        "- NEVER use 'Mark' as railcar number.\n"
+        "- Carrier type must match the carrier text exactly (e.g., CHICAGO RAIL LINK).\n\n"
+        "INVENTORY RULES:\n"
+        "- Do not merge length groups. Each unique length or dimension is its own variant.\n"
+        "- Extract pcs_per_pkg, packages, pieces, fbm exactly as written.\n"
+        "- total_pcs = sum of pieces.\n"
+        "- total_fbm = sum of fbm.\n\n"
+        "TOTAL QUANTITY RULE:\n"
+        "- Use explicit totals if they appear.\n"
+        "- If no explicit total quantity appears, leave null.\n\n"
+        "CUSTOM FIELDS RULE:\n"
+        "- Capture all meaningful leftover fields not part of main schema.\n\n"
         "Return ONLY the JSON."
     )
 def extract(file):
     path = Path(file.name)
     suffix = path.suffix.lower()
     if suffix == ".pdf":
         fid = upload_pdf(path)
+        content = [
+            {"type": "text", "text": prompt()},
             {"type": "file", "file": {"file_id": fid}}
         ]
     else:
         b64 = base64.b64encode(path.read_bytes()).decode()
+        content = [
+            {"type": "text", "text": prompt()},
+            {"type": "image_url", "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}}
         ]
     r = client.chat.completions.create(
         model=MODEL,
+        messages=[{"role": "user", "content": content}]
     )
+    text = r.choices[0].message.content
+    s = text.find("{")
+    e = text.rfind("}")
+    return text[s:e+1]
 def ui(file):
     return extract(file)
 examples = [
     "IMG_0001.jpg",
     "IMG_0002.jpg"