Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
# app.py — Logistics OCR Extractor (PDF + Images) with strict ship_from rules
|
| 3 |
|
| 4 |
import base64
|
| 5 |
import json
|
|
@@ -13,7 +12,6 @@ MODEL = "gpt-5.1"
|
|
| 13 |
client = OpenAI(api_key=API_KEY)
|
| 14 |
|
| 15 |
|
| 16 |
-
# ----------------------- PDF Upload -----------------------
|
| 17 |
def upload_pdf(path):
|
| 18 |
f = client.files.create(
|
| 19 |
file=open(path, "rb"),
|
|
@@ -22,12 +20,10 @@ def upload_pdf(path):
|
|
| 22 |
return f.id
|
| 23 |
|
| 24 |
|
| 25 |
-
# ----------------------- Prompt Builder -----------------------
|
| 26 |
def build_prompt():
|
| 27 |
return (
|
| 28 |
"Extract structured JSON from this logistics shipping document. "
|
| 29 |
-
"
|
| 30 |
-
"Return strictly valid JSON in this schema:\n\n"
|
| 31 |
"{\n"
|
| 32 |
" \"po_number\": string|null,\n"
|
| 33 |
" \"ship_from\": string|null,\n"
|
|
@@ -58,29 +54,26 @@ def build_prompt():
|
|
| 58 |
|
| 59 |
"SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
|
| 60 |
"1. If document contains explicit Origin/Ship From labels, extract that value.\n"
|
| 61 |
-
"2. If
|
| 62 |
-
"set ship_from = the email 'From:' field.\n"
|
| 63 |
"3. If both Origin and Mill exist, use Origin.\n"
|
| 64 |
-
"4. If only Mill exists
|
| 65 |
-
"5. Priority
|
| 66 |
-
"6. If
|
| 67 |
-
|
| 68 |
-
"
|
| 69 |
-
"-
|
| 70 |
-
"- Extract
|
| 71 |
-
"- total_pcs = sum of
|
| 72 |
-
"- total_fbm = sum of
|
| 73 |
-
|
| 74 |
-
"
|
| 75 |
-
"-
|
| 76 |
-
"- If
|
| 77 |
-
|
| 78 |
-
"
|
| 79 |
-
"Return only JSON. No explanations."
|
| 80 |
)
|
| 81 |
|
| 82 |
|
| 83 |
-
# ----------------------- Extraction Logic -----------------------
|
| 84 |
def extract(file):
|
| 85 |
path = Path(file.name)
|
| 86 |
suffix = path.suffix.lower()
|
|
@@ -112,15 +105,15 @@ def extract(file):
|
|
| 112 |
return txt[s:e+1]
|
| 113 |
|
| 114 |
|
| 115 |
-
# ----------------------- Gradio UI -----------------------
|
| 116 |
def ui(file):
|
| 117 |
return extract(file)
|
| 118 |
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
| 124 |
]
|
| 125 |
|
| 126 |
gr.Interface(
|
|
@@ -128,5 +121,5 @@ gr.Interface(
|
|
| 128 |
inputs=gr.File(label="Upload PDF or Image"),
|
| 129 |
outputs=gr.JSON(label="Extracted JSON"),
|
| 130 |
title="Logistics OCR Data Extractor (GPT-5.1)",
|
| 131 |
-
examples=
|
| 132 |
).launch()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import json
|
|
|
|
| 12 |
client = OpenAI(api_key=API_KEY)
|
| 13 |
|
| 14 |
|
|
|
|
| 15 |
def upload_pdf(path):
|
| 16 |
f = client.files.create(
|
| 17 |
file=open(path, "rb"),
|
|
|
|
| 20 |
return f.id
|
| 21 |
|
| 22 |
|
|
|
|
| 23 |
def build_prompt():
|
| 24 |
return (
|
| 25 |
"Extract structured JSON from this logistics shipping document. "
|
| 26 |
+
"Return ONLY valid JSON.\n\n"
|
|
|
|
| 27 |
"{\n"
|
| 28 |
" \"po_number\": string|null,\n"
|
| 29 |
" \"ship_from\": string|null,\n"
|
|
|
|
| 54 |
|
| 55 |
"SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
|
| 56 |
"1. If document contains explicit Origin/Ship From labels, extract that value.\n"
|
| 57 |
+
"2. If it is an email-based inbound notice and no explicit origin exists, use the email 'From:' field.\n"
|
|
|
|
| 58 |
"3. If both Origin and Mill exist, use Origin.\n"
|
| 59 |
+
"4. If only Mill exists and is clearly the shipping point, use Mill.\n"
|
| 60 |
+
"5. Priority: Origin → Email From → Mill → Sender block.\n"
|
| 61 |
+
"6. If nothing matches, ship_from = null.\n\n"
|
| 62 |
+
|
| 63 |
+
"Inventory rules:\n"
|
| 64 |
+
"- Never merge different lengths.\n"
|
| 65 |
+
"- Extract packages, pcs_per_pkg, pieces, fbm EXACTLY.\n"
|
| 66 |
+
"- total_pcs = sum of variant pieces.\n"
|
| 67 |
+
"- total_fbm = sum of variant fbm.\n\n"
|
| 68 |
+
|
| 69 |
+
"total_quantity rules:\n"
|
| 70 |
+
"- Use the explicit total if shown.\n"
|
| 71 |
+
"- If not explicitly shown, do not infer.\n\n"
|
| 72 |
+
|
| 73 |
+
"Return ONLY the JSON."
|
|
|
|
| 74 |
)
|
| 75 |
|
| 76 |
|
|
|
|
| 77 |
def extract(file):
|
| 78 |
path = Path(file.name)
|
| 79 |
suffix = path.suffix.lower()
|
|
|
|
| 105 |
return txt[s:e+1]
|
| 106 |
|
| 107 |
|
|
|
|
| 108 |
def ui(file):
|
| 109 |
return extract(file)
|
| 110 |
|
| 111 |
|
| 112 |
+
# EXAMPLES must be ONLY filenames,
|
| 113 |
+
# and these files must exist in the root directory of the Space.
|
| 114 |
+
examples = [
|
| 115 |
+
"IMG_0001.jpg",
|
| 116 |
+
"IMG_0002.jpg"
|
| 117 |
]
|
| 118 |
|
| 119 |
gr.Interface(
|
|
|
|
| 121 |
inputs=gr.File(label="Upload PDF or Image"),
|
| 122 |
outputs=gr.JSON(label="Extracted JSON"),
|
| 123 |
title="Logistics OCR Data Extractor (GPT-5.1)",
|
| 124 |
+
examples=examples
|
| 125 |
).launch()
|