mlbench123 commited on
Commit
aa6d0d8
·
verified ·
1 Parent(s): 5e37228

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -30
app.py CHANGED
@@ -1,5 +1,4 @@
1
  #!/usr/bin/env python3
2
- # app.py — Logistics OCR Extractor (PDF + Images) with strict ship_from rules
3
 
4
  import base64
5
  import json
@@ -13,7 +12,6 @@ MODEL = "gpt-5.1"
13
  client = OpenAI(api_key=API_KEY)
14
 
15
 
16
- # ----------------------- PDF Upload -----------------------
17
  def upload_pdf(path):
18
  f = client.files.create(
19
  file=open(path, "rb"),
@@ -22,12 +20,10 @@ def upload_pdf(path):
22
  return f.id
23
 
24
 
25
- # ----------------------- Prompt Builder -----------------------
26
  def build_prompt():
27
  return (
28
  "Extract structured JSON from this logistics shipping document. "
29
- "Use only what appears in the PDF/image, never hallucinate. "
30
- "Return strictly valid JSON in this schema:\n\n"
31
  "{\n"
32
  " \"po_number\": string|null,\n"
33
  " \"ship_from\": string|null,\n"
@@ -58,29 +54,26 @@ def build_prompt():
58
 
59
  "SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
60
  "1. If document contains explicit Origin/Ship From labels, extract that value.\n"
61
- "2. If document is an email-based inbound notice and no explicit origin exists, "
62
- "set ship_from = the email 'From:' field.\n"
63
  "3. If both Origin and Mill exist, use Origin.\n"
64
- "4. If only Mill exists AND it is clearly the shipping location, use Mill.\n"
65
- "5. Priority order: Origin → Email From → Mill → Sender company block.\n"
66
- "6. If none apply, ship_from = null.\n\n"
67
-
68
- "Rules for inventories:\n"
69
- "- Do NOT merge different lengths; create a separate variant per length.\n"
70
- "- Extract EXACT numbers shown: packages, pcs_per_pkg, pieces, fbm.\n"
71
- "- total_pcs = sum of all variant pieces.\n"
72
- "- total_fbm = sum of all variant fbm.\n\n"
73
-
74
- "Rules for total_quantity:\n"
75
- "- If the document shows a total PCS value explicitly, use it.\n"
76
- "- If only variants exist, do not compute total_quantity unless the document explicitly states it.\n\n"
77
-
78
- "Parse tables carefully. If a dimension group (like 2x6) appears, use that.\n"
79
- "Return only JSON. No explanations."
80
  )
81
 
82
 
83
- # ----------------------- Extraction Logic -----------------------
84
  def extract(file):
85
  path = Path(file.name)
86
  suffix = path.suffix.lower()
@@ -112,15 +105,15 @@ def extract(file):
112
  return txt[s:e+1]
113
 
114
 
115
- # ----------------------- Gradio UI -----------------------
116
  def ui(file):
117
  return extract(file)
118
 
119
 
120
- # Sample images (optional)
121
- sample_files = [
122
- ("IMG_0001.jpg", "/IMG_0001.jpg"),
123
- ("IMG_0002.jpg", "/IMG_0002.jpg")
 
124
  ]
125
 
126
  gr.Interface(
@@ -128,5 +121,5 @@ gr.Interface(
128
  inputs=gr.File(label="Upload PDF or Image"),
129
  outputs=gr.JSON(label="Extracted JSON"),
130
  title="Logistics OCR Data Extractor (GPT-5.1)",
131
- examples=sample_files
132
  ).launch()
 
1
  #!/usr/bin/env python3
 
2
 
3
  import base64
4
  import json
 
12
  client = OpenAI(api_key=API_KEY)
13
 
14
 
 
15
  def upload_pdf(path):
16
  f = client.files.create(
17
  file=open(path, "rb"),
 
20
  return f.id
21
 
22
 
 
23
  def build_prompt():
24
  return (
25
  "Extract structured JSON from this logistics shipping document. "
26
+ "Return ONLY valid JSON.\n\n"
 
27
  "{\n"
28
  " \"po_number\": string|null,\n"
29
  " \"ship_from\": string|null,\n"
 
54
 
55
  "SHIP_FROM EXTRACTION RULES (MANDATORY):\n"
56
  "1. If document contains explicit Origin/Ship From labels, extract that value.\n"
57
+ "2. If it is an email-based inbound notice and no explicit origin exists, use the email 'From:' field.\n"
 
58
  "3. If both Origin and Mill exist, use Origin.\n"
59
+ "4. If only Mill exists and is clearly the shipping point, use Mill.\n"
60
+ "5. Priority: Origin → Email From → Mill → Sender block.\n"
61
+ "6. If nothing matches, ship_from = null.\n\n"
62
+
63
+ "Inventory rules:\n"
64
+ "- Never merge different lengths.\n"
65
+ "- Extract packages, pcs_per_pkg, pieces, fbm EXACTLY.\n"
66
+ "- total_pcs = sum of variant pieces.\n"
67
+ "- total_fbm = sum of variant fbm.\n\n"
68
+
69
+ "total_quantity rules:\n"
70
+ "- Use the explicit total if shown.\n"
71
+ "- If not explicitly shown, do not infer.\n\n"
72
+
73
+ "Return ONLY the JSON."
 
74
  )
75
 
76
 
 
77
  def extract(file):
78
  path = Path(file.name)
79
  suffix = path.suffix.lower()
 
105
  return txt[s:e+1]
106
 
107
 
 
108
  def ui(file):
109
  return extract(file)
110
 
111
 
112
+ # EXAMPLES must be ONLY filenames,
113
+ # and these files must exist in the root directory of the Space.
114
+ examples = [
115
+ "IMG_0001.jpg",
116
+ "IMG_0002.jpg"
117
  ]
118
 
119
  gr.Interface(
 
121
  inputs=gr.File(label="Upload PDF or Image"),
122
  outputs=gr.JSON(label="Extracted JSON"),
123
  title="Logistics OCR Data Extractor (GPT-5.1)",
124
+ examples=examples
125
  ).launch()