mlbench123 commited on
Commit
9f9fb74
·
verified ·
1 Parent(s): e135d87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -19
app.py CHANGED
@@ -11,9 +11,13 @@ MODEL = "gpt-5.1"
11
 
12
  client = OpenAI(api_key=API_KEY)
13
 
 
 
14
  def upload_pdf(path):
15
  return client.files.create(file=open(path, "rb"), purpose="assistants").id
16
 
 
 
17
  def prompt():
18
  return (
19
  "Extract structured JSON from the attached logistics document. Return ONLY valid JSON.\n"
@@ -45,34 +49,37 @@ def prompt():
45
  " ],\n"
46
  " \"custom_fields\": {}\n"
47
  "}\n\n"
 
48
  "SHIP FROM RULES:\n"
49
  "- If explicit fields like 'Origin', 'Ship From' exist, extract that value.\n"
50
- "- If the document is an email-style inbound notice (header block) and shows:\n"
51
- " From: Name <email>\n"
52
  " then ship_from_name = Name, ship_from_email = email.\n"
53
- "- If only an email exists and no human name, set both fields to that email.\n"
54
- "- If both Origin and an email sender exist, use Origin for ship_from_name and still capture the email under ship_from_email.\n"
55
  "- Priority: Origin → Email Name → Mill → Sender block → null.\n\n"
 
56
  "CARRIER / EQUIPMENT RULE:\n"
57
- "- If the table contains:\n"
58
- " Equipment id = <value>\n"
59
- " Mark = <value>\n"
60
- " then ALWAYS treat 'Equipment id' as the railcar number.\n"
61
- "- NEVER use 'Mark' as railcar number.\n"
62
- "- Carrier type must match the carrier text exactly (e.g., CHICAGO RAIL LINK).\n\n"
63
  "INVENTORY RULES:\n"
64
- "- Do not merge length groups. Each unique length or dimension is its own variant.\n"
65
- "- Extract pcs_per_pkg, packages, pieces, fbm exactly as written.\n"
66
  "- total_pcs = sum of pieces.\n"
67
  "- total_fbm = sum of fbm.\n\n"
 
68
  "TOTAL QUANTITY RULE:\n"
69
- "- Use explicit totals if they appear.\n"
70
- "- If no explicit total quantity appears, leave null.\n\n"
71
  "CUSTOM FIELDS RULE:\n"
72
- "- Capture all meaningful leftover fields not part of main schema.\n\n"
 
73
  "Return ONLY the JSON."
74
  )
75
 
 
 
76
  def extract(file):
77
  path = Path(file.name)
78
  suffix = path.suffix.lower()
@@ -87,7 +94,10 @@ def extract(file):
87
  b64 = base64.b64encode(path.read_bytes()).decode()
88
  content = [
89
  {"type": "text", "text": prompt()},
90
- {"type": "image_url", "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}}
 
 
 
91
  ]
92
 
93
  r = client.chat.completions.create(
@@ -100,18 +110,24 @@ def extract(file):
100
  e = text.rfind("}")
101
  return text[s:e+1]
102
 
 
103
  def ui(file):
104
  return extract(file)
105
 
 
 
106
  examples = [
107
- "IMG_0001.jpg",
108
- "IMG_0002.jpg"
109
  ]
110
 
 
 
111
  gr.Interface(
112
  fn=ui,
113
  inputs=gr.File(label="Upload PDF or Image"),
114
  outputs=gr.JSON(label="Extracted JSON"),
115
  title="Logistics OCR Data Extractor (GPT-5.1)",
116
- examples=examples
 
117
  ).launch(share=True)
 
11
 
12
  client = OpenAI(api_key=API_KEY)
13
 
14
+
15
+ # ---------------- PDF Upload ----------------
16
  def upload_pdf(path):
17
  return client.files.create(file=open(path, "rb"), purpose="assistants").id
18
 
19
+
20
+ # ---------------- Prompt ----------------
21
  def prompt():
22
  return (
23
  "Extract structured JSON from the attached logistics document. Return ONLY valid JSON.\n"
 
49
  " ],\n"
50
  " \"custom_fields\": {}\n"
51
  "}\n\n"
52
+
53
  "SHIP FROM RULES:\n"
54
  "- If explicit fields like 'Origin', 'Ship From' exist, extract that value.\n"
55
+ "- If email header block exists:\n"
56
+ " From: Name <email>\n"
57
  " then ship_from_name = Name, ship_from_email = email.\n"
58
+ "- If only email exists, set both fields to email.\n"
59
+ "- If both Origin and email sender exist, name = Origin and email under custom_fields.\n"
60
  "- Priority: Origin → Email Name → Mill → Sender block → null.\n\n"
61
+
62
  "CARRIER / EQUIPMENT RULE:\n"
63
+ "- If table shows 'Equipment id = X' and 'Mark = Y', then X = rail_car_number.\n"
64
+ "- 'Mark' must never be used as the railcar number.\n\n"
65
+
 
 
 
66
  "INVENTORY RULES:\n"
67
+ "- Each dimension group must remain separate.\n"
68
+ "- pieces_per_pkg, packages, pieces, fbm must be exact.\n"
69
  "- total_pcs = sum of pieces.\n"
70
  "- total_fbm = sum of fbm.\n\n"
71
+
72
  "TOTAL QUANTITY RULE:\n"
73
+ "- Use explicit totals. If no total, leave null.\n\n"
74
+
75
  "CUSTOM FIELDS RULE:\n"
76
+ "- Capture leftover meaningful text.\n\n"
77
+
78
  "Return ONLY the JSON."
79
  )
80
 
81
+
82
+ # ---------------- Extraction ----------------
83
  def extract(file):
84
  path = Path(file.name)
85
  suffix = path.suffix.lower()
 
94
  b64 = base64.b64encode(path.read_bytes()).decode()
95
  content = [
96
  {"type": "text", "text": prompt()},
97
+ {
98
+ "type": "image_url",
99
+ "image_url": {"url": f"data:image/{suffix[1:]};base64,{b64}"}
100
+ }
101
  ]
102
 
103
  r = client.chat.completions.create(
 
110
  e = text.rfind("}")
111
  return text[s:e+1]
112
 
113
+
114
  def ui(file):
115
  return extract(file)
116
 
117
+
118
+ # ---------------- Sample Images (Preview Enabled) ----------------
119
  examples = [
120
+ ["IMG_0001.jpg"],
121
+ ["IMG_0002.jpg"]
122
  ]
123
 
124
+
125
+ # ---------------- Gradio App ----------------
126
  gr.Interface(
127
  fn=ui,
128
  inputs=gr.File(label="Upload PDF or Image"),
129
  outputs=gr.JSON(label="Extracted JSON"),
130
  title="Logistics OCR Data Extractor (GPT-5.1)",
131
+ examples=examples,
132
+ examples_per_page=2
133
  ).launch(share=True)