Update app.py
Browse files
app.py
CHANGED
|
@@ -9,31 +9,11 @@ MODEL = "gpt-5.1"
|
|
| 9 |
|
| 10 |
client = OpenAI(api_key=API_KEY)
|
| 11 |
|
| 12 |
-
SAMPLE_DIR = Path("samples")
|
| 13 |
-
SAMPLES = {
|
| 14 |
-
"None": None,
|
| 15 |
-
"Sample 1 - IMG_0001.jpg": SAMPLE_DIR / "IMG_0001.jpg",
|
| 16 |
-
"Sample 2 - IMG_0002.jpg": SAMPLE_DIR / "IMG_0002.jpg"
|
| 17 |
-
}
|
| 18 |
|
| 19 |
-
|
| 20 |
-
# ------------------ Upload PDF ------------------
|
| 21 |
-
def upload_pdf(path):
|
| 22 |
-
f = client.files.create(
|
| 23 |
-
file=open(path, "rb"),
|
| 24 |
-
purpose="assistants"
|
| 25 |
-
)
|
| 26 |
-
return f.id
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# ------------------ Prompt ---------------------
|
| 30 |
def build_prompt():
|
| 31 |
return (
|
| 32 |
-
"You are an
|
| 33 |
-
"
|
| 34 |
-
"extract ONLY the information that is explicitly present.\n\n"
|
| 35 |
-
|
| 36 |
-
"You must return STRICT JSON in the EXACT structure below:\n"
|
| 37 |
"{\n"
|
| 38 |
" \"po_number\": string|null,\n"
|
| 39 |
" \"ship_from\": string|null,\n"
|
|
@@ -41,98 +21,76 @@ def build_prompt():
|
|
| 41 |
" \"rail_car_number\": string|null,\n"
|
| 42 |
" \"total_quantity\": number|null,\n"
|
| 43 |
" \"inventories\": [\n"
|
| 44 |
-
"
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
-
"
|
| 48 |
-
"
|
| 49 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
" ],\n"
|
| 51 |
" \"custom_fields\": {}\n"
|
| 52 |
-
"}\n
|
| 53 |
-
|
| 54 |
-
"YOUR RULES (MUST FOLLOW EXACTLY):\n"
|
| 55 |
-
"1. Do NOT guess or hallucinate. Only extract values explicitly shown in the document.\n"
|
| 56 |
-
"2. If a field is not present or cannot be confirmed → output null.\n"
|
| 57 |
-
"3. PO Number may appear under wording like 'PO', 'Purchase Order', 'P.O.', 'Customer PO', etc.\n"
|
| 58 |
-
"4. Ship From may appear as 'Origin', 'From', 'Exporter', 'Ship From', 'Supplier', etc.\n"
|
| 59 |
-
"5. Carrier Type may appear as 'Carrier', 'Carrier Type', 'Routing', 'Mode', 'Transport Type', "
|
| 60 |
-
"'RAIL', 'TRUCK', 'CN', 'BNSF', 'CP', 'Truckload', etc.\n"
|
| 61 |
-
"6. Rail Car Number may appear as 'Railcar', 'Rail Car #', 'Car Number', 'Car #', etc.\n"
|
| 62 |
-
"7. Total Quantity must be ONLY the explicit total PCS/pieces count if it appears. "
|
| 63 |
-
"If the only total shown is FBM/weight/volume → DO NOT treat that as quantity.\n"
|
| 64 |
-
"8. Inventories must capture every unique product line that appears. Extract product name, "
|
| 65 |
-
"item description, dimensions like '2x4', '23/32', and PCS when available.\n"
|
| 66 |
-
"9. Dimensions may appear as '2 X 4', '2x6', '48x96', '23/32', etc. Normalize to a single "
|
| 67 |
-
"string representation.\n"
|
| 68 |
-
"10. custom_fields must contain ANY additional fields not part of the main schema (dates, mills, "
|
| 69 |
-
"FBM, weights, routing codes, package counts, etc.). Key names must be lower_snake_case.\n"
|
| 70 |
-
"11. JSON MUST be valid, must not include comments, and must not include text outside the JSON object.\n\n"
|
| 71 |
-
|
| 72 |
-
"ADDITIONAL RULES FOR COMPLEX TABLES:\n"
|
| 73 |
-
"- If multiple product variants exist, create multiple inventory objects.\n"
|
| 74 |
-
"- If tables list PCS per package × number of packages, you MAY compute total PCS.\n"
|
| 75 |
-
"- Never compute derived values unless the math is explicitly possible.\n"
|
| 76 |
-
"- If a value is ambiguous, set it to null.\n\n"
|
| 77 |
-
|
| 78 |
-
"Final requirement: Return ONLY the JSON object. No explanation, no markdown.\n"
|
| 79 |
)
|
| 80 |
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
if
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
{"type": "text", "text":
|
| 91 |
-
{"type": "file", "file": {"file_id":
|
| 92 |
]
|
| 93 |
-
|
| 94 |
else:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
{"type": "text", "text": build_prompt()},
|
| 101 |
-
{"type": "image_url", "image_url": {"url": f"data:image/{mime};base64,{b64}"}}
|
| 102 |
]
|
| 103 |
|
| 104 |
-
|
| 105 |
model=MODEL,
|
| 106 |
-
messages=[{"role": "user", "content":
|
| 107 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
out = response.choices[0].message.content
|
| 110 |
-
s = out.find("{")
|
| 111 |
-
e = out.rfind("}")
|
| 112 |
-
|
| 113 |
-
return out[s:e+1]
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
# ------------------ UI Logic ------------------
|
| 117 |
-
def run_extraction(uploaded_file, sample_name):
|
| 118 |
-
|
| 119 |
-
if uploaded_file:
|
| 120 |
-
return extract_from_path(Path(uploaded_file.name))
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
-
# ------------------ Gradio Interface ------------------
|
| 129 |
gr.Interface(
|
| 130 |
-
fn=
|
| 131 |
-
inputs=
|
| 132 |
-
gr.File(label="Upload PDF or Image"),
|
| 133 |
-
gr.Dropdown(list(SAMPLES.keys()), value="None", label="Or choose a sample")
|
| 134 |
-
],
|
| 135 |
outputs=gr.JSON(label="Extracted JSON"),
|
| 136 |
-
title="Logistics OCR
|
| 137 |
-
|
| 138 |
).launch()
|
|
|
|
| 9 |
|
| 10 |
client = OpenAI(api_key=API_KEY)
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def build_prompt():
|
| 14 |
return (
|
| 15 |
+
"You are an extraction system. Extract ONLY explicit data. No guessing.\n\n"
|
| 16 |
+
"Return JSON strictly:\n"
|
|
|
|
|
|
|
|
|
|
| 17 |
"{\n"
|
| 18 |
" \"po_number\": string|null,\n"
|
| 19 |
" \"ship_from\": string|null,\n"
|
|
|
|
| 21 |
" \"rail_car_number\": string|null,\n"
|
| 22 |
" \"total_quantity\": number|null,\n"
|
| 23 |
" \"inventories\": [\n"
|
| 24 |
+
" {\n"
|
| 25 |
+
" \"productName\": string,\n"
|
| 26 |
+
" \"productCode\": string|null,\n"
|
| 27 |
+
" \"variants\": [\n"
|
| 28 |
+
" {\n"
|
| 29 |
+
" \"dimensions\": string|null,\n"
|
| 30 |
+
" \"pcs_per_pkg\": number|null,\n"
|
| 31 |
+
" \"length_ft\": number|null,\n"
|
| 32 |
+
" \"width\": number|null,\n"
|
| 33 |
+
" \"packages\": number|null,\n"
|
| 34 |
+
" \"pieces\": number|null,\n"
|
| 35 |
+
" \"fbm\": number|null\n"
|
| 36 |
+
" }\n"
|
| 37 |
+
" ],\n"
|
| 38 |
+
" \"total_pcs\": number|null,\n"
|
| 39 |
+
" \"total_fbm\": number|null\n"
|
| 40 |
+
" }\n"
|
| 41 |
" ],\n"
|
| 42 |
" \"custom_fields\": {}\n"
|
| 43 |
+
"}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
)
|
| 45 |
|
| 46 |
|
| 47 |
+
def upload_pdf(p):
|
| 48 |
+
f = client.files.create(file=open(p, "rb"), purpose="assistants")
|
| 49 |
+
return f.id
|
| 50 |
+
|
| 51 |
|
| 52 |
+
def extract(file):
|
| 53 |
+
path = Path(file.name)
|
| 54 |
+
prompt = build_prompt()
|
| 55 |
+
ext = path.suffix.lower()
|
| 56 |
|
| 57 |
+
if ext == ".pdf":
|
| 58 |
+
fid = upload_pdf(path)
|
| 59 |
+
msg = [
|
| 60 |
+
{"type": "text", "text": prompt},
|
| 61 |
+
{"type": "file", "file": {"file_id": fid}}
|
| 62 |
]
|
|
|
|
| 63 |
else:
|
| 64 |
+
b64 = base64.b64encode(path.read_bytes()).decode()
|
| 65 |
+
mime = f"image/{ext[1:]}"
|
| 66 |
+
msg = [
|
| 67 |
+
{"type": "text", "text": prompt},
|
| 68 |
+
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}
|
|
|
|
|
|
|
| 69 |
]
|
| 70 |
|
| 71 |
+
r = client.chat.completions.create(
|
| 72 |
model=MODEL,
|
| 73 |
+
messages=[{"role": "user", "content": msg}]
|
| 74 |
)
|
| 75 |
+
raw = r.choices[0].message.content
|
| 76 |
+
s = raw.find("{")
|
| 77 |
+
e = raw.rfind("}")
|
| 78 |
+
return json.loads(raw[s:e+1])
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
sample_files = [
|
| 82 |
+
("IMG_0001.jpg", "samples/IMG_0001.jpg"),
|
| 83 |
+
("IMG_0002.jpg", "samples/IMG_0002.jpg")
|
| 84 |
+
]
|
| 85 |
|
| 86 |
+
def ui(file):
|
| 87 |
+
return extract(file)
|
| 88 |
|
| 89 |
|
|
|
|
| 90 |
gr.Interface(
|
| 91 |
+
fn=ui,
|
| 92 |
+
inputs=gr.File(label="Upload PDF or Image"),
|
|
|
|
|
|
|
|
|
|
| 93 |
outputs=gr.JSON(label="Extracted JSON"),
|
| 94 |
+
title="Logistics OCR Extraction",
|
| 95 |
+
examples=[f for _, f in sample_files]
|
| 96 |
).launch()
|