Liori25 commited on
Commit
4a6ebfb
·
verified ·
1 Parent(s): 06e7044

Upload IO_Pipeline.ipynb

Browse files
Files changed (1) hide show
  1. IO_Pipeline.ipynb +639 -0
IO_Pipeline.ipynb ADDED
@@ -0,0 +1,639 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "markdown",
19
+ "source": [
20
+ "# Part 4: Input-Output Pipeline"
21
+ ],
22
+ "metadata": {
23
+ "id": "JyoRTpDES8Tq"
24
+ }
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "source": [
29
+ "- Input: Image of a handwritten recipe\n",
30
+ "- Output: Text of the recipe"
31
+ ],
32
+ "metadata": {
33
+ "id": "-Ms7ezZJTepY"
34
+ }
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "source": [
39
+ "from google.colab import files\n",
40
+ "\n",
41
+ "print(\"Please upload 'RecipeData_10K.csv' from your computer:\")\n",
42
+ "uploaded = files.upload()"
43
+ ],
44
+ "metadata": {
45
+ "colab": {
46
+ "base_uri": "https://localhost:8080/",
47
+ "height": 88
48
+ },
49
+ "id": "CfK_Cy_fUFnK",
50
+ "outputId": "b73eaa28-ad59-4326-c089-28e251ef16a5"
51
+ },
52
+ "execution_count": 4,
53
+ "outputs": [
54
+ {
55
+ "output_type": "stream",
56
+ "name": "stdout",
57
+ "text": [
58
+ "Please upload 'RecipeData_10K.csv' from your computer:\n"
59
+ ]
60
+ },
61
+ {
62
+ "output_type": "display_data",
63
+ "data": {
64
+ "text/plain": [
65
+ "<IPython.core.display.HTML object>"
66
+ ],
67
+ "text/html": [
68
+ "\n",
69
+ " <input type=\"file\" id=\"files-1101386e-69b8-4d66-b4de-58e3de6dcab7\" name=\"files[]\" multiple disabled\n",
70
+ " style=\"border:none\" />\n",
71
+ " <output id=\"result-1101386e-69b8-4d66-b4de-58e3de6dcab7\">\n",
72
+ " Upload widget is only available when the cell has been executed in the\n",
73
+ " current browser session. Please rerun this cell to enable.\n",
74
+ " </output>\n",
75
+ " <script>// Copyright 2017 Google LLC\n",
76
+ "//\n",
77
+ "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
78
+ "// you may not use this file except in compliance with the License.\n",
79
+ "// You may obtain a copy of the License at\n",
80
+ "//\n",
81
+ "// http://www.apache.org/licenses/LICENSE-2.0\n",
82
+ "//\n",
83
+ "// Unless required by applicable law or agreed to in writing, software\n",
84
+ "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
85
+ "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
86
+ "// See the License for the specific language governing permissions and\n",
87
+ "// limitations under the License.\n",
88
+ "\n",
89
+ "/**\n",
90
+ " * @fileoverview Helpers for google.colab Python module.\n",
91
+ " */\n",
92
+ "(function(scope) {\n",
93
+ "function span(text, styleAttributes = {}) {\n",
94
+ " const element = document.createElement('span');\n",
95
+ " element.textContent = text;\n",
96
+ " for (const key of Object.keys(styleAttributes)) {\n",
97
+ " element.style[key] = styleAttributes[key];\n",
98
+ " }\n",
99
+ " return element;\n",
100
+ "}\n",
101
+ "\n",
102
+ "// Max number of bytes which will be uploaded at a time.\n",
103
+ "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
104
+ "\n",
105
+ "function _uploadFiles(inputId, outputId) {\n",
106
+ " const steps = uploadFilesStep(inputId, outputId);\n",
107
+ " const outputElement = document.getElementById(outputId);\n",
108
+ " // Cache steps on the outputElement to make it available for the next call\n",
109
+ " // to uploadFilesContinue from Python.\n",
110
+ " outputElement.steps = steps;\n",
111
+ "\n",
112
+ " return _uploadFilesContinue(outputId);\n",
113
+ "}\n",
114
+ "\n",
115
+ "// This is roughly an async generator (not supported in the browser yet),\n",
116
+ "// where there are multiple asynchronous steps and the Python side is going\n",
117
+ "// to poll for completion of each step.\n",
118
+ "// This uses a Promise to block the python side on completion of each step,\n",
119
+ "// then passes the result of the previous step as the input to the next step.\n",
120
+ "function _uploadFilesContinue(outputId) {\n",
121
+ " const outputElement = document.getElementById(outputId);\n",
122
+ " const steps = outputElement.steps;\n",
123
+ "\n",
124
+ " const next = steps.next(outputElement.lastPromiseValue);\n",
125
+ " return Promise.resolve(next.value.promise).then((value) => {\n",
126
+ " // Cache the last promise value to make it available to the next\n",
127
+ " // step of the generator.\n",
128
+ " outputElement.lastPromiseValue = value;\n",
129
+ " return next.value.response;\n",
130
+ " });\n",
131
+ "}\n",
132
+ "\n",
133
+ "/**\n",
134
+ " * Generator function which is called between each async step of the upload\n",
135
+ " * process.\n",
136
+ " * @param {string} inputId Element ID of the input file picker element.\n",
137
+ " * @param {string} outputId Element ID of the output display.\n",
138
+ " * @return {!Iterable<!Object>} Iterable of next steps.\n",
139
+ " */\n",
140
+ "function* uploadFilesStep(inputId, outputId) {\n",
141
+ " const inputElement = document.getElementById(inputId);\n",
142
+ " inputElement.disabled = false;\n",
143
+ "\n",
144
+ " const outputElement = document.getElementById(outputId);\n",
145
+ " outputElement.innerHTML = '';\n",
146
+ "\n",
147
+ " const pickedPromise = new Promise((resolve) => {\n",
148
+ " inputElement.addEventListener('change', (e) => {\n",
149
+ " resolve(e.target.files);\n",
150
+ " });\n",
151
+ " });\n",
152
+ "\n",
153
+ " const cancel = document.createElement('button');\n",
154
+ " inputElement.parentElement.appendChild(cancel);\n",
155
+ " cancel.textContent = 'Cancel upload';\n",
156
+ " const cancelPromise = new Promise((resolve) => {\n",
157
+ " cancel.onclick = () => {\n",
158
+ " resolve(null);\n",
159
+ " };\n",
160
+ " });\n",
161
+ "\n",
162
+ " // Wait for the user to pick the files.\n",
163
+ " const files = yield {\n",
164
+ " promise: Promise.race([pickedPromise, cancelPromise]),\n",
165
+ " response: {\n",
166
+ " action: 'starting',\n",
167
+ " }\n",
168
+ " };\n",
169
+ "\n",
170
+ " cancel.remove();\n",
171
+ "\n",
172
+ " // Disable the input element since further picks are not allowed.\n",
173
+ " inputElement.disabled = true;\n",
174
+ "\n",
175
+ " if (!files) {\n",
176
+ " return {\n",
177
+ " response: {\n",
178
+ " action: 'complete',\n",
179
+ " }\n",
180
+ " };\n",
181
+ " }\n",
182
+ "\n",
183
+ " for (const file of files) {\n",
184
+ " const li = document.createElement('li');\n",
185
+ " li.append(span(file.name, {fontWeight: 'bold'}));\n",
186
+ " li.append(span(\n",
187
+ " `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
188
+ " `last modified: ${\n",
189
+ " file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
190
+ " 'n/a'} - `));\n",
191
+ " const percent = span('0% done');\n",
192
+ " li.appendChild(percent);\n",
193
+ "\n",
194
+ " outputElement.appendChild(li);\n",
195
+ "\n",
196
+ " const fileDataPromise = new Promise((resolve) => {\n",
197
+ " const reader = new FileReader();\n",
198
+ " reader.onload = (e) => {\n",
199
+ " resolve(e.target.result);\n",
200
+ " };\n",
201
+ " reader.readAsArrayBuffer(file);\n",
202
+ " });\n",
203
+ " // Wait for the data to be ready.\n",
204
+ " let fileData = yield {\n",
205
+ " promise: fileDataPromise,\n",
206
+ " response: {\n",
207
+ " action: 'continue',\n",
208
+ " }\n",
209
+ " };\n",
210
+ "\n",
211
+ " // Use a chunked sending to avoid message size limits. See b/62115660.\n",
212
+ " let position = 0;\n",
213
+ " do {\n",
214
+ " const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
215
+ " const chunk = new Uint8Array(fileData, position, length);\n",
216
+ " position += length;\n",
217
+ "\n",
218
+ " const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
219
+ " yield {\n",
220
+ " response: {\n",
221
+ " action: 'append',\n",
222
+ " file: file.name,\n",
223
+ " data: base64,\n",
224
+ " },\n",
225
+ " };\n",
226
+ "\n",
227
+ " let percentDone = fileData.byteLength === 0 ?\n",
228
+ " 100 :\n",
229
+ " Math.round((position / fileData.byteLength) * 100);\n",
230
+ " percent.textContent = `${percentDone}% done`;\n",
231
+ "\n",
232
+ " } while (position < fileData.byteLength);\n",
233
+ " }\n",
234
+ "\n",
235
+ " // All done.\n",
236
+ " yield {\n",
237
+ " response: {\n",
238
+ " action: 'complete',\n",
239
+ " }\n",
240
+ " };\n",
241
+ "}\n",
242
+ "\n",
243
+ "scope.google = scope.google || {};\n",
244
+ "scope.google.colab = scope.google.colab || {};\n",
245
+ "scope.google.colab._files = {\n",
246
+ " _uploadFiles,\n",
247
+ " _uploadFilesContinue,\n",
248
+ "};\n",
249
+ "})(self);\n",
250
+ "</script> "
251
+ ]
252
+ },
253
+ "metadata": {}
254
+ },
255
+ {
256
+ "output_type": "stream",
257
+ "name": "stdout",
258
+ "text": [
259
+ "Saving Recipe.jfif to Recipe.jfif\n"
260
+ ]
261
+ }
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "markdown",
266
+ "source": [
267
+ "\n",
268
+ "\n",
269
+ "---\n",
270
+ "\n"
271
+ ],
272
+ "metadata": {
273
+ "id": "UoYUP6WTUmpc"
274
+ }
275
+ },
276
+ {
277
+ "cell_type": "markdown",
278
+ "source": [
279
+ "## OLD VERSION\n",
280
+ "to emphasize my process along the paper, I kept this part which I evantually won't be using beacuase the used model \"TrOCRProcessor didn't achive good results.\n",
281
+ "\n",
282
+ "you may skip this part to see the final IO pipline on the next part"
283
+ ],
284
+ "metadata": {
285
+ "id": "hq0kcSzjS6Tr"
286
+ }
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "source": [
291
+ "from transformers import TrOCRProcessor, VisionEncoderDecoderModel\n",
292
+ "from PIL import Image\n",
293
+ "import torch\n",
294
+ "import numpy as np\n",
295
+ "import os # Import os module to use os.path.join"
296
+ ],
297
+ "metadata": {
298
+ "colab": {
299
+ "base_uri": "https://localhost:8080/"
300
+ },
301
+ "id": "AWlqrv7kTBrE",
302
+ "outputId": "fa4af507-d82a-4606-880d-bca5b8ff5bc1"
303
+ },
304
+ "execution_count": 1,
305
+ "outputs": [
306
+ {
307
+ "output_type": "stream",
308
+ "name": "stderr",
309
+ "text": [
310
+ "WARNING:torchao.kernel.intmm:Warning: Detected no triton, on systems without Triton certain kernels will not work\n"
311
+ ]
312
+ }
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 6,
318
+ "metadata": {
319
+ "colab": {
320
+ "base_uri": "https://localhost:8080/"
321
+ },
322
+ "id": "EDaLqbvsSqvq",
323
+ "outputId": "55a5db5f-00d3-4396-8ea1-3e9bfbbecbbd"
324
+ },
325
+ "outputs": [
326
+ {
327
+ "output_type": "stream",
328
+ "name": "stderr",
329
+ "text": [
330
+ "Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']\n",
331
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
332
+ ]
333
+ },
334
+ {
335
+ "output_type": "stream",
336
+ "name": "stdout",
337
+ "text": [
338
+ "📄 Scanning Recipe.jfif...\n",
339
+ "\n",
340
+ "🤖 FULL DIGITIZED RECIPE:\n",
341
+ "==============================\n",
342
+ "1903\n",
343
+ "0 0\n",
344
+ "1930 1932\n",
345
+ "0 0\n",
346
+ "==============================\n"
347
+ ]
348
+ }
349
+ ],
350
+ "source": [
351
+ "# 1. SETUP\n",
352
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
353
+ "processor = TrOCRProcessor.from_pretrained(\"microsoft/trocr-large-handwritten\")\n",
354
+ "model = VisionEncoderDecoderModel.from_pretrained(\"microsoft/trocr-large-handwritten\").to(device)\n",
355
+ "\n",
356
+ "def scan_recipe_line_by_line(image_path, line_height=80):\n",
357
+ " \"\"\"\n",
358
+ " Inputs:\n",
359
+ " image_path: path to your 900x1200 image\n",
360
+ " line_height: approximate height of one line of text in pixels\n",
361
+ " \"\"\"\n",
362
+ " full_image = Image.open(image_path).convert(\"RGB\")\n",
363
+ " width, height = full_image.size\n",
364
+ "\n",
365
+ " all_text = []\n",
366
+ "\n",
367
+ " # 2. THE SCANNING LOOP\n",
368
+ " # We move down the image in 'steps' (strips)\n",
369
+ " print(f\"📄 Scanning {os.path.basename(image_path)}...\")\n",
370
+ "\n",
371
+ " for top in range(0, height, line_height):\n",
372
+ " # Define the box for the current line strip\n",
373
+ " bottom = min(top + line_height, height)\n",
374
+ " # (left, top, right, bottom)\n",
375
+ " line_strip = full_image.crop((0, top, width, bottom))\n",
376
+ "\n",
377
+ " # 3. PROCESS THE STRIP\n",
378
+ " # We check if the strip has actual ink (isn't just white paper)\n",
379
+ " if np.array(line_strip).std() < 5: # Skip blank strips\n",
380
+ " continue\n",
381
+ "\n",
382
+ " pixel_values = processor(images=line_strip, return_tensors=\"pt\").pixel_values.to(device)\n",
383
+ "\n",
384
+ " with torch.no_grad():\n",
385
+ " generated_ids = model.generate(pixel_values, max_new_tokens=50)\n",
386
+ "\n",
387
+ " line_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
388
+ "\n",
389
+ " # If the model found text, add it to our list\n",
390
+ " if line_text.strip() and line_text.strip() != \"0\":\n",
391
+ " all_text.append(line_text)\n",
392
+ "\n",
393
+ " # 4. JOIN EVERYTHING\n",
394
+ " return \"\\n\".join(all_text)\n",
395
+ "\n",
396
+ "# --- TEST THE PIPELINE ---\n",
397
+ "test_image = \"/content/Recipe.jfif\"\n",
398
+ "final_recipe = scan_recipe_line_by_line(test_image)\n",
399
+ "\n",
400
+ "print(\"\\n🤖 FULL DIGITIZED RECIPE:\")\n",
401
+ "print(\"=\"*30)\n",
402
+ "print(final_recipe)\n",
403
+ "print(\"=\"*30)"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "markdown",
408
+ "source": [
409
+ "\n",
410
+ "\n",
411
+ "---\n",
412
+ "\n"
413
+ ],
414
+ "metadata": {
415
+ "id": "qxTqJLBwUoPS"
416
+ }
417
+ },
418
+ {
419
+ "cell_type": "markdown",
420
+ "source": [
421
+ "### Part 4- 2nd and final version of the IO pipeline"
422
+ ],
423
+ "metadata": {
424
+ "id": "PmEbXIqzTQIz"
425
+ }
426
+ },
427
+ {
428
+ "cell_type": "markdown",
429
+ "source": [
430
+ "We implemented a Serverless Inference Pipeline leveraging the **Qwen2.5-VL Vision-Language Model** hosted on the Hugging Face Inference API. Unlike traditional Document Image Transformer (DiT) approaches that require separate stages for OCR and layout analysis, our solution utilizes an end-to-end generative approach where the model processes raw pixels and directly outputs structured JSON. This architecture offloads heavy computation to cloud-hosted GPUs, allowing the application to digitize complex handwritten recipes efficiently without requiring local hardware acceleration"
431
+ ],
432
+ "metadata": {
433
+ "id": "wdygXOgvTJfK"
434
+ }
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "source": [
439
+ "import os\n",
440
+ "import json\n",
441
+ "import base64\n",
442
+ "from PIL import Image\n",
443
+ "import io\n",
444
+ "from huggingface_hub import InferenceClient"
445
+ ],
446
+ "metadata": {
447
+ "id": "ykczbBR4VCNL"
448
+ },
449
+ "execution_count": 7,
450
+ "outputs": []
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "source": [
455
+ "class RecipeDigitalizerPipeline:\n",
456
+ " def __init__(self):\n",
457
+ " print(\"Connecting to Hugging Face API (Qwen Mode)...\")\n",
458
+ " self.token = os.getenv(\"HF_TOKEN\")\n",
459
+ "\n",
460
+ " # --- WE ARE STICKING TO QWEN ---\n",
461
+ " # If 2.5 gives you trouble, you can try \"Qwen/Qwen2-VL-7B-Instruct\"\n",
462
+ " self.model_id = \"Qwen/Qwen2.5-VL-7B-Instruct\"\n",
463
+ "\n",
464
+ " self.client = InferenceClient(token=self.token)\n",
465
+ "\n",
466
+ " def compress_image(self, image_path):\n",
467
+ " \"\"\"\n",
468
+ " Resizes the image so it doesn't crash the Free API.\n",
469
+ " \"\"\"\n",
470
+ " with Image.open(image_path) as img:\n",
471
+ " if img.mode != 'RGB':\n",
472
+ " img = img.convert('RGB')\n",
473
+ "\n",
474
+ " # Resize: Free API often rejects images larger than 1024x1024\n",
475
+ " max_size = 1024\n",
476
+ " if max(img.size) > max_size:\n",
477
+ " img.thumbnail((max_size, max_size))\n",
478
+ "\n",
479
+ " # Save to memory as JPEG\n",
480
+ " buffer = io.BytesIO()\n",
481
+ " img.save(buffer, format=\"JPEG\", quality=70) # Quality 70 is enough for text\n",
482
+ "\n",
483
+ " # Convert to Base64\n",
484
+ " encoded_string = base64.b64encode(buffer.getvalue()).decode('utf-8')\n",
485
+ " return f\"data:image/jpeg;base64,{encoded_string}\"\n",
486
+ "\n",
487
+ " def run_pipeline(self, image_path):\n",
488
+ " prompt = \"\"\"Extract the recipe from this image.\n",
489
+ " Output strictly valid JSON with keys: title, ingredients (list), instructions (list), cuisine_type, difficulty.\n",
490
+ " Do not include markdown formatting like ```json, just the raw JSON.\"\"\"\n",
491
+ "\n",
492
+ " try:\n",
493
+ " # 1. Compress Image (Solves 400 Bad Request)\n",
494
+ " image_url = self.compress_image(image_path)\n",
495
+ "\n",
496
+ " # 2. Call Qwen API\n",
497
+ " response = self.client.chat.completions.create(\n",
498
+ " model=self.model_id,\n",
499
+ " messages=[\n",
500
+ " {\n",
501
+ " \"role\": \"user\",\n",
502
+ " \"content\": [\n",
503
+ " {\n",
504
+ " \"type\": \"image_url\",\n",
505
+ " \"image_url\": {\"url\": image_url}\n",
506
+ " },\n",
507
+ " {\"type\": \"text\", \"text\": prompt}\n",
508
+ " ]\n",
509
+ " }\n",
510
+ " ],\n",
511
+ " max_tokens=1024\n",
512
+ " )\n",
513
+ "\n",
514
+ " # 3. Clean Output\n",
515
+ " raw_text = response.choices[0].message.content\n",
516
+ " clean_json = raw_text.replace(\"```json\", \"\").replace(\"```\", \"\").strip()\n",
517
+ "\n",
518
+ " # Extra safety: Find the first { and last }\n",
519
+ " start = clean_json.find('{')\n",
520
+ " end = clean_json.rfind('}') + 1\n",
521
+ " if start != -1 and end != -1:\n",
522
+ " clean_json = clean_json[start:end]\n",
523
+ "\n",
524
+ " return json.loads(clean_json)\n",
525
+ "\n",
526
+ " except Exception as e:\n",
527
+ " return {\"error\": f\"Qwen API Error: {str(e)}\"}"
528
+ ],
529
+ "metadata": {
530
+ "id": "I0XOgMjETSXw"
531
+ },
532
+ "execution_count": 8,
533
+ "outputs": []
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "source": [
538
+ "# --- PART 4: EXECUTION EXAMPLE ---\n",
539
+ "\n",
540
+ "if __name__ == \"__main__\":\n",
541
+ " import os\n",
542
+ "\n",
543
+ " # 1. AUTHENTICATION FIX\n",
544
+ " try:\n",
545
+ " from google.colab import userdata\n",
546
+ " # Get the secret named \"HF1\"\n",
547
+ " hf1_secret = userdata.get('HF_TOKEN')\n",
548
+ "\n",
549
+ " # Inject it into the environment as 'HF_TOKEN' so the Pipeline class can find it\n",
550
+ " os.environ[\"HF_TOKEN\"] = hf1_secret\n",
551
+ " print(f\"✅ Successfully loaded token from secret HF_TOKEN\")\n",
552
+ "\n",
553
+ " except Exception as e:\n",
554
+ " print(f\"⚠️ Warning: Could not load secret 'HF_TOKEN'. Make sure the name in the Key icon is exactly 'HF_TOKEN'.\")\n",
555
+ " print(f\"Error details: {e}\")\n",
556
+ "\n",
557
+ " # 2. INITIALIZE PIPELINE\n",
558
+ " # Now this will work because we set os.environ[\"HF_TOKEN\"] above\n",
559
+ " try:\n",
560
+ " app = RecipeDigitalizerPipeline()\n",
561
+ "\n",
562
+ " # 3. USER INPUT\n",
563
+ " user_image = \"/content/Recipe.jfif\"\n",
564
+ "\n",
565
+ " # 4. RUN PIPELINE\n",
566
+ " if os.path.exists(user_image):\n",
567
+ " print(f\"Processing {user_image}...\")\n",
568
+ " ai_output = app.run_pipeline(user_image)\n",
569
+ "\n",
570
+ " # 5. AI OUTPUT\n",
571
+ " print(\"\\n--- FINAL DIGITAL OUTPUT ---\")\n",
572
+ " print(json.dumps(ai_output, indent=4))\n",
573
+ " else:\n",
574
+ " print(f\"❌ Error: Image not found at {user_image}\")\n",
575
+ "\n",
576
+ " except Exception as e:\n",
577
+ " print(f\"❌ Application Error: {e}\")"
578
+ ],
579
+ "metadata": {
580
+ "colab": {
581
+ "base_uri": "https://localhost:8080/"
582
+ },
583
+ "id": "EyXpPQGsTXkd",
584
+ "outputId": "10c5fa31-6731-45ec-b5cc-074d6d534bfc"
585
+ },
586
+ "execution_count": 15,
587
+ "outputs": [
588
+ {
589
+ "output_type": "stream",
590
+ "name": "stdout",
591
+ "text": [
592
+ "✅ Successfully loaded token from secret HF_TOKEN\n",
593
+ "Connecting to Hugging Face API (Qwen Mode)...\n",
594
+ "Processing /content/Recipe.jfif...\n",
595
+ "\n",
596
+ "--- FINAL DIGITAL OUTPUT ---\n",
597
+ "{\n",
598
+ " \"title\": \"Chocolate Chip Cookies\",\n",
599
+ " \"ingredients\": [\n",
600
+ " \"3 cups flour\",\n",
601
+ " \"1 1/2 teaspoons baking soda\",\n",
602
+ " \"1/4 teaspoon salt\",\n",
603
+ " \"1/2 cup soften butter\",\n",
604
+ " \"1/4 cup sugar\",\n",
605
+ " \"1/2 cup brown sugar\",\n",
606
+ " \"3 eggs\",\n",
607
+ " \"2 teaspoons vanilla\",\n",
608
+ " \"2 cups chocolate chips\"\n",
609
+ " ],\n",
610
+ " \"instructions\": [\n",
611
+ " \"Preheat oven to 350\\u00b0 for about 15 minutes or roll out a cookie cake and bake for about 9 minutes.\"\n",
612
+ " ],\n",
613
+ " \"cuisine_type\": \"American\",\n",
614
+ " \"difficulty\": \"Easy\"\n",
615
+ "}\n"
616
+ ]
617
+ }
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "markdown",
622
+ "source": [
623
+ "Our evaluation demonstrates that the Qwen-VL Serverless Pipeline significantly outperforms traditional Document Image Transformer (DiT) baselines. While the DiT model frequently suffered from hallucinations and failed to correct OCR errors due to a lack of semantic awareness, our VLM approach leverages deep linguistic understanding to resolve ambiguities. For instance, the model successfully inferred 'sugar' from the noisy input 's_gar' by analyzing the culinary context—a semantic correction capability that was absent in the standard DiT pipeline."
624
+ ],
625
+ "metadata": {
626
+ "id": "JIZUnKOWTZqc"
627
+ }
628
+ },
629
+ {
630
+ "cell_type": "code",
631
+ "source": [],
632
+ "metadata": {
633
+ "id": "6kaTyYGBTZiL"
634
+ },
635
+ "execution_count": null,
636
+ "outputs": []
637
+ }
638
+ ]
639
+ }