Abdul234 commited on
Commit
6514f45
·
verified ·
1 Parent(s): 9fa621e

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +30 -0
  2. requirements.txt +4 -0
  3. trocr_large_stage1.ipynb +1346 -0
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Aug 6 12:47:42 2025
4
+
5
+ @author: RMD
6
+ """
7
+
8
+ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
9
+ from PIL import Image
10
+ import gradio as gr
11
+
12
+ # Load model and processor
13
+ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-stage1")
14
+ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-stage1")
15
+
16
+ def recognize_text(image):
17
+ image = image.convert("RGB")
18
+ pixel_values = processor(images=image, return_tensors="pt").pixel_values
19
+ generated_ids = model.generate(pixel_values)
20
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
21
+ return generated_text
22
+
23
+ # Launch Gradio Interface
24
+ interface = gr.Interface(fn=recognize_text,
25
+ inputs=gr.Image(type="pil"),
26
+ outputs="text",
27
+ title="TrOCR Text Recognition",
28
+ description="Upload a printed/handwritten text image to recognize the text.")
29
+
30
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio
4
+ Pillow
trocr_large_stage1.ipynb ADDED
@@ -0,0 +1,1346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "toc_visible": true
8
+ },
9
+ "kaggle": {
10
+ "accelerator": "gpu"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ },
15
+ "kernelspec": {
16
+ "name": "python3",
17
+ "display_name": "Python 3"
18
+ },
19
+ "widgets": {
20
+ "application/vnd.jupyter.widget-state+json": {
21
+ "499a30b9f3d948b393a833b8c432ae38": {
22
+ "model_module": "@jupyter-widgets/controls",
23
+ "model_name": "HBoxModel",
24
+ "model_module_version": "1.5.0",
25
+ "state": {
26
+ "_dom_classes": [],
27
+ "_model_module": "@jupyter-widgets/controls",
28
+ "_model_module_version": "1.5.0",
29
+ "_model_name": "HBoxModel",
30
+ "_view_count": null,
31
+ "_view_module": "@jupyter-widgets/controls",
32
+ "_view_module_version": "1.5.0",
33
+ "_view_name": "HBoxView",
34
+ "box_style": "",
35
+ "children": [
36
+ "IPY_MODEL_0087ebb741b84c27a534cc86ce119c20",
37
+ "IPY_MODEL_6ec79949c7564a7d9da4c4713dbb1d6a",
38
+ "IPY_MODEL_2fa21d685d854cd78deba6db23d118e9"
39
+ ],
40
+ "layout": "IPY_MODEL_337ffd9b357842a8b8ac6b262eeb6bca"
41
+ }
42
+ },
43
+ "0087ebb741b84c27a534cc86ce119c20": {
44
+ "model_module": "@jupyter-widgets/controls",
45
+ "model_name": "HTMLModel",
46
+ "model_module_version": "1.5.0",
47
+ "state": {
48
+ "_dom_classes": [],
49
+ "_model_module": "@jupyter-widgets/controls",
50
+ "_model_module_version": "1.5.0",
51
+ "_model_name": "HTMLModel",
52
+ "_view_count": null,
53
+ "_view_module": "@jupyter-widgets/controls",
54
+ "_view_module_version": "1.5.0",
55
+ "_view_name": "HTMLView",
56
+ "description": "",
57
+ "description_tooltip": null,
58
+ "layout": "IPY_MODEL_a1a3eeb969c4459990297aab0b2fc101",
59
+ "placeholder": "​",
60
+ "style": "IPY_MODEL_db314661fee0478386e01caaee424c49",
61
+ "value": "config.json: "
62
+ }
63
+ },
64
+ "6ec79949c7564a7d9da4c4713dbb1d6a": {
65
+ "model_module": "@jupyter-widgets/controls",
66
+ "model_name": "FloatProgressModel",
67
+ "model_module_version": "1.5.0",
68
+ "state": {
69
+ "_dom_classes": [],
70
+ "_model_module": "@jupyter-widgets/controls",
71
+ "_model_module_version": "1.5.0",
72
+ "_model_name": "FloatProgressModel",
73
+ "_view_count": null,
74
+ "_view_module": "@jupyter-widgets/controls",
75
+ "_view_module_version": "1.5.0",
76
+ "_view_name": "ProgressView",
77
+ "bar_style": "success",
78
+ "description": "",
79
+ "description_tooltip": null,
80
+ "layout": "IPY_MODEL_0b2fcda70b0c4a34890eea43a701250b",
81
+ "max": 1,
82
+ "min": 0,
83
+ "orientation": "horizontal",
84
+ "style": "IPY_MODEL_6a643e4a16f74331826f41f96b10ee8c",
85
+ "value": 1
86
+ }
87
+ },
88
+ "2fa21d685d854cd78deba6db23d118e9": {
89
+ "model_module": "@jupyter-widgets/controls",
90
+ "model_name": "HTMLModel",
91
+ "model_module_version": "1.5.0",
92
+ "state": {
93
+ "_dom_classes": [],
94
+ "_model_module": "@jupyter-widgets/controls",
95
+ "_model_module_version": "1.5.0",
96
+ "_model_name": "HTMLModel",
97
+ "_view_count": null,
98
+ "_view_module": "@jupyter-widgets/controls",
99
+ "_view_module_version": "1.5.0",
100
+ "_view_name": "HTMLView",
101
+ "description": "",
102
+ "description_tooltip": null,
103
+ "layout": "IPY_MODEL_4d0ef08daecf4748ad2f27ad0901a0d7",
104
+ "placeholder": "​",
105
+ "style": "IPY_MODEL_25b3ace657494257bedad1c934b07c0b",
106
+ "value": " 4.24k/? [00:00<00:00, 228kB/s]"
107
+ }
108
+ },
109
+ "337ffd9b357842a8b8ac6b262eeb6bca": {
110
+ "model_module": "@jupyter-widgets/base",
111
+ "model_name": "LayoutModel",
112
+ "model_module_version": "1.2.0",
113
+ "state": {
114
+ "_model_module": "@jupyter-widgets/base",
115
+ "_model_module_version": "1.2.0",
116
+ "_model_name": "LayoutModel",
117
+ "_view_count": null,
118
+ "_view_module": "@jupyter-widgets/base",
119
+ "_view_module_version": "1.2.0",
120
+ "_view_name": "LayoutView",
121
+ "align_content": null,
122
+ "align_items": null,
123
+ "align_self": null,
124
+ "border": null,
125
+ "bottom": null,
126
+ "display": null,
127
+ "flex": null,
128
+ "flex_flow": null,
129
+ "grid_area": null,
130
+ "grid_auto_columns": null,
131
+ "grid_auto_flow": null,
132
+ "grid_auto_rows": null,
133
+ "grid_column": null,
134
+ "grid_gap": null,
135
+ "grid_row": null,
136
+ "grid_template_areas": null,
137
+ "grid_template_columns": null,
138
+ "grid_template_rows": null,
139
+ "height": null,
140
+ "justify_content": null,
141
+ "justify_items": null,
142
+ "left": null,
143
+ "margin": null,
144
+ "max_height": null,
145
+ "max_width": null,
146
+ "min_height": null,
147
+ "min_width": null,
148
+ "object_fit": null,
149
+ "object_position": null,
150
+ "order": null,
151
+ "overflow": null,
152
+ "overflow_x": null,
153
+ "overflow_y": null,
154
+ "padding": null,
155
+ "right": null,
156
+ "top": null,
157
+ "visibility": null,
158
+ "width": null
159
+ }
160
+ },
161
+ "a1a3eeb969c4459990297aab0b2fc101": {
162
+ "model_module": "@jupyter-widgets/base",
163
+ "model_name": "LayoutModel",
164
+ "model_module_version": "1.2.0",
165
+ "state": {
166
+ "_model_module": "@jupyter-widgets/base",
167
+ "_model_module_version": "1.2.0",
168
+ "_model_name": "LayoutModel",
169
+ "_view_count": null,
170
+ "_view_module": "@jupyter-widgets/base",
171
+ "_view_module_version": "1.2.0",
172
+ "_view_name": "LayoutView",
173
+ "align_content": null,
174
+ "align_items": null,
175
+ "align_self": null,
176
+ "border": null,
177
+ "bottom": null,
178
+ "display": null,
179
+ "flex": null,
180
+ "flex_flow": null,
181
+ "grid_area": null,
182
+ "grid_auto_columns": null,
183
+ "grid_auto_flow": null,
184
+ "grid_auto_rows": null,
185
+ "grid_column": null,
186
+ "grid_gap": null,
187
+ "grid_row": null,
188
+ "grid_template_areas": null,
189
+ "grid_template_columns": null,
190
+ "grid_template_rows": null,
191
+ "height": null,
192
+ "justify_content": null,
193
+ "justify_items": null,
194
+ "left": null,
195
+ "margin": null,
196
+ "max_height": null,
197
+ "max_width": null,
198
+ "min_height": null,
199
+ "min_width": null,
200
+ "object_fit": null,
201
+ "object_position": null,
202
+ "order": null,
203
+ "overflow": null,
204
+ "overflow_x": null,
205
+ "overflow_y": null,
206
+ "padding": null,
207
+ "right": null,
208
+ "top": null,
209
+ "visibility": null,
210
+ "width": null
211
+ }
212
+ },
213
+ "db314661fee0478386e01caaee424c49": {
214
+ "model_module": "@jupyter-widgets/controls",
215
+ "model_name": "DescriptionStyleModel",
216
+ "model_module_version": "1.5.0",
217
+ "state": {
218
+ "_model_module": "@jupyter-widgets/controls",
219
+ "_model_module_version": "1.5.0",
220
+ "_model_name": "DescriptionStyleModel",
221
+ "_view_count": null,
222
+ "_view_module": "@jupyter-widgets/base",
223
+ "_view_module_version": "1.2.0",
224
+ "_view_name": "StyleView",
225
+ "description_width": ""
226
+ }
227
+ },
228
+ "0b2fcda70b0c4a34890eea43a701250b": {
229
+ "model_module": "@jupyter-widgets/base",
230
+ "model_name": "LayoutModel",
231
+ "model_module_version": "1.2.0",
232
+ "state": {
233
+ "_model_module": "@jupyter-widgets/base",
234
+ "_model_module_version": "1.2.0",
235
+ "_model_name": "LayoutModel",
236
+ "_view_count": null,
237
+ "_view_module": "@jupyter-widgets/base",
238
+ "_view_module_version": "1.2.0",
239
+ "_view_name": "LayoutView",
240
+ "align_content": null,
241
+ "align_items": null,
242
+ "align_self": null,
243
+ "border": null,
244
+ "bottom": null,
245
+ "display": null,
246
+ "flex": null,
247
+ "flex_flow": null,
248
+ "grid_area": null,
249
+ "grid_auto_columns": null,
250
+ "grid_auto_flow": null,
251
+ "grid_auto_rows": null,
252
+ "grid_column": null,
253
+ "grid_gap": null,
254
+ "grid_row": null,
255
+ "grid_template_areas": null,
256
+ "grid_template_columns": null,
257
+ "grid_template_rows": null,
258
+ "height": null,
259
+ "justify_content": null,
260
+ "justify_items": null,
261
+ "left": null,
262
+ "margin": null,
263
+ "max_height": null,
264
+ "max_width": null,
265
+ "min_height": null,
266
+ "min_width": null,
267
+ "object_fit": null,
268
+ "object_position": null,
269
+ "order": null,
270
+ "overflow": null,
271
+ "overflow_x": null,
272
+ "overflow_y": null,
273
+ "padding": null,
274
+ "right": null,
275
+ "top": null,
276
+ "visibility": null,
277
+ "width": "20px"
278
+ }
279
+ },
280
+ "6a643e4a16f74331826f41f96b10ee8c": {
281
+ "model_module": "@jupyter-widgets/controls",
282
+ "model_name": "ProgressStyleModel",
283
+ "model_module_version": "1.5.0",
284
+ "state": {
285
+ "_model_module": "@jupyter-widgets/controls",
286
+ "_model_module_version": "1.5.0",
287
+ "_model_name": "ProgressStyleModel",
288
+ "_view_count": null,
289
+ "_view_module": "@jupyter-widgets/base",
290
+ "_view_module_version": "1.2.0",
291
+ "_view_name": "StyleView",
292
+ "bar_color": null,
293
+ "description_width": ""
294
+ }
295
+ },
296
+ "4d0ef08daecf4748ad2f27ad0901a0d7": {
297
+ "model_module": "@jupyter-widgets/base",
298
+ "model_name": "LayoutModel",
299
+ "model_module_version": "1.2.0",
300
+ "state": {
301
+ "_model_module": "@jupyter-widgets/base",
302
+ "_model_module_version": "1.2.0",
303
+ "_model_name": "LayoutModel",
304
+ "_view_count": null,
305
+ "_view_module": "@jupyter-widgets/base",
306
+ "_view_module_version": "1.2.0",
307
+ "_view_name": "LayoutView",
308
+ "align_content": null,
309
+ "align_items": null,
310
+ "align_self": null,
311
+ "border": null,
312
+ "bottom": null,
313
+ "display": null,
314
+ "flex": null,
315
+ "flex_flow": null,
316
+ "grid_area": null,
317
+ "grid_auto_columns": null,
318
+ "grid_auto_flow": null,
319
+ "grid_auto_rows": null,
320
+ "grid_column": null,
321
+ "grid_gap": null,
322
+ "grid_row": null,
323
+ "grid_template_areas": null,
324
+ "grid_template_columns": null,
325
+ "grid_template_rows": null,
326
+ "height": null,
327
+ "justify_content": null,
328
+ "justify_items": null,
329
+ "left": null,
330
+ "margin": null,
331
+ "max_height": null,
332
+ "max_width": null,
333
+ "min_height": null,
334
+ "min_width": null,
335
+ "object_fit": null,
336
+ "object_position": null,
337
+ "order": null,
338
+ "overflow": null,
339
+ "overflow_x": null,
340
+ "overflow_y": null,
341
+ "padding": null,
342
+ "right": null,
343
+ "top": null,
344
+ "visibility": null,
345
+ "width": null
346
+ }
347
+ },
348
+ "25b3ace657494257bedad1c934b07c0b": {
349
+ "model_module": "@jupyter-widgets/controls",
350
+ "model_name": "DescriptionStyleModel",
351
+ "model_module_version": "1.5.0",
352
+ "state": {
353
+ "_model_module": "@jupyter-widgets/controls",
354
+ "_model_module_version": "1.5.0",
355
+ "_model_name": "DescriptionStyleModel",
356
+ "_view_count": null,
357
+ "_view_module": "@jupyter-widgets/base",
358
+ "_view_module_version": "1.2.0",
359
+ "_view_name": "StyleView",
360
+ "description_width": ""
361
+ }
362
+ },
363
+ "6e54658145734e648ccfd1daa2c4c607": {
364
+ "model_module": "@jupyter-widgets/controls",
365
+ "model_name": "HBoxModel",
366
+ "model_module_version": "1.5.0",
367
+ "state": {
368
+ "_dom_classes": [],
369
+ "_model_module": "@jupyter-widgets/controls",
370
+ "_model_module_version": "1.5.0",
371
+ "_model_name": "HBoxModel",
372
+ "_view_count": null,
373
+ "_view_module": "@jupyter-widgets/controls",
374
+ "_view_module_version": "1.5.0",
375
+ "_view_name": "HBoxView",
376
+ "box_style": "",
377
+ "children": [
378
+ "IPY_MODEL_a186b8a35c68407fbb51a309b4e22e5a",
379
+ "IPY_MODEL_80820127d5c540e598cd00cf94d5d635",
380
+ "IPY_MODEL_bd9e46996772480f8cd5851635ca30c4"
381
+ ],
382
+ "layout": "IPY_MODEL_4e649cdda36b4dc4a36a5442ddf7d4a8"
383
+ }
384
+ },
385
+ "a186b8a35c68407fbb51a309b4e22e5a": {
386
+ "model_module": "@jupyter-widgets/controls",
387
+ "model_name": "HTMLModel",
388
+ "model_module_version": "1.5.0",
389
+ "state": {
390
+ "_dom_classes": [],
391
+ "_model_module": "@jupyter-widgets/controls",
392
+ "_model_module_version": "1.5.0",
393
+ "_model_name": "HTMLModel",
394
+ "_view_count": null,
395
+ "_view_module": "@jupyter-widgets/controls",
396
+ "_view_module_version": "1.5.0",
397
+ "_view_name": "HTMLView",
398
+ "description": "",
399
+ "description_tooltip": null,
400
+ "layout": "IPY_MODEL_709ab0ae3897437e9b891b1669c04897",
401
+ "placeholder": "​",
402
+ "style": "IPY_MODEL_ff0d6857285e4a5290314d6fa73240b7",
403
+ "value": "model.safetensors:   4%"
404
+ }
405
+ },
406
+ "80820127d5c540e598cd00cf94d5d635": {
407
+ "model_module": "@jupyter-widgets/controls",
408
+ "model_name": "FloatProgressModel",
409
+ "model_module_version": "1.5.0",
410
+ "state": {
411
+ "_dom_classes": [],
412
+ "_model_module": "@jupyter-widgets/controls",
413
+ "_model_module_version": "1.5.0",
414
+ "_model_name": "FloatProgressModel",
415
+ "_view_count": null,
416
+ "_view_module": "@jupyter-widgets/controls",
417
+ "_view_module_version": "1.5.0",
418
+ "_view_name": "ProgressView",
419
+ "bar_style": "",
420
+ "description": "",
421
+ "description_tooltip": null,
422
+ "layout": "IPY_MODEL_b6a3e597a9344b75a1173ea6b8cf0762",
423
+ "max": 2432558500,
424
+ "min": 0,
425
+ "orientation": "horizontal",
426
+ "style": "IPY_MODEL_cbf81f6c3a1c4d44931e9c66dedad02b",
427
+ "value": 91687298
428
+ }
429
+ },
430
+ "bd9e46996772480f8cd5851635ca30c4": {
431
+ "model_module": "@jupyter-widgets/controls",
432
+ "model_name": "HTMLModel",
433
+ "model_module_version": "1.5.0",
434
+ "state": {
435
+ "_dom_classes": [],
436
+ "_model_module": "@jupyter-widgets/controls",
437
+ "_model_module_version": "1.5.0",
438
+ "_model_name": "HTMLModel",
439
+ "_view_count": null,
440
+ "_view_module": "@jupyter-widgets/controls",
441
+ "_view_module_version": "1.5.0",
442
+ "_view_name": "HTMLView",
443
+ "description": "",
444
+ "description_tooltip": null,
445
+ "layout": "IPY_MODEL_1bddc9d8eb2547458c1f9608203469d8",
446
+ "placeholder": "​",
447
+ "style": "IPY_MODEL_583ca4b425f74811a64d48bbcf1f7401",
448
+ "value": " 91.7M/2.43G [00:09<02:30, 15.5MB/s]"
449
+ }
450
+ },
451
+ "4e649cdda36b4dc4a36a5442ddf7d4a8": {
452
+ "model_module": "@jupyter-widgets/base",
453
+ "model_name": "LayoutModel",
454
+ "model_module_version": "1.2.0",
455
+ "state": {
456
+ "_model_module": "@jupyter-widgets/base",
457
+ "_model_module_version": "1.2.0",
458
+ "_model_name": "LayoutModel",
459
+ "_view_count": null,
460
+ "_view_module": "@jupyter-widgets/base",
461
+ "_view_module_version": "1.2.0",
462
+ "_view_name": "LayoutView",
463
+ "align_content": null,
464
+ "align_items": null,
465
+ "align_self": null,
466
+ "border": null,
467
+ "bottom": null,
468
+ "display": null,
469
+ "flex": null,
470
+ "flex_flow": null,
471
+ "grid_area": null,
472
+ "grid_auto_columns": null,
473
+ "grid_auto_flow": null,
474
+ "grid_auto_rows": null,
475
+ "grid_column": null,
476
+ "grid_gap": null,
477
+ "grid_row": null,
478
+ "grid_template_areas": null,
479
+ "grid_template_columns": null,
480
+ "grid_template_rows": null,
481
+ "height": null,
482
+ "justify_content": null,
483
+ "justify_items": null,
484
+ "left": null,
485
+ "margin": null,
486
+ "max_height": null,
487
+ "max_width": null,
488
+ "min_height": null,
489
+ "min_width": null,
490
+ "object_fit": null,
491
+ "object_position": null,
492
+ "order": null,
493
+ "overflow": null,
494
+ "overflow_x": null,
495
+ "overflow_y": null,
496
+ "padding": null,
497
+ "right": null,
498
+ "top": null,
499
+ "visibility": null,
500
+ "width": null
501
+ }
502
+ },
503
+ "709ab0ae3897437e9b891b1669c04897": {
504
+ "model_module": "@jupyter-widgets/base",
505
+ "model_name": "LayoutModel",
506
+ "model_module_version": "1.2.0",
507
+ "state": {
508
+ "_model_module": "@jupyter-widgets/base",
509
+ "_model_module_version": "1.2.0",
510
+ "_model_name": "LayoutModel",
511
+ "_view_count": null,
512
+ "_view_module": "@jupyter-widgets/base",
513
+ "_view_module_version": "1.2.0",
514
+ "_view_name": "LayoutView",
515
+ "align_content": null,
516
+ "align_items": null,
517
+ "align_self": null,
518
+ "border": null,
519
+ "bottom": null,
520
+ "display": null,
521
+ "flex": null,
522
+ "flex_flow": null,
523
+ "grid_area": null,
524
+ "grid_auto_columns": null,
525
+ "grid_auto_flow": null,
526
+ "grid_auto_rows": null,
527
+ "grid_column": null,
528
+ "grid_gap": null,
529
+ "grid_row": null,
530
+ "grid_template_areas": null,
531
+ "grid_template_columns": null,
532
+ "grid_template_rows": null,
533
+ "height": null,
534
+ "justify_content": null,
535
+ "justify_items": null,
536
+ "left": null,
537
+ "margin": null,
538
+ "max_height": null,
539
+ "max_width": null,
540
+ "min_height": null,
541
+ "min_width": null,
542
+ "object_fit": null,
543
+ "object_position": null,
544
+ "order": null,
545
+ "overflow": null,
546
+ "overflow_x": null,
547
+ "overflow_y": null,
548
+ "padding": null,
549
+ "right": null,
550
+ "top": null,
551
+ "visibility": null,
552
+ "width": null
553
+ }
554
+ },
555
+ "ff0d6857285e4a5290314d6fa73240b7": {
556
+ "model_module": "@jupyter-widgets/controls",
557
+ "model_name": "DescriptionStyleModel",
558
+ "model_module_version": "1.5.0",
559
+ "state": {
560
+ "_model_module": "@jupyter-widgets/controls",
561
+ "_model_module_version": "1.5.0",
562
+ "_model_name": "DescriptionStyleModel",
563
+ "_view_count": null,
564
+ "_view_module": "@jupyter-widgets/base",
565
+ "_view_module_version": "1.2.0",
566
+ "_view_name": "StyleView",
567
+ "description_width": ""
568
+ }
569
+ },
570
+ "b6a3e597a9344b75a1173ea6b8cf0762": {
571
+ "model_module": "@jupyter-widgets/base",
572
+ "model_name": "LayoutModel",
573
+ "model_module_version": "1.2.0",
574
+ "state": {
575
+ "_model_module": "@jupyter-widgets/base",
576
+ "_model_module_version": "1.2.0",
577
+ "_model_name": "LayoutModel",
578
+ "_view_count": null,
579
+ "_view_module": "@jupyter-widgets/base",
580
+ "_view_module_version": "1.2.0",
581
+ "_view_name": "LayoutView",
582
+ "align_content": null,
583
+ "align_items": null,
584
+ "align_self": null,
585
+ "border": null,
586
+ "bottom": null,
587
+ "display": null,
588
+ "flex": null,
589
+ "flex_flow": null,
590
+ "grid_area": null,
591
+ "grid_auto_columns": null,
592
+ "grid_auto_flow": null,
593
+ "grid_auto_rows": null,
594
+ "grid_column": null,
595
+ "grid_gap": null,
596
+ "grid_row": null,
597
+ "grid_template_areas": null,
598
+ "grid_template_columns": null,
599
+ "grid_template_rows": null,
600
+ "height": null,
601
+ "justify_content": null,
602
+ "justify_items": null,
603
+ "left": null,
604
+ "margin": null,
605
+ "max_height": null,
606
+ "max_width": null,
607
+ "min_height": null,
608
+ "min_width": null,
609
+ "object_fit": null,
610
+ "object_position": null,
611
+ "order": null,
612
+ "overflow": null,
613
+ "overflow_x": null,
614
+ "overflow_y": null,
615
+ "padding": null,
616
+ "right": null,
617
+ "top": null,
618
+ "visibility": null,
619
+ "width": null
620
+ }
621
+ },
622
+ "cbf81f6c3a1c4d44931e9c66dedad02b": {
623
+ "model_module": "@jupyter-widgets/controls",
624
+ "model_name": "ProgressStyleModel",
625
+ "model_module_version": "1.5.0",
626
+ "state": {
627
+ "_model_module": "@jupyter-widgets/controls",
628
+ "_model_module_version": "1.5.0",
629
+ "_model_name": "ProgressStyleModel",
630
+ "_view_count": null,
631
+ "_view_module": "@jupyter-widgets/base",
632
+ "_view_module_version": "1.2.0",
633
+ "_view_name": "StyleView",
634
+ "bar_color": null,
635
+ "description_width": ""
636
+ }
637
+ },
638
+ "1bddc9d8eb2547458c1f9608203469d8": {
639
+ "model_module": "@jupyter-widgets/base",
640
+ "model_name": "LayoutModel",
641
+ "model_module_version": "1.2.0",
642
+ "state": {
643
+ "_model_module": "@jupyter-widgets/base",
644
+ "_model_module_version": "1.2.0",
645
+ "_model_name": "LayoutModel",
646
+ "_view_count": null,
647
+ "_view_module": "@jupyter-widgets/base",
648
+ "_view_module_version": "1.2.0",
649
+ "_view_name": "LayoutView",
650
+ "align_content": null,
651
+ "align_items": null,
652
+ "align_self": null,
653
+ "border": null,
654
+ "bottom": null,
655
+ "display": null,
656
+ "flex": null,
657
+ "flex_flow": null,
658
+ "grid_area": null,
659
+ "grid_auto_columns": null,
660
+ "grid_auto_flow": null,
661
+ "grid_auto_rows": null,
662
+ "grid_column": null,
663
+ "grid_gap": null,
664
+ "grid_row": null,
665
+ "grid_template_areas": null,
666
+ "grid_template_columns": null,
667
+ "grid_template_rows": null,
668
+ "height": null,
669
+ "justify_content": null,
670
+ "justify_items": null,
671
+ "left": null,
672
+ "margin": null,
673
+ "max_height": null,
674
+ "max_width": null,
675
+ "min_height": null,
676
+ "min_width": null,
677
+ "object_fit": null,
678
+ "object_position": null,
679
+ "order": null,
680
+ "overflow": null,
681
+ "overflow_x": null,
682
+ "overflow_y": null,
683
+ "padding": null,
684
+ "right": null,
685
+ "top": null,
686
+ "visibility": null,
687
+ "width": null
688
+ }
689
+ },
690
+ "583ca4b425f74811a64d48bbcf1f7401": {
691
+ "model_module": "@jupyter-widgets/controls",
692
+ "model_name": "DescriptionStyleModel",
693
+ "model_module_version": "1.5.0",
694
+ "state": {
695
+ "_model_module": "@jupyter-widgets/controls",
696
+ "_model_module_version": "1.5.0",
697
+ "_model_name": "DescriptionStyleModel",
698
+ "_view_count": null,
699
+ "_view_module": "@jupyter-widgets/base",
700
+ "_view_module_version": "1.2.0",
701
+ "_view_name": "StyleView",
702
+ "description_width": ""
703
+ }
704
+ }
705
+ }
706
+ }
707
+ },
708
+ "cells": [
709
+ {
710
+ "cell_type": "code",
711
+ "source": [
712
+ "!pip install -U transformers"
713
+ ],
714
+ "metadata": {
715
+ "colab": {
716
+ "base_uri": "https://localhost:8080/",
717
+ "height": 638
718
+ },
719
+ "id": "qPfPN9yAaPQx",
720
+ "outputId": "1dcce4ca-06bf-46f7-ad56-0352a059deed"
721
+ },
722
+ "execution_count": 5,
723
+ "outputs": [
724
+ {
725
+ "output_type": "stream",
726
+ "name": "stdout",
727
+ "text": [
728
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.54.0)\n",
729
+ "Collecting transformers\n",
730
+ " Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)\n",
731
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
732
+ "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.18.0)\n",
733
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.34.1)\n",
734
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2.0.2)\n",
735
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (25.0)\n",
736
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n",
737
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n",
738
+ "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n",
739
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.2)\n",
740
+ "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.3)\n",
741
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.67.1)\n",
742
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2025.3.0)\n",
743
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.14.1)\n",
744
+ "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.5)\n",
745
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.4.2)\n",
746
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.10)\n",
747
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.5.0)\n",
748
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2025.7.14)\n",
749
+ "Downloading transformers-4.54.1-py3-none-any.whl (11.2 MB)\n",
750
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.2/11.2 MB\u001b[0m \u001b[31m46.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
751
+ "\u001b[?25hInstalling collected packages: transformers\n",
752
+ " Attempting uninstall: transformers\n",
753
+ " Found existing installation: transformers 4.54.0\n",
754
+ " Uninstalling transformers-4.54.0:\n",
755
+ " Successfully uninstalled transformers-4.54.0\n",
756
+ "Successfully installed transformers-4.54.1\n"
757
+ ]
758
+ },
759
+ {
760
+ "output_type": "display_data",
761
+ "data": {
762
+ "application/vnd.colab-display-data+json": {
763
+ "pip_warning": {
764
+ "packages": [
765
+ "transformers"
766
+ ]
767
+ },
768
+ "id": "c7d109c3716d47d3b38b443b894d1ed7"
769
+ }
770
+ },
771
+ "metadata": {}
772
+ }
773
+ ]
774
+ },
775
+ {
776
+ "cell_type": "code",
777
+ "source": [],
778
+ "metadata": {
779
+ "id": "ZBjo8TuIaTMI"
780
+ },
781
+ "execution_count": 5,
782
+ "outputs": []
783
+ },
784
+ {
785
+ "cell_type": "markdown",
786
+ "source": [
787
+ "## Local Inference on GPU\n",
788
+ "Model page: https://huggingface.co/microsoft/trocr-large-stage1\n",
789
+ "\n",
790
+ "⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/microsoft/trocr-large-stage1)\n",
791
+ "\t\t\tand/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏"
792
+ ],
793
+ "metadata": {
794
+ "id": "2luRHgMsaPQx"
795
+ }
796
+ },
797
+ {
798
+ "cell_type": "code",
799
+ "source": [
800
+ "# Use a pipeline as a high-level helper\n",
801
+ "from transformers import pipeline\n",
802
+ "\n",
803
+ "pipe = pipeline(\"image-to-text\", model=\"microsoft/trocr-large-stage1\")"
804
+ ],
805
+ "metadata": {
806
+ "colab": {
807
+ "base_uri": "https://localhost:8080/",
808
+ "height": 136,
809
+ "referenced_widgets": [
810
+ "499a30b9f3d948b393a833b8c432ae38",
811
+ "0087ebb741b84c27a534cc86ce119c20",
812
+ "6ec79949c7564a7d9da4c4713dbb1d6a",
813
+ "2fa21d685d854cd78deba6db23d118e9",
814
+ "337ffd9b357842a8b8ac6b262eeb6bca",
815
+ "a1a3eeb969c4459990297aab0b2fc101",
816
+ "db314661fee0478386e01caaee424c49",
817
+ "0b2fcda70b0c4a34890eea43a701250b",
818
+ "6a643e4a16f74331826f41f96b10ee8c",
819
+ "4d0ef08daecf4748ad2f27ad0901a0d7",
820
+ "25b3ace657494257bedad1c934b07c0b",
821
+ "6e54658145734e648ccfd1daa2c4c607",
822
+ "a186b8a35c68407fbb51a309b4e22e5a",
823
+ "80820127d5c540e598cd00cf94d5d635",
824
+ "bd9e46996772480f8cd5851635ca30c4",
825
+ "4e649cdda36b4dc4a36a5442ddf7d4a8",
826
+ "709ab0ae3897437e9b891b1669c04897",
827
+ "ff0d6857285e4a5290314d6fa73240b7",
828
+ "b6a3e597a9344b75a1173ea6b8cf0762",
829
+ "cbf81f6c3a1c4d44931e9c66dedad02b",
830
+ "1bddc9d8eb2547458c1f9608203469d8",
831
+ "583ca4b425f74811a64d48bbcf1f7401"
832
+ ]
833
+ },
834
+ "id": "91OJhN-laPQy",
835
+ "outputId": "6d791caf-996d-47c2-cb7f-180909162d44"
836
+ },
837
+ "execution_count": null,
838
+ "outputs": [
839
+ {
840
+ "output_type": "display_data",
841
+ "data": {
842
+ "text/plain": [
843
+ "config.json: 0.00B [00:00, ?B/s]"
844
+ ],
845
+ "application/vnd.jupyter.widget-view+json": {
846
+ "version_major": 2,
847
+ "version_minor": 0,
848
+ "model_id": "499a30b9f3d948b393a833b8c432ae38"
849
+ }
850
+ },
851
+ "metadata": {}
852
+ },
853
+ {
854
+ "output_type": "stream",
855
+ "name": "stderr",
856
+ "text": [
857
+ "/usr/local/lib/python3.11/dist-packages/transformers/models/auto/modeling_auto.py:2160: FutureWarning: The class `AutoModelForVision2Seq` is deprecated and will be removed in v5.0. Please use `AutoModelForImageTextToText` instead.\n",
858
+ " warnings.warn(\n"
859
+ ]
860
+ },
861
+ {
862
+ "output_type": "display_data",
863
+ "data": {
864
+ "text/plain": [
865
+ "model.safetensors: 0%| | 0.00/2.43G [00:00<?, ?B/s]"
866
+ ],
867
+ "application/vnd.jupyter.widget-view+json": {
868
+ "version_major": 2,
869
+ "version_minor": 0,
870
+ "model_id": "6e54658145734e648ccfd1daa2c4c607"
871
+ }
872
+ },
873
+ "metadata": {}
874
+ }
875
+ ]
876
+ },
877
+ {
878
+ "cell_type": "code",
879
+ "source": [
880
+ "# Load model directly\n",
881
+ "from transformers import AutoTokenizer, AutoModelForVision2Seq\n",
882
+ "\n",
883
+ "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/trocr-large-stage1\")\n",
884
+ "model = AutoModelForVision2Seq.from_pretrained(\"microsoft/trocr-large-stage1\")"
885
+ ],
886
+ "metadata": {
887
+ "id": "0Y1JAk-UaPQy"
888
+ },
889
+ "execution_count": null,
890
+ "outputs": []
891
+ },
892
+ {
893
+ "cell_type": "code",
894
+ "source": [
895
+ "!pip install transformers\n",
896
+ "!pip install torch torchvision torchaudio\n",
897
+ "!pip install PIL\n",
898
+ "!pip install easyocr\n",
899
+ "!pip install git+https://github.com/Salesforce/BLIP.git\n"
900
+ ],
901
+ "metadata": {
902
+ "colab": {
903
+ "base_uri": "https://localhost:8080/"
904
+ },
905
+ "id": "2sc1IERgaUDX",
906
+ "outputId": "8f4be5f8-b19c-48b4-b310-0e449c501719"
907
+ },
908
+ "execution_count": 2,
909
+ "outputs": [
910
+ {
911
+ "output_type": "stream",
912
+ "name": "stdout",
913
+ "text": [
914
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.54.1)\n",
915
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.18.0)\n",
916
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.34.1)\n",
917
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2.0.2)\n",
918
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (25.0)\n",
919
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n",
920
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n",
921
+ "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n",
922
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.2)\n",
923
+ "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.3)\n",
924
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.67.1)\n",
925
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2025.3.0)\n",
926
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.14.1)\n",
927
+ "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.5)\n",
928
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.4.2)\n",
929
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.10)\n",
930
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.5.0)\n",
931
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2025.7.14)\n",
932
+ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.6.0+cu124)\n",
933
+ "Requirement already satisfied: torchvision in /usr/local/lib/python3.11/dist-packages (0.21.0+cu124)\n",
934
+ "Requirement already satisfied: torchaudio in /usr/local/lib/python3.11/dist-packages (2.6.0+cu124)\n",
935
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.18.0)\n",
936
+ "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.14.1)\n",
937
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.5)\n",
938
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.6)\n",
939
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2025.3.0)\n",
940
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
941
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
942
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
943
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\n",
944
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.5.8)\n",
945
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch) (11.2.1.3)\n",
946
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.5.147)\n",
947
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch) (11.6.1.9)\n",
948
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch) (12.3.1.170)\n",
949
+ "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch) (0.6.2)\n",
950
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n",
951
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
952
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
953
+ "Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.2.0)\n",
954
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n",
955
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
956
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from torchvision) (2.0.2)\n",
957
+ "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.11/dist-packages (from torchvision) (11.3.0)\n",
958
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n",
959
+ "\u001b[31mERROR: Could not find a version that satisfies the requirement PIL (from versions: none)\u001b[0m\u001b[31m\n",
960
+ "\u001b[0m\u001b[31mERROR: No matching distribution found for PIL\u001b[0m\u001b[31m\n",
961
+ "\u001b[0mRequirement already satisfied: easyocr in /usr/local/lib/python3.11/dist-packages (1.7.2)\n",
962
+ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (from easyocr) (2.6.0+cu124)\n",
963
+ "Requirement already satisfied: torchvision>=0.5 in /usr/local/lib/python3.11/dist-packages (from easyocr) (0.21.0+cu124)\n",
964
+ "Requirement already satisfied: opencv-python-headless in /usr/local/lib/python3.11/dist-packages (from easyocr) (4.12.0.88)\n",
965
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from easyocr) (1.16.0)\n",
966
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from easyocr) (2.0.2)\n",
967
+ "Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from easyocr) (11.3.0)\n",
968
+ "Requirement already satisfied: scikit-image in /usr/local/lib/python3.11/dist-packages (from easyocr) (0.25.2)\n",
969
+ "Requirement already satisfied: python-bidi in /usr/local/lib/python3.11/dist-packages (from easyocr) (0.6.6)\n",
970
+ "Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from easyocr) (6.0.2)\n",
971
+ "Requirement already satisfied: Shapely in /usr/local/lib/python3.11/dist-packages (from easyocr) (2.1.1)\n",
972
+ "Requirement already satisfied: pyclipper in /usr/local/lib/python3.11/dist-packages (from easyocr) (1.3.0.post6)\n",
973
+ "Requirement already satisfied: ninja in /usr/local/lib/python3.11/dist-packages (from easyocr) (1.11.1.4)\n",
974
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.18.0)\n",
975
+ "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (4.14.1)\n",
976
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.5)\n",
977
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.1.6)\n",
978
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (2025.3.0)\n",
979
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
980
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
981
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
982
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (9.1.0.70)\n",
983
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.5.8)\n",
984
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (11.2.1.3)\n",
985
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (10.3.5.147)\n",
986
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (11.6.1.9)\n",
987
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.3.1.170)\n",
988
+ "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (0.6.2)\n",
989
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (2.21.5)\n",
990
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
991
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
992
+ "Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.2.0)\n",
993
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (1.13.1)\n",
994
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch->easyocr) (1.3.0)\n",
995
+ "Requirement already satisfied: imageio!=2.35.0,>=2.33 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (2.37.0)\n",
996
+ "Requirement already satisfied: tifffile>=2022.8.12 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (2025.6.11)\n",
997
+ "Requirement already satisfied: packaging>=21 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (25.0)\n",
998
+ "Requirement already satisfied: lazy-loader>=0.4 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (0.4)\n",
999
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch->easyocr) (3.0.2)\n",
1000
+ "Collecting git+https://github.com/Salesforce/BLIP.git\n",
1001
+ " Cloning https://github.com/Salesforce/BLIP.git to /tmp/pip-req-build-bryrz0l8\n",
1002
+ " Running command git clone --filter=blob:none --quiet https://github.com/Salesforce/BLIP.git /tmp/pip-req-build-bryrz0l8\n",
1003
+ " Resolved https://github.com/Salesforce/BLIP.git to commit 3a29b7410476bf5f2ba0955827390eb6ea1f4f9d\n",
1004
+ "\u001b[31mERROR: git+https://github.com/Salesforce/BLIP.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.\u001b[0m\u001b[31m\n",
1005
+ "\u001b[0mRequirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.54.1)\n",
1006
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.18.0)\n",
1007
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.34.1)\n",
1008
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2.0.2)\n",
1009
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (25.0)\n",
1010
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n",
1011
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n",
1012
+ "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n",
1013
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.2)\n",
1014
+ "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.3)\n",
1015
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.67.1)\n",
1016
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2025.3.0)\n",
1017
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (4.14.1)\n",
1018
+ "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.5)\n",
1019
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.4.2)\n",
1020
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.10)\n",
1021
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.5.0)\n",
1022
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2025.7.14)\n",
1023
+ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.6.0+cu124)\n",
1024
+ "Requirement already satisfied: torchvision in /usr/local/lib/python3.11/dist-packages (0.21.0+cu124)\n",
1025
+ "Requirement already satisfied: torchaudio in /usr/local/lib/python3.11/dist-packages (2.6.0+cu124)\n",
1026
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.18.0)\n",
1027
+ "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.14.1)\n",
1028
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.5)\n",
1029
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.6)\n",
1030
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2025.3.0)\n",
1031
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
1032
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
1033
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
1034
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\n",
1035
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.5.8)\n",
1036
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch) (11.2.1.3)\n",
1037
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.5.147)\n",
1038
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch) (11.6.1.9)\n",
1039
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch) (12.3.1.170)\n",
1040
+ "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch) (0.6.2)\n",
1041
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n",
1042
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
1043
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n",
1044
+ "Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.2.0)\n",
1045
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n",
1046
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
1047
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from torchvision) (2.0.2)\n",
1048
+ "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.11/dist-packages (from torchvision) (11.3.0)\n",
1049
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n",
1050
+ "\u001b[31mERROR: Could not find a version that satisfies the requirement PIL (from versions: none)\u001b[0m\u001b[31m\n",
1051
+ "\u001b[0m\u001b[31mERROR: No matching distribution found for PIL\u001b[0m\u001b[31m\n",
1052
+ "\u001b[0mRequirement already satisfied: easyocr in /usr/local/lib/python3.11/dist-packages (1.7.2)\n",
1053
+ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (from easyocr) (2.6.0+cu124)\n",
1054
+ "Requirement already satisfied: torchvision>=0.5 in /usr/local/lib/python3.11/dist-packages (from easyocr) (0.21.0+cu124)\n",
1055
+ "Requirement already satisfied: opencv-python-headless in /usr/local/lib/python3.11/dist-packages (from easyocr) (4.12.0.88)\n",
1056
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from easyocr) (1.16.0)\n",
1057
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from easyocr) (2.0.2)\n",
1058
+ "Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from easyocr) (11.3.0)\n",
1059
+ "Requirement already satisfied: scikit-image in /usr/local/lib/python3.11/dist-packages (from easyocr) (0.25.2)\n",
1060
+ "Requirement already satisfied: python-bidi in /usr/local/lib/python3.11/dist-packages (from easyocr) (0.6.6)\n",
1061
+ "Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from easyocr) (6.0.2)\n",
1062
+ "Requirement already satisfied: Shapely in /usr/local/lib/python3.11/dist-packages (from easyocr) (2.1.1)\n",
1063
+ "Requirement already satisfied: pyclipper in /usr/local/lib/python3.11/dist-packages (from easyocr) (1.3.0.post6)\n",
1064
+ "Requirement already satisfied: ninja in /usr/local/lib/python3.11/dist-packages (from easyocr) (1.11.1.4)\n",
1065
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.18.0)\n",
1066
+ "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (4.14.1)\n",
1067
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.5)\n",
1068
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.1.6)\n",
1069
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (2025.3.0)\n",
1070
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
1071
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
1072
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
1073
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (9.1.0.70)\n",
1074
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.5.8)\n",
1075
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (11.2.1.3)\n",
1076
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (10.3.5.147)\n",
1077
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (11.6.1.9)\n",
1078
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.3.1.170)\n",
1079
+ "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (0.6.2)\n",
1080
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (2.21.5)\n",
1081
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
1082
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (12.4.127)\n",
1083
+ "Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (3.2.0)\n",
1084
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch->easyocr) (1.13.1)\n",
1085
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch->easyocr) (1.3.0)\n",
1086
+ "Requirement already satisfied: imageio!=2.35.0,>=2.33 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (2.37.0)\n",
1087
+ "Requirement already satisfied: tifffile>=2022.8.12 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (2025.6.11)\n",
1088
+ "Requirement already satisfied: packaging>=21 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (25.0)\n",
1089
+ "Requirement already satisfied: lazy-loader>=0.4 in /usr/local/lib/python3.11/dist-packages (from scikit-image->easyocr) (0.4)\n",
1090
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch->easyocr) (3.0.2)\n",
1091
+ "Collecting git+https://github.com/Salesforce/BLIP.git\n",
1092
+ " Cloning https://github.com/Salesforce/BLIP.git to /tmp/pip-req-build-eko7gnza\n",
1093
+ " Running command git clone --filter=blob:none --quiet https://github.com/Salesforce/BLIP.git /tmp/pip-req-build-eko7gnza\n",
1094
+ " Resolved https://github.com/Salesforce/BLIP.git to commit 3a29b7410476bf5f2ba0955827390eb6ea1f4f9d\n",
1095
+ "\u001b[31mERROR: git+https://github.com/Salesforce/BLIP.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.\u001b[0m\u001b[31m\n",
1096
+ "\u001b[0m"
1097
+ ]
1098
+ }
1099
+ ]
1100
+ },
1101
+ {
1102
+ "cell_type": "code",
1103
+ "source": [
1104
+ "from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer\n",
1105
+ "from transformers import BlipProcessor, BlipForConditionalGeneration\n",
1106
+ "from transformers import pipeline\n",
1107
+ "import torch\n",
1108
+ "from PIL import Image\n",
1109
+ "import easyocr\n",
1110
+ "import requests\n",
1111
+ "import os\n",
1112
+ "\n",
1113
+ "# ====== Step 1: Image Captioning ======\n",
1114
+ "def generate_caption(image_path):\n",
1115
+ " processor = BlipProcessor.from_pretrained(\"Salesforce/blip-image-captioning-base\")\n",
1116
+ " model = BlipForConditionalGeneration.from_pretrained(\"Salesforce/blip-image-captioning-base\").to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
1117
+ "\n",
1118
+ " raw_image = Image.open(image_path).convert('RGB')\n",
1119
+ " inputs = processor(raw_image, return_tensors=\"pt\").to(model.device)\n",
1120
+ "\n",
1121
+ " out = model.generate(**inputs)\n",
1122
+ " caption = processor.decode(out[0], skip_special_tokens=True)\n",
1123
+ " return caption\n",
1124
+ "\n",
1125
+ "# ====== Step 2: Text Extraction from Image ======\n",
1126
+ "def extract_text_easyocr(image_path):\n",
1127
+ " reader = easyocr.Reader(['en'])\n",
1128
+ " result = reader.readtext(image_path)\n",
1129
+ " extracted_text = ' '.join([item[1] for item in result])\n",
1130
+ " return extracted_text\n",
1131
+ "\n",
1132
+ "# ====== Step 3: Summarize Caption + OCR Text ======\n",
1133
+ "def summarize_text(text):\n",
1134
+ " summarizer = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n",
1135
+ " summary = summarizer(text, max_length=60, min_length=5, do_sample=False)\n",
1136
+ " return summary[0]['summary_text']\n",
1137
+ "\n",
1138
+ "# ====== Step 4: Braille Conversion ======\n",
1139
+ "braille_dict = {\n",
1140
+ " \"a\": \"⠁\", \"b\": \"⠃\", \"c\": \"⠉\", \"d\": \"⠙\", \"e\": \"⠑\", \"f\": \"⠋\", \"g\": \"⠛\", \"h\": \"⠓\",\n",
1141
+ " \"i\": \"⠊\", \"j\": \"⠚\", \"k\": \"⠅\", \"l\": \"⠇\", \"m\": \"⠍\", \"n\": \"⠝\", \"o\": \"⠕\", \"p\": \"⠏\",\n",
1142
+ " \"q\": \"⠟\", \"r\": \"⠗\", \"s\": \"⠎\", \"t\": \"⠞\", \"u\": \"⠥\", \"v\": \"⠧\", \"w\": \"⠺\", \"x\": \"⠭\",\n",
1143
+ " \"y\": \"⠽\", \"z\": \"⠵\", \" \": \" \", \".\": \".\", \",\": \",\", \"?\": \"⠹\"\n",
1144
+ "}\n",
1145
+ "\n",
1146
+ "def text_to_braille(text):\n",
1147
+ " braille_output = \"\"\n",
1148
+ " for char in text.lower():\n",
1149
+ " braille_output += braille_dict.get(char, \"?\")\n",
1150
+ " return braille_output\n",
1151
+ "\n",
1152
+ "# ====== Main Flow ======\n",
1153
+ "def process_image(image_path):\n",
1154
+ " print(\"📸 Generating caption...\")\n",
1155
+ " caption = generate_caption(image_path)\n",
1156
+ " print(\"🧾 Extracting text...\")\n",
1157
+ " extracted_text = extract_text_easyocr(image_path)\n",
1158
+ "\n",
1159
+ " full_text = caption + \". \" + extracted_text\n",
1160
+ " print(f\"\\n🧠 Full Description: {full_text}\")\n",
1161
+ "\n",
1162
+ " print(\"📝 Summarizing...\")\n",
1163
+ " summary = summarize_text(full_text)\n",
1164
+ " print(f\"\\n🧾 Summary: {summary}\")\n",
1165
+ "\n",
1166
+ " print(\"🔡 Converting to Braille...\")\n",
1167
+ " braille = text_to_braille(summary)\n",
1168
+ " print(f\"\\nBraille Unicode Output:\\n{braille}\")\n",
1169
+ "\n",
1170
+ "# ====== Call Function with Image ======\n",
1171
+ "# Example usage:\n",
1172
+ "process_image(\"/content/OIP.jpg\") # Replace with your image filename"
1173
+ ],
1174
+ "metadata": {
1175
+ "colab": {
1176
+ "base_uri": "https://localhost:8080/"
1177
+ },
1178
+ "id": "hzTnbecmcP1a",
1179
+ "outputId": "449cffe0-bb69-490c-8a9f-a9f8685049eb"
1180
+ },
1181
+ "execution_count": 3,
1182
+ "outputs": [
1183
+ {
1184
+ "output_type": "stream",
1185
+ "name": "stderr",
1186
+ "text": [
1187
+ "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
1188
+ ]
1189
+ },
1190
+ {
1191
+ "output_type": "stream",
1192
+ "name": "stdout",
1193
+ "text": [
1194
+ "📸 Generating caption...\n"
1195
+ ]
1196
+ },
1197
+ {
1198
+ "output_type": "stream",
1199
+ "name": "stderr",
1200
+ "text": [
1201
+ "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n",
1202
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
1203
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
1204
+ "You will be able to reuse this secret in all of your notebooks.\n",
1205
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
1206
+ " warnings.warn(\n",
1207
+ "`cache.key_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].keys` instead.\n",
1208
+ "`cache.value_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].values` instead.\n",
1209
+ "WARNING:easyocr.easyocr:Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n"
1210
+ ]
1211
+ },
1212
+ {
1213
+ "output_type": "stream",
1214
+ "name": "stdout",
1215
+ "text": [
1216
+ "🧾 Extracting text...\n",
1217
+ "\n",
1218
+ "🧠 Full Description: a screenshote screen shot of a text editor. ProrE2 Do wctu plote YCUI paint: \"ncnyoJ nahcembrdcon Ins udco rouwan: 4id-You cn Jko YR ) Jhaencteenldocuineni nykc Your drunicrt look prolcssionj % Moduxcd_ Woid uroidc;rcaj Adcren; comiplcincai cacholmcl @amolc Ycvdnscjamjic Lfdnd Cle Mir Incnerooic [hccicincal: #cant Mo ninc € 4cand cakonc eccn vaur dorumentrcardinatcd Vnsntcuc Ihrna thc Bluics chansand smaman mcn Jppk #ue: Youi h eJding, chanre ryttn Inc newIhcme Kronnic > cLange\n",
1219
+ "📝 Summarizing...\n"
1220
+ ]
1221
+ },
1222
+ {
1223
+ "output_type": "stream",
1224
+ "name": "stderr",
1225
+ "text": [
1226
+ "Device set to use cpu\n"
1227
+ ]
1228
+ },
1229
+ {
1230
+ "output_type": "stream",
1231
+ "name": "stdout",
1232
+ "text": [
1233
+ "\n",
1234
+ "🧾 Summary: A screenshote screen shot of a text editor. ProrE2 Do wctu plote YCUI paint: \"ncnyoJ nahcembrdcon Ins udco rouwan: 4id-You cn Jko YR\"\n",
1235
+ "🔡 Converting to Braille...\n",
1236
+ "\n",
1237
+ "Braille Unicode Output:\n",
1238
+ "⠁ ⠎⠉⠗⠑⠑⠝⠎⠓⠕⠞⠑ ⠎⠉⠗⠑⠑⠝ ⠎⠓⠕⠞ ⠕⠋ ⠁ ⠞⠑⠭⠞ ⠑⠙⠊⠞⠕⠗. ⠏⠗⠕⠗⠑? ⠙⠕ ⠺⠉⠞⠥ ⠏⠇⠕⠞⠑ ⠽⠉⠥⠊ ⠏⠁⠊⠝⠞? ?⠝⠉⠝⠽⠕⠚ ⠝⠁⠓⠉⠑⠍⠃⠗⠙⠉⠕⠝ ⠊⠝⠎ ⠥⠙⠉⠕ ⠗⠕⠥⠺⠁⠝? ?⠊⠙?⠽⠕⠥ ⠉⠝ ⠚⠅⠕ ⠽⠗?\n"
1239
+ ]
1240
+ }
1241
+ ]
1242
+ },
1243
+ {
1244
+ "cell_type": "code",
1245
+ "source": [
1246
+ "import os\n",
1247
+ "\n",
1248
+ "# Create a folder named 'dataset' if it doesn't exist\n",
1249
+ "os.makedirs(\"dataset\", exist_ok=True)\n",
1250
+ "print(\"✅ 'dataset/' folder created!\")\n"
1251
+ ],
1252
+ "metadata": {
1253
+ "id": "90GBxqzf0ZeH",
1254
+ "outputId": "6b76a7a9-a73f-491c-8420-5a9108cc6772",
1255
+ "colab": {
1256
+ "base_uri": "https://localhost:8080/"
1257
+ }
1258
+ },
1259
+ "execution_count": 4,
1260
+ "outputs": [
1261
+ {
1262
+ "output_type": "stream",
1263
+ "name": "stdout",
1264
+ "text": [
1265
+ "✅ 'dataset/' folder created!\n"
1266
+ ]
1267
+ }
1268
+ ]
1269
+ },
1270
+ {
1271
+ "cell_type": "code",
1272
+ "source": [
1273
+ "from PIL import Image, ImageDraw, ImageFont\n",
1274
+ "\n",
1275
+ "# Texts for the test images\n",
1276
+ "texts = [\"help me\", \"fire emergency\", \"call 911\"]\n",
1277
+ "\n",
1278
+ "for i, text in enumerate(texts):\n",
1279
+ " img = Image.new(\"RGB\", (300, 100), color=(255, 255, 255))\n",
1280
+ " draw = ImageDraw.Draw(img)\n",
1281
+ "\n",
1282
+ " # You can specify a font if you have one, else it uses default\n",
1283
+ " draw.text((10, 40), text, fill=(0, 0, 0))\n",
1284
+ "\n",
1285
+ " # Save image\n",
1286
+ " img_path = f\"dataset/img{i+1}.png\"\n",
1287
+ " img.save(img_path)\n",
1288
+ "\n",
1289
+ "print(\"✅ Test images created and saved in 'dataset/'\")\n"
1290
+ ],
1291
+ "metadata": {
1292
+ "id": "d1qtrdk20gCw",
1293
+ "outputId": "33173d9d-301a-4a75-e726-693d52ff545e",
1294
+ "colab": {
1295
+ "base_uri": "https://localhost:8080/"
1296
+ }
1297
+ },
1298
+ "execution_count": 5,
1299
+ "outputs": [
1300
+ {
1301
+ "output_type": "stream",
1302
+ "name": "stdout",
1303
+ "text": [
1304
+ "✅ Test images created and saved in 'dataset/'\n"
1305
+ ]
1306
+ }
1307
+ ]
1308
+ },
1309
+ {
1310
+ "cell_type": "code",
1311
+ "source": [
1312
+ "import json\n",
1313
+ "\n",
1314
+ "# Expected labels matching the images\n",
1315
+ "labels = [\n",
1316
+ " {\"image\": \"img1.png\", \"expected\": \"help me\"},\n",
1317
+ " {\"image\": \"img2.png\", \"expected\": \"fire emergency\"},\n",
1318
+ " {\"image\": \"img3.png\", \"expected\": \"call 911\"}\n",
1319
+ "]\n",
1320
+ "\n",
1321
+ "# Save to labels.json\n",
1322
+ "with open(\"dataset/labels.json\", \"w\") as f:\n",
1323
+ " json.dump(labels, f, indent=2)\n",
1324
+ "\n",
1325
+ "print(\"✅ labels.json created in 'dataset/'\")\n"
1326
+ ],
1327
+ "metadata": {
1328
+ "id": "r5T_QvIo0jy-",
1329
+ "outputId": "cd7d66d2-8192-41d8-e38d-aa6f4c0e51a6",
1330
+ "colab": {
1331
+ "base_uri": "https://localhost:8080/"
1332
+ }
1333
+ },
1334
+ "execution_count": 6,
1335
+ "outputs": [
1336
+ {
1337
+ "output_type": "stream",
1338
+ "name": "stdout",
1339
+ "text": [
1340
+ "✅ labels.json created in 'dataset/'\n"
1341
+ ]
1342
+ }
1343
+ ]
1344
+ }
1345
+ ]
1346
+ }