prithivMLmods commited on
Commit
8c63fde
·
verified ·
1 Parent(s): e924001

Upload Notebbok (#2)

Browse files

- Upload Notebbok (5c914829d3d32d267149f93a2144e19f83b4a20f)

Caption3o-XL-2B-Qwen2VL/Caption3o_XL_2B_Qwen2VL.ipynb ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "uFovmijgUV1Z"
7
+ },
8
+ "source": [
9
+ "## **Caption3o-XL-2B-Qwen2VL**"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "sSu_X_ddr6TZ"
16
+ },
17
+ "source": [
18
+ "The Caption3o-XL-2B-Qwen2VL model is a fine-tuned version of Qwen2-VL-2B-Instruct, tailored for Image Captioning and Vision Language Attribution. This variant is designed to generate precise, highly descriptive captions with a focus on defining visual properties, object attributes, and scene details across a wide spectrum of images and aspect ratios.\n",
19
+ "\n",
20
+ "This model is specially fine-tuned for Vision Language Attribution, enabling it to accurately attribute and define the visual properties of objects, scenes, and environments. It generates captions with rich attribute descriptions, providing more precise outputs than generic captioners, and delivers high-fidelity descriptions even for general, artistic, technical, abstract, or low-context images. It performs robustly across various aspect ratios—whether wide, tall, square, or irregular—and supports variational detail control, allowing both concise summaries and fine-grained attributions based on the prompt structure. Built on the Qwen2-VL-2B-Instruct architecture, it leverages strong multimodal reasoning for visual comprehension and instruction-following, and while it defaults to English, it can be adapted for multilingual captioning through prompt engineering.\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "markdown",
25
+ "metadata": {
26
+ "id": "RugX4SGZV-8O"
27
+ },
28
+ "source": [
29
+ "### **Install packages**"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {
36
+ "id": "l-NtFtjSpuJQ"
37
+ },
38
+ "outputs": [],
39
+ "source": [
40
+ "%%capture\n",
41
+ "!pip install git+https://github.com/huggingface/transformers.git \\\n",
42
+ " git+https://github.com/huggingface/accelerate.git \\\n",
43
+ " git+https://github.com/huggingface/peft.git \\\n",
44
+ " transformers-stream-generator huggingface_hub albumentations \\\n",
45
+ " pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \\\n",
46
+ " python-docx torchvision safetensors matplotlib num2words \\\n",
47
+ "\n",
48
+ "!pip install xformers requests pymupdf hf_xet spaces pyvips pillow gradio \\\n",
49
+ " einops torch fpdf timm av decord bitsandbytes reportlab\n",
50
+ "#Hold tight, this will take around 1-2 minutes."
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "metadata": {
56
+ "id": "mvoSnRZcVBu4"
57
+ },
58
+ "source": [
59
+ "### **Run Demo App**"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {
66
+ "id": "tElKr2Fkp1bO"
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "import gradio as gr\n",
71
+ "import spaces\n",
72
+ "from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer\n",
73
+ "from qwen_vl_utils import process_vision_info\n",
74
+ "import torch\n",
75
+ "from PIL import Image\n",
76
+ "import os\n",
77
+ "import uuid\n",
78
+ "import io\n",
79
+ "from threading import Thread\n",
80
+ "from reportlab.lib.pagesizes import A4\n",
81
+ "from reportlab.lib.styles import getSampleStyleSheet\n",
82
+ "from reportlab.lib import colors\n",
83
+ "from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer\n",
84
+ "from reportlab.lib.units import inch\n",
85
+ "from reportlab.pdfbase import pdfmetrics\n",
86
+ "from reportlab.pdfbase.ttfonts import TTFont\n",
87
+ "import docx\n",
88
+ "from docx.enum.text import WD_ALIGN_PARAGRAPH\n",
89
+ "\n",
90
+ "# Define model options\n",
91
+ "MODEL_OPTIONS = {\n",
92
+ " \"Caption3o-XL-2B-Qwen2VL\": \"prithivMLmods/Caption3o-XL-2B-Qwen2VL\",\n",
93
+ "}\n",
94
+ "\n",
95
+ "# Preload models and processors into CUDA\n",
96
+ "models = {}\n",
97
+ "processors = {}\n",
98
+ "for name, model_id in MODEL_OPTIONS.items():\n",
99
+ " print(f\"Loading {name}🤗. Hold tight, this will take around 4-6 minutes..\")\n",
100
+ " models[name] = Qwen2VLForConditionalGeneration.from_pretrained(\n",
101
+ " model_id,\n",
102
+ " trust_remote_code=True,\n",
103
+ " torch_dtype=torch.float16\n",
104
+ " ).to(\"cuda\").eval()\n",
105
+ " processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n",
106
+ "\n",
107
+ "image_extensions = Image.registered_extensions()\n",
108
+ "\n",
109
+ "def identify_and_save_blob(blob_path):\n",
110
+ " \"\"\"Identifies if the blob is an image and saves it.\"\"\"\n",
111
+ " try:\n",
112
+ " with open(blob_path, 'rb') as file:\n",
113
+ " blob_content = file.read()\n",
114
+ " try:\n",
115
+ " Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image\n",
116
+ " extension = \".png\" # Default to PNG for saving\n",
117
+ " media_type = \"image\"\n",
118
+ " except (IOError, SyntaxError):\n",
119
+ " raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
120
+ "\n",
121
+ " filename = f\"temp_{uuid.uuid4()}_media{extension}\"\n",
122
+ " with open(filename, \"wb\") as f:\n",
123
+ " f.write(blob_content)\n",
124
+ "\n",
125
+ " return filename, media_type\n",
126
+ "\n",
127
+ " except FileNotFoundError:\n",
128
+ " raise ValueError(f\"The file {blob_path} was not found.\")\n",
129
+ " except Exception as e:\n",
130
+ " raise ValueError(f\"An error occurred while processing the file: {e}\")\n",
131
+ "\n",
132
+ "@spaces.GPU\n",
133
+ "def qwen_inference(model_name, media_input, text_input=None):\n",
134
+ " \"\"\"Handles inference for the selected model.\"\"\"\n",
135
+ " model = models[model_name]\n",
136
+ " processor = processors[model_name]\n",
137
+ "\n",
138
+ " if isinstance(media_input, str):\n",
139
+ " media_path = media_input\n",
140
+ " if media_path.endswith(tuple([i for i in image_extensions.keys()])):\n",
141
+ " media_type = \"image\"\n",
142
+ " else:\n",
143
+ " try:\n",
144
+ " media_path, media_type = identify_and_save_blob(media_input)\n",
145
+ " except Exception as e:\n",
146
+ " raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
147
+ "\n",
148
+ " messages = [\n",
149
+ " {\n",
150
+ " \"role\": \"user\",\n",
151
+ " \"content\": [\n",
152
+ " {\n",
153
+ " \"type\": media_type,\n",
154
+ " media_type: media_path\n",
155
+ " },\n",
156
+ " {\"type\": \"text\", \"text\": text_input},\n",
157
+ " ],\n",
158
+ " }\n",
159
+ " ]\n",
160
+ "\n",
161
+ " text = processor.apply_chat_template(\n",
162
+ " messages, tokenize=False, add_generation_prompt=True\n",
163
+ " )\n",
164
+ " image_inputs, _ = process_vision_info(messages)\n",
165
+ " inputs = processor(\n",
166
+ " text=[text],\n",
167
+ " images=image_inputs,\n",
168
+ " padding=True,\n",
169
+ " return_tensors=\"pt\",\n",
170
+ " ).to(\"cuda\")\n",
171
+ "\n",
172
+ " streamer = TextIteratorStreamer(\n",
173
+ " processor.tokenizer, skip_prompt=True, skip_special_tokens=True\n",
174
+ " )\n",
175
+ " generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)\n",
176
+ "\n",
177
+ " thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
178
+ " thread.start()\n",
179
+ "\n",
180
+ " buffer = \"\"\n",
181
+ " for new_text in streamer:\n",
182
+ " buffer += new_text\n",
183
+ " # Remove <|im_end|> or similar tokens from the output\n",
184
+ " buffer = buffer.replace(\"<|im_end|>\", \"\")\n",
185
+ " yield buffer\n",
186
+ "\n",
187
+ "def format_plain_text(output_text):\n",
188
+ " \"\"\"Formats the output text as plain text without LaTeX delimiters.\"\"\"\n",
189
+ " # Remove LaTeX delimiters and convert to plain text\n",
190
+ " plain_text = output_text.replace(\"\\\\(\", \"\").replace(\"\\\\)\", \"\").replace(\"\\\\[\", \"\").replace(\"\\\\]\", \"\")\n",
191
+ " return plain_text\n",
192
+ "\n",
193
+ "def generate_document(media_path, output_text, file_format, font_size, line_spacing, alignment, image_size):\n",
194
+ " \"\"\"Generates a document with the input image and plain text output.\"\"\"\n",
195
+ " plain_text = format_plain_text(output_text)\n",
196
+ " if file_format == \"pdf\":\n",
197
+ " return generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
198
+ " elif file_format == \"docx\":\n",
199
+ " return generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
200
+ "\n",
201
+ "def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
202
+ " \"\"\"Generates a PDF document.\"\"\"\n",
203
+ " filename = f\"output_{uuid.uuid4()}.pdf\"\n",
204
+ " doc = SimpleDocTemplate(\n",
205
+ " filename,\n",
206
+ " pagesize=A4,\n",
207
+ " rightMargin=inch,\n",
208
+ " leftMargin=inch,\n",
209
+ " topMargin=inch,\n",
210
+ " bottomMargin=inch\n",
211
+ " )\n",
212
+ " styles = getSampleStyleSheet()\n",
213
+ " styles[\"Normal\"].fontSize = int(font_size)\n",
214
+ " styles[\"Normal\"].leading = int(font_size) * line_spacing\n",
215
+ " styles[\"Normal\"].alignment = {\n",
216
+ " \"Left\": 0,\n",
217
+ " \"Center\": 1,\n",
218
+ " \"Right\": 2,\n",
219
+ " \"Justified\": 4\n",
220
+ " }[alignment]\n",
221
+ "\n",
222
+ " story = []\n",
223
+ "\n",
224
+ " # Add image with size adjustment\n",
225
+ " image_sizes = {\n",
226
+ " \"Small\": (200, 200),\n",
227
+ " \"Medium\": (400, 400),\n",
228
+ " \"Large\": (600, 600)\n",
229
+ " }\n",
230
+ " img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])\n",
231
+ " story.append(img)\n",
232
+ " story.append(Spacer(1, 12))\n",
233
+ "\n",
234
+ " # Add plain text output\n",
235
+ " text = Paragraph(plain_text, styles[\"Normal\"])\n",
236
+ " story.append(text)\n",
237
+ "\n",
238
+ " doc.build(story)\n",
239
+ " return filename\n",
240
+ "\n",
241
+ "def generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
242
+ " \"\"\"Generates a DOCX document.\"\"\"\n",
243
+ " filename = f\"output_{uuid.uuid4()}.docx\"\n",
244
+ " doc = docx.Document()\n",
245
+ "\n",
246
+ " # Add image with size adjustment\n",
247
+ " image_sizes = {\n",
248
+ " \"Small\": docx.shared.Inches(2),\n",
249
+ " \"Medium\": docx.shared.Inches(4),\n",
250
+ " \"Large\": docx.shared.Inches(6)\n",
251
+ " }\n",
252
+ " doc.add_picture(media_path, width=image_sizes[image_size])\n",
253
+ " doc.add_paragraph()\n",
254
+ "\n",
255
+ " # Add plain text output\n",
256
+ " paragraph = doc.add_paragraph()\n",
257
+ " paragraph.paragraph_format.line_spacing = line_spacing\n",
258
+ " paragraph.paragraph_format.alignment = {\n",
259
+ " \"Left\": WD_ALIGN_PARAGRAPH.LEFT,\n",
260
+ " \"Center\": WD_ALIGN_PARAGRAPH.CENTER,\n",
261
+ " \"Right\": WD_ALIGN_PARAGRAPH.RIGHT,\n",
262
+ " \"Justified\": WD_ALIGN_PARAGRAPH.JUSTIFY\n",
263
+ " }[alignment]\n",
264
+ " run = paragraph.add_run(plain_text)\n",
265
+ " run.font.size = docx.shared.Pt(int(font_size))\n",
266
+ "\n",
267
+ " doc.save(filename)\n",
268
+ " return filename\n",
269
+ "\n",
270
+ "# CSS for output styling\n",
271
+ "css = \"\"\"\n",
272
+ " #output {\n",
273
+ " height: 500px;\n",
274
+ " overflow: auto;\n",
275
+ " border: 1px solid #ccc;\n",
276
+ " }\n",
277
+ ".submit-btn {\n",
278
+ " background-color: #cf3434 !important;\n",
279
+ " color: white !important;\n",
280
+ "}\n",
281
+ ".submit-btn:hover {\n",
282
+ " background-color: #ff2323 !important;\n",
283
+ "}\n",
284
+ ".download-btn {\n",
285
+ " background-color: #35a6d6 !important;\n",
286
+ " color: white !important;\n",
287
+ "}\n",
288
+ ".download-btn:hover {\n",
289
+ " background-color: #22bcff !important;\n",
290
+ "}\n",
291
+ "\"\"\"\n",
292
+ "\n",
293
+ "# Gradio app setup\n",
294
+ "with gr.Blocks(css=css, theme=\"bethecloud/storj_theme\") as demo:\n",
295
+ " gr.Markdown(\"# **Caption3o-XL-2B-Qwen2VL**\")\n",
296
+ "\n",
297
+ " with gr.Tab(label=\"Image Input\"):\n",
298
+ "\n",
299
+ " with gr.Row():\n",
300
+ " with gr.Column():\n",
301
+ " model_choice = gr.Dropdown(\n",
302
+ " label=\"Model Selection\",\n",
303
+ " choices=list(MODEL_OPTIONS.keys()),\n",
304
+ " value=\"Caption3o-XL-2B-Qwen2VL\"\n",
305
+ " )\n",
306
+ " input_media = gr.File(\n",
307
+ " label=\"Upload Image\", type=\"filepath\"\n",
308
+ " )\n",
309
+ " text_input = gr.Textbox(label=\"Question\", value=\"Caption the image precisely.\")\n",
310
+ " submit_btn = gr.Button(value=\"Submit\", elem_classes=\"submit-btn\")\n",
311
+ "\n",
312
+ " with gr.Column():\n",
313
+ " output_text = gr.Textbox(label=\"Output Text\", lines=7)\n",
314
+ "\n",
315
+ " with gr.Accordion(\"Plain Text\", open=False):\n",
316
+ " plain_text_output = gr.Textbox(label=\"Standardized Plain Text\", lines=10)\n",
317
+ "\n",
318
+ " submit_btn.click(\n",
319
+ " qwen_inference, [model_choice, input_media, text_input], [output_text]\n",
320
+ " ).then(\n",
321
+ " lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]\n",
322
+ " )\n",
323
+ "\n",
324
+ " with gr.Accordion(\"Docx/PDF Settings\", open=False):\n",
325
+ " with gr.Row():\n",
326
+ " with gr.Column():\n",
327
+ " line_spacing = gr.Dropdown(\n",
328
+ " choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],\n",
329
+ " value=1.5,\n",
330
+ " label=\"Line Spacing\"\n",
331
+ " )\n",
332
+ " font_size = gr.Dropdown(\n",
333
+ " choices=[\"8\", \"10\", \"12\", \"14\", \"16\", \"18\", \"20\", \"22\", \"24\"],\n",
334
+ " value=\"16\",\n",
335
+ " label=\"Font Size\"\n",
336
+ " )\n",
337
+ " alignment = gr.Dropdown(\n",
338
+ " choices=[\"Left\", \"Center\", \"Right\", \"Justified\"],\n",
339
+ " value=\"Justified\",\n",
340
+ " label=\"Text Alignment\"\n",
341
+ " )\n",
342
+ " image_size = gr.Dropdown(\n",
343
+ " choices=[\"Small\", \"Medium\", \"Large\"],\n",
344
+ " value=\"Medium\",\n",
345
+ " label=\"Image Size\"\n",
346
+ " )\n",
347
+ " file_format = gr.Radio([\"pdf\", \"docx\"], label=\"File Format\", value=\"pdf\")\n",
348
+ "\n",
349
+ " get_document_btn = gr.Button(value=\"Get Document\", elem_classes=\"download-btn\")\n",
350
+ "\n",
351
+ " get_document_btn.click(\n",
352
+ " generate_document, [input_media, output_text, file_format, font_size, line_spacing, alignment, image_size], gr.File(label=\"Download Document\")\n",
353
+ " )\n",
354
+ "\n",
355
+ "demo.launch(debug=True)"
356
+ ]
357
+ }
358
+ ],
359
+ "metadata": {
360
+ "accelerator": "GPU",
361
+ "colab": {
362
+ "gpuType": "T4",
363
+ "provenance": []
364
+ },
365
+ "kernelspec": {
366
+ "display_name": "Python 3",
367
+ "name": "python3"
368
+ },
369
+ "language_info": {
370
+ "name": "python"
371
+ }
372
+ },
373
+ "nbformat": 4,
374
+ "nbformat_minor": 0
375
+ }