snake7gun commited on
Commit
098dceb
·
verified ·
1 Parent(s): 5899b79

Delete README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -211
README.md DELETED
@@ -1,211 +0,0 @@
1
- ---
2
- library_name: transformers
3
- pipeline_tag: image-text-to-text
4
- inference: true
5
- widget:
6
- - text: Hello!
7
- example_title: Hello world
8
- group: Python
9
- base_model:
10
- - Qwen/Qwen3-VL-8B-Thinking
11
- ---
12
-
13
- This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [Qwen/Qwen3-VL-8B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking).
14
-
15
- ### Example usage:
16
-
17
- ```python
18
- import numpy as np
19
- import torch
20
- import transformers
21
- from PIL import Image
22
- from transformers import (
23
- AutoModel,
24
- AutoModelForCausalLM,
25
- AutoProcessor,
26
- AutoTokenizer,
27
- Qwen3VLForConditionalGeneration,
28
- )
29
-
30
- model_id = "tiny-random/qwen3-vl"
31
- model = Qwen3VLForConditionalGeneration.from_pretrained(
32
- model_id, dtype=torch.bfloat16, device_map="cuda",
33
- attn_implementation="flash_attention_2",
34
- )
35
- processor = AutoProcessor.from_pretrained(model_id)
36
- messages = [
37
- {
38
- "role": "user",
39
- "content": [
40
- {
41
- "type": "image",
42
- "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
43
- },
44
- {"type": "text", "text": "Describe this image."},
45
- ],
46
- }
47
- ]
48
-
49
- # Preparation for inference
50
- inputs = processor.apply_chat_template(
51
- messages,
52
- tokenize=True,
53
- add_generation_prompt=True,
54
- return_dict=True,
55
- return_tensors="pt"
56
- ).to(model.device)
57
-
58
- # Inference: Generation of the output
59
- generated_ids = model.generate(**inputs, max_new_tokens=32)
60
- generated_ids_trimmed = [
61
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
62
- ]
63
- output_text = processor.batch_decode(
64
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
65
- )
66
- print(output_text)
67
- ```
68
-
69
- ### Codes to create this repo:
70
-
71
- ```python
72
- import json
73
- from pathlib import Path
74
-
75
- import accelerate
76
- import torch
77
- from huggingface_hub import file_exists, hf_hub_download
78
- from transformers import (
79
- AutoConfig,
80
- AutoModelForCausalLM,
81
- AutoProcessor,
82
- GenerationConfig,
83
- # Qwen3VLMoeForConditionalGeneration,
84
- Qwen3VLForConditionalGeneration,
85
- set_seed,
86
- )
87
-
88
- source_model_id = "Qwen/Qwen3-VL-8B-Thinking"
89
- save_folder = "/tmp/tiny-random/qwen3-vl"
90
-
91
- processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
92
- processor.save_pretrained(save_folder)
93
-
94
- with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
95
- config_json = json.load(f)
96
-
97
- config_json['text_config'].update({
98
- 'head_dim': 32,
99
- 'hidden_size': 8,
100
- 'intermediate_size': 64,
101
- 'moe_intermediate_size': 64,
102
- 'num_hidden_layers': 2,
103
- 'num_attention_heads': 8,
104
- 'num_key_value_heads': 4,
105
- })
106
- config_json['text_config']['rope_scaling']['mrope_section'] = [8, 4, 4]
107
- config_json['vision_config'].update(
108
- {
109
- 'hidden_size': 32 * 4,
110
- 'intermediate_size': 64,
111
- 'num_heads': 4,
112
- 'out_hidden_size': 8,
113
- 'depth': 6,
114
- 'deepstack_visual_indexes': [1, 3, 5],
115
- }
116
- )
117
- with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
118
- json.dump(config_json, f, indent=2)
119
-
120
- config = AutoConfig.from_pretrained(
121
- save_folder,
122
- trust_remote_code=True,
123
- )
124
- print(config)
125
- torch.set_default_dtype(torch.bfloat16)
126
- model = Qwen3VLForConditionalGeneration(config)
127
- torch.set_default_dtype(torch.float32)
128
- if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
129
- model.generation_config = GenerationConfig.from_pretrained(
130
- source_model_id, trust_remote_code=True,
131
- )
132
- model.generation_config.do_sample = True
133
- print(model.generation_config)
134
- model = model.cpu()
135
- with torch.no_grad():
136
- for name, p in sorted(model.named_parameters()):
137
- torch.nn.init.normal_(p, 0, 0.1)
138
- print(name, p.shape)
139
- model.save_pretrained(save_folder)
140
- ```
141
-
142
- ### Printing the model:
143
-
144
- ```text
145
- Qwen3VLForConditionalGeneration(
146
- (model): Qwen3VLModel(
147
- (visual): Qwen3VLVisionModel(
148
- (patch_embed): Qwen3VLVisionPatchEmbed(
149
- (proj): Conv3d(3, 128, kernel_size=(2, 16, 16), stride=(2, 16, 16))
150
- )
151
- (pos_embed): Embedding(2304, 128)
152
- (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
153
- (blocks): ModuleList(
154
- (0-5): 6 x Qwen3VLVisionBlock(
155
- (norm1): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
156
- (norm2): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
157
- (attn): Qwen3VLVisionAttention(
158
- (qkv): Linear(in_features=128, out_features=384, bias=True)
159
- (proj): Linear(in_features=128, out_features=128, bias=True)
160
- )
161
- (mlp): Qwen3VLVisionMLP(
162
- (linear_fc1): Linear(in_features=128, out_features=64, bias=True)
163
- (linear_fc2): Linear(in_features=64, out_features=128, bias=True)
164
- (act_fn): PytorchGELUTanh()
165
- )
166
- )
167
- )
168
- (merger): Qwen3VLVisionPatchMerger(
169
- (norm): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
170
- (linear_fc1): Linear(in_features=512, out_features=512, bias=True)
171
- (act_fn): GELU(approximate='none')
172
- (linear_fc2): Linear(in_features=512, out_features=8, bias=True)
173
- )
174
- (deepstack_merger_list): ModuleList(
175
- (0-2): 3 x Qwen3VLVisionPatchMerger(
176
- (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
177
- (linear_fc1): Linear(in_features=512, out_features=512, bias=True)
178
- (act_fn): GELU(approximate='none')
179
- (linear_fc2): Linear(in_features=512, out_features=8, bias=True)
180
- )
181
- )
182
- )
183
- (language_model): Qwen3VLTextModel(
184
- (embed_tokens): Embedding(151936, 8)
185
- (layers): ModuleList(
186
- (0-1): 2 x Qwen3VLTextDecoderLayer(
187
- (self_attn): Qwen3VLTextAttention(
188
- (q_proj): Linear(in_features=8, out_features=256, bias=False)
189
- (k_proj): Linear(in_features=8, out_features=128, bias=False)
190
- (v_proj): Linear(in_features=8, out_features=128, bias=False)
191
- (o_proj): Linear(in_features=256, out_features=8, bias=False)
192
- (q_norm): Qwen3VLTextRMSNorm((32,), eps=1e-06)
193
- (k_norm): Qwen3VLTextRMSNorm((32,), eps=1e-06)
194
- )
195
- (mlp): Qwen3VLTextMLP(
196
- (gate_proj): Linear(in_features=8, out_features=64, bias=False)
197
- (up_proj): Linear(in_features=8, out_features=64, bias=False)
198
- (down_proj): Linear(in_features=64, out_features=8, bias=False)
199
- (act_fn): SiLU()
200
- )
201
- (input_layernorm): Qwen3VLTextRMSNorm((8,), eps=1e-06)
202
- (post_attention_layernorm): Qwen3VLTextRMSNorm((8,), eps=1e-06)
203
- )
204
- )
205
- (norm): Qwen3VLTextRMSNorm((8,), eps=1e-06)
206
- (rotary_emb): Qwen3VLTextRotaryEmbedding()
207
- )
208
- )
209
- (lm_head): Linear(in_features=8, out_features=151936, bias=False)
210
- )
211
- ```