Tiny dummy models
Collection
Randomly initialized tiny models for debugging/testing purpose • 176 items • Updated • 6
How to use yujiepan/qwen2-audio-tiny-random with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="yujiepan/qwen2-audio-tiny-random")
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe(messages) # Load model directly
from transformers import AutoProcessor, AutoModelForSeq2SeqLM
processor = AutoProcessor.from_pretrained("yujiepan/qwen2-audio-tiny-random")
model = AutoModelForSeq2SeqLM.from_pretrained("yujiepan/qwen2-audio-tiny-random")
messages = [
{"role": "user", "content": "Who are you?"},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use yujiepan/qwen2-audio-tiny-random with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "yujiepan/qwen2-audio-tiny-random"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "yujiepan/qwen2-audio-tiny-random",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker model run hf.co/yujiepan/qwen2-audio-tiny-random
How to use yujiepan/qwen2-audio-tiny-random with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "yujiepan/qwen2-audio-tiny-random" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "yujiepan/qwen2-audio-tiny-random",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "yujiepan/qwen2-audio-tiny-random" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "yujiepan/qwen2-audio-tiny-random",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'How to use yujiepan/qwen2-audio-tiny-random with Docker Model Runner:
docker model run hf.co/yujiepan/qwen2-audio-tiny-random
This model is for debugging. It is randomly initialized using the config from Qwen/Qwen2-Audio-7B-Instruct but with smaller size.
Codes:
import os
from typing import Dict
import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
AutoTokenizer, GenerationConfig,
Qwen2AudioForConditionalGeneration, pipeline,
set_seed)
model_id = "Qwen/Qwen2-Audio-7B-Instruct"
repo_id = "yujiepan/qwen2-audio-tiny-random"
save_path = f"/tmp/{repo_id}"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.audio_config.encoder_layers = 2
config.audio_config.encoder_attention_heads = 2
config.audio_config.encoder_ffn_dim = 32
config.audio_config.d_model = 16
config.text_config.num_hidden_layers = 2
config.text_config.intermediate_size = 32
config.text_config.hidden_size = 16
config.text_config.num_attention_heads = 2
config.text_config.num_key_value_heads = 1
model = Qwen2AudioForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
for _, p in sorted(model.named_parameters()):
torch.nn.init.uniform_(p, -0.3, 0.3)
processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")
def try_inference():
from io import BytesIO
from urllib.request import urlopen
import librosa
processor = AutoProcessor.from_pretrained(save_path)
model = Qwen2AudioForConditionalGeneration.from_pretrained(
save_path, device_map="auto")
conversation = [
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
]},
{"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
]},
]
text = processor.apply_chat_template(
conversation, add_generation_prompt=True, tokenize=False)
audios = []
for message in conversation:
if isinstance(message["content"], list):
for ele in message["content"]:
if ele["type"] == "audio":
audios.append(librosa.load(
BytesIO(urlopen(ele['audio_url']).read()),
sr=processor.feature_extractor.sampling_rate)[0]
)
inputs = processor(text=text, audios=audios,
return_tensors="pt", padding=True)
inputs.input_ids = inputs.input_ids.to("cuda")
generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(response)
try_inference()