HoliAntiSpoof: Audio LLM for Holistic Speech Anti-Spoofing
Paper • 2602.04535 • Published
How to use wsntxxn/HoliAntiSpoof with PEFT:
from peft import PeftModel
from transformers import AutoModelForCausalLM
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Omni-7B")
model = PeftModel.from_pretrained(base_model, "wsntxxn/HoliAntiSpoof")How to use wsntxxn/HoliAntiSpoof with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="wsntxxn/HoliAntiSpoof")
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe(messages) # Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("wsntxxn/HoliAntiSpoof", dtype="auto")How to use wsntxxn/HoliAntiSpoof with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "wsntxxn/HoliAntiSpoof"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "wsntxxn/HoliAntiSpoof",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker model run hf.co/wsntxxn/HoliAntiSpoof
How to use wsntxxn/HoliAntiSpoof with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "wsntxxn/HoliAntiSpoof" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "wsntxxn/HoliAntiSpoof",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "wsntxxn/HoliAntiSpoof" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "wsntxxn/HoliAntiSpoof",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'How to use wsntxxn/HoliAntiSpoof with Docker Model Runner:
docker model run hf.co/wsntxxn/HoliAntiSpoof
Use the code below to get started with the model.
import re
import torch
from transformers import Qwen2_5OmniProcessor, Qwen2_5OmniThinkerForConditionalGeneration
from peft import PeftModel
from qwen_omni_utils import process_mm_info
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
torch_dtype="auto",
device_map="auto",
attn_implementation="flash_attention_2"
)
model = PeftModel.from_pretrained(model, "wsntxxn/HoliAntiSpoof")
model = model.merge_and_unload()
conversation = [
{
'role': 'system',
'content': [
{
'type': 'text',
'text': 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'
}
]
},
{
'role': 'user',
'content': [
{
'type': 'audio',
'audio': '/path/to/audio'
},
{
'type': 'text',
'text': 'Determine whether this audio clip is a spoof or not. If it is a spoof, determine the spoofing method and the spoofing region.' # without semantic analysis
# 'text': 'Is this speech utterance a spoof or not? If it is, please provide the spoofing method, the spoofing time region, and the potential influence of the spoofing text content on the semantic meaning of the utterance.' # with semantic analysis
}
]
}
]
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
tokenizer = processor.tokenizer
real_fake_seq_idx = 6
real_token_vocab_idx = tokenizer.convert_tokens_to_ids("real")
fake_token_vocab_idx = tokenizer.convert_tokens_to_ids("fake")
pattern = re.compile("<|im_end|>|<|endoftext|>")
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, _, _ = process_mm_info(conversation, use_audio_in_video=False)
inputs = processor(
text=text, audio=audios, images=None, videos=None, return_tensors="pt", padding=True, use_audio_in_video=False
)
inputs = inputs.to(model.device).to(model.dtype)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1024,
tokenizer=tokenizer,
stop_strings=["<|im_end|>", "<|endoftext|>"],
output_scores=True,
return_dict_in_generate=True
)
output_trimmed = outputs.sequences[0, len(inputs["input_ids"][0]):]
output_text = tokenizer.decode(
output_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
)
matched = pattern.search(output_text)
if matched:
output_text = output_text[:matched.start()]
real_logit = outputs.scores[real_fake_seq_idx][0, real_token_vocab_idx].item()
fake_logit = outputs.scores[real_fake_seq_idx][0, fake_token_vocab_idx].item()
print(f"output: {output_text}, real logit: {real_logit:.3f}, fake logit: {fake_logit:.3f}")
BibTeX:
@article{xu2026holiantispoof,
title={HoliAntiSpoof: Audio LLM for Holistic Speech Anti-Spoofing},
author={Xu, Xuenan and Ren, Yiming and Liu, Liwei and Wu, Wen and Li, Baoxiang and Lu, Chaochao and Wang, Shuai and Zhang, Chao},
journal={arXiv preprint arXiv:2602.04535},
year={2026}
}
Base model
Qwen/Qwen2.5-Omni-7B