HoliAntiSpoof: Audio LLM for Holistic Speech Anti-Spoofing
Paper
• 2602.04535 • Published
Use the code below to get started with the model.
import re
import torch
from transformers import Qwen2_5OmniProcessor, Qwen2_5OmniThinkerForConditionalGeneration
from peft import PeftModel
from qwen_omni_utils import process_mm_info
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
torch_dtype="auto",
device_map="auto",
attn_implementation="flash_attention_2"
)
model = PeftModel.from_pretrained(model, "wsntxxn/HoliAntiSpoof")
model = model.merge_and_unload()
conversation = [
{
'role': 'system',
'content': [
{
'type': 'text',
'text': 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'
}
]
},
{
'role': 'user',
'content': [
{
'type': 'audio',
'audio': '/path/to/audio'
},
{
'type': 'text',
'text': 'Determine whether this audio clip is a spoof or not. If it is a spoof, determine the spoofing method and the spoofing region.' # without semantic analysis
# 'text': 'Is this speech utterance a spoof or not? If it is, please provide the spoofing method, the spoofing time region, and the potential influence of the spoofing text content on the semantic meaning of the utterance.' # with semantic analysis
}
]
}
]
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
tokenizer = processor.tokenizer
real_fake_seq_idx = 6
real_token_vocab_idx = tokenizer.convert_tokens_to_ids("real")
fake_token_vocab_idx = tokenizer.convert_tokens_to_ids("fake")
pattern = re.compile("<|im_end|>|<|endoftext|>")
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, _, _ = process_mm_info(conversation, use_audio_in_video=False)
inputs = processor(
text=text, audio=audios, images=None, videos=None, return_tensors="pt", padding=True, use_audio_in_video=False
)
inputs = inputs.to(model.device).to(model.dtype)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1024,
tokenizer=tokenizer,
stop_strings=["<|im_end|>", "<|endoftext|>"],
output_scores=True,
return_dict_in_generate=True
)
output_trimmed = outputs.sequences[0, len(inputs["input_ids"][0]):]
output_text = tokenizer.decode(
output_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
)
matched = pattern.search(output_text)
if matched:
output_text = output_text[:matched.start()]
real_logit = outputs.scores[real_fake_seq_idx][0, real_token_vocab_idx].item()
fake_logit = outputs.scores[real_fake_seq_idx][0, fake_token_vocab_idx].item()
print(f"output: {output_text}, real logit: {real_logit:.3f}, fake logit: {fake_logit:.3f}")
BibTeX:
@article{xu2026holiantispoof,
title={HoliAntiSpoof: Audio LLM for Holistic Speech Anti-Spoofing},
author={Xu, Xuenan and Ren, Yiming and Liu, Liwei and Wu, Wen and Li, Baoxiang and Lu, Chaochao and Wang, Shuai and Zhang, Chao},
journal={arXiv preprint arXiv:2602.04535},
year={2026}
}
Base model
Qwen/Qwen2.5-Omni-7B