ZhenYe234 commited on
Commit
9cdba85
·
verified ·
1 Parent(s): 9e7d42c

Add Talker-T2AV + WhisperX-VAE weights (model params only, no optimizer state)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ talker-t2av/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,76 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ language:
4
+ - zh
5
+ - en
6
+ tags:
7
+ - talking-head
8
+ - text-to-video
9
+ - audio-video-generation
10
+ - autoregressive
11
+ - diffusion
12
+ library_name: transformers
13
  ---
14
+
15
+ # Talker-T2AV
16
+
17
+ **Joint Talking Audio-Video Generation with Autoregressive Diffusion Modeling**
18
+
19
+ [Paper (arXiv 2604.23586)](https://arxiv.org/abs/2604.23586) ·
20
+ [Code (GitHub)](https://github.com/zhenye234/Talker-T2AV) ·
21
+ [Samples](https://talker-t2av.github.io/)
22
+
23
+ This repository hosts the pretrained weights for the paper
24
+ "Talker-T2AV: Joint Talking Audio-Video Generation with Autoregressive
25
+ Diffusion Modeling".
26
+
27
+ ## Contents
28
+
29
+ ```
30
+ talker-t2av/
31
+ model.safetensors ← AR backbone (Qwen3-0.6B) + dual diffusion heads
32
+ + Patch Transformer Encoder + Stop Predictor
33
+ (3.12 B params, 2.77 GB)
34
+ config.json
35
+ chat_template.jinja
36
+ tokenizer.json
37
+ tokenizer_config.json
38
+
39
+ whisperx-vae/
40
+ model.ckpt ← WhisperX-VAE audio autoencoder
41
+ (32-d, 25 Hz; Whisper-Large-v3 encoder + DAC backbone)
42
+ ```
43
+
44
+ For the LIA-X video motion autoencoder (40-d motion, 25 Hz), see
45
+ [wyhsirius/LIA-X](https://github.com/wyhsirius/LIA-X) — its weights are NOT
46
+ hosted here.
47
+
48
+ ## Quickstart
49
+
50
+ ```bash
51
+ git clone https://github.com/zhenye234/Talker-T2AV.git
52
+ cd Talker-T2AV
53
+
54
+ # put the weights in place
55
+ huggingface-cli download HKUSTAudio/Talker-T2AV --local-dir ./hf_weights
56
+ ln -s "$(pwd)/hf_weights/talker-t2av" ./ckpts/checkpoint-229954
57
+ export WHISPERVAE_CKPT="$(pwd)/hf_weights/whisperx-vae/model.ckpt"
58
+
59
+ python infer.py
60
+ ```
61
+
62
+ See the [GitHub README](https://github.com/zhenye234/Talker-T2AV) for full
63
+ installation and reproduction instructions.
64
+
65
+ ## Citation
66
+
67
+ ```bibtex
68
+ @article{ye2026talkert2av,
69
+ title = {Talker-T2AV: Joint Talking Audio-Video Generation with Autoregressive Diffusion Modeling},
70
+ author = {Ye, Zhen and Tan, Xu and Yin, Aoxiong and Lin, Hongzhan and
71
+ Zhang, Guangyan and Sun, Peiwen and Li, Yiming and
72
+ Chan, Chi-Min and Ye, Wei and Zhang, Shikun and Xue, Wei},
73
+ journal = {arXiv preprint arXiv:2604.23586},
74
+ year = {2026}
75
+ }
76
+ ```
talker-t2av/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
talker-t2av/config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SpeechLLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 151643,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_parameters": {
54
+ "rope_theta": 1000000,
55
+ "rope_type": "default"
56
+ },
57
+ "sliding_window": null,
58
+ "tie_word_embeddings": true,
59
+ "transformers_version": "5.3.0",
60
+ "use_cache": false,
61
+ "use_sliding_window": false,
62
+ "vocab_size": 151936
63
+ }
talker-t2av/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:893e130e88c03b22179dd4efba6628ed5eb3c03d1637931aedff4b78fc590dc3
3
+ size 2770431192
talker-t2av/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:507f9cdc7200bbbf57240103221ab4ae4f888dfe5154ef245a24e6c3ec1b26b3
3
+ size 11423312
talker-t2av/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<SPEECH_FRAME>",
10
+ "<MOTION_FRAME>"
11
+ ],
12
+ "is_local": false,
13
+ "model_max_length": 1000,
14
+ "pad_token": "<|endoftext|>",
15
+ "padding_side": "right",
16
+ "split_special_tokens": false,
17
+ "tokenizer_class": "Qwen2Tokenizer",
18
+ "unk_token": null
19
+ }
whisperx-vae/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6def379348fb6b5fb716fa2187a9bc8b5a15c8beae0d2644c0ca4fe2c398d904
3
+ size 6907516457