Upload folder using huggingface_hub
Browse files- config.json +142 -0
- model.safetensors +3 -0
- optimizer.pt +3 -0
- rng_state_0.pth +3 -0
- rng_state_1.pth +3 -0
- rng_state_2.pth +3 -0
- rng_state_3.pth +3 -0
- rng_state_4.pth +3 -0
- rng_state_5.pth +3 -0
- scheduler.pt +3 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
config.json
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_gradient_checkpointing": true,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"MetaQuery"
|
| 5 |
+
],
|
| 6 |
+
"attn_implementation": "sdpa",
|
| 7 |
+
"audio_lora_dict": {
|
| 8 |
+
"adapter_name": "ovi_audio_lora",
|
| 9 |
+
"enable": true,
|
| 10 |
+
"lora_dropout": 0.05,
|
| 11 |
+
"network_alpha": 48,
|
| 12 |
+
"rank": 48,
|
| 13 |
+
"target_modules": [
|
| 14 |
+
"q",
|
| 15 |
+
"k",
|
| 16 |
+
"v",
|
| 17 |
+
"o",
|
| 18 |
+
"ffn.0",
|
| 19 |
+
"ffn.2"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
"audio_loss_weight": 1.0,
|
| 23 |
+
"audio_uncond_ratio": 0.05,
|
| 24 |
+
"audio_vae_path": "./pretrained_models/Ovi/MMAudio/ext_weights/",
|
| 25 |
+
"audio_window_size": 2,
|
| 26 |
+
"connector_num_hidden_layers": 6,
|
| 27 |
+
"decode_noise_scale": 0.025,
|
| 28 |
+
"decode_timestep": 0.05,
|
| 29 |
+
"diffusion_forcing": {
|
| 30 |
+
"block_causal": false,
|
| 31 |
+
"enable": true,
|
| 32 |
+
"few_steps": false,
|
| 33 |
+
"is_neg_posi_share_kv": true,
|
| 34 |
+
"kv_save_step": -1,
|
| 35 |
+
"num_lookback_chunks": 2,
|
| 36 |
+
"prefix_timestep": -1,
|
| 37 |
+
"temp_chunk": 3
|
| 38 |
+
},
|
| 39 |
+
"diffusion_model_id": "Lightricks/LTX-Video",
|
| 40 |
+
"diffusion_model_path": "./pretrained_models/Ovi/Ovi/model_960x960_10s.safetensors",
|
| 41 |
+
"distill_loss_weight": 0.0,
|
| 42 |
+
"gen_fps": 25.0,
|
| 43 |
+
"has_context_prompt": {
|
| 44 |
+
"context_uncond_ratio": 0.025,
|
| 45 |
+
"enable": true,
|
| 46 |
+
"gt_t5_transcript_ratio": 1.0
|
| 47 |
+
},
|
| 48 |
+
"has_user_audio_input": {
|
| 49 |
+
"audio_uncond_ratio": 0.25,
|
| 50 |
+
"enable": false
|
| 51 |
+
},
|
| 52 |
+
"in_channels": 16,
|
| 53 |
+
"infer": false,
|
| 54 |
+
"joint_uncond_ratio": 0.0,
|
| 55 |
+
"latent_spatial_size": 28,
|
| 56 |
+
"latent_temporal_size": 36,
|
| 57 |
+
"local_files_only": true,
|
| 58 |
+
"lora_dict": {},
|
| 59 |
+
"loss_type": "ovi_flow",
|
| 60 |
+
"max_input_text_tokens": 256,
|
| 61 |
+
"mllm_id": "./pretrained_models/Qwen2.5-Omni-7B",
|
| 62 |
+
"mllm_local_path": null,
|
| 63 |
+
"mllm_lora_dict": {
|
| 64 |
+
"alpha": 64,
|
| 65 |
+
"dropout": 0.05,
|
| 66 |
+
"enable": false,
|
| 67 |
+
"rank": 32,
|
| 68 |
+
"target_modules": [
|
| 69 |
+
"q_proj",
|
| 70 |
+
"k_proj",
|
| 71 |
+
"v_proj",
|
| 72 |
+
"o_proj",
|
| 73 |
+
"gate_proj",
|
| 74 |
+
"up_proj",
|
| 75 |
+
"down_proj"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
"model_type": "metaquery",
|
| 79 |
+
"modules_to_freeze": [
|
| 80 |
+
"vae_model_video",
|
| 81 |
+
"vae_model_audio",
|
| 82 |
+
"text_model",
|
| 83 |
+
"fusion_model",
|
| 84 |
+
"mllm.mllm_backbone"
|
| 85 |
+
],
|
| 86 |
+
"modules_to_unfreeze": [
|
| 87 |
+
"mllm.connector",
|
| 88 |
+
"mllm.mllm_backbone.model.embed_tokens"
|
| 89 |
+
],
|
| 90 |
+
"negative_text_prompt": "jitter, bad hands, blur, distortion, robotic, muffled, echo, distorted",
|
| 91 |
+
"noise_scheduler_id": "Lightricks/LTX-Video",
|
| 92 |
+
"noise_to_first_frame": 0.25,
|
| 93 |
+
"num_metaqueries": 128,
|
| 94 |
+
"omniavatar_dict": {
|
| 95 |
+
"init_lora_weights": "kaiming",
|
| 96 |
+
"lora_alpha": 64,
|
| 97 |
+
"lora_rank": 128,
|
| 98 |
+
"lora_target_modules": "to_q,to_k,to_v,to_out.0",
|
| 99 |
+
"weights_path": null
|
| 100 |
+
},
|
| 101 |
+
"prev_temp_chunk": 4,
|
| 102 |
+
"ref_uncond_ratio": 0.2,
|
| 103 |
+
"retify_to_last_frame": false,
|
| 104 |
+
"scheduler_id": "Lightricks/LTX-Video",
|
| 105 |
+
"scheduler_type": "euler",
|
| 106 |
+
"system_prompt": "You are a speech-video clip narrator for a text-to-audio-video (T2AV) model.\nThe input includes audio-visual dialogue between the Interlocutor and the Agent. The dialogue context is provided ONLY to infer what occurs next and must NOT be described, referenced, or summarized explicitly.\nYour task is to write a narration describing the NEXT recorded speech and video clip that shows ONLY the AGENT. Follow these rules precisely:\nVISUAL RULES: - Provide a HIGH-DETAIL appearance sentence, covering: (a) Gender + precise clothing (material/color/style, e.g., \"velvet head covering with ruffled accent\"), (b) Distinctive features (eyeliner/beard/accessories), (c) Scene composition (framing/lighting/color tone/backdrop, e.g., \"stark black and white lighting against plain curtains\"). - Describe ONLY visible physical actions in strict chronological order: (a) Facial muscles (e.g., \"lips curve upward forming a subtle smile\"), (b) Eye movements (e.g., \"gaze drifts downward for two seconds\"), (c) Posture/gestures (e.g., \"thumb brushes beard thoughtfully\"). - Emotions MUST derive from observable cues: CORRECT: \"eyes crinkling at the corners in a warm smile\" (visible) WRONG: \"she feels happy\" (internal) or \"looks thoughtful\" (abstract).\nSPEECH RULES: - Determine speech occurrence based on PROVIDED TARGETS or NATURAL CONVERSATION FLOW: - IF TARGETS ARE PROVIDED: (a) If \"[Target State]: Listening\" is provided in the context, describe silent actions ONLY without any speech content and do not use <S><E> tags. (b) If \"[Target Speech]: [Text]\" is provided, use the provided [Text] exactly as full speech content. - IF TARGETS ARE NOT PROVIDED: (a) If dialogue context implies continued listening or non-verbal reaction, describe silent actions ONLY without any speech content and do not use <S><E> tags. (b) If dialogue context implies the agent WILL SPEAK NEXT (even after brief listening actions), generate speech. (c) When speech needs to be generated: (1) Infer the speech content EXCLUSIVELY from dialogue context. (2) The speech content must be a RESPONSE that move forward the conversation, but word overlap with the end of the agent\u2019s speech context is STRICTLY FORBIDDEN. - GENERAL SPEECH STYLE (Only for speech existing in the narration): (b) Wrap each segment in <S> and <E> (speech contents are broken down into multiple segments when there are too many words for one segment) in the MIDDLE of the narration. (c) For each speech segment, embed it NATURALLY within action descriptions BEFORE and AFTER speech: CORRECT: \"meeting the camera, <S>I have considered your proposal<E> he brushes his thumb against his beard.\" WRONG: separate speech/action blocks.\nAUDIO RULES: - Add \"Audio: [description]\" at the VERY END of the paragraph, separated by a single space. - If no speech occurs (means no <S><E> tags exist), use \"SILENT\" for the audio description. - Otherwise, describe ONLY the agent\u2019s audio characteristics: (a) Voice traits: Describe the voice of the AGENT only. If the agent has spoken in the prior context, describe that voice in detail (pitch/tone/pace/texture, e.g., \"low-pitched with slight rasp\"). If the agent has NOT yet spoken in the context, you MUST ignore the interlocutor\u2019s voice entirely and infer a new, plausible voice for the AGENT EXCLUSIVELY based on his visual appearance (age, gender, physique). (b) Emotional progression (e.g., \"reflective shifting to joyful tone\"). (c) Recording environment ONLY if it affects vocal quality (e.g., \"dry and direct, quiet room\"). - NEVER mention background sounds, other voices, or specific locations (e.g., \"gift shop\"). - Never mix or attribute the interlocutor\u2019s vocal traits to the agent.\nTEMPORAL & STYLE RULES: - Chronological flow: [Full appearance sentence] \u2192 [Pre-speech actions] \u2192 [<S>Speech Content<E>] \u2192 [Post-speech actions] \u2192 (repeat for multiple speech segments if any) \u2192 Audio line - Appearance sentence must start with appearance-based identifier (e.g., \"A woman in a blue dress...\"), then use pronouns (she/he/they). - Use PRESENT TENSE throughout (e.g., \"she wears\", \"he gestures\"). - Write in THIRD-PERSON, single continuous paragraph. NO bullet points, headings, or line breaks. - NEVER use: agent, assistant, interlocutor, respond, reply, listen, understand.\nOUTPUT EXAMPLES (CRITICAL FOR FORMATTING): EXAMPLE 1 (Natural pause before speech, no target provided): \"A man in a grey sweatshirt stands before dark red curtains. His eyes track the interlocutor for two seconds, meeting the camera with a steady gaze, <S>I have been considering your proposal all week.<E> straightening his shoulders as his thumb brushes his beard thoughtfully. Audio: Calm low-pitched male voice with steady rhythm and slight rasp, recorded in a quiet room.\"\nEXAMPLE 2 (Silent reaction only, no target provided): \"A woman in a dark velvet head covering sits in stark black and white lighting. Her gaze drifts downward for three seconds, eyelids lowering slowly. Fingers tap once on the armrest as her head tilts upward with a subtle smile, eyes crinkling at the corners while maintaining silence. Audio: SILENT.\"\nEXAMPLE 3 (Multiple speech segments with emotional progression, no target provided):\nEXAMPLE 4 (Target provided and speech break into multiple segments, [Target Speech]: \"I completely understand your frustration with the current situation, but we need to stay focused on the long-term goals of the project.\"): \"A woman in a silk blouse sits against a bright window. She nods slowly as the interlocutor finishes, then takes a deep breath. Meeting the camera with a steady gaze, <S>I completely understand your frustration with the current situation,<E> her expression softens. Pausing briefly to adjust her posture, she continues, <S>but we need to stay focused on the long-term goals of the project.<E> She remains attentive, her eyes reflecting deep thought while awaiting a reply. Audio: Articulate female voice with a calm, reassuring tone and steady pace, recorded in a quiet environment.\"\nEXAMPLE 5 (Target provided andcontains [Target State]: Listening): \"A man in a flannel shirt leans back against a brick wall. He tilts his head slightly to the right, eyes tracking the interlocutor\u2019s movements. His brow furrows momentarily in concentration, lips pressed together in a neutral line. He maintains a steady, receptive posture, nodding once in acknowledgement while remaining silent. Audio: SILENT.\"\nNEVER OUTPUT LIKE THIS (common failures): - \"...last words of agent\u2019s context speech. <S>Last words of context, then new content.<E>\" // Repeating context - \"The agent looks thoughtful after hearing the question.\" // Banned words + internal state - \"<S>I agree<E> <S>But I don\u2019t think it\u2019s a good idea<E>\" // Short speech broken down into multiple segments and not embedded in actions - \"Audio: Quiet room with distant traffic.\" // Illegal background sound - \"A man speaks about his recovery.\" // Invented topic (must derive from context) - \"She has a kind face.\" // Abstract trait without visible cues",
|
| 107 |
+
"text_encoder_id": "./pretrained_models/Ovi/Wan2.2-TI2V-5B",
|
| 108 |
+
"text_encoder_local_path": null,
|
| 109 |
+
"text_encoder_path": "./pretrained_models/Ovi/Wan2.2-TI2V-5B/models_t5_umt5-xxl-enc-bf16.pth",
|
| 110 |
+
"text_prompt": "A realistic video of a person communicating with another person by front-facing camera, with dynamic facial expression and rhythmichead motion that complement his talking or listening responses.",
|
| 111 |
+
"tokenizer_local_path": null,
|
| 112 |
+
"torch_dtype": "bfloat16",
|
| 113 |
+
"transformers_version": "4.52.3",
|
| 114 |
+
"use_context": true,
|
| 115 |
+
"use_ref_as_sink": true,
|
| 116 |
+
"use_region_loss": {
|
| 117 |
+
"enable": false,
|
| 118 |
+
"eye": 0.3,
|
| 119 |
+
"head": 0.2,
|
| 120 |
+
"mouth": 0.5
|
| 121 |
+
},
|
| 122 |
+
"use_target_audio": true,
|
| 123 |
+
"vae_downsample_f": 16,
|
| 124 |
+
"vae_downsample_t": 4,
|
| 125 |
+
"vae_id": "Lightricks/LTX-Video",
|
| 126 |
+
"vae_path": "./pretrained_models/Ovi/Wan2.2-TI2V-5B/Wan2.2_VAE.pth",
|
| 127 |
+
"video_lora_dict": {
|
| 128 |
+
"adapter_name": "ovi_vid_lora",
|
| 129 |
+
"enable": true,
|
| 130 |
+
"lora_dropout": 0.05,
|
| 131 |
+
"network_alpha": 48,
|
| 132 |
+
"rank": 48,
|
| 133 |
+
"target_modules": [
|
| 134 |
+
"q",
|
| 135 |
+
"k",
|
| 136 |
+
"v",
|
| 137 |
+
"o",
|
| 138 |
+
"ffn.0",
|
| 139 |
+
"ffn.2"
|
| 140 |
+
]
|
| 141 |
+
}
|
| 142 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:014c55f7260de9d6755471a599ff1754717a40bddd931133a057077b16aeb72e
|
| 3 |
+
size 4104261360
|
optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7d0d006937312190474aa3773412bdef4cf627f6eb8b64ebd854476ae12405d
|
| 3 |
+
size 8209260421
|
rng_state_0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa261cbb5e88b9bfadd27671e436fe97a795a4739ddea2b876b574329db9361f
|
| 3 |
+
size 15877
|
rng_state_1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99d29687b707549492d1604e6c8ddd72d298b1405d2cf6655b532cd2954e55f0
|
| 3 |
+
size 15877
|
rng_state_2.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88333570899354c3735dbce725b661681be58e38257e22325a9bd2bf9a8a8ff1
|
| 3 |
+
size 15877
|
rng_state_3.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b32cea7bc1fa86e248ddddf4411abed21788a45ef70650f90a4773090207dc7
|
| 3 |
+
size 15877
|
rng_state_4.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5f45b75074fd4fca78715bed645fd0d8d2321b8834a0adf1866bc259975d7b2
|
| 3 |
+
size 15877
|
rng_state_5.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec5bcb3e474f12546e50cae2bcadafed7d48dbfdaa220e233981e1d784da7515
|
| 3 |
+
size 15877
|
scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa0d0b12b796f9cc33f5f516cce55034d726ff49f3b3864f07a2454c3fdda119
|
| 3 |
+
size 1465
|
trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de0f7801d7d19f12abf1fb80df97d159bb65838f0735db65b41332cea4be004d
|
| 3 |
+
size 6545
|