Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

config.json +102 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_2.pth +3 -0
rng_state_3.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +0 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "_gradient_checkpointing": true,
+  "architectures": [
+    "MetaQuery"
+  ],
+  "attn_implementation": "sdpa",
+  "audio_uncond_ratio": 0.05,
+  "audio_window_size": 2,
+  "connector_num_hidden_layers": 8,
+  "context_uncond_ratio": 0.2,
+  "decode_noise_scale": 0.025,
+  "decode_timestep": 0.05,
+  "diffusion_forcing": {
+    "block_causal": false,
+    "enable": true,
+    "few_steps": false,
+    "is_neg_posi_share_kv": true,
+    "kv_save_step": -1,
+    "num_lookback_chunks": 2,
+    "prefix_timestep": -1,
+    "ref_uncond_ratio": 0.1,
+    "temp_chunk": 3,
+    "use_ref_as_sink": true
+  },
+  "diffusion_model_id": "Lightricks/LTX-Video",
+  "diffusion_model_path": "./pretrained_models/Wan2.1-T2V-14B/",
+  "gen_fps": 25.0,
+  "in_channels": 16,
+  "infer": false,
+  "joint_uncond_ratio": 0.1,
+  "latent_spatial_size": 56,
+  "latent_temporal_size": 16,
+  "local_files_only": true,
+  "lora_dict": {
+    "adapter_name": "context",
+    "enable": true,
+    "network_alpha": 128,
+    "rank": 128,
+    "target_modules": [
+      "q",
+      "k",
+      "v",
+      "o",
+      "ffn.0",
+      "ffn.2"
+    ]
+  },
+  "loss_type": "flow",
+  "max_input_text_tokens": 256,
+  "mllm_id": "./pretrained_models/Qwen2.5-Omni-7B",
+  "mllm_local_path": null,
+  "model_type": "metaquery",
+  "modules_to_freeze": [
+    "audio_encoder",
+    "pipe.vae",
+    "pipe.dit",
+    "pipe.text_encoder",
+    "mllm.mllm_backbone"
+  ],
+  "modules_to_unfreeze": [
+    "mllm.connector",
+    "mllm.mllm_backbone.model.embed_tokens"
+  ],
+  "negative_text_prompt": "Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, abrupt and extreme head shift, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward",
+  "noise_scheduler_id": "Lightricks/LTX-Video",
+  "noise_to_first_frame": 0.25,
+  "num_metaqueries": 128,
+  "omniavatar_dict": {
+    "init_lora_weights": "kaiming",
+    "lora_alpha": 64,
+    "lora_rank": 128,
+    "lora_reinit": false,
+    "lora_target_modules": "q,k,v,o,ffn.0,ffn.2",
+    "merged": true,
+    "weights_path": "./pretrained_models/OmniAvatar-14B/pytorch_model.pt"
+  },
+  "prev_temp_chunk": 4,
+  "retify_to_last_frame": true,
+  "scheduler_id": "Lightricks/LTX-Video",
+  "scheduler_type": "euler",
+  "system_prompt": "You are a specialized system for generating descriptions of an agent's non-verbal behavior during face-to-face conversations. Your output will be used as learnable textual control tokens for a video generation model.\nInput Format: You will receive two historical context segments: [First] Agent's historical context (audio always available; video often unavailable): - Recent audio of the agent's speech or vocalization - Recent agent video frames if available (in most cases not present) [Second] User's historical context (audio always available; video may be available): - Recent audio of the user's speech - Recent user video frames when provided\n[Output] - Produce non-verbal behavior description encoded as learnable textual cues - This description should predict the agent's upcoming head pose tendencies, micro-movements, gaze patterns, and facial dynamics\nKey Principles: - React naturally to the conversational dynamics captured in [Second] (user's timing, tone, emotional state, visual cues if available) - Focus solely on the agent's visible head region: head orientation, gaze direction, facial muscle movements - Do not assume any specific upcoming speech timing, duration, or transitions, as future audio is unknown\nOutput Structure (Text after each colon explains how to answer that item): 1. State: Indicate whether the agent is more likely to be in a \"Speaking\", \"Listening\" or \"Turn-switching\"(rapid shifts expected) state based on the historical conversational flow. If unclear, choose the state suggested by the most recent history. 2. Overall demeanor: Briefly characterize the intended non-verbal attitude (e.g., engaged, thoughtful, friendly, contemplative, etc.) 3. Arousal level: From low to high, which determines the intensity, speed, and amplitude of the agent's movements described below 4. Head motion: Describe the agent's upcoming natural head movements (posture maintenance, nods, tilts, rhythmic adjustments, etc.) 5. Gaze: Outline expected gaze behavior, specifying focus targets (e.g., direct eye contact, averting gaze for thought, scanning), referencing to the user\u2019s location if their video is present in [Second]. 6. Facial expression: Describe the agent\u2019s facial expression tendencies, covering the likely baseline (neutral, expressive, or relaxed, etc.) and potential changes in the eyes, mouth, or eyebrows that correspond to the tone and intensity of the conversation. 7. Listening / Speaking Behavior: - If listening: give a more detailed description of active listening behavior with above aspects - If speaking: give a concise description of speech-related coordination with above aspects\nConstraints: - Do NOT transcribe or summarize speech content - Do NOT describe the user's behavior, appearance and background - Do NOT describe the agent's appearance, background - Focus on the upcoming response behavior in detail",
+  "text_encoder_id": "./pretrained_models/Wan2.1-T2V-14B",
+  "text_encoder_local_path": null,
+  "text_encoder_path": "./pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth",
+  "text_prompt": "A realistic video of a person communicating with another person by front-facing camera, with dynamic facial expression and rhythmichead motion that complement his talking or listening responses.",
+  "tokenizer_local_path": null,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.3",
+  "use_context": true,
+  "use_region_loss": {
+    "enable": false,
+    "eye": 0.3,
+    "head": 0.2,
+    "mouth": 0.5
+  },
+  "use_target_audio": true,
+  "vae_downsample_f": 8,
+  "vae_downsample_t": 4,
+  "vae_id": "Lightricks/LTX-Video",
+  "vae_path": "./pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth",
+  "wav2vec_path": "./pretrained_models/wav2vec2-base-960h"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a86df145ad681d84eed00375f1baf76a9b711952e7fc1fac5cb9a17bfe88e76
+size 3465287048

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57f56778d58f255346e1428bd71b9374f94fa83e54aa28d70d21a51ce1a7a24a
+size 6931161524

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b371c76d3a12ca423aa5dbbd3ec2bafc19de4c20c4b9dd06285d6127e8d8638
+size 16389

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43f854c9fb767921e25aa1b3894ff165baadee2166d4752d758febc790909d3d
+size 16389

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7736fb612d59ef87a0e9a60fc7ededddf27ce4fbe0c8a7b8af022bf418f604a
+size 16389

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:030c48620b1d8864fa197defe20550b200efb42aab9374eefed3135fdd25f919
+size 16389

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd9013cebcf5b9095fe8a54eb281f126d567cfaf4037f3bfb2211da5e59b1faf
+size 16389

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b156a90ea8d3274fde8c6028e25d18e1098699e4e5115e347368a408c2c09b3a
+size 16389

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b60ca1e0da875a37e38cc28af99674602c0c9db8a6a4202817561ef59ce65af7
+size 16389

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4240bfb48d6759b1e4c393dc4f26cccb1e6e8619b6e8f8b6e9fb4078208b8491
+size 16389

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d10983ea22496cb85733e8a649c58f9a63decf9d05d1b318b06136c3e64e90a5
+size 1465

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:955f7d7bfb861f692ca588ef2ab045c1e67871fb4f63c4dea96395365a567bbc
+size 6225