Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

mhla_videogen/config.json +58 -0
mhla_videogen/config.yaml +317 -0
mhla_videogen/model.safetensors +3 -0

mhla_videogen/config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "model_type": "wan_t2v",
+  "model_name": "Wan_T2V_1300M",
+  "image_size": 256,
+  "video_width": 800,
+  "video_height": 480,
+  "num_frames": 81,
+  "patch_size": [
+    1,
+    2,
+    2
+  ],
+  "dim": 1536,
+  "ffn_dim": 8960,
+  "freq_dim": 256,
+  "num_heads": 12,
+  "num_layers": 30,
+  "window_size": [
+    -1,
+    -1
+  ],
+  "qk_norm": true,
+  "cross_attn_norm": true,
+  "eps": 1e-06,
+  "self_attn_type": "mhla",
+  "rope_after": true,
+  "rms_output": true,
+  "norm_output": false,
+  "mhla_adjust": true,
+  "without_rope": false,
+  "is_gated": false,
+  "is_lepe": false,
+  "ffn_type": "mlp",
+  "linear_attn_idx": [
+    1,
+    2,
+    4,
+    5,
+    7,
+    8,
+    10,
+    11,
+    13,
+    14,
+    16,
+    17,
+    19,
+    20,
+    22,
+    23,
+    25,
+    26,
+    28,
+    29
+  ],
+  "training_epoch": 2,
+  "training_step": 47000
+}

mhla_videogen/config.yaml ADDED Viewed

	@@ -0,0 +1,317 @@

+data:
+    data_dir:
+        toy_data: toy_data
+    caption_proportion:
+        prompt: 1
+    external_caption_suffixes: []
+    external_clipscore_suffixes: []
+    caption_selection_type: clipscore
+    clip_thr_temperature: 0.1
+    clip_thr: 25.0
+    del_img_clip_thr: 22.0
+    sort_dataset: false
+    load_text_feat: false
+    load_vae_feat: true
+    aspect_ratio_type: ASPECT_RATIO_VIDEO_256_MS
+    transform: default_train_video
+    type: SanaZipDatasetWithCache
+    image_size: 256
+    hq_only: false
+    valid_num: 0
+    data: null
+    num_frames: 81
+    extra: null
+    external_data_filter:
+        toy_data:
+            _image_quality:
+                min: 0.5
+                max: 1.0
+    motion_score_file_thres: {}
+    motion_score_cal_type: average
+    target_fps: 16
+    resample_fps: true
+    shuffle_dataset: false
+    vae_cache_dir:
+    json_cache_dir: null
+    load_first_frame: false
+model:
+    model: Wan_T2V_1300M
+    from_pretrained: null
+    load_model_ckpt:
+    init_patch_embedding: false
+    image_size: 256
+    video_width: 800
+    video_height: 480
+    num_frames: 81
+    patch_size:
+    - 1
+    - 2
+    - 2
+    dim: 1536
+    ffn_dim: 8960
+    freq_dim: 256
+    num_heads: 12
+    num_layers: 30
+    window_size:
+    - -1
+    - -1
+    qk_norm: true
+    cross_attn_norm: true
+    eps: 1.0e-06
+    mixed_precision: bf16
+    fp32_attention: true
+    load_from: null
+    resume_from:
+        checkpoint: latest
+        load_ema: false
+        resume_optimizer: true
+        resume_lr_scheduler: true
+    aspect_ratio_type: ASPECT_RATIO_VIDEO_480
+    multi_scale: false
+    class_dropout_prob: 0.1
+    guidance_type: classifier-free
+    mask: null
+    image_latent_mode: video_zero
+    linear_attn_idx:
+    - 1
+    - 2
+    - 4
+    - 5
+    - 7
+    - 8
+    - 10
+    - 11
+    - 13
+    - 14
+    - 16
+    - 17
+    - 19
+    - 20
+    - 22
+    - 23
+    - 25
+    - 26
+    - 28
+    - 29
+    self_attn_type: mhla
+    rope_after: true
+    rms_output: true
+    norm_output: false
+    mhla_adjust: true
+    without_rope: false
+    is_gated: false
+    is_lepe: false
+    block_layout: null
+    power: 1.0
+    ffn_type: mlp
+    attn_mask: null
+    diagonal_block_size: 1
+vae:
+    vae_type: WanVAE
+    vae_latent_dim: 16
+    vae_pretrained:
+    vae_stride:
+    - 4
+    - 8
+    - 8
+    vae_downsample_rate: 8
+    weight_dtype: bf16
+    extra: null
+    cache_dir: null
+    if_cache: false
+text_encoder:
+    t5_model: umt5_xxl
+    t5_dtype: bfloat16
+    text_len: 512
+    t5_checkpoint: null
+    t5_tokenizer: google/umt5-xxl
+    extra: null
+    caption_channels: 4096
+scheduler:
+    train_sampling_steps: 1000
+    predict_flow_v: true
+    noise_schedule: linear_flow
+    pred_sigma: false
+    learn_sigma: true
+    vis_sampler: flow_dpm-solver
+    flow_shift: 3.0
+    inference_flow_shift: null
+    weighting_scheme: logit_normal
+    weighting_scheme_discriminator: logit_normal_trigflow
+    add_noise_timesteps:
+    - 1.5708
+    logit_mean: 0.0
+    logit_std: 1.0
+    logit_mean_discriminator: 0.0
+    logit_std_discriminator: 1.0
+    mode_scale: 1.29
+    sigma_data: 1.0
+    p_low: null
+    p_high: null
+    timestep_norm_scale_factor: 1.0
+    pretrain_timestep_norm_scale_factor: 1.0
+    discrete_norm_timestep: false
+    extra: null
+train:
+    num_workers: 10
+    seed: 1
+    train_batch_size: 4
+    train_batch_size_image: 32
+    early_stop_hours: 100
+    num_epochs: 100
+    gradient_accumulation_steps: 1
+    grad_checkpointing: true
+    gradient_clip: 0.1
+    gc_step: 1
+    optimizer:
+        betas:
+        - 0.9
+        - 0.999
+        eps: 1.0e-10
+        lr: 2.0e-05
+        type: AdamW
+        weight_decay: 0.0
+    optimizer_D:
+        eps: 1.0e-10
+        lr: 0.0001
+        type: AdamW
+        weight_decay: 0.03
+    load_from_optimizer: false
+    load_from_lr_scheduler: false
+    resume_lr_scheduler: true
+    lr_schedule: constant
+    lr_schedule_args:
+        num_warmup_steps: 1000
+    auto_lr: null
+    ema_rate: 0.9999
+    eval_batch_size: 16
+    use_fsdp: false
+    use_flash_attn: false
+    eval_sampling_steps: 500
+    lora_rank: 4
+    log_interval: 20
+    mask_type: 'null'
+    mask_loss_coef: 0.0
+    load_mask_index: false
+    snr_loss: false
+    real_prompt_ratio: 1.0
+    save_image_epochs: 1
+    save_model_epochs: 5
+    save_model_steps: 500
+    visualize: true
+    null_embed_root: output/pretrained_models/
+    valid_prompt_embed_root: output/tmp_embed/
+    validation_prompts:
+    - soft lighting and warm colors infuse the image, creating a magical and serene
+        effect. the view captures the serene guru man levitating above the golden
+        sands, his long, flowing beard and simple robes gently swaying. his eyes are
+        closed, and a peaceful smile graces his calm face. a gentle glow surrounds
+        him, enhancing his aura of tranquility. behind him, the majestic pyramids
+        of egypt loom, bathed in the warm light of the setting sun. the softly glowing
+        sands and the pyramid silhouettes create a composition rich with spirituality
+        and enlightenment, exuding an atmosphere of profound calmness.
+    - a soft-focus view captures a serene garden, filled with cherry blossom trees
+        in bloom. at the center stands a beautiful japanese woman, portrayed in exquisite
+        detail, wearing a traditional kimono with intricate floral patterns in soft
+        pastel colors. her long, dark hair cascades elegantly down her back, enhancing
+        her gentle, serene expression. pink petals drift down in a light breeze, adding
+        to the garden's ethereal ambiance. sunlight filters through the leafy canopy,
+        casting dappled shadows that dance around her, subtly highlighting her serene
+        posture and the details of her kimono. the atmosphere is tranquil and picturesque,
+        enveloped in a sense of timeless beauty.
+    - the angle is mid-range, focusing on the well-dressed asian male friend sitting
+        comfortably in a modern and stylish cafe. the setting exudes warmth and elegance,
+        with soft music playing in the background to create an inviting atmosphere.
+        the man holds a small gift box in his hand, his face illuminated by a confident
+        smile as he looks around, his expression full of anticipation. his attire,
+        a blend of classic and contemporary style, complements the chic surroundings,
+        enhancing his poised demeanor. the ambient lighting accentuates his features,
+        making the scene lively and intimate.
+    - the setting is a training facility, brightly lit with mirrored walls and sprung
+        wooden floors. the scene starts with the camera panning slowly over the room,
+        capturing the boundless determination in action as a group of korean girls
+        rigorously practice their moves. each one is focused, their expressions marked
+        by the intensity of a long and arduous journey. they are seen rehearsing complex
+        choreography, with synchronized steps and practiced precision. alongside the
+        strenuous physical training, snippets of their vocal lessons are interwoven,
+        illustrating the multifaceted preparation involved. the atmosphere is one
+        of dedication and discipline, reflected in their commitment to rigorous exercise
+        and strict dietary habits. these scenes provide a glimpse into the grueling
+        yet passionate pursuit of their dreams.
+    - astronaut is riding a horse on the moon, wearing a space suit and helmet. the
+        horse is galloping across the lunar surface, leaving behind a trail of moon
+        dust. in the background, earth is visible in the black sky, a beautiful blue
+        and green marble. the astronaut is holding a flag with a logo on it, waving
+        it proudly as they ride. the scene is surreal and whimsical, capturing the
+        imagination of space exploration and adventure.
+    local_save_vis: true
+    deterministic_validation: true
+    online_metric: false
+    eval_metric_step: 2000
+    online_metric_dir: metric_helper
+    work_dir: output/debug
+    skip_step: 0
+    loss_type: huber
+    huber_c: 0.001
+    num_ddim_timesteps: 50
+    w_max: 15.0
+    w_min: 3.0
+    ema_decay: 0.95
+    debug_nan: false
+    ema_update: false
+    weight_loss: true
+    tangent_warmup_steps: 10000
+    scm_cfg_scale:
+    - 1.0
+    cfg_interval: null
+    scm_logvar_loss: true
+    norm_invariant_to_spatial_dim: true
+    norm_same_as_512_scale: false
+    g_norm_constant: 0.1
+    g_norm_r: 1.0
+    show_gradient: false
+    lr_scale: null
+    adv_lambda: 1.0
+    scm_loss: true
+    scm_lambda: 1.0
+    loss_scale: 1.0
+    r1_penalty: false
+    r1_penalty_weight: 1.0e-05
+    diff_timesteps_D: true
+    suffix_checkpoints: disc
+    misaligned_pairs_D: false
+    discriminator_loss: cross entropy
+    largest_timestep: 1.5708
+    train_largest_timestep: false
+    largest_timestep_prob: 0.5
+    reconstruct_loss: false
+    reconstruct_loss_type: huber
+    vis_grad: false
+    extra: null
+    offload_vae: false
+    offload_text_encoder: false
+    deepspeed_stage: null
+    sp_degree: 1
+    fsdp_config: null
+    fsdp_inference: false
+    train_la_only: false
+work_dir: output/debug
+resume_from: latest
+load_from: null
+debug: false
+caching: false
+report_to: wandb
+tracker_project_name: wan-video
+name: debug
+loss_report_name: loss
+task: t2v
+image_encoder:
+    image_encoder_type: null
+    image_encoder_pretrained: null
+    image_encoder_tokenizer: null
+    weight_dtype: float32
+    extra: null
+distill: null
+lora: null
+cfg_scale: 3.0

mhla_videogen/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c5628e7d98e79ec5af71031446fc605be791b5bf5b2ceb53dc4b03cde8f6775
+size 5677997904