jaeikkim commited on
Commit
eb98644
·
1 Parent(s): 91a0722

Final Samples

Browse files
MMaDA/inference/demo/space_demo.yaml ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb:
2
+ entity: null
3
+ # run_id: askkz9i2
4
+ resume: 'auto'
5
+
6
+ experiment:
7
+ project: "omada-instruction-tuning_0204"
8
+ name: "omada-instruction-tuning_0204"
9
+ output_dir: "/dataset/omada/ckpt/it-0204"
10
+ max_train_examples_t2i: 40000000
11
+ max_train_examples_mmu: 40000000
12
+ save_every: 500
13
+ eval_every: 99999999999999
14
+ generate_every: 1000000000
15
+ log_every: 1
16
+ log_grad_norm_every: 100
17
+ resume_from_checkpoint: "latest"
18
+
19
+ model:
20
+ vq_model_image:
21
+ type: "magvitv2"
22
+ vq_model_name: "/dataset/omada/ckpt/showlab/magvitv2"
23
+ ### Omada ######################[#########################################
24
+ vq_model_audio:
25
+ type: "emova"
26
+ vq_model_name: "/dataset/omada/ckpt/Emova-ollm/emova_speech_tokenizer_hf"
27
+ omada:
28
+ tokenizer_path: "/dataset/omada/ckpt/it-0204/checkpoint-0207/unwrapped_model"
29
+ local_files_only: true
30
+ # pretrained_model_path: "Gen-Verse/MMaDA-8B-MixCoT"
31
+ pretrained_model_path: "/dataset/omada/ckpt/it-0204/checkpoint-0207/unwrapped_model"
32
+ w_clip_vit: False
33
+ new_vocab_size: 138752
34
+ llm_vocab_size: 126464
35
+ codebook_size: 8192
36
+ num_vq_tokens: 1024
37
+ num_new_special_tokens: 0 # v2s, s2s, i2i
38
+ tie_word_embeddings: False
39
+ #########################################################################
40
+
41
+ gradient_checkpointing: True
42
+
43
+ dataset:
44
+ gen_type: "pass"
45
+ und_type: "pass"
46
+ combined_loader_mode: "max_size_cycle"
47
+ params:
48
+ train_t2i_shards_path_or_url: "/data_storage/shared/datasets/imagenet-1k/data/train"
49
+ train_mmu_shards_path_or_url: [ "/data_storage/shared/datasets/SA-1B/sa_{000000..000999}.tar",
50
+ "/data_storage/shared/datasets/cc12m/raw/raw/{0000..0999}.tar",
51
+ "/data_storage/shared/datasets/laion-aesthetics-12m/{00000..00999}.tar"
52
+ ]
53
+ train_lm_shards_path_or_url: "/data_storage/shared/datasets/falcon-refinedweb/data/data/*.parquet"
54
+ add_caption_prompt: True
55
+ external_caption_path: "/data_storage/shared/datasets/SAM-LLaVA-Captions10M"
56
+ external_journeydb_caption_path: "/data_storage/shared/datasets/journeydb_anno/train_journeydb_anno.json"
57
+ external_laion12m_caption_path: "/data_storage/shared/datasets/laion-aesthetic-12m-captions"
58
+ external_cc12m_caption_path: "/data_storage/shared/datasets/cc12m/captions"
59
+ validation_prompts_file: "validation_prompts/imagenet_prompts.txt"
60
+ mmu_image_root: "/data_storage/ty/MMaDA/mmu_validation"
61
+ ### Omada ###############################################################
62
+ video_root: "/home/work/AIDAS/data/video/openvid1m/video/video"
63
+ video_speech_dataset:
64
+ sample_mode: "exclusive"
65
+ sample_method: "uniform_sequential"
66
+ v2t_sample_method: "uniform_sequential"
67
+ use_precomputed_tokens: true
68
+ precomputed_tokens_root: "/home/work/AIDAS/cache/openvid_speech_tokens"
69
+ index_cache_path: "/home/work/AIDAS/cache/video_speech_index.pt"
70
+ max_video_seconds: 10
71
+ llavavid_max_video_seconds: 10
72
+ llavavid_path: "/dataset/omada/datasets/video/LLaVA-Video-178K"
73
+ llavavid_local_files_only: true
74
+ llavavid_skip_configs:
75
+ - "llava_hound"
76
+ - "0_30_s_activitynetqa"
77
+ - "30_60_s_activitynetqa"
78
+ - "1_2_m_activitynetqa"
79
+ - "2_3_m_activitynetqa"
80
+ - "0_30_s_activitynet"
81
+ - "30_60_s_activitynet"
82
+ - "1_2_m_activitynet"
83
+ - "2_3_m_activitynet"
84
+ llavavid_skip_video_patterns:
85
+ - "activitynet"
86
+ use_llavavid: false
87
+ llavavid_max_samples: 500000
88
+ llavavid_sample_seed: 42
89
+ sharegptvideo_sft_path:
90
+ - "/dataset/omada/datasets/ShareGPTVideo/video_instruction/train/qa/chatgpt_qa_240k_sft_frames.jsonl"
91
+ # - "/dataset/omada/datasets/video/vlmeval_sft_train_20f.jsonl"
92
+ # - "/dataset/omada/datasets/video/vlmeval_sft_train_20f_no_videomme.jsonl"
93
+ # - "/dataset/omada/datasets/video/vlmeval_sft_train_20f_temp_act.jsonl"
94
+ - "/dataset/omada/datasets/video/vlmeval_sft_train_20f_mv_mme_corr.jsonl"
95
+ sharegptvideo_num_frames: 5
96
+ sharegptvideo_sample_method: "uniform_sequential"
97
+ sharegptvideo_strip_video_token: true
98
+ sharegptvideo_require_video: true
99
+ # video_dataset_name: "openvid1m"
100
+ hqedit_split: "train"
101
+ t2i_dataset: "prompt_image_jsonl+basic_edit_jsonl+dpg_jsonl"
102
+ # t2i_dataset: "basic_edit_jsonl+dpg_jsonl"
103
+ t2i_split: "train"
104
+ t2i_dataset_name: "jackyhate/text-to-image-2M"
105
+ flux_reason_dataset_name: "LucasFang/FLUX-Reason-6M"
106
+ flux_reason_score_threshold: 8.0
107
+ flux_reason_local_files_only: true
108
+ pickapic_dataset_name: "Min-Jaewon/pickapic-v2"
109
+ ultraedit_dataset_name: "BleachNick/UltraEdit_500k"
110
+ ultraedit_local_files_only: true
111
+ journeydb_jsonl_path: "/home/work/AIDAS/data/JourneyDB/data/train/train_anno_realease_repath.jsonl"
112
+ journeydb_image_root: "/home/work/AIDAS/data/JourneyDB/data/train"
113
+ journeydb_local_files_only: true
114
+ prompt_image_jsonl:
115
+ jsonl_path: "/dataset/omada/datasets/t2i/prompt_image_geneval_pass.jsonl"
116
+ prompt_keys: ["prompt", "query"]
117
+ image_keys: ["image_path", "image"]
118
+ skip_missing: true
119
+ cache_path: "dataset/omada/datasets/t2i/prompt_image_geneval_pass_0114.cache.jsonl"
120
+ max_samples: null
121
+ seed: 42
122
+ dpg_jsonl:
123
+ jsonl_path: "/dataset/omada/datasets/t2i/combined_dpg.jsonl"
124
+ prompt_keys: ["prompt", "query"]
125
+ image_keys: ["image_path", "image"]
126
+ skip_missing: true
127
+ cache_path: "/dataset/omada/datasets/t2i/combined_dpg.cache.jsonl"
128
+ max_samples: null
129
+ seed: 42
130
+ i2i_prompt_image_jsonl:
131
+ jsonl_path:
132
+ - "/dataset/omada/datasets/i2i/basic_edit_all_pair_pass.jsonl"
133
+ - "/dataset/omada/datasets/ImgEdit/Singleturn/ImgEdit_pairs_from_parquet_300k.jsonl"
134
+ prompt_keys: ["prompt"]
135
+ image_keys: ["image_path"]
136
+ skip_missing: true
137
+ cache_path: "/dataset/omada/datasets/i2i/basic_edit_all_pair_pass.cache_0114.jsonl"
138
+ max_samples: null
139
+ seed: 42
140
+ t2i_local_files_only: true
141
+ openimage_i2i:
142
+ sft_jsonl: "/home/work/AIDAS/data/openimage_source_images/sft_with_local_source_image_path.jsonl"
143
+ pref_jsonl: "/home/work/AIDAS/data/openimage_source_images/pref_with_local_source_image_path.jsonl"
144
+ multi_turn_jsonl: "/home/work/AIDAS/data/openimage_source_images/multi-turn_with_local_source_image_path.jsonl"
145
+ image_root: "/home/work/AIDAS/data/nano_edited_images"
146
+ prefer_summarized_text: true
147
+ pref_positive_only: true
148
+ skip_missing: true
149
+ max_samples_per_source: null
150
+ max_total_samples: null
151
+ seed: 42
152
+ hf_instruction_lm:
153
+ split: "all"
154
+ max_samples_per_source: 1000000
155
+ max_total_samples: 20000000
156
+ seed: 42
157
+ gsm8k_aug:
158
+ split: "all"
159
+ seed: 42
160
+ train_files:
161
+ - "/dataset/omada/datasets/lm/GSM8K/train_aug/google_gemma-3-27b-it/train.csv"
162
+ - "/dataset/omada/datasets/lm/GSM8K/train_aug/Qwen_Qwen3-30B-A3B-Instruct-2507/train.csv"
163
+ - "/dataset/omada/datasets/lm/GSM8K/train_aug/Qwen_Qwen3-32B/train.csv"
164
+ - "/dataset/omada/datasets/lm/MATH/train_aug/google_gemma-3-27b-it/train.csv"
165
+ - "/dataset/omada/datasets/lm/MATH/train_aug/Qwen_Qwen3-30B-A3B-Instruct-2507/train.csv"
166
+ - "/dataset/omada/datasets/lm/MATH/train_aug/Qwen_Qwen3-32B/train.csv"
167
+ test_files:
168
+ - "/dataset/omada/datasets/lm/GSM8K/test_aug/google_gemma-3-27b-it/test.csv"
169
+ - "/dataset/omada/datasets/lm/GSM8K/test_aug/Qwen_Qwen3-30B-A3B-Instruct-2507/test.csv"
170
+ - "/dataset/omada/datasets/lm/GSM8K/test_aug/Qwen_Qwen3-32B/test.csv"
171
+ - "/dataset/omada/datasets/lm/MATH/test_aug/google_gemma-3-27b-it/test.csv"
172
+ - "/dataset/omada/datasets/lm/MATH/test_aug/Qwen_Qwen3-30B-A3B-Instruct-2507/test.csv"
173
+ - "/dataset/omada/datasets/lm/MATH/test_aug/Qwen_Qwen3-32B/test.csv"
174
+ include_reasoning: true
175
+ include_answer: false
176
+ max_total_samples: null
177
+ # mmlu_aux:
178
+ # dataset_dir: "/dataset/omada/datasets/lm/MMLU"
179
+ # seed: 42
180
+ # max_total_samples: null
181
+ # add_ntm: true
182
+ # split: "val+test"
183
+ # gpqa_train:
184
+ # dataset_dir: "/dataset/omada/datasets/lm/GPQA/train"
185
+ # seed: 42
186
+ # max_total_samples: null
187
+ # answer_mode: "label_text"
188
+ # arc_c_train:
189
+ # dataset_dir: "/dataset/omada/datasets/lm/ARC/ARC-Challenge"
190
+ # seed: 42
191
+ # max_total_samples: null
192
+ # split: "all"
193
+ reasoning_sft_csv:
194
+ csv_path: "/dataset/omada/datasets/lm/filtered/1024_trimmed_aug_datasets.csv"
195
+ seed: 42
196
+ max_total_samples: null
197
+ speech2speech:
198
+ - name: "instructs2s_200k_en"
199
+ wav_pairs_file: "/dataset/omada/datasets/speech/InstructS2S-200K/en/wav/pairs.txt"
200
+ use_precomputed_tokens: true
201
+ precomputed_tokens_root: "/dataset/omada/datasets/speech_tokens/instructs2s_200k_en"
202
+ - name: "instructs2s_eval"
203
+ wav_pairs_file: "/dataset/omada/datasets/speech/instructs2s_eval_whisper_with_assistant.pairs.txt"
204
+ use_precomputed_tokens: true
205
+ precomputed_tokens_root: "/dataset/omada/datasets/speech_tokens/instructs2s_eval"
206
+ mmu_interleaved:
207
+ # - jsonl_path: "/dataset/omada/datasets/mmbench_test_pseudo_cambrian_shared.jsonl"
208
+ - jsonl_path: "/dataset/omada/datasets/mmbench_test_pseudo_cambrian_shared_wrongdup.jsonl"
209
+ # - jsonl_path: ""
210
+ image_root: "/"
211
+ resolution: 480
212
+ # - dataset_name: "lmms-lab/POPE"
213
+ # split: "test"
214
+ # resolution: 480
215
+ # cache_dir: "/dataset/omada/datasets"
216
+ # local_files_only: true
217
+ - dataset_name: "lmms-lab/MME"
218
+ split: "test"
219
+ resolution: 480
220
+ cache_dir: "/dataset/omada/datasets"
221
+ local_files_only: true
222
+ answer_noise_prob: 0.50
223
+ answer_noise_seed: 42
224
+ answer_noise_strategy: "swap"
225
+ # - dataset_name: "lmms-lab/MMBench_EN"
226
+ # split: "dev"
227
+ # resolution: 480
228
+ # cache_dir: "/dataset/omada/datasets"
229
+ # local_files_only: true
230
+ # - dataset_name: "lmms-lab/MMMU"
231
+ # split: "all_except_test"
232
+ # resolution: 480
233
+ # cache_dir: "/dataset/omada/datasets"
234
+ # local_files_only: true
235
+ # - dataset_name: "GQA_TestDev_Balanced"
236
+ # gqa_jsonl_path: "/dataset/omada/datasets/gqa/GQA_TestDev_Balanced.jsonl"
237
+ # resolution: 480
238
+ - jsonl_path: "/dataset/omada/datasets/Cambrian-10M/jsons/Cambrian7M_withsystemprompt_300k_balanced.jsonl"
239
+ image_root: "/dataset/omada/datasets/Cambrian-10M"
240
+ resolution: 480
241
+ # subset for gigaspeech: xs, xl
242
+ # subset for librispeech: train-clean-360, train-clean-100
243
+ # subset for commonvoice: validated, invalidated
244
+ audio_data:
245
+ - name: "jsonl"
246
+ jsonl_path: "/dataset/omada/datasets/speech/seedtts_test_combined_en.jsonl"
247
+ text_key: "text"
248
+ audio_key: "speech"
249
+ use_precomputed_tokens: true
250
+ require_precomputed_tokens: true
251
+ precomputed_tokens_root: "/dataset/omada/datasets/speech_tokens/seedtts_test_combined_en"
252
+ - name: "jsonl"
253
+ jsonl_path: "/dataset/omada/datasets/speech/instructs2s_s2t_t2s_combined.jsonl"
254
+ text_key: "text"
255
+ audio_key: "speech"
256
+ use_precomputed_tokens: true
257
+ require_precomputed_tokens: false
258
+ precomputed_tokens_root: "/dataset/omada/datasets/speech_tokens/instructs2s_200k_en"
259
+ - name: "librispeech"
260
+ subset: "clean"
261
+ split: "all"
262
+ use_precomputed_tokens: true
263
+ require_precomputed_tokens: true
264
+ precomputed_tokens_root: "/dataset/omada/cache/librispeech_tokens"
265
+ # - name: "commonvoice"
266
+ # subset: "validated"
267
+ #########################################################################
268
+ require_cached_audio_tokens: true
269
+ shuffle_buffer_size: 1000
270
+ num_workers: 2
271
+ resolution: 336
272
+ t2i_resolution: 512
273
+ # resolution: 16
274
+ pin_memory: True
275
+ persistent_workers: True
276
+ dataloader_timeout: 0
277
+
278
+
279
+ speech_token_cache:
280
+ enable: true
281
+ root: "cache/speech_tokens"
282
+ max_items_in_memory: 4096
283
+
284
+ preprocessing:
285
+ max_seq_length: 128 # backward compatibility
286
+ max_seq_length_text: 1024 # for pure text/lm outputs (input trunc 256, output pad 768)
287
+ max_seq_length_lm_input: 1024 # tokenizer truncation for LM inputs
288
+ max_seq_length_mmu: 128 # for mmu/video text (output pad)
289
+ max_seq_length_mmu_input: 128 # for mmu/video text input truncation
290
+ max_seq_length_s2t: 128 # for speech-to-text prompts/targets
291
+ max_seq_length_t2i: 128 # for text-to-image prompts
292
+ max_seq_length_t2s: 128 # for text-to-speech prompts
293
+ max_aud_length: 512 # for audio tokens
294
+ max_aud_length_short: 256 # for short audio tokens
295
+ resolution: 224 # for video tokens
296
+ # max_seq_length: 16 # for text tokens
297
+ # max_aud_length: 16 # for audio tokens
298
+ # resolution: 16 # for video tokens
299
+ center_crop: False
300
+ random_flip: False
301
+
302
+ optimizer:
303
+ name: adamw
304
+ params: # default adamw params
305
+ learning_rate: 0.00002
306
+ # learning_rate: 0.00004859840219369731
307
+ scale_lr: False # scale learning rate by total batch size
308
+ beta1: 0.9
309
+ beta2: 0.999
310
+ weight_decay: 0.01
311
+ epsilon: 1e-8
312
+
313
+ lr_scheduler:
314
+ scheduler: "cosine"
315
+ params:
316
+ learning_rate: ${optimizer.params.learning_rate}
317
+ warmup_steps: 1000
318
+ # warmup_steps: 0
319
+ min_lr_scale: 0.1
320
+
321
+ training:
322
+ gradient_accumulation_steps: 2
323
+ noise_type: "mask"
324
+ batch_size_t2i: 1
325
+ batch_size_lm: 1
326
+ batch_size_mmu: 1
327
+ batch_size_v2t: 1
328
+ batch_size_v2s: 0
329
+ batch_size_s2t: 1
330
+ batch_size_t2s: 1
331
+ batch_size_s2s: 0
332
+
333
+ mixed_precision: "bf16"
334
+ enable_tf32: True
335
+ seed: 10086
336
+ max_train_steps: 1000000
337
+ max_train_epochs: NONE
338
+ overfit_one_batch: False
339
+ cond_dropout_prob: 0.1
340
+ min_masking_rate: 0.0
341
+ label_smoothing: 0.0
342
+ max_grad_norm: 1
343
+ guidance_scale: 3.5
344
+ generation_timesteps: 20
345
+
346
+ t2i_coeff: 0.2
347
+ i2i_coeff: 0.2
348
+ lm_coeff: 0.2
349
+ mmu_coeff: 0.3
350
+ v2t_coeff: 0.5
351
+ v2s_coeff: 0.0
352
+ t2s_coeff: 0.4
353
+ s2t_coeff: 0.4
354
+ s2s_coeff: 0.0
MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED
@@ -1777,6 +1777,7 @@ class OmadaDemo:
1777
  noise_schedule=self.mask_schedule,
1778
  noise_type=self.noise_type,
1779
  seq_len=seq_len,
 
1780
  mask_token_id=self.mask_token_id,
1781
  codebook_size=self.codebook_size,
1782
  uni_prompting=self.uni_prompting,
@@ -1854,6 +1855,7 @@ class OmadaDemo:
1854
  noise_schedule=self.mask_schedule,
1855
  noise_type=self.noise_type,
1856
  seq_len=seq_len,
 
1857
  mask_token_id=self.mask_token_id,
1858
  codebook_size=self.codebook_size,
1859
  uni_prompting=self.uni_prompting,
 
1777
  noise_schedule=self.mask_schedule,
1778
  noise_type=self.noise_type,
1779
  seq_len=seq_len,
1780
+ resolution=seq_len,
1781
  mask_token_id=self.mask_token_id,
1782
  codebook_size=self.codebook_size,
1783
  uni_prompting=self.uni_prompting,
 
1855
  noise_schedule=self.mask_schedule,
1856
  noise_type=self.noise_type,
1857
  seq_len=seq_len,
1858
+ resolution=seq_len,
1859
  mask_token_id=self.mask_token_id,
1860
  codebook_size=self.codebook_size,
1861
  uni_prompting=self.uni_prompting,
app.py CHANGED
@@ -879,13 +879,14 @@ def get_app() -> OmadaDemo:
879
  # Concurrent init race (warmup vs request): safe to ignore.
880
  pass
881
 
 
 
882
  default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
883
  legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
884
- eval_cfg = Path("/dataset/omada/OMaDA/MMaDA/configs/omada_instruction_tuning2.yaml")
885
  train_config = os.getenv("TRAIN_CONFIG_PATH")
886
  if not train_config:
887
- if eval_cfg.exists():
888
- train_config = str(eval_cfg)
889
  else:
890
  train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
891
 
@@ -2882,8 +2883,8 @@ with gr.Blocks(**_blocks_kwargs) as demo:
2882
  {"mode": "MMU (Image → Text)", "text": _get_example_value(MMU_EXAMPLES, 1, 1, _get_example_value(MMU_EXAMPLES, 0, 1, DEFAULT_MMU_PROMPT)), "image": _get_example_value(MMU_EXAMPLES, 1, 0, _get_example_value(MMU_EXAMPLES, 0, 0, None)), "audio": None, "video": None},
2883
  ],
2884
  "MMU (Video → Text)": [
2885
- {"mode": "MMU (Video → Text)", "text": "", "image": None, "audio": None, "video": _get_example_value(V2T_EXAMPLES, 0, 0, None)},
2886
- {"mode": "MMU (Video → Text)", "text": "", "image": None, "audio": None, "video": _get_example_value(V2T_EXAMPLES, 1, 0, _get_example_value(V2T_EXAMPLES, 0, 0, None))},
2887
  ],
2888
  "Image Generation": [
2889
  {"mode": "Image Generation", "text": _get_example_value(T2I_EXAMPLES, 0, 0, "A cinematic mountain landscape at sunrise."), "image": None, "audio": None, "video": None},
 
879
  # Concurrent init race (warmup vs request): safe to ignore.
880
  pass
881
 
882
+ # Prefer a repo-local Space config first, then fall back to demo configs.
883
+ space_demo_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "space_demo.yaml"
884
  default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
885
  legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
 
886
  train_config = os.getenv("TRAIN_CONFIG_PATH")
887
  if not train_config:
888
+ if space_demo_cfg.exists():
889
+ train_config = str(space_demo_cfg)
890
  else:
891
  train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
892
 
 
2883
  {"mode": "MMU (Image → Text)", "text": _get_example_value(MMU_EXAMPLES, 1, 1, _get_example_value(MMU_EXAMPLES, 0, 1, DEFAULT_MMU_PROMPT)), "image": _get_example_value(MMU_EXAMPLES, 1, 0, _get_example_value(MMU_EXAMPLES, 0, 0, None)), "audio": None, "video": None},
2884
  ],
2885
  "MMU (Video → Text)": [
2886
+ {"mode": "MMU (Video → Text)", "text": "", "image": None, "audio": None, "video": _get_example_value(V2T_EXAMPLES, -2, 0, _get_example_value(V2T_EXAMPLES, 0, 0, None))},
2887
+ {"mode": "MMU (Video → Text)", "text": "", "image": None, "audio": None, "video": _get_example_value(V2T_EXAMPLES, -1, 0, _get_example_value(V2T_EXAMPLES, 1, 0, _get_example_value(V2T_EXAMPLES, 0, 0, None)))},
2888
  ],
2889
  "Image Generation": [
2890
  {"mode": "Image Generation", "text": _get_example_value(T2I_EXAMPLES, 0, 0, "A cinematic mountain landscape at sunrise."), "image": None, "audio": None, "video": None},