nguyenminh4099 commited on
Commit
fb42113
·
verified ·
1 Parent(s): 045a636

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -103,6 +103,7 @@ test.py
103
  /models/
104
  /scripts/
105
 
 
106
  /src/tools/
107
  /src/utils/
108
  /src/visualization/
 
103
  /models/
104
  /scripts/
105
 
106
+
107
  /src/tools/
108
  /src/utils/
109
  /src/visualization/
app.py CHANGED
@@ -11,15 +11,12 @@ from src.models.taskers.inferencer import infer
11
 
12
  logger = get_logger("Application", is_stream=True)
13
 
14
- def setup_environment():
15
- cmd = ['bash', 'scripts/prepare.sh', '--platform', 'gradio']
16
- subprocess.run(cmd, shell=False, capture_output=False, stdout=None)
17
-
18
- setup_environment()
19
  logger.info("Environment is set up")
20
 
21
  model, cfg, saved_cfg, llm_tokenizer = load_ensemble_model(
22
- os.path.join(repo_dir, 'vavsp_llm.yaml')
23
  )
24
  logger.info("Loaded model")
25
 
@@ -64,7 +61,7 @@ if __name__ == "__main__":
64
  format='mp4'
65
  ),
66
  gr.Slider(
67
- minimum=1, maximum=999,
68
  value=3, step=1,
69
  label='Second',
70
  ),
@@ -74,4 +71,4 @@ if __name__ == "__main__":
74
  description="Vietnamese Automatic Speech Recognition Utilizing Audio and Visual Data"
75
  )
76
 
77
- app.queue().launch(share=True)
 
11
 
12
  logger = get_logger("Application", is_stream=True)
13
 
14
+ cmd = ['bash', 'src/prepare.sh', '--platform', 'gradio']
15
+ subprocess.run(cmd, shell=False, capture_output=False, stdout=None)
 
 
 
16
  logger.info("Environment is set up")
17
 
18
  model, cfg, saved_cfg, llm_tokenizer = load_ensemble_model(
19
+ os.path.join(repo_dir, 'src/models/vavsp_llm.yaml')
20
  )
21
  logger.info("Loaded model")
22
 
 
61
  format='mp4'
62
  ),
63
  gr.Slider(
64
+ minimum=1, maximum=200,
65
  value=3, step=1,
66
  label='Second',
67
  ),
 
71
  description="Vietnamese Automatic Speech Recognition Utilizing Audio and Visual Data"
72
  )
73
 
74
+ app.launch(share=True)
src/models/taskers/embedder.py CHANGED
@@ -61,7 +61,7 @@ class Embedder(Tasker):
61
  'ffmpeg', '-y',
62
  '-loglevel', 'panic',
63
  '-i', video_path,
64
- '-vf', f'subtitles={subtitle_path}:force_style="PrimaryColour=&HFFFFFF,BorderStyle=4",BackColour=0',
65
  output_path,
66
  ]
67
 
 
61
  'ffmpeg', '-y',
62
  '-loglevel', 'panic',
63
  '-i', video_path,
64
+ '-vf', f'\"subtitles={subtitle_path}:force_style=\'PrimaryColour=&HFFFFFF,BorderStyle=4,BackColour=0\'\"',
65
  output_path,
66
  ]
67
 
src/models/vavsp_llm.yaml ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name: null
2
+ task: null
3
+ generation:
4
+ _name: null
5
+ beam: 20
6
+ nbest: 1
7
+ max_len_a: 1.0
8
+ max_len_b: 0
9
+ min_len: 1
10
+ match_source_len: false
11
+ unnormalized: false
12
+ no_early_stop: false
13
+ no_beamable_mm: false
14
+ lenpen: 0.0
15
+ unkpen: 0.0
16
+ replace_unk: null
17
+ sacrebleu: false
18
+ score_reference: false
19
+ prefix_size: 0
20
+ no_repeat_ngram_size: 0
21
+ sampling: false
22
+ sampling_topk: -1
23
+ sampling_topp: -1.0
24
+ constraints: null
25
+ temperature: 1.0
26
+ diverse_beam_groups: -1
27
+ diverse_beam_strength: 0.5
28
+ diversity_rate: -1.0
29
+ print_alignment: null
30
+ print_step: false
31
+ lm_path: null
32
+ lm_weight: 0.0
33
+ iter_decode_eos_penalty: 0.0
34
+ iter_decode_max_iter: 10
35
+ iter_decode_force_max_iter: false
36
+ iter_decode_with_beam: 1
37
+ iter_decode_with_external_reranker: false
38
+ retain_iter_history: false
39
+ retain_dropout: false
40
+ retain_dropout_modules: null
41
+ decoding_format: null
42
+ no_seed_provided: false
43
+ common:
44
+ _name: null
45
+ no_progress_bar: false
46
+ log_interval: 100
47
+ log_format: null
48
+ log_file: null
49
+ tensorboard_logdir: null
50
+ wandb_project: null
51
+ azureml_logging: false
52
+ seed: 1
53
+ cpu: false
54
+ tpu: false
55
+ bf16: false
56
+ memory_efficient_bf16: false
57
+ fp16: false
58
+ memory_efficient_fp16: false
59
+ fp16_no_flatten_grads: false
60
+ fp16_init_scale: 128
61
+ fp16_scale_window: null
62
+ fp16_scale_tolerance: 0.0
63
+ on_cpu_convert_precision: false
64
+ min_loss_scale: 0.0001
65
+ threshold_loss_scale: null
66
+ amp: false
67
+ amp_batch_retries: 2
68
+ amp_init_scale: 128
69
+ amp_scale_window: null
70
+ user_dir: src
71
+ empty_cache_freq: 0
72
+ all_gather_list_size: 16384
73
+ model_parallel_size: 1
74
+ quantization_config_path: null
75
+ profile: false
76
+ reset_logging: false
77
+ suppress_crashes: false
78
+ use_plasma_view: false
79
+ plasma_path: /tmp/plasma
80
+ common_eval:
81
+ _name: null
82
+ path: src/models/checkpoints/checkpoint_best.pt
83
+ post_process: null
84
+ quiet: false
85
+ model_overrides: '{}'
86
+ results_path: decode/vsr/vi
87
+ checkpoint:
88
+ _name: null
89
+ save_dir: checkpoints
90
+ restore_file: checkpoint_last.pt
91
+ finetune_from_model: null
92
+ reset_dataloader: false
93
+ reset_lr_scheduler: false
94
+ reset_meters: false
95
+ reset_optimizer: false
96
+ optimizer_overrides: '{}'
97
+ save_interval: 1
98
+ save_interval_updates: 0
99
+ keep_interval_updates: -1
100
+ keep_interval_updates_pattern: -1
101
+ keep_last_epochs: -1
102
+ keep_best_checkpoints: -1
103
+ no_save: false
104
+ no_epoch_checkpoints: false
105
+ no_last_checkpoints: false
106
+ no_save_optimizer_state: false
107
+ best_checkpoint_metric: loss
108
+ maximize_best_checkpoint_metric: false
109
+ patience: -1
110
+ checkpoint_suffix: ''
111
+ checkpoint_shard_count: 1
112
+ load_checkpoint_on_all_dp_ranks: false
113
+ write_checkpoints_asynchronously: false
114
+ model_parallel_size: 1
115
+ distributed_training:
116
+ _name: null
117
+ distributed_world_size: 1
118
+ distributed_num_procs: 1
119
+ distributed_rank: 0
120
+ distributed_backend: nccl
121
+ distributed_init_method: null
122
+ distributed_port: -1
123
+ device_id: 0
124
+ distributed_no_spawn: false
125
+ ddp_backend: pytorch_ddp
126
+ ddp_comm_hook: none
127
+ bucket_cap_mb: 25
128
+ fix_batches_to_gpus: false
129
+ find_unused_parameters: false
130
+ fast_stat_sync: false
131
+ heartbeat_timeout: -1
132
+ broadcast_buffers: false
133
+ slowmo_momentum: null
134
+ slowmo_algorithm: LocalSGD
135
+ localsgd_frequency: 3
136
+ nprocs_per_node: 1
137
+ pipeline_model_parallel: false
138
+ pipeline_balance: null
139
+ pipeline_devices: null
140
+ pipeline_chunks: 0
141
+ pipeline_encoder_balance: null
142
+ pipeline_encoder_devices: null
143
+ pipeline_decoder_balance: null
144
+ pipeline_decoder_devices: null
145
+ pipeline_checkpoint: never
146
+ zero_sharding: none
147
+ fp16: false
148
+ memory_efficient_fp16: false
149
+ tpu: false
150
+ no_reshard_after_forward: false
151
+ fp32_reduce_scatter: false
152
+ cpu_offload: false
153
+ use_sharded_state: false
154
+ dataset:
155
+ _name: null
156
+ num_workers: 0
157
+ skip_invalid_size_inputs_valid_test: false
158
+ max_tokens: 3000
159
+ batch_size: null
160
+ required_batch_size_multiple: 8
161
+ required_seq_len_multiple: 1
162
+ dataset_impl: null
163
+ data_buffer_size: 10
164
+ train_subset: train
165
+ valid_subset: valid
166
+ combine_valid_subsets: null
167
+ ignore_unused_valid_subsets: false
168
+ validate_interval: 1
169
+ validate_interval_updates: 0
170
+ validate_after_updates: 0
171
+ fixed_validation_seed: null
172
+ disable_validation: false
173
+ max_tokens_valid: 3000
174
+ batch_size_valid: null
175
+ max_valid_steps: null
176
+ curriculum: 0
177
+ gen_subset: test
178
+ num_shards: 1
179
+ shard_id: 0
180
+ override:
181
+ _name: null
182
+ noise_wav: null
183
+ noise_prob: 0.0
184
+ noise_snr: 0.0
185
+ modalities:
186
+ - visual
187
+ - audio
188
+ data: src/models/dataset/vsr/vi
189
+ label_dir: src/models/dataset/vsr/vi
190
+ labels:
191
+ - km
192
+ label_rate: -1
193
+ eval_bleu: false
194
+ llm_ckpt_path: vilm/vinallama-2.7b
195
+ w2v_path: src/models/checkpoints/large_vox_iter5.pt
196
+ demo: false
197
+ is_ax: false
src/prepare.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ usage() {
4
+ echo "Description: Prepare environment when running on cloud platform kaggle or colab
5
+ Usage:
6
+ $0
7
+ --platform <platform> Cloud platform where to run project.
8
+ 2 available platforms 'kaggle' and 'any'. Default: kaggle
9
+ "
10
+ exit 1
11
+ }
12
+
13
+ platform="any"
14
+
15
+ while [[ "$#" -gt 0 ]]; do
16
+ case "$1" in
17
+ --platform)
18
+ if [[ -z "$2" ]];
19
+ then
20
+ echo "MISSING platform value"
21
+ usage
22
+ fi
23
+ platform="$2"
24
+ shift 2
25
+ ;;
26
+ esac
27
+ done
28
+
29
+
30
+ # Download and set up miniconda
31
+ MINICONDA_INSTALLER_SCRIPT=Miniconda3-py310_23.11.0-2-Linux-x86_64.sh
32
+ MINICONDA_PREFIX=/usr/local
33
+ wget https://repo.continuum.io/miniconda/$MINICONDA_INSTALLER_SCRIPT
34
+ chmod +x $MINICONDA_INSTALLER_SCRIPT
35
+ ./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX
36
+
37
+ # Update version python
38
+ conda install --channel defaults conda python=3.10 --yes
39
+ conda update --channel defaults --all --yes
40
+
41
+ # Install ffmpeg to process media file
42
+ conda install -c conda-forge ffmpeg==7.0.1 --yes
43
+
44
+ # Require git to install packages from github
45
+ if [[ "$platform" == 'kaggle' ]]
46
+ then
47
+ conda install -c conda-forge git --yes
48
+ fi
49
+
50
+ # Install hf-transfer
51
+ pip install hf-transfer
52
+ env HF_HUB_ENABLE_HF_TRANSFER=1