#!pip install ctranslate2==4.4.0 #!pip install faster_whisper==1.1.1 #!pip install -q praat-textgrids #!pip install -q huggingface_hub &> /dev/null import os import textgrids import csv #############################################################################def def whisper_transcribe(model,sound_file,out_dir): """ """ #import os #import textgrids #import csv try: segments, _ = model.transcribe(sound_file,vad_filter=True,word_timestamps=True) data = list(segments) transcription = data[0].text #print(transcription) file = open('words.csv',"w") for one in data[0].words: #print(f'{one.word}:{one.start}-{one.end}') line = f'{one.word.strip()};{one.start};{one.end}\n' file.write(line) file.close() new_txtgrid = textgrids.TextGrid() new_txtgrid.tier_from_csv('words','words.csv') new_txtgrid.xmax=one.end file_name = os.path.basename(sound_file) # eg: 'demo.txt' name = os.path.splitext(file_name)[0] # eg: 'demo' # save .textgrid new_txtgrid.write(f'{out_dir}/{name}.TextGrid') # save .lab f = open(f'{out_dir}/{name}.lab','w') f.write(transcription) f.close() except: pass #############################################################################def def _whisper_transcribe(model,sound_file,out_dir): """ """ #import os #import textgrids #import csv try: segments, _ = model.transcribe(sound_file,vad_filter=True,word_timestamps=True) data = list(segments) transcription = data[0].text #print(transcription) file_name = os.path.basename(sound_file) # eg: 'demo.txt' name = os.path.splitext(file_name)[0] # eg: 'demo' # save .lab f = open(f'{out_dir}/{name}.lab','w') f.write(transcription) f.close() except: pass #############################################################################def def whisper_folder(model,in_dir,out_dir): """ from faster_whisper import WhisperModel model = WhisperModel('/content/whisper_convert', device="cuda", compute_type="int8_float16") """ import os import glob from tqdm import tqdm os.makedirs(out_dir, exist_ok=True) files = glob.glob(f'{in_dir}/*.mp3') + glob.glob(f'{in_dir}/*.wav') for sound_file in tqdm(files): try: whisper_transcribe(model,sound_file,out_dir) except: print(f'error {file}') pass #############################################################################def import os import textgrids from faster_whisper import WhisperModel from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm def load_models_on_gpus(model_path, num_gpus, compute_type="float16"): """ Nạp mô hình faster-whisper lên từng GPU và trả về danh sách các mô hình. """ models = [] for gpu_id in range(num_gpus): os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) model = WhisperModel(model_path, device="cuda", compute_type=compute_type) models.append((gpu_id, model)) return models def process_file(model, sound_file, out_dir, gpu_id): """ Hàm xử lý một file trên một GPU cụ thể. """ os.makedirs(out_dir, exist_ok=True) try: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) segments, _ = model.transcribe(sound_file, vad_filter=True, word_timestamps=True) data = list(segments) transcription = data[0].text file = open('words.csv', "w") for one in data[0].words: line = f'{one.word.strip()};{one.start};{one.end}\n' file.write(line) file.close() new_txtgrid = textgrids.TextGrid() new_txtgrid.tier_from_csv('words', 'words.csv') new_txtgrid.xmax = data[-1].end file_name = os.path.basename(sound_file) name = os.path.splitext(file_name)[0] new_txtgrid.write(f'{out_dir}/{name}.TextGrid') with open(f'{out_dir}/{name}.lab', 'w') as f: f.write(transcription) except Exception as e: print(f"Lỗi khi xử lý {sound_file} trên GPU {gpu_id}: {e}") def whisper_transcribe_parallel(models, sound_files, out_dir): """ Hàm chính để xử lý song song trên nhiều GPU với ThreadPoolExecutor. """ with ThreadPoolExecutor(max_workers=len(models)) as executor: futures = [] for i, sound_file in enumerate(sound_files): gpu_id, model = models[i % len(models)] futures.append(executor.submit(process_file, model, sound_file, out_dir, gpu_id)) # Hiển thị thanh tiến trình với tqdm for _ in tqdm(futures, desc="Đang xử lý"): _.result() # Đợi kết quả #############################################################################def def folder_upload(repo_id,local_folder,hub_folder,repo_type,hub_token): """ repo_type: "model", "dataset", "space" """ import os import glob from huggingface_hub import Repository, create_repo from huggingface_hub import upload_folder # create new repo if not existing create_repo(repo_id, repo_type=repo_type, exist_ok=True, token=hub_token) upload_folder( repo_id=repo_id, folder_path=local_folder, path_in_repo=hub_folder, repo_type=repo_type, token=hub_token, ) print(f'Uploaded {local_folder} to {repo_id}')