import os from faster_whisper import WhisperModel, BatchedInferencePipeline import re def clean_text(text: str) ->str: text = re.sub(r'\s+', ' ', text) return text def preprocess_transcript(segments: list): """ Tiền xử lý transcript, trả về dict với data: - start: thời gian đoạn văn bắt đầu: - end: thời gian kết thúc của batch - text: đoạn văn được transcript """ processed_segments = [] for segment in segments: processed_segments.append({ 'start': segment.start, 'end': segment.end, 'text': clean_text(segment.text) }) return processed_segments def transcript_audio( input_audio: str = "audio.mp3", model_size: str = "base", device: str = "cpu", compute_type: str = "int8", #float16, float32 beam_size: int = 5, vad_filter: bool = False): """ Thực hiện chuyển đổi """ if not os.path.exists(input_audio): raise FileNotFoundError("file not found") #Khởi tạo model model = WhisperModel(model_size, device=device, compute_type=compute_type) #Cấu hình cho tham số: transcript_kwargs = {"beam_size": beam_size} if vad_filter: transcript_kwargs["vad_filter"] = vad_filter #Chạy transcription: batched_model = BatchedInferencePipeline(model=model) segments, info = batched_model.transcribe(input_audio, **transcript_kwargs, batch_size=16) segments = list(segments) processed_segments = preprocess_transcript(segments) return processed_segments def save_transcript(segments: list, output): """ Lưu transcript :param segments: :param output: :return: """ with open(output, 'w', encoding='utf-8') as f: for segment in segments: f.write(segment['text'] + '\n')