import gradio as gr import torchaudio import torch import librosa import numpy as np # import moviepy.editor as mp import moviepy from moviepy.video.io.VideoFileClip import VideoFileClip from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import wget import subprocess import os import csv import pandas as pd from vosk import Model as VoskModel from vosk import KaldiRecognizer, SetLogLevel from jiwer import cer from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer from huggingface_hub import snapshot_download # url = "https://huggingface.co/MahtaFetrat/tempmodel/resolve/main/checkpoint-15-1200.zip" # output_file = wget.download(url) # !unzip checkpoint-15-1200.zip -d extracted_model # zip_file = "checkpoint-15-1200.zip" # output_dir = "extracted_model" # subprocess.run(["unzip", zip_file, "-d", output_dir], check=True) model_name = "MahtaFetrat/wav2vec2_finetuned_on_youtube_farsi_30" local_dir = snapshot_download(repo_id=model_name) # Function to split audio into chunks def split_audio(audio, sampling_rate, chunk_size=30): chunk_length = chunk_size * sampling_rate chunks = [audio[i:i + chunk_length] for i in range(0, len(audio), chunk_length)] return chunks # Function for inference from an audio file path def infer_from_audio_file(audio_file_path, model, processor, device="cpu"): # Load audio file audio, sampling_rate = librosa.load(audio_file_path, sr=16000) # Split audio into chunks of at most 30 seconds chunks = split_audio(audio, sampling_rate) transcriptions = [] for chunk in chunks: # Process the audio using the feature extractor from the processor inputs = processor(chunk, sampling_rate=sampling_rate).input_values[0] input_features = [{"input_values": inputs}] batch = processor.pad( input_features, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors="pt", ) # Move inputs to the correct device input_values = batch.input_values.to(device) # Ensure the model is in evaluation mode model.eval() with torch.no_grad(): # Make predictions outputs = model(input_values) logits = outputs.logits # Decode the predictions pred_ids = torch.argmax(logits, dim=-1) pred_str = processor.batch_decode(pred_ids.cpu().numpy()) transcriptions.append(pred_str[0]) # Concatenate the transcriptions full_transcription = ' '.join(transcriptions) return full_transcription tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="", pad_token="", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) latest_checkpoint = local_dir model = Wav2Vec2ForCTC.from_pretrained(latest_checkpoint) def tuned_wav2vec_speech_file_to_array_fn(path): speech_array, sampling_rate = torchaudio.load(path) speech_array = speech_array.squeeze().numpy() speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=tuned_wav2vec_processor.feature_extractor.sampling_rate) return speech_array def transcribe_audio(audio_file_path): speech = tuned_wav2vec_speech_file_to_array_fn(audio_file_path) features = tuned_wav2vec_processor( speech, sampling_rate=tuned_wav2vec_processor.feature_extractor.sampling_rate, return_tensors="pt", padding=True ) input_values = features.input_values attention_mask = features.attention_mask with torch.no_grad(): logits = tuned_wav2vec_model(input_values, attention_mask=attention_mask).logits pred_ids = torch.argmax(logits, dim=-1) predicted = tuned_wav2vec_processor.batch_decode(pred_ids) return predicted[0] def preprocess_audio(audio_path): y, sr = librosa.load(audio_path, sr=16000, mono=True) y = librosa.util.normalize(y) y = librosa.effects.preemphasis(y) return torch.tensor(y, dtype=torch.float32).unsqueeze(0) def speech_to_text(audio_path): predicted = infer_from_audio_file(audio_path, model, processor) return predicted def video_to_text(video_path): # video = mp.VideoFileClip(video_path) video = VideoFileClip(video_path) audio_path = "extracted_audio.wav" video.audio.write_audiofile(audio_path, codec="pcm_s16le") return speech_to_text(audio_path) examples_audio = [ ["examples/audio1.m4a"], ["examples/audio2.m4a"], ["examples/audio3.m4a"] ] examples_video = [ ["examples/video1.mp4"], ["examples/video2.mp4"] ] with gr.Blocks() as demo: gr.Markdown("### تبدیل گفتار فارسی به متن با استفاده از Wav2Vec2") with gr.Tab("آپلود ویدئو"): video_input = gr.File(label="انتخاب ویدئو") video_output = gr.Textbox(label="متن استخراج شده") video_button = gr.Button("تبدیل به متن") video_button.click(video_to_text, inputs=video_input, outputs=video_output) gr.Examples(examples=examples_video, inputs=video_input) with gr.Tab("آپلود فایل صوتی"): audio_input = gr.File(label="انتخاب فایل صوتی") audio_output = gr.Textbox(label="متن استخراج شده") audio_button = gr.Button("تبدیل به متن") audio_button.click(speech_to_text, inputs=audio_input, outputs=audio_output) gr.Examples(examples=examples_audio, inputs=audio_input) with gr.Tab("ضبط صدا"): mic_input = gr.Audio(sources="microphone", type="filepath") mic_output = gr.Textbox(label="متن استخراج شده") mic_button = gr.Button("تبدیل به متن") mic_button.click(speech_to_text, inputs=mic_input, outputs=mic_output) demo.launch()