Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torchaudio | |
| import torch | |
| import librosa | |
| import numpy as np | |
| # import moviepy.editor as mp | |
| import moviepy | |
| from moviepy.video.io.VideoFileClip import VideoFileClip | |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
| import wget | |
| import subprocess | |
| import os | |
| import csv | |
| import pandas as pd | |
| from vosk import Model as VoskModel | |
| from vosk import KaldiRecognizer, SetLogLevel | |
| from jiwer import cer | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer | |
| from huggingface_hub import snapshot_download | |
| # url = "https://huggingface.co/MahtaFetrat/tempmodel/resolve/main/checkpoint-15-1200.zip" | |
| # output_file = wget.download(url) | |
| # !unzip checkpoint-15-1200.zip -d extracted_model | |
| # zip_file = "checkpoint-15-1200.zip" | |
| # output_dir = "extracted_model" | |
| # subprocess.run(["unzip", zip_file, "-d", output_dir], check=True) | |
| model_name = "MahtaFetrat/wav2vec2_finetuned_on_youtube_farsi_30" | |
| local_dir = snapshot_download(repo_id=model_name) | |
| # Function to split audio into chunks | |
| def split_audio(audio, sampling_rate, chunk_size=30): | |
| chunk_length = chunk_size * sampling_rate | |
| chunks = [audio[i:i + chunk_length] for i in range(0, len(audio), chunk_length)] | |
| return chunks | |
| # Function for inference from an audio file path | |
| def infer_from_audio_file(audio_file_path, model, processor, device="cpu"): | |
| # Load audio file | |
| audio, sampling_rate = librosa.load(audio_file_path, sr=16000) | |
| # Split audio into chunks of at most 30 seconds | |
| chunks = split_audio(audio, sampling_rate) | |
| transcriptions = [] | |
| for chunk in chunks: | |
| # Process the audio using the feature extractor from the processor | |
| inputs = processor(chunk, sampling_rate=sampling_rate).input_values[0] | |
| input_features = [{"input_values": inputs}] | |
| batch = processor.pad( | |
| input_features, | |
| padding=True, | |
| max_length=None, | |
| pad_to_multiple_of=None, | |
| return_tensors="pt", | |
| ) | |
| # Move inputs to the correct device | |
| input_values = batch.input_values.to(device) | |
| # Ensure the model is in evaluation mode | |
| model.eval() | |
| with torch.no_grad(): | |
| # Make predictions | |
| outputs = model(input_values) | |
| logits = outputs.logits | |
| # Decode the predictions | |
| pred_ids = torch.argmax(logits, dim=-1) | |
| pred_str = processor.batch_decode(pred_ids.cpu().numpy()) | |
| transcriptions.append(pred_str[0]) | |
| # Concatenate the transcriptions | |
| full_transcription = ' '.join(transcriptions) | |
| return full_transcription | |
| tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") | |
| feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) | |
| processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) | |
| latest_checkpoint = local_dir | |
| model = Wav2Vec2ForCTC.from_pretrained(latest_checkpoint) | |
| def tuned_wav2vec_speech_file_to_array_fn(path): | |
| speech_array, sampling_rate = torchaudio.load(path) | |
| speech_array = speech_array.squeeze().numpy() | |
| speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=tuned_wav2vec_processor.feature_extractor.sampling_rate) | |
| return speech_array | |
| def transcribe_audio(audio_file_path): | |
| speech = tuned_wav2vec_speech_file_to_array_fn(audio_file_path) | |
| features = tuned_wav2vec_processor( | |
| speech, | |
| sampling_rate=tuned_wav2vec_processor.feature_extractor.sampling_rate, | |
| return_tensors="pt", | |
| padding=True | |
| ) | |
| input_values = features.input_values | |
| attention_mask = features.attention_mask | |
| with torch.no_grad(): | |
| logits = tuned_wav2vec_model(input_values, attention_mask=attention_mask).logits | |
| pred_ids = torch.argmax(logits, dim=-1) | |
| predicted = tuned_wav2vec_processor.batch_decode(pred_ids) | |
| return predicted[0] | |
| def preprocess_audio(audio_path): | |
| y, sr = librosa.load(audio_path, sr=16000, mono=True) | |
| y = librosa.util.normalize(y) | |
| y = librosa.effects.preemphasis(y) | |
| return torch.tensor(y, dtype=torch.float32).unsqueeze(0) | |
| def speech_to_text(audio_path): | |
| predicted = infer_from_audio_file(audio_path, model, processor) | |
| return predicted | |
| def video_to_text(video_path): | |
| # video = mp.VideoFileClip(video_path) | |
| video = VideoFileClip(video_path) | |
| audio_path = "extracted_audio.wav" | |
| video.audio.write_audiofile(audio_path, codec="pcm_s16le") | |
| return speech_to_text(audio_path) | |
| examples_audio = [ | |
| ["examples/audio1.m4a"], | |
| ["examples/audio2.m4a"], | |
| ["examples/audio3.m4a"] | |
| ] | |
| examples_video = [ | |
| ["examples/video1.mp4"], | |
| ["examples/video2.mp4"] | |
| ] | |
| with gr.Blocks() as demo: | |
| gr.Markdown("### تبدیل گفتار فارسی به متن با استفاده از Wav2Vec2") | |
| with gr.Tab("آپلود ویدئو"): | |
| video_input = gr.File(label="انتخاب ویدئو") | |
| video_output = gr.Textbox(label="متن استخراج شده") | |
| video_button = gr.Button("تبدیل به متن") | |
| video_button.click(video_to_text, inputs=video_input, outputs=video_output) | |
| gr.Examples(examples=examples_video, inputs=video_input) | |
| with gr.Tab("آپلود فایل صوتی"): | |
| audio_input = gr.File(label="انتخاب فایل صوتی") | |
| audio_output = gr.Textbox(label="متن استخراج شده") | |
| audio_button = gr.Button("تبدیل به متن") | |
| audio_button.click(speech_to_text, inputs=audio_input, outputs=audio_output) | |
| gr.Examples(examples=examples_audio, inputs=audio_input) | |
| with gr.Tab("ضبط صدا"): | |
| mic_input = gr.Audio(sources="microphone", type="filepath") | |
| mic_output = gr.Textbox(label="متن استخراج شده") | |
| mic_button = gr.Button("تبدیل به متن") | |
| mic_button.click(speech_to_text, inputs=mic_input, outputs=mic_output) | |
| demo.launch() | |