baharbhz commited on
Commit
0a74def
·
verified ·
1 Parent(s): f8a8a21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -10
app.py CHANGED
@@ -7,12 +7,78 @@ import numpy as np
7
  # import moviepy.editor as mp
8
  import moviepy
9
  from moviepy.video.io.VideoFileClip import VideoFileClip
10
-
11
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- model_name = "m3hrdadfi/wav2vec2-large-xlsr-persian"
14
- processor = Wav2Vec2Processor.from_pretrained(model_name)
15
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def preprocess_audio(audio_path):
18
  y, sr = librosa.load(audio_path, sr=16000, mono=True)
@@ -22,14 +88,34 @@ def preprocess_audio(audio_path):
22
 
23
 
24
  def speech_to_text(audio_path):
25
- waveform = preprocess_audio(audio_path)
26
 
27
- input_values = processor(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  with torch.no_grad():
29
- logits = model(input_values).logits
30
- predicted_ids = torch.argmax(logits, dim=-1)
31
- transcription = processor.batch_decode(predicted_ids)[0]
32
- return transcription
 
 
33
 
34
 
35
  def video_to_text(video_path):
 
7
  # import moviepy.editor as mp
8
  import moviepy
9
  from moviepy.video.io.VideoFileClip import VideoFileClip
 
10
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
11
+ import wget
12
+ import subprocess
13
+ import os
14
+ import csv
15
+ import pandas as pd
16
+ from vosk import Model as VoskModel
17
+ from vosk import KaldiRecognizer, SetLogLevel
18
+ from jiwer import cer
19
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
20
+ import torch
21
+ import librosa
22
+ import torchaudio
23
+ import numpy as np
24
+
25
+
26
+ url = "https://huggingface.co/MahtaFetrat/tempmodel/resolve/main/checkpoint-15-1200.zip"
27
+ output_file = wget.download(url)
28
+
29
+ # !unzip checkpoint-15-1200.zip -d extracted_model
30
+
31
+ zip_file = "checkpoint-15-1200.zip"
32
+ output_dir = "extracted_model"
33
+
34
+ subprocess.run(["unzip", zip_file, "-d", output_dir], check=True)
35
+
36
+
37
+ from transformers import Wav2Vec2CTCTokenizer
38
+
39
+ tokenizer = Wav2Vec2CTCTokenizer("/vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")
40
+
41
+
42
+ from transformers import Wav2Vec2FeatureExtractor
43
+
44
+ feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
45
+
46
+
47
+ from transformers import Wav2Vec2Processor
48
 
49
+ tuned_wav2vec_processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
50
+ tuned_wav2vec_model = Wav2Vec2ForCTC.from_pretrained("/extracted_model/checkpoint-15-1200")
51
+
52
+
53
+ def tuned_wav2vec_speech_file_to_array_fn(path):
54
+ speech_array, sampling_rate = torchaudio.load(path)
55
+ speech_array = speech_array.squeeze().numpy()
56
+ speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=tuned_wav2vec_processor.feature_extractor.sampling_rate)
57
+
58
+ return speech_array
59
+
60
+
61
+ def transcribe_audio(audio_file_path):
62
+ speech = tuned_wav2vec_speech_file_to_array_fn(audio_file_path)
63
+
64
+ features = tuned_wav2vec_processor(
65
+ speech,
66
+ sampling_rate=tuned_wav2vec_processor.feature_extractor.sampling_rate,
67
+ return_tensors="pt",
68
+ padding=True
69
+ )
70
+
71
+ input_values = features.input_values
72
+ attention_mask = features.attention_mask
73
+
74
+ with torch.no_grad():
75
+ logits = tuned_wav2vec_model(input_values, attention_mask=attention_mask).logits
76
+
77
+ pred_ids = torch.argmax(logits, dim=-1)
78
+
79
+ predicted = tuned_wav2vec_processor.batch_decode(pred_ids)
80
+ return predicted[0]
81
+
82
 
83
  def preprocess_audio(audio_path):
84
  y, sr = librosa.load(audio_path, sr=16000, mono=True)
 
88
 
89
 
90
  def speech_to_text(audio_path):
91
+ # waveform = preprocess_audio(audio_path)
92
 
93
+ # input_values = processor(waveform.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
94
+ # with torch.no_grad():
95
+ # logits = model(input_values).logits
96
+ # predicted_ids = torch.argmax(logits, dim=-1)
97
+ # transcription = processor.batch_decode(predicted_ids)[0]
98
+ # return transcription
99
+
100
+ speech = tuned_wav2vec_speech_file_to_array_fn(audio_path)
101
+
102
+ features = tuned_wav2vec_processor(
103
+ speech,
104
+ sampling_rate=tuned_wav2vec_processor.feature_extractor.sampling_rate,
105
+ return_tensors="pt",
106
+ padding=True
107
+ )
108
+
109
+ input_values = features.input_values
110
+ attention_mask = features.attention_mask
111
+
112
  with torch.no_grad():
113
+ logits = tuned_wav2vec_model(input_values, attention_mask=attention_mask).logits
114
+
115
+ pred_ids = torch.argmax(logits, dim=-1)
116
+
117
+ predicted = tuned_wav2vec_processor.batch_decode(pred_ids)
118
+ return predicted[0]
119
 
120
 
121
  def video_to_text(video_path):