NLPV commited on
Commit
24fa0cd
·
verified ·
1 Parent(s): ee4215a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -8
app.py CHANGED
@@ -5,34 +5,92 @@ import difflib
5
  import pandas as pd
6
  from Levenshtein import distance as lev_distance
7
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
8
  import torchaudio
9
 
10
- # Load AI4Bharat Whisper model
11
- processor = WhisperProcessor.from_pretrained("ai4bharat/indic-whisper-large-v2")
12
- model = WhisperForConditionalGeneration.from_pretrained("ai4bharat/indic-whisper-large-v2").to("cpu") # use "cuda" if you have a GPU
13
 
14
- # ... [play_text and helper functions as before] ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def transcribe_audio(audio_path, original_text):
17
  try:
18
- # Load audio and preprocess
19
  speech, rate = torchaudio.load(audio_path)
 
 
 
 
20
  if rate != 16000:
21
  resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
22
  speech = resampler(speech)
23
  input_features = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
24
 
25
- # AI4Bharat Whisper transcription
26
  predicted_ids = model.generate(input_features)
27
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
28
 
29
- # Error analysis and metrics (as before)
30
  errors = compare_hindi_sentences(original_text, transcription)
31
  df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
32
 
33
- duration = speech.shape[-1] / 16000
 
34
  transcribed_words = transcription.strip().split()
35
  speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
 
 
36
  accuracy = calculate_accuracy(original_text, transcription)
37
  result_dict = {
38
  "📝 Transcribed Text": transcription,
@@ -42,3 +100,24 @@ def transcribe_audio(audio_path, original_text):
42
  return result_dict, df_errors
43
  except Exception as e:
44
  return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import pandas as pd
6
  from Levenshtein import distance as lev_distance
7
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
8
+ import torch
9
  import torchaudio
10
 
11
+ # Load AI4Bharat Whisper model (Hindi-only)
12
+ processor = WhisperProcessor.from_pretrained("ai4bharat/whisper-medium-hi")
13
+ model = WhisperForConditionalGeneration.from_pretrained("ai4bharat/whisper-medium-hi").to("cpu") # or "cuda" if you have a GPU
14
 
15
+ def play_text(text):
16
+ tts = gTTS(text=text, lang='hi', slow=False)
17
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
18
+ tts.save(temp_file.name)
19
+ return temp_file.name
20
+
21
+ def get_error_type(asr_word, correct_word):
22
+ if not asr_word:
23
+ return "Missing word"
24
+ if not correct_word:
25
+ return "Extra word"
26
+ if lev_distance(asr_word, correct_word) <= 2:
27
+ return "Spelling mistake"
28
+ set1, set2 = set(asr_word), set(correct_word)
29
+ if set1 & set2:
30
+ return "Phonetic/Matra error"
31
+ return "Substitution/Distorted"
32
+
33
+ def compare_hindi_sentences(expected, transcribed):
34
+ expected_words = expected.strip().split()
35
+ transcribed_words = transcribed.strip().split()
36
+ matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
37
+ errors = []
38
+ for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
39
+ if opcode == "equal":
40
+ continue
41
+ elif opcode == "replace":
42
+ for k in range(max(i2 - i1, j2 - j1)):
43
+ asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
44
+ correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
45
+ error_type = get_error_type(asr_word, correct_word)
46
+ errors.append((asr_word, correct_word, error_type))
47
+ elif opcode == "insert":
48
+ for k in range(j1, j2):
49
+ errors.append(("", expected_words[k], "Missing word"))
50
+ elif opcode == "delete":
51
+ for k in range(i1, i2):
52
+ errors.append((transcribed_words[k], "", "Extra word"))
53
+ return errors
54
+
55
+ def calculate_accuracy(expected, transcribed):
56
+ expected_words = expected.strip().split()
57
+ transcribed_words = transcribed.strip().split()
58
+ matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
59
+ correct = 0
60
+ total = len(expected_words)
61
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
62
+ if tag == 'equal':
63
+ correct += (j2-j1)
64
+ accuracy = (correct / total) * 100 if total > 0 else 0
65
+ return round(accuracy, 2)
66
 
67
  def transcribe_audio(audio_path, original_text):
68
  try:
69
+ # Load and preprocess the audio file
70
  speech, rate = torchaudio.load(audio_path)
71
+ # Convert to mono if needed
72
+ if speech.shape[0] > 1:
73
+ speech = torch.mean(speech, dim=0, keepdim=True)
74
+ # Resample if needed
75
  if rate != 16000:
76
  resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
77
  speech = resampler(speech)
78
  input_features = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
79
 
80
+ # Generate transcription
81
  predicted_ids = model.generate(input_features)
82
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
83
 
84
+ # Error analysis
85
  errors = compare_hindi_sentences(original_text, transcription)
86
  df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
87
 
88
+ # Speaking speed
89
+ duration = speech.shape[-1] / 16000 # seconds
90
  transcribed_words = transcription.strip().split()
91
  speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
92
+
93
+ # Accuracy
94
  accuracy = calculate_accuracy(original_text, transcription)
95
  result_dict = {
96
  "📝 Transcribed Text": transcription,
 
100
  return result_dict, df_errors
101
  except Exception as e:
102
  return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
103
+
104
+ with gr.Blocks() as app:
105
+ gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Whisper)")
106
+ with gr.Row():
107
+ input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
108
+ play_button = gr.Button("🔊 Listen to Text")
109
+ audio_output = gr.Audio(label="Text-to-Speech Output", type="filepath")
110
+ play_button.click(play_text, inputs=input_text, outputs=audio_output)
111
+
112
+ gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
113
+ audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
114
+ submit_button = gr.Button("✅ Submit Recording for Checking")
115
+ output = gr.JSON(label="Results")
116
+ error_table = gr.Dataframe(label="गलती तालिका (Error Table)")
117
+ submit_button.click(
118
+ transcribe_audio,
119
+ inputs=[audio_input, input_text],
120
+ outputs=[output, error_table]
121
+ )
122
+
123
+ app.launch()