NLPV commited on
Commit
0b42505
·
verified ·
1 Parent(s): 60aad94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -16
app.py CHANGED
@@ -1,13 +1,18 @@
1
  import gradio as gr
2
  from gtts import gTTS
3
  import tempfile
 
 
 
 
4
  import difflib
5
  import pandas as pd
6
  from Levenshtein import distance as lev_distance
7
- from indic_asr import load_model, ModelType
8
 
9
- # Load IndicWav2Vec model for Hindi
10
- model = load_model(lang="hi", model_type=ModelType.WAV2VEC2)
 
 
11
 
12
  def play_text(text):
13
  tts = gTTS(text=text, lang='hi', slow=False)
@@ -63,28 +68,33 @@ def calculate_accuracy(expected, transcribed):
63
 
64
  def transcribe_audio(audio_path, original_text):
65
  try:
66
- # Use IndicWav2Vec for transcription
67
- transcription = model.transcribe(audio_path).strip()
 
 
 
 
 
 
 
 
 
 
 
68
  errors = compare_hindi_sentences(original_text, transcription)
69
  df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
70
-
71
- # Speaking speed: estimate from audio file length
72
- import wave
73
- with wave.open(audio_path, 'r') as wav_file:
74
- frames = wav_file.getnframes()
75
- rate = wav_file.getframerate()
76
- duration = frames / float(rate)
77
  transcribed_words = transcription.strip().split()
 
78
  speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
79
-
80
  # Accuracy
81
  accuracy = calculate_accuracy(original_text, transcription)
82
- result_dict = {
83
  "📝 Transcribed Text": transcription,
84
  "⏱️ Speaking Speed (words/sec)": speed,
85
- "✅ Reading Accuracy (%)": accuracy,
86
  }
87
- return result_dict, df_errors
88
  except Exception as e:
89
  return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
90
 
 
1
  import gradio as gr
2
  from gtts import gTTS
3
  import tempfile
4
+ import os
5
+ import torch
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+ import torchaudio
8
  import difflib
9
  import pandas as pd
10
  from Levenshtein import distance as lev_distance
 
11
 
12
+ # Load AI4Bharat Hindi model & processor (public model on Hugging Face)
13
+ MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
14
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
15
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
16
 
17
  def play_text(text):
18
  tts = gTTS(text=text, lang='hi', slow=False)
 
68
 
69
  def transcribe_audio(audio_path, original_text):
70
  try:
71
+ waveform, sample_rate = torchaudio.load(audio_path)
72
+ if waveform.shape[0] > 1:
73
+ waveform = waveform.mean(dim=0, keepdim=True)
74
+ if sample_rate != 16000:
75
+ transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
76
+ waveform = transform(waveform)
77
+ waveform = waveform / waveform.abs().max()
78
+ input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
79
+ with torch.no_grad():
80
+ logits = model(input_values).logits
81
+ predicted_ids = torch.argmax(logits, dim=-1)
82
+ transcription = processor.decode(predicted_ids[0]).strip()
83
+ # Error analysis
84
  errors = compare_hindi_sentences(original_text, transcription)
85
  df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
86
+ # Speaking speed
 
 
 
 
 
 
87
  transcribed_words = transcription.strip().split()
88
+ duration = waveform.shape[1] / 16000
89
  speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
 
90
  # Accuracy
91
  accuracy = calculate_accuracy(original_text, transcription)
92
+ result = {
93
  "📝 Transcribed Text": transcription,
94
  "⏱️ Speaking Speed (words/sec)": speed,
95
+ "✅ Reading Accuracy (%)": accuracy
96
  }
97
+ return result, df_errors
98
  except Exception as e:
99
  return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
100