Spaces:

NLPV
/

ReadabilityTest

Sleeping

App Files Files Community

ReadabilityTest / app.py

NLPV

Update app.py

a59a577 verified 6 months ago

raw

history blame

3.39 kB

	import gradio as gr
	from gtts import gTTS
	import tempfile
	import os
	import difflib
	import torch
	import re
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import torchaudio

	# Load AI4Bharat Hindi model & processor
	MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
	processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
	model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)

	def play_text(text):
	tts = gTTS(text=text, lang='hi', slow=False)
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
	tts.save(temp_file.name)
	# Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
	os.system(f"start {temp_file.name}")
	return "✅ Text is being read out. Please listen and read it yourself."

	def transcribe_audio(audio_path, original_text):
	try:
	# 1. Load audio & convert to mono, 16kHz if needed
	waveform, sample_rate = torchaudio.load(audio_path)
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)
	if sample_rate != 16000:
	transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
	waveform = transform(waveform)

	# --- Amplify voice intensity here ---
	GAIN = 1.5 # You can adjust this value (1.0 = unchanged, 2.0 = double)
	waveform = waveform * GAIN
	waveform = torch.clamp(waveform, -1.0, 1.0) # Avoid clipping/distortion

	input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values

	# 2. Transcribe with AI4Bharat model
	with torch.no_grad():
	logits = model(input_values).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0])

	# 3. Calculate accuracy etc.
	original_words = re.findall(r'\w+', original_text.strip())
	transcribed_words = re.findall(r'\w+', transcription.strip())
	matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
	accuracy = round(matcher.ratio() * 100, 2)

	# Speaking speed approximation (needs duration, which torchaudio gives)
	duration = waveform.shape[1] / 16000
	speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0

	result = {
	"📝 Transcribed Text": transcription,
	"🎯 Accuracy (%)": accuracy,
	"⏱️ Speaking Speed (words/sec)": speed
	}
	return result
	except Exception as e:
	return {"error": str(e)}

	return {"error": str(e)}

	with gr.Blocks() as app:
	gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")

	with gr.Row():
	input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
	play_button = gr.Button("🔊 Listen to Text")

	play_button.click(play_text, inputs=[input_text], outputs=[])

	gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
	audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")

	submit_button = gr.Button("✅ Submit Recording for Checking")
	output = gr.JSON(label="Results")

	submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])

	app.launch()