Spaces:

afaqalinagra
/

PASHTO-ASR-MODEL

Sleeping

App Files Files Community

PASHTO-ASR-MODEL / app.py

afaqalinagra

Update app.py

6da03f5 verified about 2 months ago

raw

history blame contribute delete

3.97 kB

	import gradio as gr
	import torch
	import numpy as np
	import librosa

	from transformers import WhisperProcessor, WhisperForConditionalGeneration

	# =========================
	# CONFIG
	# =========================
	MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
	DEVICE = "cpu"
	TARGET_SR = 16000

	# =========================
	# LOAD MODEL
	# =========================
	processor = WhisperProcessor.from_pretrained(
	MODEL_ID,
	language="pashto",
	task="transcribe"
	)

	model = WhisperForConditionalGeneration.from_pretrained(
	MODEL_ID
	).to(DEVICE)

	model.eval()

	# =========================
	# SAFE AUDIO HANDLER
	# =========================
	def transcribe_audio(audio):
	if audio is None:
	return "No audio provided."

	# ---- HANDLE BOTH GRADIO FORMATS ----
	if isinstance(audio, dict):
	waveform = audio.get("data", None)
	sample_rate = audio.get("sampling_rate", None)
	else:
	sample_rate, waveform = audio

	if waveform is None or sample_rate is None:
	return "Invalid audio input."

	if len(waveform) == 0:
	return "Empty audio."

	# Convert stereo → mono
	if waveform.ndim > 1:
	waveform = np.mean(waveform, axis=1)

	# Resample to 16kHz
	if sample_rate != TARGET_SR:
	waveform = librosa.resample(
	waveform.astype(np.float32),
	orig_sr=sample_rate,
	target_sr=TARGET_SR
	)

	inputs = processor(
	waveform,
	sampling_rate=TARGET_SR,
	return_tensors="pt"
	)

	with torch.no_grad():
	predicted_ids = model.generate(
	inputs.input_features.to(DEVICE),
	max_length=448
	)

	transcription = processor.batch_decode(
	predicted_ids,
	skip_special_tokens=True
	)[0]

	return transcription.strip() if transcription else "No speech detected."

	# =========================
	# GLASSMORPHISM CSS
	# =========================
	CUSTOM_CSS = """
	body {
	background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
	font-family: Inter, sans-serif;
	}

	.gradio-container {
	max-width: 1100px !important;
	margin: auto;
	}

	.glass {
	background: rgba(255, 255, 255, 0.12);
	backdrop-filter: blur(18px);
	-webkit-backdrop-filter: blur(18px);
	border-radius: 18px;
	border: 1px solid rgba(255, 255, 255, 0.25);
	box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
	padding: 24px;
	}

	h1, h3, p {
	color: white !important;
	text-align: center;
	}

	button {
	background: linear-gradient(135deg, #ff8008, #ffc837) !important;
	color: black !important;
	font-weight: 600 !important;
	border-radius: 10px !important;
	}

	textarea {
	font-size: 16px !important;
	}
	"""

	# =========================
	# UI
	# =========================
	with gr.Blocks(css=CUSTOM_CSS) as demo:
	with gr.Column(elem_classes="glass"):
	gr.Markdown(
	"""
	# 🎙️ Pashto Speech-to-Text
	### Powered by Whisper ASR
	Upload or record Pashto audio and get accurate transcription.
	"""
	)

	audio_input = gr.Audio(
	sources=["upload", "microphone"],
	type="numpy",
	label="Upload or Record Pashto Audio"
	)

	transcribe_btn = gr.Button("Transcribe")

	output_text = gr.Textbox(
	label="Transcription Output",
	lines=6,
	placeholder="Pashto transcription will appear here..."
	)

	transcribe_btn.click(
	fn=transcribe_audio,
	inputs=audio_input,
	outputs=output_text
	)

	gr.Markdown(
	"""
	<hr>
	<p>
	Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
	Runs entirely on Hugging Face free infrastructure.
	</p>
	"""
	)

	# =========================
	# LAUNCH
	# =========================
	if __name__ == "__main__":
	demo.launch()