Spaces:

minte-atnafu
/

GihonTech_Local_Language_Transcription

Sleeping

GihonTech_Local_Language_Transcription / app.py

Minte

Fix Afan Oromo language configuration and model loading

133a63b 3 months ago

10.1 kB

	import traceback
	import soundfile as sf
	import torch
	import numpy as np
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor
	import gradio as gr
	import resampy

	# Language configuration - UPDATED with correct Afan Oromo code
	LANGUAGE_CONFIG = {
	"Amharic": {
	"code": "amh",
	"model": "facebook/seamless-m4t-v2-large",
	"available": True
	},
	"Swahili": {
	"code": "swh",
	"model": "facebook/seamless-m4t-v2-large",
	"available": True
	},
	"Somali": {
	"code": "som",
	"model": "facebook/seamless-m4t-v2-large",
	"available": True
	},
	"Afan Oromo": {
	"code": "gaz", # FIXED: Changed from "orm" to "gaz"
	"model": "facebook/seamless-m4t-v2-large", # Using SeamlessM4T since it supports gaz
	"available": True
	},
	"Tigrinya": {
	"code": "tir",
	"model": "facebook/seamless-m4t-v2-large",
	"available": False,
	"message": "Tigrinya transcription is not currently available"
	},
	"Chichewa": {
	"code": "nya",
	"model": "dmatekenya/wav2vec2-large-xls-r-300m-chichewa",
	"available": True
	}
	}

	# Initialize models
	models = {}
	processors = {}

	print("[INFO] Loading transcription models...")

	# Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo
	try:
	seamless_model_id = "facebook/seamless-m4t-v2-large"
	seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
	seamless_model = AutoModelForSpeechSeq2Seq.from_pretrained(seamless_model_id).to("cpu")

	for lang, config in LANGUAGE_CONFIG.items():
	if config["available"] and config["model"] == seamless_model_id:
	models[lang] = seamless_model
	processors[lang] = seamless_processor

	print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo")
	except Exception as e:
	print("[ERROR] Failed to load SeamlessM4T model:", e)
	traceback.print_exc()

	# Load Chichewa model
	try:
	chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
	chichewa_model = Wav2Vec2ForCTC.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa").to("cpu")
	models["Chichewa"] = chichewa_model
	processors["Chichewa"] = chichewa_processor
	print("[SUCCESS] Chichewa model loaded successfully")
	except Exception as e:
	print("[ERROR] Failed to load Chichewa model:", e)
	traceback.print_exc()
	LANGUAGE_CONFIG["Chichewa"]["available"] = False

	# --- Helper: ASR ---
	def transcribe_audio(audio_file, language):
	if language not in models or language not in processors:
	return f"Model for {language} is not available"

	if not LANGUAGE_CONFIG[language]["available"]:
	if language == "Tigrinya":
	return LANGUAGE_CONFIG[language]["message"]
	return f"{language} transcription is currently unavailable"

	try:
	# Read and preprocess audio
	audio, sr = sf.read(audio_file)
	if audio.ndim > 1:
	audio = audio.mean(axis=1)
	audio = resampy.resample(audio, sr, 16000)

	model = models[language]
	processor = processors[language]

	# Handle different model types
	if language == "Chichewa":
	# Wav2Vec2 processing
	inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
	with torch.no_grad():
	logits = model(**inputs).logits
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids)[0]

	else:
	# Standard SeamlessM4T processing for all other languages
	inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # Fixed: audio instead of audios
	with torch.no_grad():
	generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
	transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	return transcription.strip()

	except Exception as e:
	print(f"[ERROR] ASR transcription failed for {language}:", e)
	traceback.print_exc()
	return f"Transcription failed: {str(e)[:100]}..."

	# --- Beautiful Gradio UI ---
	with gr.Blocks(
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="green",
	),
	title="🌍 GihonTech - Multilingual Speech Recognition",
	css="""
	.gradio-container {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	}
	.header {
	text-align: center;
	padding: 20px;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border-radius: 15px;
	margin-bottom: 20px;
	color: white;
	}
	.language-card {
	background: white;
	padding: 15px;
	border-radius: 10px;
	margin: 10px 0;
	border-left: 4px solid #667eea;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	.unavailable {
	background: #ffebee;
	border-left: 4px solid #f44336;
	}
	.available {
	background: #e8f5e8;
	border-left: 4px solid #4caf50;
	}
	"""
	) as demo:

	# Header Section
	with gr.Row():
	with gr.Column():
	gr.HTML("""
	<div class="header">
	<h1>🌍 GihonTech Multilingual Speech Recognition</h1>
	<p>Transcribe audio in multiple African languages with state-of-the-art AI models</p>
	</div>
	""")

	# Main Content
	with gr.Row():
	# Input Section
	with gr.Column(scale=1):
	gr.Markdown("### 🎤 Upload Audio")

	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Record or Upload Audio",
	elem_classes="audio-input"
	)

	language_select = gr.Dropdown(
	choices=list(LANGUAGE_CONFIG.keys()),
	value="Swahili",
	label="Select Language",
	info="Choose the language of your audio"
	)

	submit_btn = gr.Button(
	"🎯 Transcribe Audio",
	variant="primary",
	size="lg"
	)

	# Output Section
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Transcription Result")
	transcription_output = gr.Textbox(
	label="Transcribed Text",
	placeholder="Your transcription will appear here...",
	lines=5,
	show_copy_button=True
	)

	# Status indicator
	status_indicator = gr.HTML("""
	<div style="text-align: center; padding: 10px;">
	<span style="color: #4caf50;">✅ Ready to transcribe</span>
	</div>
	""")

	# Language Information Section
	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🌐 Supported Languages")

	for lang, config in LANGUAGE_CONFIG.items():
	status_class = "unavailable" if not config["available"] else "available"
	status_text = "🔴 Not Available" if not config["available"] else "🟢 Available"
	model_info = config["model"] if config["available"] else config.get("message", "Not available")

	gr.HTML(f"""
	<div class="language-card {status_class}">
	<h4>{lang} {status_text}</h4>
	<p><strong>Model:</strong> {model_info}</p>
	<p><strong>Language Code:</strong> {config['code']}</p>
	</div>
	""")

	# Footer
	with gr.Row():
	with gr.Column():
	gr.Markdown("""
	---
	### ℹ️ About This Service

	Powered by:
	- Facebook SeamlessM4T
	- Hugging Face Transformers
	- Specialized African Language Models

	Supported Languages & Codes:
	- Amharic (amh)
	- Swahili (swh)
	- Somali (som)
	- Afan Oromo (gaz)
	- Chichewa (nya)

	Supported Formats: WAV, MP3, M4A, FLAC
	Maximum Duration: 30 seconds per audio

	For best results, use clear audio with minimal background noise
	""")

	# Event handlers
	def update_status(language):
	config = LANGUAGE_CONFIG[language]
	if not config["available"]:
	if language == "Tigrinya":
	return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">⛔ {config["message"]}</span></div>'
	return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">⛔ {language} transcription is currently unavailable</span></div>'
	return '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">✅ Ready to transcribe</span></div>'

	# Connect events
	language_select.change(
	fn=update_status,
	inputs=[language_select],
	outputs=status_indicator
	)

	submit_btn.click(
	fn=transcribe_audio,
	inputs=[audio_input, language_select],
	outputs=transcription_output
	).then(
	fn=lambda: '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">✅ Ready to transcribe</span></div>',
	outputs=status_indicator
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)