|
|
import traceback |
|
|
import soundfile as sf |
|
|
import torch |
|
|
import numpy as np |
|
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor |
|
|
import gradio as gr |
|
|
import resampy |
|
|
|
|
|
|
|
|
LANGUAGE_CONFIG = { |
|
|
"Amharic": { |
|
|
"code": "amh", |
|
|
"model": "facebook/seamless-m4t-v2-large", |
|
|
"available": True |
|
|
}, |
|
|
"Swahili": { |
|
|
"code": "swh", |
|
|
"model": "facebook/seamless-m4t-v2-large", |
|
|
"available": True |
|
|
}, |
|
|
"Somali": { |
|
|
"code": "som", |
|
|
"model": "facebook/seamless-m4t-v2-large", |
|
|
"available": True |
|
|
}, |
|
|
"Afan Oromo": { |
|
|
"code": "gaz", |
|
|
"model": "facebook/seamless-m4t-v2-large", |
|
|
"available": True |
|
|
}, |
|
|
"Tigrinya": { |
|
|
"code": "tir", |
|
|
"model": "facebook/seamless-m4t-v2-large", |
|
|
"available": False, |
|
|
"message": "Tigrinya transcription is not currently available" |
|
|
}, |
|
|
"Chichewa": { |
|
|
"code": "nya", |
|
|
"model": "dmatekenya/wav2vec2-large-xls-r-300m-chichewa", |
|
|
"available": True |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
models = {} |
|
|
processors = {} |
|
|
|
|
|
print("[INFO] Loading transcription models...") |
|
|
|
|
|
|
|
|
try: |
|
|
seamless_model_id = "facebook/seamless-m4t-v2-large" |
|
|
seamless_processor = AutoProcessor.from_pretrained(seamless_model_id) |
|
|
seamless_model = AutoModelForSpeechSeq2Seq.from_pretrained(seamless_model_id).to("cpu") |
|
|
|
|
|
for lang, config in LANGUAGE_CONFIG.items(): |
|
|
if config["available"] and config["model"] == seamless_model_id: |
|
|
models[lang] = seamless_model |
|
|
processors[lang] = seamless_processor |
|
|
|
|
|
print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo") |
|
|
except Exception as e: |
|
|
print("[ERROR] Failed to load SeamlessM4T model:", e) |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
try: |
|
|
chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa") |
|
|
chichewa_model = Wav2Vec2ForCTC.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa").to("cpu") |
|
|
models["Chichewa"] = chichewa_model |
|
|
processors["Chichewa"] = chichewa_processor |
|
|
print("[SUCCESS] Chichewa model loaded successfully") |
|
|
except Exception as e: |
|
|
print("[ERROR] Failed to load Chichewa model:", e) |
|
|
traceback.print_exc() |
|
|
LANGUAGE_CONFIG["Chichewa"]["available"] = False |
|
|
|
|
|
|
|
|
def transcribe_audio(audio_file, language): |
|
|
if language not in models or language not in processors: |
|
|
return f"Model for {language} is not available" |
|
|
|
|
|
if not LANGUAGE_CONFIG[language]["available"]: |
|
|
if language == "Tigrinya": |
|
|
return LANGUAGE_CONFIG[language]["message"] |
|
|
return f"{language} transcription is currently unavailable" |
|
|
|
|
|
try: |
|
|
|
|
|
audio, sr = sf.read(audio_file) |
|
|
if audio.ndim > 1: |
|
|
audio = audio.mean(axis=1) |
|
|
audio = resampy.resample(audio, sr, 16000) |
|
|
|
|
|
model = models[language] |
|
|
processor = processors[language] |
|
|
|
|
|
|
|
|
if language == "Chichewa": |
|
|
|
|
|
inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True) |
|
|
with torch.no_grad(): |
|
|
logits = model(**inputs).logits |
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
|
|
|
|
else: |
|
|
|
|
|
inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"]) |
|
|
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
return transcription.strip() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"[ERROR] ASR transcription failed for {language}:", e) |
|
|
traceback.print_exc() |
|
|
return f"Transcription failed: {str(e)[:100]}..." |
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
secondary_hue="green", |
|
|
), |
|
|
title="π GihonTech - Multilingual Speech Recognition", |
|
|
css=""" |
|
|
.gradio-container { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
} |
|
|
.header { |
|
|
text-align: center; |
|
|
padding: 20px; |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
border-radius: 15px; |
|
|
margin-bottom: 20px; |
|
|
color: white; |
|
|
} |
|
|
.language-card { |
|
|
background: white; |
|
|
padding: 15px; |
|
|
border-radius: 10px; |
|
|
margin: 10px 0; |
|
|
border-left: 4px solid #667eea; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
|
} |
|
|
.unavailable { |
|
|
background: #ffebee; |
|
|
border-left: 4px solid #f44336; |
|
|
} |
|
|
.available { |
|
|
background: #e8f5e8; |
|
|
border-left: 4px solid #4caf50; |
|
|
} |
|
|
""" |
|
|
) as demo: |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.HTML(""" |
|
|
<div class="header"> |
|
|
<h1>π GihonTech Multilingual Speech Recognition</h1> |
|
|
<p>Transcribe audio in multiple African languages with state-of-the-art AI models</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### π€ Upload Audio") |
|
|
|
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="filepath", |
|
|
label="Record or Upload Audio", |
|
|
elem_classes="audio-input" |
|
|
) |
|
|
|
|
|
language_select = gr.Dropdown( |
|
|
choices=list(LANGUAGE_CONFIG.keys()), |
|
|
value="Swahili", |
|
|
label="Select Language", |
|
|
info="Choose the language of your audio" |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button( |
|
|
"π― Transcribe Audio", |
|
|
variant="primary", |
|
|
size="lg" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### π Transcription Result") |
|
|
transcription_output = gr.Textbox( |
|
|
label="Transcribed Text", |
|
|
placeholder="Your transcription will appear here...", |
|
|
lines=5, |
|
|
show_copy_button=True |
|
|
) |
|
|
|
|
|
|
|
|
status_indicator = gr.HTML(""" |
|
|
<div style="text-align: center; padding: 10px;"> |
|
|
<span style="color: #4caf50;">β
Ready to transcribe</span> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("### π Supported Languages") |
|
|
|
|
|
for lang, config in LANGUAGE_CONFIG.items(): |
|
|
status_class = "unavailable" if not config["available"] else "available" |
|
|
status_text = "π΄ Not Available" if not config["available"] else "π’ Available" |
|
|
model_info = config["model"] if config["available"] else config.get("message", "Not available") |
|
|
|
|
|
gr.HTML(f""" |
|
|
<div class="language-card {status_class}"> |
|
|
<h4>{lang} {status_text}</h4> |
|
|
<p><strong>Model:</strong> {model_info}</p> |
|
|
<p><strong>Language Code:</strong> {config['code']}</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### βΉοΈ About This Service |
|
|
|
|
|
**Powered by:** |
|
|
- Facebook SeamlessM4T |
|
|
- Hugging Face Transformers |
|
|
- Specialized African Language Models |
|
|
|
|
|
**Supported Languages & Codes:** |
|
|
- Amharic (amh) |
|
|
- Swahili (swh) |
|
|
- Somali (som) |
|
|
- Afan Oromo (gaz) |
|
|
- Chichewa (nya) |
|
|
|
|
|
**Supported Formats:** WAV, MP3, M4A, FLAC |
|
|
**Maximum Duration:** 30 seconds per audio |
|
|
|
|
|
*For best results, use clear audio with minimal background noise* |
|
|
""") |
|
|
|
|
|
|
|
|
def update_status(language): |
|
|
config = LANGUAGE_CONFIG[language] |
|
|
if not config["available"]: |
|
|
if language == "Tigrinya": |
|
|
return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">β {config["message"]}</span></div>' |
|
|
return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">β {language} transcription is currently unavailable</span></div>' |
|
|
return '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">β
Ready to transcribe</span></div>' |
|
|
|
|
|
|
|
|
language_select.change( |
|
|
fn=update_status, |
|
|
inputs=[language_select], |
|
|
outputs=status_indicator |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=transcribe_audio, |
|
|
inputs=[audio_input, language_select], |
|
|
outputs=transcription_output |
|
|
).then( |
|
|
fn=lambda: '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">β
Ready to transcribe</span></div>', |
|
|
outputs=status_indicator |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
show_error=True |
|
|
) |