Spaces:

minte-atnafu
/

GihonTech_Local_Language_Transcription

Sleeping

File size: 10,103 Bytes

import traceback
import soundfile as sf
import torch
import numpy as np
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor
import gradio as gr
import resampy

# Language configuration - UPDATED with correct Afan Oromo code
LANGUAGE_CONFIG = {
    "Amharic": {
        "code": "amh",
        "model": "facebook/seamless-m4t-v2-large",
        "available": True
    },
    "Swahili": {
        "code": "swh", 
        "model": "facebook/seamless-m4t-v2-large",
        "available": True
    },
    "Somali": {
        "code": "som",
        "model": "facebook/seamless-m4t-v2-large",
        "available": True
    },
    "Afan Oromo": {
        "code": "gaz",  # FIXED: Changed from "orm" to "gaz"
        "model": "facebook/seamless-m4t-v2-large",  # Using SeamlessM4T since it supports gaz
        "available": True
    },
    "Tigrinya": {
        "code": "tir",
        "model": "facebook/seamless-m4t-v2-large",
        "available": False,
        "message": "Tigrinya transcription is not currently available"
    },
    "Chichewa": {
        "code": "nya",
        "model": "dmatekenya/wav2vec2-large-xls-r-300m-chichewa",
        "available": True
    }
}

# Initialize models
models = {}
processors = {}

print("[INFO] Loading transcription models...")

# Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo
try:
    seamless_model_id = "facebook/seamless-m4t-v2-large"
    seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
    seamless_model = AutoModelForSpeechSeq2Seq.from_pretrained(seamless_model_id).to("cpu")
    
    for lang, config in LANGUAGE_CONFIG.items():
        if config["available"] and config["model"] == seamless_model_id:
            models[lang] = seamless_model
            processors[lang] = seamless_processor
    
    print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo")
except Exception as e:
    print("[ERROR] Failed to load SeamlessM4T model:", e)
    traceback.print_exc()

# Load Chichewa model
try:
    chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
    chichewa_model = Wav2Vec2ForCTC.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa").to("cpu")
    models["Chichewa"] = chichewa_model
    processors["Chichewa"] = chichewa_processor
    print("[SUCCESS] Chichewa model loaded successfully")
except Exception as e:
    print("[ERROR] Failed to load Chichewa model:", e)
    traceback.print_exc()
    LANGUAGE_CONFIG["Chichewa"]["available"] = False

# --- Helper: ASR ---
def transcribe_audio(audio_file, language):
    if language not in models or language not in processors:
        return f"Model for {language} is not available"
    
    if not LANGUAGE_CONFIG[language]["available"]:
        if language == "Tigrinya":
            return LANGUAGE_CONFIG[language]["message"]
        return f"{language} transcription is currently unavailable"
    
    try:
        # Read and preprocess audio
        audio, sr = sf.read(audio_file)
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        audio = resampy.resample(audio, sr, 16000)
        
        model = models[language]
        processor = processors[language]
        
        # Handle different model types
        if language == "Chichewa":
            # Wav2Vec2 processing
            inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
            with torch.no_grad():
                logits = model(**inputs).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
        
        else:
            # Standard SeamlessM4T processing for all other languages
            inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")  # Fixed: audio instead of audios
            with torch.no_grad():
                generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        return transcription.strip()
    
    except Exception as e:
        print(f"[ERROR] ASR transcription failed for {language}:", e)
        traceback.print_exc()
        return f"Transcription failed: {str(e)[:100]}..."

# --- Beautiful Gradio UI ---
with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="green",
    ),
    title="🌍 GihonTech - Multilingual Speech Recognition",
    css="""
    .gradio-container {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    }
    .header {
        text-align: center;
        padding: 20px;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        border-radius: 15px;
        margin-bottom: 20px;
        color: white;
    }
    .language-card {
        background: white;
        padding: 15px;
        border-radius: 10px;
        margin: 10px 0;
        border-left: 4px solid #667eea;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .unavailable {
        background: #ffebee;
        border-left: 4px solid #f44336;
    }
    .available {
        background: #e8f5e8;
        border-left: 4px solid #4caf50;
    }
    """
) as demo:
    
    # Header Section
    with gr.Row():
        with gr.Column():
            gr.HTML("""
            <div class="header">
                <h1>🌍 GihonTech Multilingual Speech Recognition</h1>
                <p>Transcribe audio in multiple African languages with state-of-the-art AI models</p>
            </div>
            """)
    
    # Main Content
    with gr.Row():
        # Input Section
        with gr.Column(scale=1):
            gr.Markdown("### 🎤 Upload Audio")
            
            audio_input = gr.Audio(
                sources=["microphone", "upload"], 
                type="filepath", 
                label="Record or Upload Audio",
                elem_classes="audio-input"
            )
            
            language_select = gr.Dropdown(
                choices=list(LANGUAGE_CONFIG.keys()),
                value="Swahili",
                label="Select Language",
                info="Choose the language of your audio"
            )
            
            submit_btn = gr.Button(
                "🎯 Transcribe Audio", 
                variant="primary",
                size="lg"
            )
        
        # Output Section
        with gr.Column(scale=1):
            gr.Markdown("### 📝 Transcription Result")
            transcription_output = gr.Textbox(
                label="Transcribed Text",
                placeholder="Your transcription will appear here...",
                lines=5,
                show_copy_button=True
            )
            
            # Status indicator
            status_indicator = gr.HTML("""
            <div style="text-align: center; padding: 10px;">
                <span style="color: #4caf50;">✅ Ready to transcribe</span>
            </div>
            """)
    
    # Language Information Section
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🌐 Supported Languages")
            
            for lang, config in LANGUAGE_CONFIG.items():
                status_class = "unavailable" if not config["available"] else "available"
                status_text = "🔴 Not Available" if not config["available"] else "🟢 Available"
                model_info = config["model"] if config["available"] else config.get("message", "Not available")
                
                gr.HTML(f"""
                <div class="language-card {status_class}">
                    <h4>{lang} {status_text}</h4>
                    <p><strong>Model:</strong> {model_info}</p>
                    <p><strong>Language Code:</strong> {config['code']}</p>
                </div>
                """)
    
    # Footer
    with gr.Row():
        with gr.Column():
            gr.Markdown("""
            ---
            ### ℹ️ About This Service
            
            **Powered by:** 
            - Facebook SeamlessM4T
            - Hugging Face Transformers
            - Specialized African Language Models
            
            **Supported Languages & Codes:**
            - Amharic (amh)
            - Swahili (swh) 
            - Somali (som)
            - Afan Oromo (gaz)
            - Chichewa (nya)
            
            **Supported Formats:** WAV, MP3, M4A, FLAC
            **Maximum Duration:** 30 seconds per audio
            
            *For best results, use clear audio with minimal background noise*
            """)
    
    # Event handlers
    def update_status(language):
        config = LANGUAGE_CONFIG[language]
        if not config["available"]:
            if language == "Tigrinya":
                return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">⛔ {config["message"]}</span></div>'
            return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">⛔ {language} transcription is currently unavailable</span></div>'
        return '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">✅ Ready to transcribe</span></div>'
    
    # Connect events
    language_select.change(
        fn=update_status,
        inputs=[language_select],
        outputs=status_indicator
    )
    
    submit_btn.click(
        fn=transcribe_audio,
        inputs=[audio_input, language_select],
        outputs=transcription_output
    ).then(
        fn=lambda: '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">✅ Ready to transcribe</span></div>',
        outputs=status_indicator
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0", 
        server_port=7860,
        share=False,
        show_error=True
    )