Spaces:

NeuralFalcon
/

Omnilingual-ASR-Colab

Paused

App Files Files Community

NeuralFalcon commited on Nov 12, 2025

Commit

92e075b

verified ·

1 Parent(s): 6a0cd27

Upload 21 files

Browse files

Files changed (21) hide show

app.py +179 -0
server/.gitignore +1 -0
server/audio_transcription.py +867 -0
server/convert_media_to_wav.py +252 -0
server/download_models.py +91 -0
server/env_vars.py +13 -0
server/inference/align_utils.py +188 -0
server/inference/audio_chunker.py +359 -0
server/inference/audio_reading_tools.py +89 -0
server/inference/audio_sentence_alignment.py +219 -0
server/inference/mms_model_pipeline.py +138 -0
server/inference/norm_config_module.py +276 -0
server/inference/punctuations.lst +188 -0
server/inference/text_normalization.py +101 -0
server/lang_dict.py +1675 -0
server/media_transcription_processor.py +334 -0
server/requirements.txt +24 -0
server/subtitle.py +236 -0
server/transcription_status.py +71 -0
server/transcriptions_blueprint.py +292 -0
server/video_utils.py +199 -0

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import gradio as gr
+import torch
+import os
+import warnings
+import sys
+import os
+fix_import=f"{os.getcwd()}/server"
+sys.path.append(fix_import)
+from inference.audio_chunker import AudioChunker
+from inference.audio_sentence_alignment import AudioAlignment
+from inference.mms_model_pipeline import MMSModel
+from media_transcription_processor import MediaTranscriptionProcessor
+from subtitle import make_subtitle
+from lang_dict import lang_code
+import download_models
+# warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
+warnings.filterwarnings(
+    "ignore",
+    message=".*torchaudio.functional._alignment.forced_align.*",
+    category=UserWarning
+)
+# ---- Setup Model Globals ----
+_model_loaded = False
+_model_loading = False
+# ---- Initialize model ----
+def load_model(model_name="omniASR_LLM_1B"):
+    """Load MMS model on startup - only once."""
+    global _model_loaded, _model_loading
+    if _model_loaded or _model_loading:
+        return
+    _model_loading = True
+    print(f"🔄 Loading {model_name} model...")
+    AudioChunker()
+    AudioAlignment()
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    MMSModel(model_card=model_name, device=device)
+    _model_loaded = True
+    _model_loading = False
+    print("✅ Model loaded successfully.")
+# ---- Transcription function ----
+def media_transcription(file_path, lang_code="eng_Latn"):
+    """Perform transcription + subtitle generation."""
+    with open(file_path, "rb") as f:
+        media_bytes = f.read()
+    processor = MediaTranscriptionProcessor(
+        media_bytes=media_bytes,
+        filename=file_path,
+        language_with_script=lang_code
+    )
+    processor.convert_media()
+    processor.transcribe_full_pipeline()
+    results = processor.get_results()
+    transcription = results['transcription']
+    word_level_timestamps = [
+        {"word": s['text'], "start": s['start'], "end": s['end']}
+        for s in results.get('aligned_segments', [])
+    ]
+    sentence_srt, word_level_srt, shorts_srt = make_subtitle(word_level_timestamps, file_path)
+    return transcription, sentence_srt, word_level_srt, shorts_srt
+def transcribe_interface(audio, selected_lang):
+    """Main Gradio wrapper."""
+    if audio is None:
+        return "Please upload or record audio.", None, None, None
+    # Save uploaded/recorded audio
+    file_path = audio
+    find_lang_code = lang_code[selected_lang]
+    # print(f"🎙 Transcribing {file_path} in {selected_lang} ({find_lang_code})...")
+    try:
+        transcription, sentence_srt, word_level_srt, shorts_srt = media_transcription(file_path, find_lang_code)
+        return transcription, sentence_srt, word_level_srt, shorts_srt
+    except Exception as e:
+        return f"❌ Error: {e}", None, None, None
+def ui():
+    lang_list = list(lang_code.keys())
+    custom_css = """.gradio-container { font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif; }"""
+    with gr.Blocks(theme=gr.themes.Soft(),css=custom_css) as demo:
+        gr.HTML("""
+        <div style="text-align: center; margin: 20px auto; max-width: 800px;">
+            <h1 style="font-size: 2.5em; margin-bottom: 10px;">Meta Omnilingual ASR</h1>
+            <a href="https://github.com/NeuralFalconYT/omnilingual-asr-colab" target="_blank" style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white; border-radius: 6px; text-decoration: none; font-size: 1em;">😇 Run on Google Colab</a>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(sources=[ "microphone","upload"], type="filepath", label="🎙 Upload or Record Audio")
+                language_dropdown = gr.Dropdown(
+                                    choices=lang_list,
+                                    value=lang_list[0],
+                                    label="🌐 Select Language"
+                                )
+                transcribe_btn = gr.Button("🚀 Transcribe")
+            with gr.Column():
+              transcription_output = gr.Textbox(label="Transcription", lines=8,show_copy_button=True)
+              with gr.Accordion("🎬 Subtitle (Not Accurate)", open=False):
+                    sentence_srt_out = gr.File(label="Sentence-level Subtitle File")
+                    word_srt_out = gr.File(label="Word-level Subtitle File")
+                    shorts_srt_out = gr.File(label="Shorts Subtitle File")
+        transcribe_btn.click(
+            fn=transcribe_interface,
+            inputs=[audio_input, language_dropdown],
+            outputs=[transcription_output, sentence_srt_out, word_srt_out, shorts_srt_out]
+        )
+    return demo
+import click
+@click.command()
+@click.option(
+    "--debug",
+    is_flag=True,
+    default=False,
+    help="Enable debug mode (shows detailed logs)."
+)
+@click.option(
+    "--share",
+    is_flag=True,
+    default=False,
+    help="Create a public Gradio share link (for Colab or remote usage)."
+)
+@click.option(
+    "--model",
+    default="omniASR_LLM_1B",
+    type=click.Choice([
+        "omniASR_CTC_300M",
+        "omniASR_CTC_1B",
+        "omniASR_CTC_3B",
+        "omniASR_CTC_7B",
+        "omniASR_LLM_300M",
+        "omniASR_LLM_1B",
+        "omniASR_LLM_3B",
+        "omniASR_LLM_7B",
+        "omniASR_LLM_7B_ZS",
+    ]),
+    help="Choose the OmniASR model to load."
+)
+def main(debug, share, model):
+# def main(debug=True, share=True,model="omniASR_LLM_1B"):
+    """Universal CLI entry point for omniASR transcription UI."""
+    print(f"\n🚀 Starting omniASR UI with model: {model}")
+    # ✅ Load model
+    load_model(model)
+    # ✅ Launch UI
+    demo = ui()
+    demo.queue().launch(share=share, debug=debug)
+if __name__ == "__main__":
+    main()

server/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ models/

server/audio_transcription.py ADDED Viewed

	@@ -0,0 +1,867 @@

+from __future__ import annotations
+# Standard library imports
+import logging
+import os
+import tempfile
+from typing import Dict, List, Optional, Tuple
+# Third-party imports
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+import uroman
+# fairseq2 imports
+from inference.align_utils import get_uroman_tokens
+from inference.audio_chunker import AudioChunker
+from inference.audio_reading_tools import wav_to_bytes
+# Import AudioAlignment and its config classes
+from inference.audio_sentence_alignment import AudioAlignment
+from inference.mms_model_pipeline import MMSModel
+from inference.text_normalization import text_normalize
+from transcription_status import transcription_status
+from env_vars import USE_CHUNKING
+# Constants
+SAMPLE_RATE = 16000
+logger = logging.getLogger(__name__)
+def transcribe_single_chunk(audio_tensor: torch.Tensor, sample_rate: int = 16000, language_with_script: str = None):
+    """
+    Basic transcription pipeline for a single audio chunk using MMS model pipeline.
+    This is the lowest-level transcription function that handles individual audio segments.
+    Args:
+        audio_tensor (torch.Tensor): Audio tensor (1D waveform)
+        sample_rate (int): Sample rate of the audio tensor
+        language_with_script (str): language_with_script for transcription (3-letter ISO codes like "eng", "spa") with script
+    Returns:
+        str: Transcribed text
+    """
+    logger.info("Starting complete audio transcription pipeline...")
+    try:
+        logger.info("Using pipeline transcription...")
+        # Use the singleton model instance
+        model = MMSModel.get_instance()
+        # Transcribe using pipeline - convert tensor to list format
+        lang_list = [language_with_script] if language_with_script else None
+        results = model.transcribe_audio(audio_tensor, batch_size=1, language_with_scripts=lang_list)
+        result = results[0] if results else {}
+        # Convert pipeline result to expected format
+        if isinstance(result, dict) and 'text' in result:
+            transcription_text = result['text']
+        elif isinstance(result, str):
+            transcription_text = result
+        else:
+            transcription_text = str(result)
+        if not transcription_text.strip():
+            logger.warning("Pipeline returned empty transcription")
+            return ""
+        logger.info(f"✓ Pipeline transcription successful: '{transcription_text}'")
+        # Return the transcription text
+        return transcription_text
+    except Exception as e:
+        logger.error(f"Error in transcription pipeline: {str(e)}", exc_info=True)
+        raise
+def perform_forced_alignment(
+    audio_tensor: torch.Tensor,
+    transcription_tokens: List[str],
+    device,
+    sample_rate: int = 16000,
+) -> List[Dict]:
+    """
+    Perform forced alignment using the AudioAlignment class from audio_sentence_alignment.py.
+    Uses the provided audio tensor directly.
+    Args:
+        audio_tensor (torch.Tensor): Audio tensor (1D waveform)
+        transcription_tokens (List[str]): List of tokens from transcription
+        device: Device for computation
+        sample_rate (int): Audio sample rate
+    Returns:
+        List[Dict]: List of segments with timestamps and text
+    """
+    try:
+        logger.info(f"Starting forced alignment with audio tensor")
+        logger.info(f"Audio shape: {audio_tensor.shape}, sample_rate: {sample_rate}")
+        logger.info(f"Tokens to align: {transcription_tokens}")
+        # Use the provided audio tensor directly
+        # Convert to the format expected by AudioAlignment.get_one_row_alignments
+        if hasattr(audio_tensor, "cpu"):
+            # If it's a torch tensor, use it directly
+            alignment_tensor = audio_tensor.float()
+        else:
+            # If it's numpy, convert to tensor
+            alignment_tensor = torch.from_numpy(audio_tensor).float()
+        # Ensure it's 1D (flatten if needed)
+        if len(alignment_tensor.shape) > 1:
+            alignment_tensor = alignment_tensor.flatten()
+        # Convert audio tensor to bytes format expected by AudioAlignment
+        # Use wav_to_bytes to create proper audio bytes
+        # Move tensor to CPU first to avoid CUDA tensor to numpy conversion error
+        audio_tensor_cpu = alignment_tensor.cpu() if alignment_tensor.is_cuda else alignment_tensor
+        audio_arr = wav_to_bytes(
+            audio_tensor_cpu, sample_rate=sample_rate, format="wav"
+        )
+        # logger.info(
+        #     f"Converted audio to bytes: shape={audio_arr.shape}, dtype={audio_arr.dtype}"
+        # )
+        logger.info(f"Converted audio to bytes: {len(audio_arr)} bytes")
+        # Preprocess tokens for MMS alignment model using the same approach as TextRomanizer
+        # The MMS alignment model expects romanized tokens in the same format as text_sentences_tokens
+        try:
+            # Join tokens back to text for uroman processing
+            transcription_text = " ".join(transcription_tokens)
+            # Create uroman instance and process the text the same way as TextRomanizer
+            uroman_instance = uroman.Uroman()
+            # Step 1: Normalize the text first using text_normalize function (same as TextRomanizer)
+            normalized_text = text_normalize(transcription_text.strip(), "en")
+            # Step 2: Get uroman tokens using the same function as TextRomanizer
+            # This creates character-level tokens with spaces between characters
+            uroman_tokens_str = get_uroman_tokens(
+                [normalized_text], uroman_instance, "en"
+            )[0]
+            # Step 3: Split by spaces to get individual character tokens (same as real MMS pipeline)
+            alignment_tokens = uroman_tokens_str.split()
+            logger.info(f"Original tokens: {transcription_tokens}")
+            logger.info(f"Original text: '{transcription_text}'")
+            logger.info(f"Normalized text: '{normalized_text}'")
+            logger.info(f"Uroman tokens string: '{uroman_tokens_str}'")
+            logger.info(
+                f"Alignment tokens (count={len(alignment_tokens)}): {alignment_tokens[:20]}..."
+            )
+            # Additional debugging - check for any unusual characters
+            for i, token in enumerate(alignment_tokens[:10]):  # Check first 10 tokens
+                logger.info(
+                    f"Token {i}: '{token}' (length={len(token)}, chars={[c for c in token]})"
+                )
+        except Exception as e:
+            logger.warning(
+                f"Failed to preprocess tokens with TextRomanizer approach: {e}"
+            )
+            logger.exception("Full error traceback:")
+            # Fallback: use simple character-level tokenization
+            transcription_text = " ".join(transcription_tokens).lower()
+            # Simple character-level tokenization as fallback
+            alignment_tokens = []
+            for char in transcription_text:
+                if char == " ":
+                    alignment_tokens.append(" ")
+                else:
+                    alignment_tokens.append(char)
+            logger.info(f"Using fallback character tokens: {alignment_tokens[:20]}...")
+        logger.info(
+            f"Using {len(alignment_tokens)} alignment tokens for forced alignment"
+        )
+        # Create AudioAlignment instance
+        logger.info("Creating AudioAlignment instance...")
+        alignment = AudioAlignment()
+        # Perform alignment using get_one_row_alignments
+        logger.info("Performing alignment...")
+        logger.info(f"About to call get_one_row_alignments with:")
+        # logger.info(f"  audio_arr type: {type(audio_arr)}, shape: {audio_arr.shape}")
+        logger.info(f"audio_arr type: {type(audio_arr)}")
+        logger.info(
+            f"  alignment_tokens type: {type(alignment_tokens)}, length: {len(alignment_tokens)}"
+        )
+        logger.info(
+            f"  First 10 tokens: {alignment_tokens[:10] if len(alignment_tokens) >= 10 else alignment_tokens}"
+        )
+        # Check for any problematic characters in tokens
+        for i, token in enumerate(alignment_tokens[:5]):
+            token_chars = [ord(c) for c in str(token)]
+            logger.info(f"  Token {i} '{token}' char codes: {token_chars}")
+        # Check if tokens contain any RTL characters that might cause the LTR assertion
+        rtl_chars = []
+        for i, token in enumerate(alignment_tokens):
+            for char in str(token):
+                # Check for Arabic, Hebrew, and other RTL characters
+                if (
+                    "\u0590" <= char <= "\u08ff"
+                    or "\ufb1d" <= char <= "\ufdff"
+                    or "\ufe70" <= char <= "\ufeff"
+                ):
+                    rtl_chars.append((i, token, char, ord(char)))
+        if rtl_chars:
+            logger.warning(f"Found RTL characters in tokens: {rtl_chars[:10]}...")
+        try:
+            audio_segments = alignment.get_one_row_alignments(
+                audio_arr, sample_rate, alignment_tokens
+            )
+        except Exception as alignment_error:
+            logger.error(f"Alignment failed with error: {alignment_error}")
+            logger.error(f"Error type: {type(alignment_error)}")
+            # Try to provide more context about the error
+            if "ltr" in str(alignment_error).lower():
+                logger.error("LTR assertion error detected. This might be due to:")
+                logger.error("1. RTL characters in the input tokens")
+                logger.error(
+                    "2. Incorrect token format - tokens should be individual characters"
+                )
+                logger.error("3. Unicode normalization issues")
+                # Try a simple ASCII-only fallback
+                logger.info("Attempting ASCII-only fallback...")
+                ascii_tokens = []
+                for token in alignment_tokens:
+                    # Keep only ASCII characters
+                    ascii_token = "".join(c for c in str(token) if ord(c) < 128)
+                    if ascii_token:
+                        ascii_tokens.append(ascii_token)
+                logger.info(
+                    f"ASCII tokens (count={len(ascii_tokens)}): {ascii_tokens[:20]}..."
+                )
+                try:
+                    audio_segments = alignment.get_one_row_alignments(
+                        audio_arr, ascii_tokens
+                    )
+                    alignment_tokens = ascii_tokens  # Update for later use
+                    logger.info("ASCII fallback successful!")
+                except Exception as ascii_error:
+                    logger.error(f"ASCII fallback also failed: {ascii_error}")
+                    raise alignment_error
+            else:
+                raise
+        logger.info(
+            f"Alignment completed, got {len(audio_segments)} character segments"
+        )
+        # Debug: Log the actual structure of audio_segments
+        if audio_segments:
+            logger.info("=== Audio Segments Debug Info ===")
+            logger.info(f"Total segments: {len(audio_segments)}")
+            # Print ALL audio segments for complete debugging
+            logger.info("=== ALL AUDIO SEGMENTS ===")
+            for i, segment in enumerate(audio_segments):
+                logger.info(f"Segment {i}: {segment}")
+                if i > 0 and i % 20 == 0:  # Print progress every 20 segments
+                    logger.info(
+                        f"... printed {i+1}/{len(audio_segments)} segments so far..."
+                    )
+            logger.info("=== End All Audio Segments ===")
+            logger.info("=== End Audio Segments Debug ===")
+        # Convert character-level segments back to word-level segments
+        # Use the actual alignment timings to preserve silence and natural timing
+        aligned_segments = []
+        logger.info(
+            f"Converting {len(audio_segments)} character segments to word segments"
+        )
+        logger.info(f"Original tokens: {transcription_tokens}")
+        logger.info(f"Alignment tokens: {alignment_tokens[:20]}...")
+        # Validate that we have segments and tokens
+        if not audio_segments or not transcription_tokens:
+            logger.warning("No audio segments or transcription tokens available")
+            return []
+        # Get actual timing from character segments
+        if audio_segments:
+            # Use the known segment keys from audio_sentence_alignment
+            start_key, duration_key = "segment_start_sec", "segment_duration"
+            first_segment = audio_segments[0]
+            last_segment = audio_segments[-1]
+            total_audio_duration = last_segment.get(start_key, 0) + last_segment.get(
+                duration_key, 0
+            )
+            logger.info(
+                f"Total audio duration from segments: {total_audio_duration:.3f}s"
+            )
+        else:
+            total_audio_duration = 0.0
+            start_key, duration_key = "segment_start_sec", "segment_duration"
+        # Strategy: Group character segments by words using the actual alignment timing
+        # This preserves the natural timing including silences from the forced alignment
+        # First, reconstruct the alignment character sequence
+        alignment_char_sequence = "".join(alignment_tokens)
+        transcription_text = "".join(
+            transcription_tokens
+        )  # Remove spaces for character matching
+        logger.info(f"Alignment sequence length: {len(alignment_char_sequence)}")
+        logger.info(f"Transcription length: {len(transcription_text)}")
+        # Create word boundaries based on romanized alignment tokens
+        # We need to map each original word to its position in the romanized sequence
+        word_boundaries = []
+        alignment_pos = 0
+        # Process each word individually to get its romanized representation
+        for word in transcription_tokens:
+            try:
+                # Get romanized version of this individual word
+                normalized_word = text_normalize(word.strip(), "en")
+                uroman_word_str = get_uroman_tokens([normalized_word], uroman_instance, "en")[0]
+                romanized_word_tokens = uroman_word_str.split()
+                word_start = alignment_pos
+                word_end = alignment_pos + len(romanized_word_tokens)
+                word_boundaries.append((word_start, word_end))
+                alignment_pos = word_end
+                logger.info(f"Word '{word}' -> romanized tokens {romanized_word_tokens} -> positions {word_start}-{word_end}")
+            except Exception as e:
+                logger.warning(f"Failed to romanize word '{word}': {e}")
+                # Fallback: estimate based on character length ratio
+                estimated_length = max(1, int(len(word) * len(alignment_tokens) / len(transcription_text)))
+                word_start = alignment_pos
+                word_end = min(alignment_pos + estimated_length, len(alignment_tokens))
+                word_boundaries.append((word_start, word_end))
+                alignment_pos = word_end
+                logger.info(f"Word '{word}' (fallback) -> estimated positions {word_start}-{word_end}")
+        logger.info(f"Word boundaries (romanized): {word_boundaries[:5]}...")
+        logger.info(f"Total alignment tokens used: {alignment_pos}/{len(alignment_tokens)}")
+        # Map each word to its character segments using the boundaries
+        for word_idx, (word, (word_start, word_end)) in enumerate(
+            zip(transcription_tokens, word_boundaries)
+        ):
+            # Find character segments that belong to this word
+            word_segments = []
+            # Map word character range to alignment token indices
+            # Since alignment_tokens might be slightly different due to normalization,
+            # we'll be flexible and use a range around the expected positions
+            start_idx = max(0, min(word_start, len(audio_segments) - 1))
+            end_idx = min(word_end, len(audio_segments))
+            # Ensure we don't go beyond available segments
+            for seg_idx in range(start_idx, end_idx):
+                if seg_idx < len(audio_segments):
+                    word_segments.append(audio_segments[seg_idx])
+            if word_segments:
+                # Use actual timing from the character segments for this word
+                start_times = [seg.get(start_key, 0) for seg in word_segments]
+                end_times = [
+                    seg.get(start_key, 0) + seg.get(duration_key, 0)
+                    for seg in word_segments
+                ]
+                start_time = min(start_times) if start_times else 0
+                end_time = max(end_times) if end_times else start_time + 0.1
+                duration = end_time - start_time
+                # Ensure minimum duration
+                if duration < 0.05:  # Minimum 50ms
+                    duration = 0.05
+                    end_time = start_time + duration
+                logger.debug(
+                    f"Word '{word}' (segments {start_idx}-{end_idx}, {len(word_segments)} segs): {start_time:.3f}s - {end_time:.3f}s ({duration:.3f}s)"
+                )
+            else:
+                logger.warning(
+                    f"No segments found for word '{word}' at position {word_start}-{word_end}"
+                )
+                # Fallback: use proportional timing if no segments found
+                if total_audio_duration > 0 and len(transcription_text) > 0:
+                    start_proportion = word_start / len(transcription_text)
+                    end_proportion = word_end / len(transcription_text)
+                    start_time = start_proportion * total_audio_duration
+                    end_time = end_proportion * total_audio_duration
+                    duration = end_time - start_time
+                else:
+                    # Ultimate fallback
+                    word_duration = 0.5
+                    start_time = word_idx * word_duration
+                    end_time = start_time + word_duration
+                    duration = word_duration
+                logger.debug(
+                    f"Word '{word}' (fallback): {start_time:.3f}s - {end_time:.3f}s"
+                )
+            aligned_segments.append(
+                {
+                    "text": word,
+                    "start": start_time,
+                    "end": end_time,
+                    "duration": duration,
+                }
+            )
+        # Validate segments don't overlap but preserve natural gaps/silences
+        for i in range(1, len(aligned_segments)):
+            prev_end = aligned_segments[i - 1]["end"]
+            current_start = aligned_segments[i]["start"]
+            if current_start < prev_end:
+                # Only fix actual overlaps, don't force adjacency
+                gap = prev_end - current_start
+                logger.debug(
+                    f"Overlap detected: segment {i-1} ends at {prev_end:.3f}s, segment {i} starts at {current_start:.3f}s (overlap: {gap:.3f}s)"
+                )
+                # Fix overlap by adjusting current segment start to previous end
+                aligned_segments[i]["start"] = prev_end
+                aligned_segments[i]["duration"] = (
+                    aligned_segments[i]["end"] - aligned_segments[i]["start"]
+                )
+                logger.debug(
+                    f"Fixed overlap for segment {i}: adjusted start to {prev_end:.3f}s"
+                )
+            else:
+                # Log natural gaps (this is normal and expected)
+                gap = current_start - prev_end
+                if gap > 0.1:  # Log gaps > 100ms
+                    logger.debug(
+                        f"Natural gap preserved: {gap:.3f}s between segments {i-1} and {i}"
+                    )
+        logger.info(f"Forced alignment completed: {len(aligned_segments)} segments")
+        return aligned_segments
+    except Exception as e:
+        logger.error(f"Error in forced alignment: {str(e)}", exc_info=True)
+        # Fallback: create uniform timestamps based on audio tensor length
+        logger.info("Using fallback uniform timestamps")
+        try:
+            # Calculate duration from the audio tensor
+            total_duration = (
+                len(audio_tensor) / sample_rate
+                if len(audio_tensor) > 0
+                else len(transcription_tokens) * 0.5
+            )
+        except:
+            total_duration = len(transcription_tokens) * 0.5  # Fallback
+        segment_duration = (
+            total_duration / len(transcription_tokens) if transcription_tokens else 1.0
+        )
+        fallback_segments = []
+        for i, token in enumerate(transcription_tokens):
+            start_time = i * segment_duration
+            end_time = (i + 1) * segment_duration
+            fallback_segments.append(
+                {
+                    "text": token,
+                    "start": start_time,
+                    "end": end_time,
+                    "duration": segment_duration,
+                }
+            )
+        logger.info(
+            f"Using fallback uniform timestamps: {len(fallback_segments)} segments"
+        )
+        return fallback_segments
+def transcribe_with_word_alignment(audio_tensor: torch.Tensor, sample_rate: int = 16000, language_with_script: str = None) -> Dict:
+    """
+    Transcription pipeline that includes word-level timing through forced alignment.
+    Adds precise word-level timestamps to the basic transcription capability.
+    Args:
+        audio_tensor (torch.Tensor): Audio tensor (1D waveform)
+        sample_rate (int): Sample rate of the audio tensor
+        language_with_script (str): language_with_script code for transcription (3-letter ISO codes like "eng", "spa") with script
+    Returns:
+        Dict: Transcription results with alignment information including word-level timestamps
+    """
+    try:
+        # Get model and device first
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        # Get the transcription results
+        transcription_text = transcribe_single_chunk(audio_tensor, sample_rate=sample_rate, language_with_script=language_with_script)
+        if not transcription_text:
+            return {
+                "transcription": "",
+                "tokens": [],
+                "aligned_segments": [],
+                "total_duration": 0.0,
+            }
+        # Tokenize the transcription for alignment
+        tokens = transcription_text.split()
+        # Perform forced alignment using the original audio tensor
+        logger.info("Performing forced alignment with original audio tensor...")
+        aligned_segments = perform_forced_alignment(audio_tensor, tokens, device, sample_rate)
+        # Calculate total duration
+        total_duration = aligned_segments[-1]["end"] if aligned_segments else 0.0
+        result = {
+            "transcription": transcription_text,
+            "tokens": tokens,
+            "aligned_segments": aligned_segments,
+            "total_duration": total_duration,
+            "num_segments": len(aligned_segments),
+        }
+        logger.info(
+            f"Transcription with alignment completed: {len(aligned_segments)} segments, {total_duration:.2f}s total"
+        )
+        return result
+    except Exception as e:
+        logger.error(f"Error in transcription with alignment: {str(e)}", exc_info=True)
+        # Return basic transcription without alignment
+        try:
+            transcription_text = transcribe_single_chunk(audio_tensor, sample_rate=sample_rate, language_with_script=language_with_script)
+            tokens = transcription_text.split() if transcription_text else []
+            return {
+                "transcription": transcription_text,
+                "tokens": tokens,
+                "aligned_segments": [],
+                "total_duration": 0.0,
+                "alignment_error": str(e),
+            }
+        except Exception as e2:
+            logger.error(f"Error in fallback transcription: {str(e2)}", exc_info=True)
+            return {
+                "transcription": "",
+                "tokens": [],
+                "aligned_segments": [],
+                "total_duration": 0.0,
+                "error": str(e2),
+            }
+def _validate_and_adjust_segments(
+    aligned_segments: List[Dict],
+    chunk_start_time: float,
+    chunk_audio_tensor: torch.Tensor,
+    chunk_sample_rate: int,
+    chunk_duration: float,
+    chunk_index: int
+) -> List[Dict]:
+    """
+    Private helper function to validate and adjust segment timestamps to global timeline.
+    Args:
+        aligned_segments: Raw segments from forced alignment (local chunk timeline)
+        chunk_start_time: Start time of this chunk in global timeline
+        chunk_audio_tensor: Audio tensor for this chunk (to get actual duration)
+        chunk_sample_rate: Sample rate of the chunk
+        chunk_duration: Reported duration of the chunk
+        chunk_index: Index of this chunk for debugging
+    Returns:
+        List of validated segments with global timeline timestamps
+    """
+    adjusted_segments = []
+    # Get the actual audio duration from the chunk tensor instead of the potentially incorrect chunk duration
+    actual_chunk_duration = len(chunk_audio_tensor) / chunk_sample_rate if len(chunk_audio_tensor) > 0 else chunk_duration
+    for segment in aligned_segments:
+        original_start = segment["start"]
+        original_end = segment["end"]
+        # Validate that segment timestamps are within chunk boundaries
+        if original_start < 0:
+            logger.warning(
+                f"Segment '{segment['text']}' has negative start time {original_start:.3f}s, clipping to 0"
+            )
+            original_start = 0
+        if original_end > actual_chunk_duration + 1.0:  # Allow 1s buffer for alignment errors
+            logger.warning(
+                f"Segment '{segment['text']}' end time {original_end:.3f}s exceeds actual chunk duration {actual_chunk_duration:.3f}s, clipping"
+            )
+            original_end = actual_chunk_duration
+        if original_start >= original_end:
+            logger.warning(
+                f"Segment '{segment['text']}' has invalid timing {original_start:.3f}s-{original_end:.3f}s, using fallback"
+            )
+            # Use proportional timing based on segment position using actual chunk duration
+            segment_index = len(adjusted_segments)
+            total_segments = len(aligned_segments)
+            if total_segments > 0:
+                segment_proportion = segment_index / total_segments
+                next_proportion = (segment_index + 1) / total_segments
+                original_start = segment_proportion * actual_chunk_duration
+                original_end = next_proportion * actual_chunk_duration
+            else:
+                original_start = 0
+                original_end = 0.5
+        # Create segment with absolute timeline
+        adjusted_segment = {
+            "text": segment["text"],
+            "start": original_start + chunk_start_time,  # Global timeline
+            "end": original_end + chunk_start_time,    # Global timeline
+            "duration": original_end - original_start,
+            "chunk_index": chunk_index,
+            "original_start": original_start,  # Local chunk time
+            "original_end": original_end,     # Local chunk time
+        }
+        adjusted_segments.append(adjusted_segment)
+        logger.debug(
+            f"Segment '{segment['text']}': {original_start:.3f}-{original_end:.3f} -> {adjusted_segment['start']:.3f}-{adjusted_segment['end']:.3f}"
+        )
+    logger.info(
+        f"Adjusted {len(adjusted_segments)} segments to absolute timeline (chunk starts at {chunk_start_time:.2f}s)"
+    )
+    return adjusted_segments
+def transcribe_full_audio_with_chunking(
+    audio_tensor: torch.Tensor, sample_rate: int = 16000, chunk_duration: float = 30.0, language_with_script: str = None, progress_callback=None
+) -> Dict:
+    """
+    Complete audio transcription pipeline that handles any length audio with intelligent chunking.
+    This is the full-featured transcription function that can process both short and long audio files.
+    Chunking mode is controlled by USE_CHUNKING environment variable:
+    - USE_CHUNKING=false: No chunking (single chunk mode)
+    - USE_CHUNKING=true (default): VAD-based intelligent chunking
+    Args:
+        audio_tensor: Audio tensor (1D waveform)
+        sample_rate: Sample rate of the audio tensor
+        chunk_duration: Target chunk duration in seconds (for static chunking)
+        language_with_script: {Language code}_{script} for transcription
+        progress_callback: Optional callback for progress updates
+    Returns:
+        Dict with full transcription and segment information including word-level timestamps
+    """
+    try:
+        logger.info(f"Starting long-form transcription: tensor shape {audio_tensor.shape} at {sample_rate}Hz")
+        logger.info(f"USE_CHUNKING = {USE_CHUNKING}")
+        # Initialize chunker
+        chunker = AudioChunker()
+        # Determine chunking mode based on USE_CHUNKING setting
+        chunking_mode = "vad" if USE_CHUNKING else "none"
+        # Chunk the audio using the new unified interface
+        # Ensure tensor is 1D before chunking (squeeze any extra dimensions)
+        if len(audio_tensor.shape) > 1:
+            logger.info(f"Squeezing audio tensor from {audio_tensor.shape} to 1D")
+            audio_tensor_1d = audio_tensor.squeeze()
+        else:
+            audio_tensor_1d = audio_tensor
+        chunks = chunker.chunk_audio(audio_tensor_1d, sample_rate=sample_rate, mode=chunking_mode, chunk_duration=chunk_duration)
+        if not chunks:
+            logger.warning("No audio chunks created")
+            return {
+                "transcription": "",
+                "chunks": [],
+                "total_duration": 0.0,
+                "error": "No audio content detected",
+            }
+        logger.info(f"Processing {len(chunks)} audio chunks (mode: {chunking_mode})")
+        # Validate chunk continuity
+        for i, chunk in enumerate(chunks):
+            logger.info(
+                f"Chunk {i+1}: {chunk['start_time']:.2f}s - {chunk['end_time']:.2f}s ({chunk['duration']:.2f}s)"
+            )
+            if i > 0:
+                prev_end = chunks[i - 1]["end_time"]
+                current_start = chunk["start_time"]
+                gap = current_start - prev_end
+                if abs(gap) > 0.1:  # More than 100ms gap/overlap
+                    logger.warning(
+                        f"Gap/overlap between chunks {i} and {i+1}: {gap:.3f}s"
+                    )
+        # Process each chunk - now all chunks have uniform format!
+        all_segments = []
+        full_transcription_parts = []
+        total_duration = 0.0
+        chunk_details = []
+        for i, chunk in enumerate(chunks):
+            logger.info(
+                f"Processing chunk {i+1}/{len(chunks)} ({chunk['duration']:.1f}s, {chunk['start_time']:.1f}s-{chunk['end_time']:.1f}s)"
+            )
+            try:
+                # Process this chunk using tensor-based transcription pipeline
+                # Use the chunk's audio_data tensor directly - no more file operations!
+                chunk_audio_tensor = chunk["audio_data"]
+                chunk_sample_rate = chunk["sample_rate"]
+                chunk_result = transcribe_with_word_alignment(
+                    audio_tensor=chunk_audio_tensor,
+                    sample_rate=chunk_sample_rate,
+                    language_with_script=language_with_script
+                )
+                # Process alignment results - uniform handling for all chunk types
+                chunk_segments = []
+                chunk_start_time = chunk["start_time"]
+                chunk_duration = chunk["duration"]
+                if chunk_result.get("aligned_segments"):
+                    logger.info(
+                        f"Chunk {i+1} has {len(chunk_result['aligned_segments'])} segments"
+                    )
+                    chunk_segments = _validate_and_adjust_segments(
+                        aligned_segments=chunk_result["aligned_segments"],
+                        chunk_start_time=chunk_start_time,
+                        chunk_audio_tensor=chunk_audio_tensor,
+                        chunk_sample_rate=chunk_sample_rate,
+                        chunk_duration=chunk_duration,
+                        chunk_index=i
+                    )
+                all_segments.extend(chunk_segments)
+                logger.info(f"Chunk {i+1} processed {len(chunk_segments)} valid segments")
+                # Add to full transcription
+                chunk_transcription = ""
+                if chunk_result.get("transcription"):
+                    chunk_transcription = chunk_result["transcription"]
+                    full_transcription_parts.append(chunk_transcription)
+                # Store detailed chunk information
+                chunk_detail = {
+                    "chunk_index": i,
+                    "start_time": chunk["start_time"],
+                    "end_time": chunk["end_time"],
+                    "duration": chunk["duration"],
+                    "transcription": chunk_transcription,
+                    "num_segments": len(chunk_segments),
+                    "segments": chunk_segments,
+                }
+                chunk_details.append(chunk_detail)
+                total_duration = max(total_duration, chunk["end_time"])
+                # Update progress linearly from 0.1 to 0.9 based on chunk processing
+                progress = 0.1 + (0.8 * (i + 1) / len(chunks))
+                transcription_status.update_progress(progress)
+                logger.info(
+                    f"Chunk {i+1} processed: '{chunk_transcription}' ({len(chunk_segments)} segments)"
+                )
+            except Exception as chunk_error:
+                logger.error(f"Error processing chunk {i+1}: {chunk_error}")
+                # Continue with next chunk
+        # Combine results
+        full_transcription = " ".join(full_transcription_parts)
+        # Validate segment continuity
+        logger.info("Validating segment continuity...")
+        for i in range(1, len(all_segments)):
+            prev_end = all_segments[i - 1]["end"]
+            current_start = all_segments[i]["start"]
+            gap = current_start - prev_end
+            if abs(gap) > 1.0:  # More than 1 second gap
+                logger.warning(f"Large gap between segments {i-1} and {i}: {gap:.3f}s")
+        result = {
+            "transcription": full_transcription,
+            "aligned_segments": all_segments,
+            "chunks": [
+                {
+                    "chunk_index": chunk_detail["chunk_index"],
+                    "start_time": chunk_detail["start_time"],
+                    "end_time": chunk_detail["end_time"],
+                    "duration": chunk_detail["duration"],
+                    "transcription": chunk_detail["transcription"],
+                    "num_segments": chunk_detail["num_segments"],
+                }
+                for chunk_detail in chunk_details
+            ],
+            "chunk_details": chunk_details,  # Full details including segments per chunk
+            "total_duration": total_duration,
+            "num_chunks": len(chunks),
+            "num_segments": len(all_segments),
+            "status": "success",
+        }
+        logger.info(
+            f"Long-form transcription completed: {len(chunks)} chunks, {total_duration:.1f}s total"
+        )
+        logger.info(f"Total segments: {len(all_segments)}")
+        # Log chunk timing summary
+        for chunk_detail in chunk_details:
+            logger.info(
+                f"Chunk {chunk_detail['chunk_index']}: {chunk_detail['start_time']:.2f}-{chunk_detail['end_time']:.2f}s, {chunk_detail['num_segments']} segments"
+            )
+        return result
+    except Exception as e:
+        logger.error(f"Error in long-form transcription: {str(e)}", exc_info=True)
+        return {
+            "transcription": "",
+            "chunks": [],
+            "total_duration": 0.0,
+            "error": str(e),
+        }

server/convert_media_to_wav.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Media to WAV Converter Module
+Converts various media formats (m4a, mp3, mp4, etc.) to standardized WAV files
+and PyTorch tensors for audio transcription pipelines.
+Standardization:
+- 16kHz sample rate
+- Mono channel (merged if multi-channel)
+- Layer normalized
+- bfloat16 dtype tensor
+- Fail-fast error handling
+"""
+import os
+import tempfile
+from pathlib import Path
+from typing import Tuple, Union, Optional
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+from pydub import AudioSegment
+from pydub.utils import which
+# Constants
+TARGET_SAMPLE_RATE = 16000
+TARGET_DTYPE = torch.bfloat16
+def verify_ffmpeg_installation():
+    """Verify FFmpeg is available for pydub operations."""
+    if not which("ffmpeg"):
+        raise RuntimeError(
+            "FFmpeg not found. Please install FFmpeg for media format support. "
+            "On Ubuntu: sudo apt install ffmpeg"
+        )
+def layer_norm(tensor: torch.Tensor, shape: torch.Size) -> torch.Tensor:
+    """Apply layer normalization to audio tensor."""
+    # Simple layer normalization: (x - mean) / std
+    mean = tensor.mean()
+    std = tensor.std()
+    if std == 0:
+        return tensor - mean
+    return (tensor - mean) / std
+def detect_media_format(file_path: str) -> str:
+    """Detect media format from file extension."""
+    file_path = Path(file_path)
+    extension = file_path.suffix.lower()
+    supported_formats = {
+        '.wav': 'wav',
+        '.mp3': 'mp3',
+        '.m4a': 'm4a',
+        '.aac': 'aac',
+        '.flac': 'flac',
+        '.ogg': 'ogg',
+        '.wma': 'wma',
+        '.mp4': 'mp4',
+        '.avi': 'avi',
+        '.mov': 'mov',
+        '.mkv': 'mkv'
+    }
+    # Return known format or just pass through the extension without the dot
+    # Let downstream processing handle unknown formats with detailed error messages
+    return supported_formats.get(extension, extension[1:] if extension.startswith('.') else extension)
+def convert_to_wav_with_pydub(input_path: str, output_path: str, format_hint: str = None):
+    """Convert media file to WAV using pydub (FFmpeg backend)."""
+    verify_ffmpeg_installation()
+    # Load audio file - pydub auto-detects format or use hint
+    if format_hint:
+        audio = AudioSegment.from_file(input_path, format=format_hint)
+    else:
+        # Let pydub auto-detect
+        audio = AudioSegment.from_file(input_path)
+    # Convert to WAV format with standard settings
+    # pydub will handle the initial conversion, librosa will do the final processing
+    audio.export(output_path, format="wav")
+def process_wav_to_standard_format(wav_path: str) -> Tuple[np.ndarray, int]:
+    """Process WAV file to standard format using librosa."""
+    # Load the WAV file with librosa (handles resampling better than pydub)
+    data, fs = librosa.load(wav_path, sr=None)  # Load at original sample rate first
+    # Resample to target sample rate if needed
+    if fs != TARGET_SAMPLE_RATE:
+        data = librosa.resample(data, orig_sr=fs, target_sr=TARGET_SAMPLE_RATE)
+    # Handle multi-channel audio by merging to mono
+    if len(data.shape) > 1:
+        # Average across channels
+        data = np.mean(data, axis=0)
+    # Ensure it's a 1D array
+    data = np.asarray(data, dtype=np.float32)
+    return data, TARGET_SAMPLE_RATE
+def create_normalized_tensor(audio_data: np.ndarray) -> torch.Tensor:
+    """Convert numpy audio data to normalized PyTorch tensor with device handling."""
+    # Convert to bf16 tensor and normalize
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    data = torch.Tensor(audio_data).to(torch.bfloat16)
+    data = layer_norm(data, data.shape)
+    data = data.unsqueeze(0).to(device)
+    return data
+def convert_media_to_wav(
+    input_path: str,
+    output_dir: Optional[str] = None,
+    keep_temp_wav: bool = True
+) -> Tuple[str, torch.Tensor]:
+    """
+    Convert media file to standardized WAV file and normalized tensor.
+    Args:
+        input_path: Path to input media file
+        output_dir: Directory for output WAV file (default: temp directory)
+        keep_temp_wav: Whether to keep the temporary WAV file
+    Returns:
+        Tuple of (wav_file_path, normalized_tensor)
+    Raises:
+        ValueError: If file format is unsupported
+        RuntimeError: If FFmpeg is not available
+        FileNotFoundError: If input file doesn't exist
+    """
+    # Validate input file
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"Input file not found: {input_path}")
+    input_path = os.path.abspath(input_path)
+    # Detect format
+    media_format = detect_media_format(input_path)
+    # Setup output path
+    if output_dir is None:
+        output_dir = tempfile.gettempdir()
+    # Create output filename
+    input_name = Path(input_path).stem
+    output_wav_path = os.path.join(output_dir, f"{input_name}_converted.wav")
+    # Step 1: Convert to WAV using pydub (handles format conversion)
+    if media_format == 'wav':
+        # Already WAV, but still process through pydub to normalize format
+        convert_to_wav_with_pydub(input_path, output_wav_path, 'wav')
+    else:
+        # Convert from other format to WAV
+        convert_to_wav_with_pydub(input_path, output_wav_path, media_format)
+    # Step 2: Process WAV to standard format using librosa
+    audio_data, sample_rate = process_wav_to_standard_format(output_wav_path)
+    # Step 3: Create normalized tensor
+    normalized_tensor = create_normalized_tensor(audio_data)
+    # Step 4: Save the processed audio back to WAV file
+    # Overwrite the temp WAV with the processed version
+    sf.write(output_wav_path, audio_data, sample_rate)
+    return output_wav_path, normalized_tensor
+def convert_media_to_wav_from_bytes(
+    media_bytes: bytes,
+    original_filename: str,
+    output_dir: Optional[str] = None
+) -> Tuple[str, torch.Tensor]:
+    """
+    Convert media from bytes to WAV file and tensor.
+    Args:
+        media_bytes: Raw media file bytes
+        original_filename: Original filename for format detection
+        output_dir: Directory for output files
+    Returns:
+        Tuple of (wav_file_path, normalized_tensor)
+    """
+    # Create temporary input file
+    input_extension = Path(original_filename).suffix
+    with tempfile.NamedTemporaryFile(delete=False, suffix=input_extension) as temp_input:
+        temp_input.write(media_bytes)
+        temp_input_path = temp_input.name
+    # Convert using the main function
+    wav_path, tensor = convert_media_to_wav(temp_input_path, output_dir)
+    # Clean up temporary input file
+    os.unlink(temp_input_path)
+    return wav_path, tensor
+# Utility function for getting audio info
+def get_media_info(file_path: str) -> dict:
+    """Get information about media file."""
+    verify_ffmpeg_installation()
+    audio = AudioSegment.from_file(file_path)
+    return {
+        "duration_seconds": len(audio) / 1000.0,
+        "frame_rate": audio.frame_rate,
+        "channels": audio.channels,
+        "sample_width": audio.sample_width,
+        "format": detect_media_format(file_path)
+    }
+if __name__ == "__main__":
+    # Example usage
+    import sys
+    if len(sys.argv) != 2:
+        print("Usage: python convert_media_to_wav.py <input_file>")
+        sys.exit(1)
+    input_file = sys.argv[1]
+    print(f"Converting {input_file}...")
+    wav_path, tensor = convert_media_to_wav(input_file)
+    print(f"✓ WAV file: {wav_path}")
+    print(f"✓ Tensor shape: {tensor.shape}")
+    print(f"✓ Tensor dtype: {tensor.dtype}")
+    print(f"✓ Tensor device: {tensor.device}")
+    # Show media info
+    info = get_media_info(input_file)
+    print(f"✓ Media info: {info}")

server/download_models.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#@title download model /content/omniasr-transcriptions/server/download_models.sh
+# %%writefile /content/omniasr-transcriptions/server/download_models.py
+#!/usr/bin/env python3
+"""
+download_models.py
+Ensures the MMS model files are downloaded into MODELS_DIR.
+"""
+import os
+import urllib.request
+import urllib.error
+from tqdm.auto import tqdm
+import sys
+def download_file(url: str, download_file_path: str, redownload: bool = False) -> bool:
+    """Download a single file with urllib + tqdm progress bar."""
+    base_path = os.path.dirname(download_file_path)
+    os.makedirs(base_path, exist_ok=True)
+    # Skip if file already exists
+    if os.path.exists(download_file_path):
+        if redownload:
+            os.remove(download_file_path)
+            tqdm.write(f"♻️ Redownloading: {os.path.basename(download_file_path)}")
+        elif os.path.getsize(download_file_path) > 0:
+            tqdm.write(f"✔️ Skipped (already exists): {os.path.basename(download_file_path)}")
+            return True
+    # Try fetching metadata
+    try:
+        request = urllib.request.urlopen(url)
+        total = int(request.headers.get("Content-Length", 0))
+    except urllib.error.URLError as e:
+        print(f"❌ Error: Unable to open URL: {url}")
+        print(f"Reason: {e.reason}")
+        return False
+    # Download with progress bar
+    with tqdm(
+        total=total,
+        desc=os.path.basename(download_file_path),
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as progress:
+        try:
+            urllib.request.urlretrieve(
+                url,
+                download_file_path,
+                reporthook=lambda count, block_size, total_size: progress.update(block_size),
+            )
+        except urllib.error.URLError as e:
+            print(f"❌ Error: Failed to download {url}")
+            print(f"Reason: {e.reason}")
+            return False
+    tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
+    return True
+def main():
+    # Use MODELS_DIR from environment variable or default
+    MODELS_DIR = os.environ.get("MODELS_DIR", "./models")
+    print(f"📁 Checking and downloading MMS models to: {MODELS_DIR}")
+    # Check write permission
+    if not os.access(os.path.dirname(MODELS_DIR) or ".", os.W_OK):
+        print(f"✗ No write permission to {MODELS_DIR}")
+        sys.exit(1)
+    # ✅ Define URLs and build full local paths here
+    model_urls = {
+        "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/dictionary.txt":
+            os.path.join(MODELS_DIR, "ctc_alignment_mling_uroman_model_dict.txt"),
+        "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt":
+            os.path.join(MODELS_DIR, "ctc_alignment_mling_uroman_model.pt"),
+    }
+    for url, full_path in model_urls.items():
+        success = download_file(url, full_path)
+        if not success:
+            print(f"✗ Failed to fetch: {os.path.basename(full_path)}")
+            sys.exit(1)
+    print("✅ All model files are ready!")
+main()
+# if __name__ == "__main__":
+#     main()

server/env_vars.py ADDED Viewed

	@@ -0,0 +1,13 @@

+#@title change model name at  /content/omniasr-transcriptions/server/env_vars.py
+# %%writefile /content/omniasr-transcriptions/server/env_vars.py
+import logging
+import os
+log_level = os.environ.get("API_LOG_LEVEL", "INFO")  # see logging._nameToLevel
+API_LOG_LEVEL = logging._nameToLevel.get(log_level)
+# MMS Model Configuration
+MODEL_NAME = os.environ.get("MODEL_NAME", "omniASR_LLM_1B")  # Model name for pipeline
+# Audio Processing Configuration
+USE_CHUNKING = os.environ.get("USE_CHUNKING", "true").lower() == "true"  # Whether to use audio chunking

server/inference/align_utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+#@title fix file path /content/omniasr-transcriptions/server/inference/align_utils.py
+# %%writefile /content/omniasr-transcriptions/server/inference/align_utils.py
+import math
+import os
+import re
+import tempfile
+import logging
+from dataclasses import dataclass
+import torch
+from torchaudio.models import wav2vec2_model
+logger = logging.getLogger(__name__)
+# iso codes with specialized rules in uroman
+special_isos_uroman = "ara, bel, bul, deu, ell, eng, fas, grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid".split(
+    ","
+)
+special_isos_uroman = [i.strip() for i in special_isos_uroman]
+def normalize_uroman(text):
+    text = text.lower()
+    text = re.sub("([^a-z' ])", " ", text)
+    text = re.sub(" +", " ", text)
+    return text.strip()
+def get_uroman_tokens(norm_transcripts, uroman, iso=None):
+    tf = tempfile.NamedTemporaryFile()
+    tf2 = tempfile.NamedTemporaryFile()
+    with open(tf.name, "w") as f:
+        for t in norm_transcripts:
+            f.write(t + "\n")
+    uroman.romanize_file(
+        input_filename=tf.name,
+        output_filename=tf2.name,
+        lcode=iso if iso in special_isos_uroman else None,
+    )
+    outtexts = []
+    with open(tf2.name) as f:
+        for line in f:
+            line = " ".join(line.strip())
+            line = re.sub(r"\s+", " ", line).strip()
+            outtexts.append(line)
+    assert len(outtexts) == len(norm_transcripts)
+    uromans = []
+    for ot in outtexts:
+        uromans.append(normalize_uroman(ot))
+    return uromans
+@dataclass
+class Segment:
+    label: str
+    start: int
+    end: int
+    def __repr__(self):
+        return f"{self.label}: [{self.start:5d}, {self.end:5d})"
+    @property
+    def length(self):
+        return self.end - self.start
+def merge_repeats(path, idx_to_token_map):
+    i1, i2 = 0, 0
+    segments = []
+    while i1 < len(path):
+        while i2 < len(path) and path[i1] == path[i2]:
+            i2 += 1
+        segments.append(Segment(idx_to_token_map[path[i1]], i1, i2 - 1))
+        i1 = i2
+    return segments
+def time_to_frame(time):
+    stride_msec = 20
+    frames_per_sec = 1000 / stride_msec
+    return int(time * frames_per_sec)
+def load_model_dict():
+    # Use models directory from environment variable
+    models_dir = os.environ.get("MODELS_DIR", "./models")
+    model_path_name = os.path.join(models_dir, "ctc_alignment_mling_uroman_model.pt")
+    logger.info("Loading model from models directory...")
+    if not os.path.exists(model_path_name):
+        raise FileNotFoundError(f"Model file not found at {model_path_name}")
+    logger.info(f"Model found at: {model_path_name}")
+    state_dict = torch.load(model_path_name, map_location="cpu")
+    model = wav2vec2_model(
+        extractor_mode="layer_norm",
+        extractor_conv_layer_config=[
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        extractor_conv_bias=True,
+        encoder_embed_dim=1024,
+        encoder_projection_dropout=0.0,
+        encoder_pos_conv_kernel=128,
+        encoder_pos_conv_groups=16,
+        encoder_num_layers=24,
+        encoder_num_heads=16,
+        encoder_attention_dropout=0.0,
+        encoder_ff_interm_features=4096,
+        encoder_ff_interm_dropout=0.1,
+        encoder_dropout=0.0,
+        encoder_layer_norm_first=True,
+        encoder_layer_drop=0.1,
+        aux_num_out=31,
+    )
+    model.load_state_dict(state_dict)
+    model.eval()
+    # Use models directory from environment variable
+    models_dir = os.environ.get("MODELS_DIR", "./models")
+    dict_path_name = os.path.join(
+        models_dir, "ctc_alignment_mling_uroman_model_dict.txt"
+    )
+    if not os.path.exists(dict_path_name):
+        raise FileNotFoundError(f"Dictionary file not found at {dict_path_name}")
+    logger.info(f"Dictionary found at: {dict_path_name}")
+    dictionary = {}
+    with open(dict_path_name) as f:
+        dictionary = {l.strip(): i for i, l in enumerate(f.readlines())}
+    return model, dictionary
+def get_spans(tokens, segments):
+    ltr_idx = 0
+    tokens_idx = 0
+    intervals = []
+    start, end = (0, 0)
+    sil = "<blank>"
+    for seg_idx, seg in enumerate(segments):
+        if tokens_idx == len(tokens):
+            assert seg_idx == len(segments) - 1
+            assert seg.label == "<blank>"
+            continue
+        cur_token = tokens[tokens_idx].split(" ")
+        ltr = cur_token[ltr_idx]
+        if seg.label == "<blank>":
+            continue
+        assert seg.label == ltr
+        if (ltr_idx) == 0:
+            start = seg_idx
+        if ltr_idx == len(cur_token) - 1:
+            ltr_idx = 0
+            tokens_idx += 1
+            intervals.append((start, seg_idx))
+            while tokens_idx < len(tokens) and len(tokens[tokens_idx]) == 0:
+                intervals.append((seg_idx, seg_idx))
+                tokens_idx += 1
+        else:
+            ltr_idx += 1
+    spans = []
+    for idx, (start, end) in enumerate(intervals):
+        span = segments[start : end + 1]
+        if start > 0:
+            prev_seg = segments[start - 1]
+            if prev_seg.label == sil:
+                pad_start = (
+                    prev_seg.start
+                    if (idx == 0)
+                    else int((prev_seg.start + prev_seg.end) / 2)
+                )
+                span = [Segment(sil, pad_start, span[0].start)] + span
+        if end + 1 < len(segments):
+            next_seg = segments[end + 1]
+            if next_seg.label == sil:
+                pad_end = (
+                    next_seg.end
+                    if (idx == len(intervals) - 1)
+                    else math.floor((next_seg.start + next_seg.end) / 2)
+                )
+                span = span + [Segment(sil, span[-1].end, pad_end)]
+        spans.append(span)
+    return spans

server/inference/audio_chunker.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import torch
+import torchaudio
+import numpy as np
+import logging
+import tempfile
+import os
+import threading
+from typing import List, Tuple, Dict, Optional, Any
+import silero_vad
+import soundfile as sf
+import librosa
+logger = logging.getLogger(__name__)
+TARGET_CHUNK_DURATION = 30.0
+MIN_CHUNK_DURATION = 5.0
+SAMPLE_RATE = 16000
+class AudioChunker:
+    """
+    Handles audio chunking with different strategies:
+    - 'none': Single chunk (no chunking)
+    - 'vad': VAD-based intelligent chunking
+    - 'static': Fixed-duration time-based chunking
+    """
+    _instance = None
+    _instance_lock = threading.Lock()
+    vad_model: Optional[Any]
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._instance_lock:
+                # Check again after acquiring lock as the value could have been set
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    # Only load VAD model here since this only runs once
+                    cls._instance.vad_model = cls.load_vad_model()
+        return cls._instance
+    @staticmethod
+    def load_vad_model():
+        """Load silero VAD model with error handling."""
+        try:
+            logger.info("Loading Silero VAD model...")
+            vad_model = silero_vad.load_silero_vad()
+            logger.info("✓ VAD model loaded successfully")
+            return vad_model
+        except Exception as e:
+            logger.error(f"Failed to load VAD model: {e}")
+            logger.warning("VAD chunking will fall back to time-based chunking")
+            return None
+    @torch.inference_mode()
+    def chunk_audio(self, audio_tensor: torch.Tensor, sample_rate: int = SAMPLE_RATE, mode: str = "vad", chunk_duration: float = 30.0) -> List[Dict]:
+        """
+        Chunk audio tensor using specified strategy.
+        Args:
+            audio_tensor: Audio tensor (1D waveform)
+            sample_rate: Sample rate of the audio tensor
+            mode: Chunking mode - 'none', 'vad', or 'static'
+            chunk_duration: Target duration for static chunking (seconds)
+        Returns:
+            List of chunk info dicts with uniform format:
+            - start_time: Start time in seconds
+            - end_time: End time in seconds
+            - duration: Duration in seconds
+            - audio_data: Audio tensor for this chunk
+            - sample_rate: Sample rate
+            - chunk_index: Index of this chunk
+        """
+        logger.info(f"Chunking audio tensor: {audio_tensor.shape} at {sample_rate}Hz (mode: {mode})")
+        try:
+            # Assert tensor is already 1D (should be preprocessed by MediaTranscriptionProcessor)
+            assert len(audio_tensor.shape) == 1, f"Expected 1D audio tensor, got shape {audio_tensor.shape}"
+            # Assert sample rate is already 16kHz (should be preprocessed by MediaTranscriptionProcessor)
+            assert sample_rate == SAMPLE_RATE, f"Expected {SAMPLE_RATE}Hz sample rate, got {sample_rate}Hz"
+            # Route to appropriate chunking strategy
+            if mode == "none":
+                return self._create_single_chunk(audio_tensor, sample_rate)
+            elif mode == "vad":
+                if self.vad_model is not None:
+                    return self._chunk_with_vad(audio_tensor)
+                else:
+                    logger.warning("VAD model not available, falling back to static chunking")
+                    return self._chunk_static(audio_tensor, chunk_duration)
+            elif mode == "static":
+                return self._chunk_static(audio_tensor, chunk_duration)
+            else:
+                raise ValueError(f"Unknown chunking mode: {mode}")
+        except Exception as e:
+            logger.error(f"Error chunking audio tensor: {e}")
+            # Ultimate fallback to single chunk
+            return self._create_single_chunk(audio_tensor, sample_rate)
+    def _create_single_chunk(self, waveform: torch.Tensor, sample_rate: int = SAMPLE_RATE) -> List[Dict]:
+        """Create a single chunk containing the entire audio."""
+        duration = len(waveform) / sample_rate
+        return [{
+            "start_time": 0.0,
+            "end_time": duration,
+            "duration": duration,
+            "audio_data": waveform,
+            "sample_rate": sample_rate,
+            "chunk_index": 0,
+        }]
+    def _chunk_static(self, waveform: torch.Tensor, chunk_duration: float) -> List[Dict]:
+        """Create fixed-duration chunks."""
+        chunks = []
+        total_samples = len(waveform)
+        target_samples = int(chunk_duration * SAMPLE_RATE)
+        start_sample = 0
+        chunk_idx = 0
+        while start_sample < total_samples:
+            end_sample = min(start_sample + target_samples, total_samples)
+            chunk_audio = waveform[start_sample:end_sample]
+            duration = len(chunk_audio) / SAMPLE_RATE
+            # Only add chunk if it meets minimum duration
+            if duration >= MIN_CHUNK_DURATION:
+                chunks.append({
+                    "start_time": start_sample / SAMPLE_RATE,
+                    "end_time": end_sample / SAMPLE_RATE,
+                    "duration": duration,
+                    "audio_data": chunk_audio,
+                    "sample_rate": SAMPLE_RATE,
+                    "chunk_index": chunk_idx,
+                })
+                chunk_idx += 1
+            start_sample = end_sample
+        logger.info(f"Created {len(chunks)} static chunks of ~{chunk_duration}s each")
+        return chunks
+    def _chunk_fallback(self, audio_path: str) -> List[Dict]:
+        """Ultimate fallback - create single chunk using librosa (for file-based legacy method)."""
+        try:
+            logger.warning("Using librosa fallback for chunking")
+            data, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
+            waveform = torch.from_numpy(data)
+            return self._create_single_chunk(waveform, SAMPLE_RATE)
+        except Exception as e:
+            logger.error(f"All chunking methods failed: {e}")
+            return []
+    def _chunk_with_vad(self, waveform: torch.Tensor) -> List[Dict]:
+        """Chunk audio using VAD for speech detection with uniform return format."""
+        try:
+            # VAD model expects tensor on CPU
+            vad_waveform = waveform.cpu() if waveform.is_cuda else waveform
+            # Get speech timestamps using VAD
+            speech_timestamps = silero_vad.get_speech_timestamps(
+                vad_waveform,
+                self.vad_model,
+                sampling_rate=SAMPLE_RATE,
+                min_speech_duration_ms=500,  # Minimum speech segment
+                min_silence_duration_ms=300,  # Minimum silence to split
+                window_size_samples=1536,
+                speech_pad_ms=100,  # Padding around speech
+            )
+            logger.info(f"Found {len(speech_timestamps)} speech segments")
+            # Create chunks based on speech segments and target duration
+            # Pass original waveform (with device preserved) to chunk creation
+            chunks = self._create_chunks_from_speech_segments(
+                waveform, speech_timestamps
+            )
+            logger.info(f"Created {len(chunks)} audio chunks using VAD")
+            return chunks
+        except Exception as e:
+            logger.error(f"VAD chunking failed: {e}")
+            return self._chunk_static(waveform, TARGET_CHUNK_DURATION)
+    def _create_chunks_from_speech_segments(
+        self, waveform: torch.Tensor, speech_segments: List[Dict]
+    ) -> List[Dict]:
+        """Create chunks that respect speech boundaries and target duration with uniform format."""
+        if not speech_segments:
+            logger.warning(
+                "No speech segments found, falling back to static chunking"
+            )
+            return self._chunk_static(waveform, TARGET_CHUNK_DURATION)
+        chunks = []
+        current_chunk_start = 0
+        target_samples = int(TARGET_CHUNK_DURATION * SAMPLE_RATE)
+        total_samples = len(waveform)
+        chunk_idx = 0
+        while current_chunk_start < total_samples:
+            # Calculate target end for this chunk
+            target_chunk_end = current_chunk_start + target_samples
+            # If this would be the last chunk or close to it, just take the rest
+            if target_chunk_end >= total_samples or (
+                total_samples - target_chunk_end
+            ) < (target_samples * 0.3):
+                chunk_end = total_samples
+            else:
+                # Find the best place to end this chunk using VAD, but ensure continuous coverage
+                chunk_end = self._find_best_chunk_end_continuous(
+                    speech_segments,
+                    current_chunk_start,
+                    target_chunk_end,
+                    total_samples,
+                )
+            # Create chunk with uniform format
+            chunk_audio = waveform[current_chunk_start:chunk_end]
+            duration = len(chunk_audio) / SAMPLE_RATE
+            chunks.append({
+                "start_time": current_chunk_start / SAMPLE_RATE,
+                "end_time": chunk_end / SAMPLE_RATE,
+                "duration": duration,
+                "audio_data": chunk_audio,
+                "sample_rate": SAMPLE_RATE,
+                "chunk_index": chunk_idx,
+            })
+            logger.info(
+                f"Created chunk {chunk_idx + 1}: {current_chunk_start/SAMPLE_RATE:.2f}s - {chunk_end/SAMPLE_RATE:.2f}s ({duration:.2f}s)"
+            )
+            chunk_idx += 1
+            # Move to next chunk - IMPORTANT: start exactly where this chunk ended
+            current_chunk_start = chunk_end
+        # Verify total coverage
+        total_audio_duration = len(waveform) / SAMPLE_RATE
+        total_chunks_duration = sum(chunk["duration"] for chunk in chunks)
+        logger.info(
+            f"Audio chunking complete: {len(chunks)} chunks covering {total_chunks_duration:.2f}s of {total_audio_duration:.2f}s total audio"
+        )
+        if (
+            abs(total_chunks_duration - total_audio_duration) > 0.01
+        ):  # Allow 10ms tolerance
+            logger.error(
+                f"Duration mismatch: chunks={total_chunks_duration:.2f}s, original={total_audio_duration:.2f}s"
+            )
+        else:
+            logger.info("✓ Perfect audio coverage achieved")
+        return chunks
+    def _find_best_chunk_end_continuous(
+        self,
+        speech_segments: List[Dict],
+        chunk_start: int,
+        target_end: int,
+        total_samples: int,
+    ) -> int:
+        """Find the best place to end a chunk while ensuring continuous coverage."""
+        # Don't go beyond the audio
+        target_end = min(target_end, total_samples)
+        # Look for a good break point within a reasonable window around target
+        search_window = int(SAMPLE_RATE * 3)  # 3 second window
+        search_start = max(chunk_start, target_end - search_window)
+        search_end = min(total_samples, target_end + search_window)
+        best_end = target_end
+        best_score = 0
+        # Look for speech segment boundaries within the search window
+        for segment in speech_segments:
+            segment_start = segment["start"]
+            segment_end = segment["end"]
+            # Check if segment end is in our search window
+            if search_start <= segment_end <= search_end:
+                # Score based on how close to target and if it's a good break point
+                distance_score = 1.0 - abs(segment_end - target_end) / search_window
+                # Prefer segment ends (natural pauses)
+                boundary_score = 1.0
+                total_score = distance_score * boundary_score
+                if total_score > best_score:
+                    best_score = total_score
+                    best_end = segment_end
+        # Ensure we don't go beyond audio bounds
+        best_end = min(int(best_end), total_samples)
+        # Ensure we make progress (don't end before we started)
+        if best_end <= chunk_start:
+            best_end = min(target_end, total_samples)
+        return best_end
+    def _find_best_chunk_end(
+        self,
+        speech_segments: List[Dict],
+        start_idx: int,
+        chunk_start: int,
+        target_end: int,
+    ) -> int:
+        """Find the best place to end a chunk (at silence, near target duration)."""
+        best_end = target_end
+        # Look for speech segments that could provide good break points
+        for i in range(start_idx, len(speech_segments)):
+            segment = speech_segments[i]
+            segment_start = segment["start"]
+            segment_end = segment["end"]
+            # If segment starts after our target end, use the gap before it
+            if segment_start > target_end:
+                best_end = min(target_end, segment_start)
+                break
+            # If segment ends near our target, use the end of the segment
+            if abs(segment_end - target_end) < SAMPLE_RATE * 5:  # Within 5 seconds
+                best_end = segment_end
+                break
+            # If segment extends way past target, look for a good break point
+            if segment_end > target_end + SAMPLE_RATE * 10:  # 10+ seconds past
+                # Try to find a silence gap within the segment or use target
+                best_end = target_end
+                break
+        return int(best_end)
+    def save_chunk_to_file(self, chunk: Dict, output_path: str) -> str:
+        """Save a chunk to a temporary audio file."""
+        try:
+            # Convert tensor to numpy if needed
+            audio_data = chunk["audio_data"]
+            if isinstance(audio_data, torch.Tensor):
+                # Move to CPU first if on GPU, then convert to numpy
+                audio_data = audio_data.cpu().numpy()
+            # Save to file
+            sf.write(output_path, audio_data, chunk["sample_rate"])
+            return output_path
+        except Exception as e:
+            logger.error(f"Failed to save chunk to file: {e}")
+            raise

server/inference/audio_reading_tools.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import io
+import numpy as np
+import soundfile as sf
+import torch
+from numpy.typing import NDArray
+# def wav_to_bytes(
+#     wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav"
+# ) -> NDArray[np.int8]:
+#     """Convert audio tensor to bytes using soundfile directly."""
+#     # Convert to numpy if torch tensor
+#     if isinstance(wav, torch.Tensor):
+#         if wav.is_cuda:
+#             wav = wav.cpu()
+#         # Convert to float32 first (numpy doesn't support bfloat16)
+#         if wav.dtype != torch.float32:
+#             wav = wav.float()
+#         wav = wav.numpy()
+#     # Ensure float32 dtype for numpy arrays
+#     if wav.dtype != np.float32:
+#         wav = wav.astype(np.float32)
+#     # Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel
+#     if wav.ndim == 1:
+#         # Already correct shape for mono
+#         pass
+#     elif wav.ndim == 2:
+#         # If shape is (channels, samples), transpose to (samples, channels)
+#         if wav.shape[0] < wav.shape[1]:
+#             wav = wav.T
+#     # Create buffer and write using soundfile directly
+#     buffer = io.BytesIO()
+#     # Map format string to soundfile format
+#     sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV'
+#     subtype = 'PCM_16' if sf_format == 'WAV' else None
+#     # Write to buffer
+#     sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype)
+#     buffer.seek(0)
+#     return np.frombuffer(buffer.getvalue(), dtype=np.int8)
+#     # return buffer.read()
+def wav_to_bytes(wav: torch.Tensor | np.ndarray, sample_rate: int = 16000, format: str = "wav"):
+    """Convert audio tensor to bytes using soundfile directly (safe + dtype fix)."""
+    # ✅ Convert to numpy if torch tensor
+    if isinstance(wav, torch.Tensor):
+        wav = wav.detach().cpu()
+        if wav.dtype == torch.bfloat16:
+            wav = wav.to(torch.float32)  # FIX: convert unsupported dtype
+        elif wav.dtype != torch.float32:
+            wav = wav.float()
+        wav = wav.numpy()
+    # ✅ Handle empty or multi-dim cases
+    if wav.ndim > 1:
+        wav = wav.squeeze()
+    if wav.size == 0:
+        raise ValueError("Empty audio segment passed to wav_to_bytes")
+    # ✅ Ensure valid range and dtype
+    wav = wav.astype(np.float32)
+    wav = np.nan_to_num(np.clip(wav, -1.0, 1.0))
+    buffer = io.BytesIO()
+    try:
+        sf.write(buffer, wav, sample_rate, format="WAV", subtype="PCM_16")
+    except Exception as e:
+        print(f"[ERROR] soundfile write failed: {e}")
+        raise
+    buffer.seek(0)
+    return np.frombuffer(buffer.getvalue(), dtype=np.int8)

server/inference/audio_sentence_alignment.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import gc
+import io
+import logging
+import threading
+from dataclasses import dataclass
+from typing import Dict, List
+import torch
+import torchaudio
+import torchaudio.functional as audio_F
+from .align_utils import get_spans, load_model_dict, merge_repeats, time_to_frame
+from .audio_reading_tools import wav_to_bytes
+# Global logger for this module
+logger = logging.getLogger(__name__)
+@dataclass(kw_only=True)
+class AudioAlignmentConfig:
+    model_path_name: str = ""
+    emission_interval: int = 30
+    audio_format: str = "flac"
+    use_star: bool = False
+    device: str = "cuda"
+class AudioAlignment:
+    """Thread-safe singleton for audio-text alignment."""
+    _instance = None
+    _lock = threading.Lock()
+    scale: int = 1000
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                # Double-check locking pattern
+                if cls._instance is None:
+                    cls._instance = super(AudioAlignment, cls).__new__(cls)
+                    cls._instance._initialize()
+        return cls._instance
+    def _initialize(self):
+        """Initialize the singleton instance (called only once)."""
+        logger.info("Initializing AudioAlignment model...")
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        config = AudioAlignmentConfig(
+            device=str(device),
+            use_star=False,  # Set to False for standard alignment
+        )
+        self.config = config
+        # FIXME: pass model name correctly
+        logger.info("Loading forced alignment model and dictionary...")
+        self.model, self.dictionary = load_model_dict()
+        self.device = torch.device(config.device)
+        self.model.to(self.device)
+        if self.config.use_star:
+            self.dictionary["<star>"] = len(self.dictionary)
+        self.blank = self.dictionary["<blank>"]
+        self.inverse_dictionary = {v: k for k, v in self.dictionary.items()}
+        logger.info(
+            f"AudioAlignment model loaded successfully on device: {self.device}"
+        )
+    @torch.inference_mode()
+    def generate_emissions(self, waveform: torch.Tensor, reading_sr):
+        emission_interval = self.config.emission_interval
+        total_duration = waveform.size(1) / reading_sr
+        emissions_arr = []
+        i = 0
+        while i < total_duration:
+            segment_start_time, segment_end_time = (i, i + emission_interval)
+            context = emission_interval * 0.1
+            input_start_time = max(segment_start_time - context, 0)
+            input_end_time = min(segment_end_time + context, total_duration)
+            waveform_split = waveform[
+                :,
+                int(reading_sr * input_start_time) : int(reading_sr * (input_end_time)),
+            ]
+            model_outs, _ = self.model(waveform_split)
+            emissions_ = model_outs[0]
+            emission_start_frame = time_to_frame(segment_start_time)
+            emission_end_frame = time_to_frame(segment_end_time)
+            offset = time_to_frame(input_start_time)
+            emissions_ = emissions_[
+                emission_start_frame - offset : emission_end_frame - offset, :
+            ]
+            emissions_arr.append(emissions_)
+            i += emission_interval
+        emissions = torch.cat(emissions_arr, dim=0).squeeze()
+        emissions = torch.log_softmax(emissions, dim=-1)
+        stride = float(waveform.size(1) * self.scale / emissions.size(0) / reading_sr)
+        return emissions, stride
+    @torch.inference_mode()
+    def get_one_row_alignments(
+        self, audio_arr, reading_sr, tokens: List[str]
+    ) -> List[Dict]:
+        """Internal method to perform forced alignment."""
+        # buffer = audio_arr.tobytes()
+        buffer = audio_arr if isinstance(audio_arr, (bytes, bytearray)) else audio_arr.tobytes()
+        waveform, audio_sf = torchaudio.load(io.BytesIO(buffer))
+        waveform = waveform.to(self.device)
+        assert audio_sf == reading_sr
+        emissions, stride = self.generate_emissions(waveform, reading_sr)
+        waveform = waveform.cpu()
+        if self.config.use_star:
+            T, _ = emissions.size()
+            emissions = torch.cat(
+                [emissions, torch.zeros(T, 1, device=self.device)], dim=1
+            )
+        if self.config.use_star:
+            tokens = ["<star>"] + tokens
+        token_indices = [
+            self.dictionary[c]
+            for c in " ".join(tokens).split(" ")
+            if c in self.dictionary
+        ]
+        targets = torch.tensor(token_indices, dtype=torch.int32, device=self.device)
+        input_lengths = torch.tensor(emissions.shape[0]).unsqueeze(-1)
+        target_lengths = torch.tensor(targets.shape[0]).unsqueeze(-1)
+        path, _ = audio_F.forced_align(
+            emissions.unsqueeze(0),
+            targets.unsqueeze(0),
+            input_lengths,
+            target_lengths,
+            blank=self.blank,
+        )
+        path = path.squeeze().to("cpu").tolist()
+        segments = merge_repeats(path, self.inverse_dictionary)
+        spans = get_spans(tokens, segments)
+        # audio_segments = []
+        # for span in spans:
+        #     seg_start_idx, seg_end_idx = span[0].start, span[-1].end
+        #     segment_start_sec = seg_start_idx * stride / self.scale
+        #     segment_end_sec = seg_end_idx * stride / self.scale
+        #     start_frame = int(segment_start_sec * reading_sr)
+        #     end_frame = int(segment_end_sec * reading_sr)
+        #     trimmed_waveform = waveform[:, start_frame:end_frame]
+        #     audio_segments.append(
+        #         {
+        #             "segment_start_sec": segment_start_sec,
+        #             "segment_end_sec": segment_end_sec,
+        #             "segment_duration": segment_end_sec - segment_start_sec,
+        #             "segment_audio_bytes": wav_to_bytes(
+        #                 trimmed_waveform, reading_sr, self.config.audio_format
+        #             ),
+        #         }
+        #     )
+        # return audio_segments
+        audio_segments = []
+        for i, span in enumerate(spans):
+            seg_start_idx, seg_end_idx = span[0].start, span[-1].end
+            segment_start_sec = seg_start_idx * stride / self.scale
+            segment_end_sec = seg_end_idx * stride / self.scale
+            start_frame = int(segment_start_sec * reading_sr)
+            end_frame = int(segment_end_sec * reading_sr)
+            trimmed_waveform = waveform[:, start_frame:end_frame]
+            # 🧩 Fix: Skip empty or invalid audio segments
+            if trimmed_waveform is None or trimmed_waveform.numel() == 0:
+                # logger.warning(
+                #     f"⚠️ Skipping empty audio segment {i} "
+                #     f"({segment_start_sec:.2f}-{segment_end_sec:.2f}s)"
+                # )
+                continue
+            try:
+                audio_bytes = wav_to_bytes(trimmed_waveform, reading_sr, self.config.audio_format)
+            except Exception as e:
+                # logger.error(f"❌ Failed to convert segment {i} to bytes: {e}")
+                continue
+            audio_segments.append(
+                {
+                    "segment_start_sec": segment_start_sec,
+                    "segment_end_sec": segment_end_sec,
+                    "segment_duration": segment_end_sec - segment_start_sec,
+                    "segment_audio_bytes": audio_bytes,
+                }
+            )
+        return audio_segments

server/inference/mms_model_pipeline.py ADDED Viewed

	@@ -0,0 +1,138 @@

+#@title fix import and path /content/omniasr-transcriptions/server/inference/mms_model_pipeline.py
+# %%writefile /content/omniasr-transcriptions/server/inference/mms_model_pipeline.py
+"""
+Pipeline-based MMS Model using the official MMS library.
+This implementation uses Wav2Vec2LlamaInferencePipeline to avoid Seq2SeqBatch complexity.
+"""
+import logging
+import os
+import torch
+from typing import List, Dict, Any, Optional
+# from omnilingual_asr.models.inference.pipeline import Wav2Vec2InferencePipeline
+from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+from inference.audio_reading_tools import wav_to_bytes
+from env_vars import MODEL_NAME
+logger = logging.getLogger(__name__)
+class MMSModel:
+    """Pipeline-based MMS model wrapper using the official inference pipeline."""
+    _instance = None
+    _initialized = False
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            logger.info("Creating new MMSModel singleton instance")
+            cls._instance = super().__new__(cls)
+        else:
+            logger.info("Using existing MMSModel singleton instance")
+        return cls._instance
+    def __init__(self, model_card: str = None, device = None):
+        """
+        Initialize the MMS model with the official pipeline.
+        Args:
+            model_card: Model card to use (omniASR_LLM_1B, omniASR_LLM_300M, etc.)
+                       If None, uses MODEL_NAME from environment variables
+            device: Device to use (torch.device object, "cuda", "cpu", etc.)
+        """
+        # Only initialize once
+        if self._initialized:
+            return
+        # Get model name from environment variable with default fallback
+        self.model_card = model_card or MODEL_NAME
+        self.device = device
+        # Load the pipeline immediately during initialization
+        self._load_pipeline()
+        # Mark as initialized
+        self._initialized = True
+    def _load_pipeline(self):
+        """Load the MMS pipeline during initialization."""
+        logger.info(f"Loading MMS pipeline: {self.model_card}")
+        logger.info(f"Target device: {self.device}")
+        # Debug FAIRSEQ2_CACHE_DIR environment variable
+        # fairseq2_cache_dir = os.environ.get('FAIRSEQ2_CACHE_DIR')
+        fairseq2_cache_dir = os.environ.get('FAIRSEQ2_CACHE_DIR',"./models")
+        logger.info(f"DEBUG: FAIRSEQ2_CACHE_DIR = {fairseq2_cache_dir}")
+        try:
+            # Convert device to string if it's a torch.device object
+            device_str = str(self.device) if hasattr(self.device, 'type') else str(self.device)
+            # self.pipeline = Wav2Vec2InferencePipeline(
+            #     model_card=self.model_card,
+            #     device=device_str
+            # )
+            self.pipeline = ASRInferencePipeline(
+                model_card=self.model_card,
+                device=device_str
+            )
+            logger.info("✓ MMS pipeline loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load MMS pipeline: {e}")
+            raise
+    def transcribe_audio(self, audio_tensor: torch.Tensor, batch_size: int = 1, language_with_scripts: List[str] = None) -> List[Dict[str, Any]]:
+        """
+        Transcribe audio tensor using the MMS pipeline.
+        Args:
+            audio_tensor: Audio tensor (1D waveform) to transcribe
+            batch_size: Batch size for processing
+            language_with_scripts: List of language_with_scripts codes for transcription (3-letter ISO codes with script)
+                 If None, uses auto-detection
+        Returns:
+            List of transcription results
+        """
+        # Pipeline is already loaded during initialization, no need to check
+        # Convert tensor to bytes for the pipeline
+        logger.info(f"Converting tensor (shape: {audio_tensor.shape}) to bytes")
+        # Move to CPU first if on GPU
+        tensor_cpu = audio_tensor.cpu() if audio_tensor.is_cuda else audio_tensor
+        # Convert to bytes using wav_to_bytes with 16kHz sample rate
+        audio_bytes = wav_to_bytes(tensor_cpu, sample_rate=16000, format="wav")
+        logger.info(f"Transcribing audio tensor with batch_size={batch_size}, language_with_scripts={language_with_scripts}")
+        try:
+            # Use the official pipeline transcribe method with a list containing the single audio bytes
+            if language_with_scripts is not None:
+                transcriptions = self.pipeline.transcribe([audio_bytes], batch_size=batch_size, lang=language_with_scripts)
+            else:
+                transcriptions = self.pipeline.transcribe([audio_bytes], batch_size=batch_size)
+            logger.info(f"✓ Successfully transcribed audio tensor")
+            return transcriptions
+        except Exception as e:
+            logger.error(f"Transcription failed: {e}")
+            raise
+    @classmethod
+    def get_instance(cls, model_card: str = None, device = None):
+        """
+        Get the singleton instance of MMSModel.
+        Args:
+            model_card: Model card to use (omniASR_LLM_1B, omniASR_LLM_300M, etc.)
+                       If None, uses MODEL_NAME from environment variables
+            device: Device to use (torch.device object, "cuda", "cpu", etc.)
+        Returns:
+            MMSModel: The singleton instance
+        """
+        if cls._instance is None:
+            cls._instance = cls(model_card=model_card, device=device)
+        return cls._instance

server/inference/norm_config_module.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# type: ignore
+import os
+import re
+colon = ":"
+comma = ","
+exclamation_mark = "!"
+period = re.escape(".")
+question_mark = re.escape("?")
+semicolon = ";"
+left_curly_bracket = "{"
+right_curly_bracket = "}"
+quotation_mark = '"'
+basic_punc = (
+    period
+    + question_mark
+    + comma
+    + colon
+    + exclamation_mark
+    + left_curly_bracket
+    + right_curly_bracket
+)
+# General punc unicode block (0x2000-0x206F)
+zero_width_space = r"\u200B"
+zero_width_nonjoiner = r"\u200C"
+left_to_right_mark = r"\u200E"
+right_to_left_mark = r"\u200F"
+left_to_right_embedding = r"\u202A"
+pop_directional_formatting = r"\u202C"
+# Here are some commonly ill-typed versions of apostrophe
+right_single_quotation_mark = r"\u2019"
+left_single_quotation_mark = r"\u2018"
+# Language specific definitions
+# Spanish
+inverted_exclamation_mark = r"\u00A1"
+inverted_question_mark = r"\u00BF"
+# Hindi
+hindi_danda = "\u0964"
+# Egyptian Arabic
+# arabic_percent = r"\u066A"
+arabic_comma = r"\u060C"
+arabic_question_mark = r"\u061F"
+arabic_semicolon = r"\u061B"
+arabic_diacritics = r"\u064B-\u0652"
+arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
+# Chinese
+full_stop = r"\u3002"
+full_comma = r"\uFF0C"
+full_exclamation_mark = r"\uFF01"
+full_question_mark = r"\uFF1F"
+full_semicolon = r"\uFF1B"
+full_colon = r"\uFF1A"
+full_parentheses = r"\uFF08\uFF09"
+quotation_mark_horizontal = r"\u300C-\u300F"
+quotation_mark_vertical = r"\uFF41-\uFF44"
+title_marks = r"\u3008-\u300B"
+wavy_low_line = r"\uFE4F"
+ellipsis = r"\u22EF"
+enumeration_comma = r"\u3001"
+hyphenation_point = r"\u2027"
+forward_slash = r"\uFF0F"
+wavy_dash = r"\uFF5E"
+box_drawings_light_horizontal = r"\u2500"
+fullwidth_low_line = r"\uFF3F"
+chinese_punc = (
+    full_stop
+    + full_comma
+    + full_exclamation_mark
+    + full_question_mark
+    + full_semicolon
+    + full_colon
+    + full_parentheses
+    + quotation_mark_horizontal
+    + quotation_mark_vertical
+    + title_marks
+    + wavy_low_line
+    + ellipsis
+    + enumeration_comma
+    + hyphenation_point
+    + forward_slash
+    + wavy_dash
+    + box_drawings_light_horizontal
+    + fullwidth_low_line
+)
+# Armenian
+armenian_apostrophe = r"\u055A"
+emphasis_mark = r"\u055B"
+exclamation_mark = r"\u055C"
+armenian_comma = r"\u055D"
+armenian_question_mark = r"\u055E"
+abbreviation_mark = r"\u055F"
+armenian_full_stop = r"\u0589"
+armenian_punc = (
+    armenian_apostrophe
+    + emphasis_mark
+    + exclamation_mark
+    + armenian_comma
+    + armenian_question_mark
+    + abbreviation_mark
+    + armenian_full_stop
+)
+lesser_than_symbol = r"&lt;"
+greater_than_symbol = r"&gt;"
+lesser_than_sign = r"\u003c"
+greater_than_sign = r"\u003e"
+nbsp_written_form = r"&nbsp"
+# Quotation marks
+left_double_quotes = r"\u201c"
+right_double_quotes = r"\u201d"
+left_double_angle = r"\u00ab"
+right_double_angle = r"\u00bb"
+left_single_angle = r"\u2039"
+right_single_angle = r"\u203a"
+low_double_quotes = r"\u201e"
+low_single_quotes = r"\u201a"
+high_double_quotes = r"\u201f"
+high_single_quotes = r"\u201b"
+all_punct_quotes = (
+    left_double_quotes
+    + right_double_quotes
+    + left_double_angle
+    + right_double_angle
+    + left_single_angle
+    + right_single_angle
+    + low_double_quotes
+    + low_single_quotes
+    + high_double_quotes
+    + high_single_quotes
+    + right_single_quotation_mark
+    + left_single_quotation_mark
+)
+mapping_quotes = (
+    "["
+    + high_single_quotes
+    + right_single_quotation_mark
+    + left_single_quotation_mark
+    + "]"
+)
+# Digits
+english_digits = r"\u0030-\u0039"
+bengali_digits = r"\u09e6-\u09ef"
+khmer_digits = r"\u17e0-\u17e9"
+devanagari_digits = r"\u0966-\u096f"
+oriya_digits = r"\u0b66-\u0b6f"
+extended_arabic_indic_digits = r"\u06f0-\u06f9"
+kayah_li_digits = r"\ua900-\ua909"
+fullwidth_digits = r"\uff10-\uff19"
+malayam_digits = r"\u0d66-\u0d6f"
+myanmar_digits = r"\u1040-\u1049"
+roman_numeral = r"\u2170-\u2179"
+nominal_digit_shapes = r"\u206f"
+# Load punctuations from MMS-lab data from the current directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(current_dir, "punctuations.lst"), "r") as punc_f:
+    punc_list = punc_f.readlines()
+punct_pattern = r""
+for punc in punc_list:
+    # the first character in the tab separated line is the punc to be removed
+    punct_pattern += re.escape(punc.split("\t")[0])
+shared_digits = (
+    english_digits
+    + bengali_digits
+    + khmer_digits
+    + devanagari_digits
+    + oriya_digits
+    + extended_arabic_indic_digits
+    + kayah_li_digits
+    + fullwidth_digits
+    + malayam_digits
+    + myanmar_digits
+    + roman_numeral
+    + nominal_digit_shapes
+)
+shared_punc_list = (
+    basic_punc
+    + all_punct_quotes
+    + greater_than_sign
+    + lesser_than_sign
+    + inverted_question_mark
+    + full_stop
+    + semicolon
+    + armenian_punc
+    + inverted_exclamation_mark
+    + arabic_comma
+    + enumeration_comma
+    + hindi_danda
+    + quotation_mark
+    + arabic_semicolon
+    + arabic_question_mark
+    + chinese_punc
+    + punct_pattern
+)
+shared_mappping = {
+    lesser_than_symbol: "",
+    greater_than_symbol: "",
+    nbsp_written_form: "",
+    r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
+}
+shared_deletion_list = (
+    left_to_right_mark
+    + zero_width_nonjoiner
+    + arabic_subscript_alef_and_inverted_damma
+    + zero_width_space
+    + arabic_diacritics
+    + pop_directional_formatting
+    + right_to_left_mark
+    + left_to_right_embedding
+)
+norm_config = {
+    "*": {
+        "lower_case": True,
+        "punc_set": shared_punc_list,
+        "del_set": shared_deletion_list,
+        "mapping": shared_mappping,
+        "digit_set": shared_digits,
+        "unicode_norm": "NFKC",
+        "rm_diacritics": False,
+    }
+}
+# =============== Mongolian ===============#
+norm_config["mon"] = norm_config["*"].copy()
+# add soft hyphen to punc list to match with fleurs
+norm_config["mon"]["del_set"] += r"\u00AD"
+norm_config["khk"] = norm_config["mon"].copy()
+# =============== Hebrew ===============#
+norm_config["heb"] = norm_config["*"].copy()
+# add "HEBREW POINT" symbols to match with fleurs
+norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
+# =============== Thai ===============#
+norm_config["tha"] = norm_config["*"].copy()
+# add "Zero width joiner" symbols to match with fleurs
+norm_config["tha"]["punc_set"] += r"\u200D"
+# =============== Arabic ===============#
+norm_config["ara"] = norm_config["*"].copy()
+norm_config["ara"]["mapping"]["ٱ"] = "ا"
+norm_config["arb"] = norm_config["ara"].copy()
+# =============== Javanese ===============#
+norm_config["jav"] = norm_config["*"].copy()
+norm_config["jav"]["rm_diacritics"] = True

server/inference/punctuations.lst ADDED Viewed

	@@ -0,0 +1,188 @@

+	7355	INVALID UNICODE	0x81
+	5265	INVALID UNICODE	0x90
+	75	INVALID UNICODE	0x8
+	31	INVALID UNICODE	0x8d
+	3	INVALID UNICODE	0x94
+	2	INVALID UNICODE	0x8f
+	2	INVALID UNICODE	0x1a
+	1	INVALID UNICODE	0x9d
+	1	INVALID UNICODE	0x93
+	1	INVALID UNICODE	0x92
+	8647	INVALID UNICODE	0xe295
+	6650	INVALID UNICODE	0xf21d
+	6234	INVALID UNICODE	0xf62d
+	4815	INVALID UNICODE	0xf173
+	4789	INVALID UNICODE	0xe514
+	4409	INVALID UNICODE	0xe293
+	3881	INVALID UNICODE	0xf523
+	3788	INVALID UNICODE	0xe233
+	2448	INVALID UNICODE	0xf50f
+	2177	INVALID UNICODE	0xe232
+	1955	INVALID UNICODE	0xea7b
+	1926	INVALID UNICODE	0xf172
+	973	INVALID UNICODE	0xe290
+	972	INVALID UNICODE	0xf519
+	661	INVALID UNICODE	0xe292
+	591	INVALID UNICODE	0xe328
+	509	INVALID UNICODE	0xe2fa
+	458	INVALID UNICODE	0xe234
+	446	INVALID UNICODE	0xe043
+	419	INVALID UNICODE	0xe040
+	399	INVALID UNICODE	0xe2fb
+	387	INVALID UNICODE	0xe32b
+	381	INVALID UNICODE	0xe236
+	374	INVALID UNICODE	0xf511
+	314	INVALID UNICODE	0xe517
+	296	INVALID UNICODE	0xe2fe
+	293	INVALID UNICODE	0xe492
+	291	INVALID UNICODE	0xf52d
+	289	INVALID UNICODE	0xe2fc
+	195	INVALID UNICODE	0xf521
+	190	INVALID UNICODE	0xe516
+	182	INVALID UNICODE	0xe041
+	178	INVALID UNICODE	0xf529
+	113	INVALID UNICODE	0xe2f9
+	87	INVALID UNICODE	0xe2d9
+	78	INVALID UNICODE	0xe32a
+	76	INVALID UNICODE	0xe291
+	74	INVALID UNICODE	0xe296
+	66	INVALID UNICODE	0xe518
+	52	INVALID UNICODE	0xe32c
+	46	INVALID UNICODE	0xe2db
+	41	INVALID UNICODE	0xe231
+	34	INVALID UNICODE	0xf522
+	33	INVALID UNICODE	0xf518
+	32	INVALID UNICODE	0xf513
+	27	INVALID UNICODE	0xe32d
+	25	INVALID UNICODE	0xe32e
+	23	INVALID UNICODE	0xe06b
+	15	INVALID UNICODE	0xea01
+	12	INVALID UNICODE	0xe294
+	11	INVALID UNICODE	0xe203
+	8	INVALID UNICODE	0xf218
+	7	INVALID UNICODE	0xe070
+	7	INVALID UNICODE	0xe013
+	5	INVALID UNICODE	0xe2de
+	4	INVALID UNICODE	0xe493
+	3	INVALID UNICODE	0xf7e8
+	3	INVALID UNICODE	0xf7d0
+	3	INVALID UNICODE	0xe313
+	2	INVALID UNICODE	0xe329
+	2	INVALID UNICODE	0xe06d
+	2	INVALID UNICODE	0xe003
+	1	INVALID UNICODE	0xf50e
+	1	INVALID UNICODE	0xf171
+	1	INVALID UNICODE	0xe01d
+⁯	71	NOMINAL DIGIT SHAPES	0x206f
+⁠	3	WORD JOINER	0x2060
+―	126545	HORIZONTAL BAR	0x2015
+־	1028	HEBREW PUNCTUATION MAQAF	0x5be
+)	98429	RIGHT PARENTHESIS	0x29
+]	27108	RIGHT SQUARE BRACKET	0x5d
+⌋	1567	RIGHT FLOOR	0x230b
+〕	97	RIGHT TORTOISE SHELL BRACKET	0x3015
+】	36	RIGHT BLACK LENTICULAR BRACKET	0x3011
+﴾	14	ORNATE LEFT PARENTHESIS	0xfd3e
+&	170517	AMPERSAND	0x26
+།	106330	TIBETAN MARK SHAD	0xf0d
+።	90203	ETHIOPIC FULL STOP	0x1362
+፥	60484	ETHIOPIC COLON	0x1365
+༌	60464	TIBETAN MARK DELIMITER TSHEG BSTAR	0xf0c
+။	51567	MYANMAR SIGN SECTION	0x104b
+/	46929	SOLIDUS	0x2f
+၊	38042	MYANMAR SIGN LITTLE SECTION	0x104a
+·	37985	MIDDLE DOT	0xb7
+‸	36310	CARET	0x2038
+*	34793	ASTERISK	0x2a
+۔	32432	ARABIC FULL STOP	0x6d4
+፤	31906	ETHIOPIC SEMICOLON	0x1364
+၏	21519	MYANMAR SYMBOL GENITIVE	0x104f
+។	20834	KHMER SIGN KHAN	0x17d4
+꓾	15773	LISU PUNCTUATION COMMA	0xa4fe
+᙮	13473	CANADIAN SYLLABICS FULL STOP	0x166e
+꤯	12892	KAYAH LI SIGN SHYA	0xa92f
+⵰	11478	TIFINAGH SEPARATOR MARK	0x2d70
+꓿	11118	LISU PUNCTUATION FULL STOP	0xa4ff
+॥	10763	DEVANAGARI DOUBLE DANDA	0x965
+؞	10403	ARABIC TRIPLE DOT PUNCTUATION MARK	0x61e
+၍	8936	MYANMAR SYMBOL COMPLETED	0x104d
+·	8431	GREEK ANO TELEIA	0x387
+†	7477	DAGGER	0x2020
+၌	6632	MYANMAR SYMBOL LOCATIVE	0x104c
+፣	5719	ETHIOPIC COMMA	0x1363
+៖	5528	KHMER SIGN CAMNUC PII KUUH	0x17d6
+꤮	4791	KAYAH LI SIGN CWI	0xa92e
+※	3439	REFERENCE MARK	0x203b
+፦	2727	ETHIOPIC PREFACE COLON	0x1366
+•	1749	BULLET	0x2022
+¶	1507	PILCROW SIGN	0xb6
+၎	1386	MYANMAR SYMBOL AFOREMENTIONED	0x104e
+﹖	1224	SMALL QUESTION MARK	0xfe56
+;	975	GREEK QUESTION MARK	0x37e
+…	827	HORIZONTAL ELLIPSIS	0x2026
+%	617	PERCENT SIGN	0x25
+・	468	KATAKANA MIDDLE DOT	0x30fb
+༎	306	TIBETAN MARK NYIS SHAD	0xf0e
+‡	140	DOUBLE DAGGER	0x2021
+#	137	NUMBER SIGN	0x23
+@	125	COMMERCIAL AT	0x40
+፡	121	ETHIOPIC WORDSPACE	0x1361
+៚	55	KHMER SIGN KOOMUUT	0x17da
+៕	49	KHMER SIGN BARIYOOSAN	0x17d5
+﹐	10	SMALL COMMA	0xfe50
+༅	6	TIBETAN MARK CLOSING YIG MGO SGAB MA	0xf05
+༄	6	TIBETAN MARK INITIAL YIG MGO MDUN MA	0xf04
+．	2	FULLWIDTH FULL STOP	0xff0e
+﹗	2	SMALL EXCLAMATION MARK	0xfe57
+﹕	2	SMALL COLON	0xfe55
+‰	2	PER MILLE SIGN	0x2030
+･	1	HALFWIDTH KATAKANA MIDDLE DOT	0xff65
+(	98504	LEFT PARENTHESIS	0x28
+[	27245	LEFT SQUARE BRACKET	0x5b
+⌊	1567	LEFT FLOOR	0x230a
+〔	95	LEFT TORTOISE SHELL BRACKET	0x3014
+【	36	LEFT BLACK LENTICULAR BRACKET	0x3010
+﴿	14	ORNATE RIGHT PARENTHESIS	0xfd3f
+_	4851	LOW LINE	0x5f
+$	72	DOLLAR SIGN	0x24
+€	14	EURO SIGN	0x20ac
+£	2	POUND SIGN	0xa3
+~	27462	TILDE	0x7e
+=	11450	EQUALS SIGN	0x3d
+|	8430	VERTICAL LINE	0x7c
+−	3971	MINUS SIGN	0x2212
+≫	1904	MUCH GREATER-THAN	0x226b
+≪	1903	MUCH LESS-THAN	0x226a
++	1450	PLUS SIGN	0x2b
+＜	345	FULLWIDTH LESS-THAN SIGN	0xff1c
+＞	344	FULLWIDTH GREATER-THAN SIGN	0xff1e
+¬	5	NOT SIGN	0xac
+×	4	MULTIPLICATION SIGN	0xd7
+→	2	RIGHTWARDS ARROW	0x2192
+᙭	537	CANADIAN SYLLABICS CHI SIGN	0x166d
+°	499	DEGREE SIGN	0xb0
+႟	421	MYANMAR SYMBOL SHAN EXCLAMATION	0x109f
+�	192	REPLACEMENT CHARACTER	0xfffd
+⌟	54	BOTTOM RIGHT CORNER	0x231f
+⌞	54	BOTTOM LEFT CORNER	0x231e
+©	2	COPYRIGHT SIGN	0xa9
+ 	40	NARROW NO-BREAK SPACE	0x202f
+ 	1	SIX-PER-EM SPACE	0x2006
+˜	40261	SMALL TILDE	0x2dc
+^	6469	CIRCUMFLEX ACCENT	0x5e
+¯	20	MACRON	0xaf
+ˇ	191442	CARON	0x2c7
+ⁿ	38144	SUPERSCRIPT LATIN SMALL LETTER N	0x207f
+ـ	9440	ARABIC TATWEEL	0x640
+ๆ	6766	THAI CHARACTER MAIYAMOK	0xe46
+ៗ	3310	KHMER SIGN LEK TOO	0x17d7
+々	678	IDEOGRAPHIC ITERATION MARK	0x3005
+ໆ	430	LAO KO LA	0xec6
+ー	319	KATAKANA-HIRAGANA PROLONGED SOUND MARK	0x30fc
+ⁱ	137	SUPERSCRIPT LATIN SMALL LETTER I	0x2071
+৷	11056	BENGALI CURRENCY NUMERATOR FOUR	0x9f7
+⅓	26	VULGAR FRACTION ONE THIRD	0x2153
+½	26	VULGAR FRACTION ONE HALF	0xbd
+¼	4	VULGAR FRACTION ONE QUARTER	0xbc
+⅟	1	FRACTION NUMERATOR ONE	0x215f
+⁄	57	FRACTION SLASH	0x2044

server/inference/text_normalization.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import re
+import unicodedata
+from . import norm_config_module
+norm_config = norm_config_module.norm_config  # type: ignore
+def text_normalize(
+    text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
+):
+    """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
+    Args:
+        text : The string to be normalized
+        iso_code :
+        remove_numbers : Boolean flag to specify if words containing only digits should be removed
+    Returns:
+        normalized_text : the string after all normalization
+    """
+    config = norm_config.get(iso_code, norm_config["*"])
+    for field in [
+        "lower_case",
+        "punc_set",
+        "del_set",
+        "mapping",
+        "digit_set",
+        "unicode_norm",
+    ]:
+        if field not in config:
+            config[field] = norm_config["*"][field]
+    text = unicodedata.normalize(config["unicode_norm"], text)
+    # Convert to lower case
+    if config["lower_case"] and lower_case:
+        text = text.lower()
+    # brackets
+    # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
+    text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
+    if remove_brackets:
+        text = re.sub(r"\([^\)]*\)", " ", text)
+    # Apply mappings
+    for old, new in config["mapping"].items():
+        text = re.sub(old, new, text)
+    # Replace punctutations with space
+    punct_pattern = r"[" + config["punc_set"]
+    punct_pattern += "]"
+    normalized_text = re.sub(punct_pattern, " ", text)
+    # remove characters in delete list
+    delete_patten = r"[" + config["del_set"] + "]"
+    normalized_text = re.sub(delete_patten, "", normalized_text)
+    # Remove words containing only digits
+    # We check for 3 cases  a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
+    # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
+    # The lookaround enables overlapping pattern matches to be replaced
+    if remove_numbers:
+        digits_pattern = "[" + config["digit_set"]
+        digits_pattern += "]+"
+        complete_digit_pattern = (
+            r"^"
+            + digits_pattern
+            + r"(?=\s)|(?<=\s)"
+            + digits_pattern
+            + r"(?=\s)|(?<=\s)"
+            + digits_pattern
+            + "$"
+        )
+        normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
+    if config["rm_diacritics"]:
+        from unidecode import unidecode
+        normalized_text = unidecode(normalized_text)
+    # Remove extra spaces
+    normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
+    return normalized_text

server/lang_dict.py ADDED Viewed

	@@ -0,0 +1,1675 @@

+lang_code={
+    "English (Latin)": "eng_Latn",
+    "Hindi (Devanagari)": "hin_Deva",
+    "Bengali (Bengali)": "ben_Beng",
+    "Arbëreshë Albanian (Latin)": "aae_Latn",
+    "Afade (Latin)": "aal_Latn",
+    "Abung (Latin)": "abb_Latn",
+    "Abidji (Latin)": "abi_Latn",
+    "Abkhazian (Cyrillic)": "abk_Cyrl",
+    "Abua (Latin)": "abn_Latn",
+    "Abellen Ayta (Latin)": "abp_Latn",
+    "Abron (Latin)": "abr_Latn",
+    "Ambulas (Latin)": "abs_Latn",
+    "Achagua (Latin)": "aca_Latn",
+    "Gikyode (Latin)": "acd_Latn",
+    "Achinese (Latin)": "ace_Latn",
+    "Saint Lucian Creole French (Latin)": "acf_Latn",
+    "Acholi (Latin)": "ach_Latn",
+    "Iraqi Arabic (Arabic)": "acm_Arab",
+    "Achang (Latin)": "acn_Latn",
+    "Achi (Latin)": "acr_Latn",
+    "Achuar-Shiwiar (Latin)": "acu_Latn",
+    "Hijazi Arabic (Arabic)": "acw_Arab",
+    "Adele (Latin)": "ade_Latn",
+    "Adhola (Latin)": "adh_Latn",
+    "Adioukrou (Latin)": "adj_Latn",
+    "Amdo Tibetan (Tibetan)": "adx_Tibt",
+    "Adyghe (Cyrillic)": "ady_Cyrl",
+    "Tunisian Arabic (Arabic)": "aeb_Arab",
+    "Saidi Arabic (Arabic)": "aec_Arab",
+    "Arem (Latin)": "aeu_Latn",
+    "Gulf Arabic (Arabic)": "afb_Arab",
+    "Eloyi (Latin)": "afo_Latn",
+    "Afrikaans (Latin)": "afr_Latn",
+    "Agarabi (Latin)": "agd_Latn",
+    "Angor (Latin)": "agg_Latn",
+    "Agariya (Latin)": "agn_Latn",
+    "Aguaruna (Latin)": "agr_Latn",
+    "Aguacateco (Latin)": "agu_Latn",
+    "Agul (Cyrillic)": "agx_Cyrl",
+    "Ahanta (Latin)": "aha_Latn",
+    "Akha (Latin)": "ahk_Latn",
+    "Igo (Latin)": "ahl_Latn",
+    "Arosi (Latin)": "ahs_Latn",
+    "Arosi (Latin)": "aia_Latn",
+    "Aja (Benin) (Latin)": "ajg_Latn",
+    "Akan (Latin)": "aka_Latn",
+    "Batak Angkola (Latin)": "akb_Latn",
+    "Akawaio (Latin)": "ake_Latn",
+    "Akpes (Latin)": "akp_Latn",
+    "Alago (Latin)": "ala_Latn",
+    "Alangan (Latin)": "alj_Latn",
+    "Gheg Albanian (Latin)": "aln_Latn",
+    "Larike-Wakasihu (Latin)": "alo_Latn",
+    "Alune (Latin)": "alp_Latn",
+    "Tosk Albanian (Latin)": "als_Latn",
+    "Southern Altai (Cyrillic)": "alt_Cyrl",
+    "Alur (Latin)": "alz_Latn",
+    "Amarasi (Latin)": "ame_Latn",
+    "Hamer-Banna (Latin)": "amf_Latn",
+    "Amharic (Ethiopic)": "amh_Ethi",
+    "Amis (Latin)": "ami_Latn",
+    "Amo (Latin)": "amk_Latn",
+    "Amanab (Latin)": "amu_Latn",
+    "Ngas (Latin)": "anc_Latn",
+    "Goemai (Latin)": "ank_Latn",
+    "Obolo (Latin)": "ann_Latn",
+    "Angika (Devanagari)": "anp_Deva",
+    "Anaang (Latin)": "anw_Latn",
+    "Anyin (Latin)": "any_Latn",
+    "A'ou (Latin)": "aom_Latn",
+    "Uab Meto (Latin)": "aoz_Latn",
+    "Sa'a (Latin)": "apb_Latn",
+    "North Levantine Arabic (Arabic)": "apc_Arab",
+    "Sudanese Arabic (Arabic)": "apd_Arab",
+    "A-Pucikwar (Latin)": "apr_Latn",
+    "Standard Arabic (Arabic)": "arb_Arab",
+    "Aragonese (Latin)": "arg_Latn",
+    "Arhâ (Latin)": "arl_Latn",
+    "Algerian Arabic (Arabic)": "arq_Arab",
+    "Najdi Arabic (Arabic)": "ars_Arab",
+    "Moroccan Arabic (Arabic)": "ary_Arab",
+    "Egyptian Arabic (Arabic)": "arz_Arab",
+    "Asu (Tanzania) (Latin)": "asa_Latn",
+    "Cishingini (Latin)": "asg_Latn",
+    "Assamese (Bengali)": "asm_Beng",
+    "Asturian (Latin)": "ast_Latn",
+    "Ata (Latin)": "ata_Latn",
+    "Atsi (Latin)": "atb_Latn",
+    "Atong (India) (Latin)": "atg_Latn",
+    "Ivbie North-Okpela-Arhe (Latin)": "ati_Latn",
+    "Atikamekw (Latin)": "atq_Latn",
+    "Avaric (Cyrillic)": "ava_Cyrl",
+    "Avikam (Latin)": "avn_Latn",
+    "Avokaya (Latin)": "avu_Latn",
+    "Awadhi (Devanagari)": "awa_Deva",
+    "Awa-Cuaiquer (Latin)": "awb_Latn",
+    "Arawum (Latin)": "awo_Latn",
+    "South Levantine Arabic (Arabic)": "ayl_Arab",
+    "Ayizo Gbe (Latin)": "ayo_Latn",
+    "North Mesopotamian Arabic (Arabic)": "ayp_Arab",
+    "Aymara (Latin)": "ayr_Latn",
+    "Mai Brat (Latin)": "ayz_Latn",
+    "Azerbaijani (Arabic)": "aze_Arab",
+    "Azerbaijani (Cyrillic)": "aze_Cyrl",
+    "Azerbaijani (Latin)": "aze_Latn",
+    "Ambele (Latin)": "azg_Latn",
+    "Highland Oaxaca Chontal (Latin)": "azz_Latn",
+    "Bagheli (Latin)": "bag_Latn",
+    "Bashkir (Cyrillic)": "bak_Cyrl",
+    "Bambara (Latin)": "bam_Latn",
+    "Balinese (Latin)": "ban_Latn",
+    "Waimaha (Latin)": "bao_Latn",
+    "Basa (Cameroon) (Latin)": "bas_Latn",
+    "Vengo (Latin)": "bav_Latn",
+    "Bambili-Bambui (Latin)": "bax_Latn",
+    "Barai (Latin)": "bba_Latn",
+    "Baeggu (Latin)": "bbb_Latn",
+    "Batak Toba (Latin)": "bbc_Latn",
+    "Ghomálá' (Latin)": "bbj_Latn",
+    "Babanki (Georgian)": "bbl_Geor",
+    "Northern Bobo Madaré (Latin)": "bbo_Latn",
+    "Kulung (Nigeria) (Latin)": "bbu_Latn",
+    "Southern Balochi (Arabic)": "bcc_Arab",
+    "Southern Balochi (Latin)": "bcc_Latn",
+    "Bainouk-Samik (Latin)": "bce_Latn",
+    "Baoulé (Latin)": "bci_Latn",
+    "Central Bikol (Latin)": "bcl_Latn",
+    "Bainouk-Gunyaamolo (Latin)": "bcs_Latn",
+    "Bana (Latin)": "bcw_Latn",
+    "Bannoni (Latin)": "bcy_Latn",
+    "Bainouk-Gunyaamolo (Latin)": "bcz_Latn",
+    "Bai (Latin)": "bda_Latn",
+    "Bade (Latin)": "bde_Latn",
+    "Balesin-Bisaya (Latin)": "bdg_Latn",
+    "Baka (South Sudan) (Latin)": "bdh_Latn",
+    "Burun (Latin)": "bdm_Latn",
+    "Bau (Latin)": "bdq_Latn",
+    "Oroko (Latin)": "bdu_Latn",
+    "Bebele (Latin)": "beb_Latn",
+    "Biali (Latin)": "beh_Latn",
+    "Belarusian (Cyrillic)": "bel_Cyrl",
+    "Bemba (Zambia) (Latin)": "bem_Latn",
+    "Bengali (Bengali)": "ben_Beng",
+    "Bila (Latin)": "bep_Latn",
+    "Betawi (Latin)": "bew_Latn",
+    "Yarawa (Latin)": "bex_Latn",
+    "Beba (Latin)": "bfa_Latn",
+    "Bafut (Latin)": "bfd_Latn",
+    "Beba (Latin)": "bfo_Latn",
+    "Balti (Arabic)": "bft_Arab",
+    "Bagheli (Devanagari)": "bfy_Deva",
+    "Pahari-Potwari (Devanagari)": "bfz_Deva",
+    "Haryanvi (Devanagari)": "bgc_Deva",
+    "Gwamhi-Wuri (Arabic)": "bgp_Arab",
+    "Bagri (Devanagari)": "bgq_Deva",
+    "Bauria (Latin)": "bgr_Latn",
+    "Gamo-Gofa-Dawro (Latin)": "bgt_Latn",
+    "Bhatri (Devanagari)": "bgw_Deva",
+    "Bharia (Devanagari)": "bha_Deva",
+    "Bhili (Devanagari)": "bhb_Deva",
+    "Bukhari (Cyrillic)": "bhh_Cyrl",
+    "Bhojpuri (Devanagari)": "bho_Deva",
+    "Bima (Latin)": "bhp_Latn",
+    "Bhattiyali (Devanagari)": "bht_Deva",
+    "Biangai (Latin)": "bhz_Latn",
+    "Bissa (Latin)": "bib_Latn",
+    "Bimoba (Latin)": "bim_Latn",
+    "Bislama (Latin)": "bis_Latn",
+    "B Eliot (Latin)": "biv_Latn",
+    "Badyara (Devanagari)": "bjj_Deva",
+    "Barok (Latin)": "bjk_Latn",
+    "Banjar (Latin)": "bjn_Latn",
+    "Binumarien (Latin)": "bjr_Latn",
+    "Bulu (Papua New Guinea) (Latin)": "bjt_Latn",
+    "Bedjond (Latin)": "bjv_Latn",
+    "Bakwé (Latin)": "bjw_Latn",
+    "Bariji (Latin)": "bjz_Latn",
+    "Binukid (Latin)": "bkd_Latn",
+    "Bakoko (Latin)": "bkh_Latn",
+    "Boki (Latin)": "bkm_Latn",
+    "Bekwarra (Latin)": "bkv_Latn",
+    "Bungku (Latin)": "bky_Latn",
+    "Bolia (Latin)": "ble_Latn",
+    "Baluan-Pam (Latin)": "blh_Latn",
+    "Tai Dam (Latin)": "blt_Latn",
+    "Mag-Indi Ayta (Latin)": "blx_Latn",
+    "Balantak (Latin)": "blz_Latn",
+    "Bembe (Latin)": "bmm_Latn",
+    "Biao Mon (Latin)": "bmq_Latn",
+    "Muinane (Latin)": "bmr_Latn",
+    "Bomwali (Latin)": "bmu_Latn",
+    "Bum (Latin)": "bmv_Latn",
+    "Bangi (Bengali)": "bng_Beng",
+    "Bonerif (Latin)": "bnm_Latn",
+    "Bontok (Latin)": "bnn_Latn",
+    "Bantoanon (Latin)": "bno_Latn",
+    "Bola (Papua New Guinea) (Latin)": "bnp_Latn",
+    "Bunun (Devanagari)": "bns_Deva",
+    "Bora (Latin)": "boa_Latn",
+    "Tibetan (Tibetan)": "bod_Tibt",
+    "Anjam (Latin)": "boj_Latn",
+    "Berom (Latin)": "bom_Latn",
+    "Borôro (Latin)": "bor_Latn",
+    "Bosnian (Latin)": "bos_Latn",
+    "Bonkiman (Latin)": "bou_Latn",
+    "Bongo (Latin)": "bov_Latn",
+    "Tuwuli (Latin)": "box_Latn",
+    "Barapasi (Latin)": "bpr_Latn",
+    "Banda-Banda (Latin)": "bps_Latn",
+    "Birgid (Latin)": "bqc_Latn",
+    "Baga Pokur (Latin)": "bqg_Latn",
+    "Bakhtiari (Arabic)": "bqi_Arab",
+    "Banda-Mbrès (Latin)": "bqj_Latn",
+    "Banda-Ndélé (Latin)": "bqp_Latn",
+    "Braj (Devanagari)": "bra_Deva",
+    "Breton (Latin)": "bre_Latn",
+    "Brahui (Arabic)": "brh_Arab",
+    "Bira (Congo) (Latin)": "bri_Latn",
+    "Burui (Latin)": "bru_Latn",
+    "Bodo (India) (Devanagari)": "brx_Deva",
+    "Basa (Nigeria) (Latin)": "bsc_Latn",
+    "Kati (Arabic)": "bsh_Arab",
+    "Bangolan (Latin)": "bsj_Latn",
+    "Burushaski (Latin)": "bsk_Latn",
+    "Bassa-Kontagora (Latin)": "bsq_Latn",
+    "Akoose (Latin)": "bss_Latn",
+    "Busami (Latin)": "bsy_Latn",
+    "Batak Dairi (Latin)": "btd_Latn",
+    "Batak Mandailing (Latin)": "btm_Latn",
+    "Ratte Buri (Latin)": "bts_Latn",
+    "Bete-Bendi (Latin)": "btt_Latn",
+    "Bateri (Arabic)": "btv_Arab",
+    "Batak Karo (Latin)": "btx_Latn",
+    "Budu (Latin)": "bud_Latn",
+    "Buginese (Latin)": "bug_Latn",
+    "Bulgarian (Cyrillic)": "bul_Cyrl",
+    "Bulu (Cameroon) (Latin)": "bum_Latn",
+    "Bulu (Cameroon) (Latin)": "buo_Latn",
+    "Bussa (Latin)": "bus_Latn",
+    "Bokobaru (Latin)": "bux_Latn",
+    "Bube (Latin)": "bvb_Latn",
+    "Baelelea (Latin)": "bvc_Latn",
+    "Buriat (Latin)": "bvz_Latn",
+    "Bwatoo (Latin)": "bwq_Latn",
+    "Bura-Pabir (Latin)": "bwr_Latn",
+    "Buli (Ghana) (Latin)": "bwu_Latn",
+    "Bilur (Latin)": "bxf_Latn",
+    "Buhutu (Latin)": "bxk_Latn",
+    "Tiéyaxo Bozo (Latin)": "byc_Latn",
+    "Bina (Nigeria) (Latin)": "byr_Latn",
+    "Bisa (Latin)": "bys_Latn",
+    "Batak (Latin)": "byv_Latn",
+    "Qaqet (Latin)": "byx_Latn",
+    "Blaan (Latin)": "bzh_Latn",
+    "Bisu (Thai)": "bzi_Thai",
+    "Jamaican Creole English (Latin)": "bzj_Latn",
+    "Boano (Sulawesi) (Latin)": "bzw_Latn",
+    "Chortí (Latin)": "caa_Latn",
+    "Garifuna (Latin)": "cab_Latn",
+    "Chuj (Latin)": "cac_Latn",
+    "Kaqchikel (Latin)": "cak_Latn",
+    "Carolinian (Latin)": "cap_Latn",
+    "Galibi Carib (Latin)": "car_Latn",
+    "Tsimané (Latin)": "cas_Latn",
+    "Catalan (Latin)": "cat_Latn",
+    "Cua (Latin)": "cax_Latn",
+    "Cabiyarí (Latin)": "cbc_Latn",
+    "Chachi (Latin)": "cbi_Latn",
+    "Carijona (Latin)": "cbr_Latn",
+    "Cashibo-Cacataibo (Latin)": "cbs_Latn",
+    "Chayahuita (Latin)": "cbt_Latn",
+    "Chachi (Latin)": "cbu_Latn",
+    "Kakua (Latin)": "cbv_Latn",
+    "Chopi (Latin)": "cce_Latn",
+    "Samba Daka (Latin)": "ccg_Latn",
+    "Chakma (Latin)": "cco_Latn",
+    "Churahi (Devanagari)": "cdj_Deva",
+    "Min Dong Chinese (Han)": "cdo_Hans",
+    "Cebuano (Latin)": "ceb_Latn",
+    "Cen Gbe (Latin)": "ceg_Latn",
+    "Cek pet (Latin)": "cek_Latn",
+    "Centúúm (Latin)": "cen_Latn",
+    "Czech (Latin)": "ces_Latn",
+    "Chafarruscas (Latin)": "cfa_Latn",
+    "Falam Chin (Latin)": "cfm_Latn",
+    "Chiga (Latin)": "cgg_Latn",
+    "Chiga (Latin)": "cgg_Latn",
+    "Chechen (Cyrillic)": "che_Cyrl",
+    "Chontal de Tabasco (Latin)": "chf_Latn",
+    "Chatino (Latin)": "chq_Latn",
+    "Chuvash (Cyrillic)": "chv_Cyrl",
+    "Ozumacín Chinantec (Latin)": "chz_Latn",
+    "Chokwe (Latin)": "cjk_Latn",
+    "Chamorro (Latin)": "cjo_Latn",
+    "Upper Chehalis (Latin)": "cjp_Latn",
+    "Shor (Cyrillic)": "cjs_Cyrl",
+    "Central Kurdish (Arabic)": "ckb_Arab",
+    "Cibak (Latin)": "ckl_Latn",
+    "Anufo (Latin)": "cko_Latn",
+    "Chak (Latin)": "ckr_Latn",
+    "Chukot (Cyrillic)": "ckt_Cyrl",
+    "Chukot (Latin)": "cky_Latn",
+    "Chala (Latin)": "cla_Latn",
+    "Lealao Chinantec (Latin)": "cle_Latn",
+    "Eastern Highland Chatino (Latin)": "cly_Latn",
+    "Mro-Khimi Chin (Latin)": "cme_Latn",
+    "Mandarin Chinese (Han)": "cmn_Hans",
+    "Mandarin Chinese (Han)": "cmn_Hant",
+    "Central Mnong (Khmer)": "cmo_Khmr",
+    "Central Mnong (Latin)": "cmo_Latn",
+    "Mro-Khimi Chin (Latin)": "cmr_Latn",
+    "Hakha Chin (Latin)": "cnh_Latn",
+    "Ashéninka Pajonal (Latin)": "cni_Latn",
+    "Lalana Chinantec (Latin)": "cnl_Latn",
+    "Northern Tlaxiaco Chatino (Latin)": "cnt_Latn",
+    "Cochimi (Latin)": "coe_Latn",
+    "Cofán (Latin)": "cof_Latn",
+    "Chong (Latin)": "cok_Latn",
+    "Cotoname (Latin)": "con_Latn",
+    "Cornish (Latin)": "cor_Latn",
+    "Caquinte (Latin)": "cot_Latn",
+    "Wamey (Latin)": "cou_Latn",
+    "Ponares (Latin)": "cpa_Latn",
+    "Ucayali-Yurúa Ashéninka (Latin)": "cpb_Latn",
+    "Pichis Ashéninka (Latin)": "cpu_Latn",
+    "Pu-Xian Chinese (Han)": "cpx_Hans",
+    "Ucayali-Yurúa Ashéninka (Latin)": "cpy_Latn",
+    "Crimean Tatar (Cyrillic)": "crh_Cyrl",
+    "Cree (Canadian Aboriginal Syllabics)": "crk_Cans",
+    "Cree (Latin)": "crk_Latn",
+    "El Nayar Cora (Latin)": "crn_Latn",
+    "Caramanta (Latin)": "crq_Latn",
+    "Seselwa Creole French (Latin)": "crs_Latn",
+    "Iyo'wujwa Chorote (Latin)": "crt_Latn",
+    "Carrier (Latin)": "csk_Latn",
+    "Southern Ping Chinese (Latin)": "cso_Latn",
+    "Northern Tlaxiaco Chatino (Latin)": "ctd_Latn",
+    "Tepinapa Chinantec (Latin)": "cte_Latn",
+    "Chittagonian (Bengali)": "ctg_Beng",
+    "Tataltepec Chatino (Latin)": "ctl_Latn",
+    "Tataltepec Chatino (Latin)": "cto_Latn",
+    "Wayanad Chetti (Latin)": "ctu_Latn",
+    "Cun (Latin)": "cuc_Latn",
+    "Culina (Latin)": "cui_Latn",
+    "Culina (Latin)": "cuk_Latn",
+    "Culina (Latin)": "cul_Latn",
+    "Teutila Cuicatec (Latin)": "cut_Latn",
+    "Chuka (Latin)": "cux_Latn",
+    "Chuwabu (Latin)": "cwa_Latn",
+    "Kwere (Latin)": "cwe_Latn",
+    "Nute (Latin)": "cwt_Latn",
+    "Cemuhî (Latin)": "cya_Latn",
+    "Welsh (Latin)": "cym_Latn",
+    "Dambi (Latin)": "daa_Latn",
+    "Dagbani (Latin)": "dag_Latn",
+    "Gwahatike (Latin)": "dah_Latn",
+    "Danish (Latin)": "dan_Latn",
+    "Dargwa (Cyrillic)": "dar_Cyrl",
+    "Taita (Latin)": "dav_Latn",
+    "Dabarre (Latin)": "dbd_Latn",
+    "Doga (Latin)": "dbj_Latn",
+    "Daba (Latin)": "dbq_Latn",
+    "Deccan (Arabic)": "dcc_Arab",
+    "Dendi (Nigeria) (Latin)": "ddn_Latn",
+    "Dedua (Latin)": "ded_Latn",
+    "Dezfuli (Latin)": "deg_Latn",
+    "Desano (Latin)": "des_Latn",
+    "German (Latin)": "deu_Latn",
+    "Dagaari Dioula (Latin)": "dga_Latn",
+    "Dghwede (Latin)": "dgh_Latn",
+    "Dugwor (Latin)": "dgi_Latn",
+    "Dakka (Latin)": "dgk_Latn",
+    "Dogri (macrolanguage) (Devanagari)": "dgo_Deva",
+    "Dogrib (Latin)": "dgr_Latn",
+    "Didinga (Devanagari)": "dhi_Deva",
+    "Digo (Latin)": "did_Latn",
+    "Digo (Latin)": "dig_Latn",
+    "Dilling (Latin)": "dik_Latn",
+    "Dilling (Latin)": "dip_Latn",
+    "Dhivehi (Thaana)": "div_Thaa",
+    "Zarma (Latin)": "dje_Latn",
+    "Jukun of Takum (Latin)": "djk_Latn",
+    "Domaaki (Arabic)": "dmk_Arab",
+    "Domaaki (Arabic)": "dml_Arab",
+    "Dan (Latin)": "dnj_Latn",
+    "Dan (Latin)": "dnt_Latn",
+    "Dan (Latin)": "dnw_Latn",
+    "Dom (Latin)": "dop_Latn",
+    "Dogosé (Latin)": "dos_Latn",
+    "Duruwa (Latin)": "dru_Latn",
+    "Lower Sorbian (Latin)": "dsb_Latn",
+    "Daasanach (Latin)": "dsh_Latn",
+    "Dusner (Latin)": "dtp_Latn",
+    "Toro So Dogon (Latin)": "dts_Latn",
+    "Dotyali (Devanagari)": "dty_Deva",
+    "Duala (Latin)": "dua_Latn",
+    "Duna (Latin)": "dug_Latn",
+    "Dutton World Speedwords (Latin)": "dwr_Latn",
+    "Dyiri (Latin)": "dyi_Latn",
+    "Dyola-Fonyi (Latin)": "dyo_Latn",
+    "Dyula (Latin)": "dyu_Latn",
+    "Dazaga (Latin)": "dzg_Latn",
+    "Dzongkha (Tibetan)": "dzo_Tibt",
+    "Embu (Latin)": "ebu_Latn",
+    "Epie (Latin)": "ego_Latn",
+    "Eipomek (Latin)": "eip_Latn",
+    "Askopan (Latin)": "eiv_Latn",
+    "Eka (Latin)": "eka_Latn",
+    "Standard Estonian (Latin)": "ekk_Latn",
+    "Eki (Latin)": "eko_Latn",
+    "Yace (Latin)": "ekr_Latn",
+    "Modern Greek (1453-) (Greek)": "ell_Grek",
+    "Modern Greek (1453-) (Greek, cypr1249)": "ell_Grek_cypr1249",
+    "Eleme (Latin)": "elm_Latn",
+    "Eman (Latin)": "emp_Latn",
+    "Enlhet (Latin)": "enb_Latn",
+    "English (Latin)": "eng_Latn",
+    "Enxet (Latin)": "enx_Latn",
+    "Esperanto (Latin)": "epo_Latn",
+    "Ese Ejja (Latin)": "ese_Latn",
+    "Esselen (Latin)": "ess_Latn",
+    "Central Yupik (Latin)": "esu_Latn",
+    "Eton (Vanuatu) (Latin)": "eto_Latn",
+    "Eton (Cameroon) (Latin)": "ets_Latn",
+    "Eton (Cameroon) (Latin)": "etu_Latn",
+    "Basque (Latin)": "eus_Latn",
+    "Even (Cyrillic)": "evn_Cyrl",
+    "Ewe (Latin)": "ewe_Latn",
+    "Ewondo (Latin)": "ewo_Latn",
+    "Eyak (Latin)": "eyo_Latn",
+    "Ezaa (Latin)": "eza_Latn",
+    "Fali (Latin)": "fal_Latn",
+    "Fang (Equatorial Guinea) (Latin)": "fan_Latn",
+    "Faroese (Latin)": "fao_Latn",
+    "Fasu (Latin)": "far_Latn",
+    "Persian (Arabic)": "fas_Arab",
+    "Fanti (Latin)": "fat_Latn",
+    "Faita (Latin)": "fia_Latn",
+    "Fijian (Latin)": "fij_Latn",
+    "Filipino (Latin)": "fil_Latn",
+    "Finnish (Latin)": "fin_Latn",
+    "Fipa (Latin)": "fip_Latn",
+    "Knaanic (Latin)": "fkk_Latn",
+    "Foau (Latin)": "flr_Latn",
+    "Fe'fe' (Latin)": "fmp_Latn",
+    "Far Western Muria (Devanagari)": "fmu_Deva",
+    "Fon (Latin)": "fon_Latn",
+    "French (Latin)": "fra_Latn",
+    "Fordata (Latin)": "frd_Latn",
+    "Western Frisian (Latin)": "fry_Latn",
+    "Fulah (Latin)": "fub_Latn",
+    "Pulaar (Latin)": "fuc_Latn",
+    "East Futuna (Latin)": "fue_Latn",
+    "Fulah (Latin)": "ful_Latn",
+    "Pulaar (Latin)": "fuq_Latn",
+    "Nigerian Fulfulde (Latin)": "fuv_Latn",
+    "Gagauz (Cyrillic)": "gag_Cyrl",
+    "Gagauz (Latin)": "gag_Latn",
+    "Gaina (Latin)": "gai_Latn",
+    "Gamkonora (Latin)": "gam_Latn",
+    "Kandawo (Telugu)": "gau_Telu",
+    "Gabri (Latin)": "gbi_Latn",
+    "Kaytetye (Devanagari)": "gbk_Deva",
+    "Garhwali (Devanagari)": "gbm_Deva",
+    "Gbari (Latin)": "gbo_Latn",
+    "Gbagyi (Latin)": "gbr_Latn",
+    "Gbagyi (Latin)": "gby_Latn",
+    "Alekano (Latin)": "gcc_Latn",
+    "Gade (Latin)": "gde_Latn",
+    "Guduf-Gava (Latin)": "gdf_Latn",
+    "Gengle (Latin)": "geb_Latn",
+    "Gebe (Latin)": "gej_Latn",
+    "Geser-Gorom (Latin)": "ges_Latn",
+    "Guria (Arabic)": "ggg_Arab",
+    "Gidar (Latin)": "gid_Latn",
+    "Gbazari (Arabic)": "gig_Arab",
+    "Gilbertese (Latin)": "gil_Latn",
+    "Gimi (Papua New Guinea) (Latin)": "giz_Latn",
+    "Kachi Koli (Arabic)": "gjk_Arab",
+    "Gunditjmara (Latin)": "gjn_Latn",
+    "Gujari (Arabic)": "gju_Arab",
+    "Gokana (Latin)": "gkn_Latn",
+    "Nanai (Cyrillic)": "gld_Cyrl",
+    "Irish (Latin)": "gle_Latn",
+    "Galician (Latin)": "glg_Latn",
+    "Gilaki (Arabic)": "glk_Arab",
+    "Manx (Latin)": "glv_Latn",
+    "Gula (Chad) (Latin)": "glw_Latn",
+    "Gamo (Latin)": "gmv_Latn",
+    "Gana (Latin)": "gna_Latn",
+    "Gondi (Latin)": "gnd_Latn",
+    "Ngangam (Latin)": "gng_Latn",
+    "Gofa (Latin)": "gof_Latn",
+    "Gogo (Latin)": "gog_Latn",
+    "Gola (Latin)": "gol_Latn",
+    "Goan Konkani (Devanagari)": "gom_Deva",
+    "Gorontalo (Latin)": "gor_Latn",
+    "Gor (Latin)": "gqr_Latn",
+    "Ancient Greek (to 1453) (Greek)": "grc_Grek",
+    "Gbiri-Niragu (Latin)": "gri_Latn",
+    "Guarani (Latin)": "grn_Latn",
+    "Garo (Bengali)": "grt_Beng",
+    "Guriaso (Latin)": "gsl_Latn",
+    "German Sign Language (Latin)": "gso_Latn",
+    "Guajajára (Latin)": "gub_Latn",
+    "Wayuu (Latin)": "guc_Latn",
+    "Yocoboué Dida (Latin)": "gud_Latn",
+    "Paraguayan Guaraní (Latin)": "gug_Latn",
+    "Guahibo (Latin)": "guh_Latn",
+    "Eastern Bolivian Guaraní (Latin)": "gui_Latn",
+    "Gujarati (Gujarati)": "guj_Gujr",
+    "Gumuz (Ethiopic)": "guk_Ethi",
+    "Gumuz (Latin)": "gum_Latn",
+    "Guro (Latin)": "guo_Latn",
+    "Guinau dan (Latin)": "guq_Latn",
+    "Farefare (Latin)": "gur_Latn",
+    "Farefare (Latin)": "guu_Latn",
+    "Gusilay (Latin)": "gux_Latn",
+    "Gusii (Latin)": "guz_Latn",
+    "Guanano (Latin)": "gvc_Latn",
+    "Gwanja (Latin)": "gvl_Latn",
+    "Kalami (Arabic)": "gwc_Arab",
+    "Gweno (Latin)": "gwe_Latn",
+    "Gwichʼin (Latin)": "gwi_Latn",
+    "Gwere (Latin)": "gwr_Latn",
+    "Gwere (Arabic)": "gwt_Arab",
+    "Guaymí (Latin)": "gym_Latn",
+    "Gyem (Latin)": "gyr_Latn",
+    "Geji (Latin)": "gyz_Latn",
+    "Hadiyya (Latin)": "had_Latn",
+    "Hanga (Latin)": "hag_Latn",
+    "Hahon (Latin)": "hah_Latn",
+    "Hakka Chinese (Latin)": "hak_Latn",
+    "Ha(Latin)": "hao_Latn",
+    "Hdi (Latin)": "hap_Latn",
+    "Haitian (Latin)": "hat_Latn",
+    "Hausa (Latin)": "hau_Latn",
+    "Hawaiian (Latin)": "haw_Latn",
+    "Haya (Latin)": "hay_Latn",
+    "Huba (Latin)": "hbb_Latn",
+    "Huichol (Latin)": "hch_Latn",
+    "Hebrew (Hebrew)": "heb_Hebr",
+    "Hehe (Latin)": "heh_Latn",
+    "Herero (Latin)": "her_Latn",
+    "Hiaitsiihi (Latin)": "hia_Latn",
+    "Fiji Hindi (Latin)": "hif_Latn",
+    "Higgi (Latin)": "hig_Latn",
+    "Hiligaynon (Latin)": "hil_Latn",
+    "Hindi (Devanagari)": "hin_Deva",
+    "Hkongso Chin (Latin)": "hkk_Latn",
+    "Halang (Latin)": "hla_Latn",
+    "Halia (Devanagari)": "hlb_Deva",
+    "Matu Chin (Latin)": "hlt_Latn",
+    "Chhattisgarhi (Devanagari)": "hne_Deva",
+    "Hän (Latin)": "hnn_Latn",
+    "Northern Hindko (Arabic)": "hno_Arab",
+    "Hunsrik (Latin)": "hns_Latn",
+    "Ho (Oriya)": "hoc_Orya",
+    "Croatian (Latin)": "hrv_Latn",
+    "Upper Sorbian (Latin)": "hsb_Latn",
+    "Hoti (Latin)": "hto_Latn",
+    "Huba (Latin)": "hub_Latn",
+    "Huave (Latin)": "hue_Latn",
+    "San Francisco Del Mar Huave (Latin)": "hui_Latn",
+    "Hula (Latin)": "hul_Latn",
+    "Hungarian (Latin)": "hun_Latn",
+    "Huastec (Latin)": "hus_Latn",
+    "Humla (Latin)": "huu_Latn",
+    "San Mateo Del Mar Huave (Latin)": "huv_Latn",
+    "Hulaulá (Latin)": "hux_Latn",
+    "Havanese (Latin)": "hvn_Latn",
+    "Hwana (Latin)": "hwo_Latn",
+    "Armenian (Armenian)": "hye_Armn",
+    "Western Armenian (Armenian)": "hyw_Armn",
+    "Iban (Latin)": "iba_Latn",
+    "Ibibio (Latin)": "ibb_Latn",
+    "Igbo (Latin)": "ibo_Latn",
+    "Etkywan (Latin)": "icr_Latn",
+    "Ido (Latin)": "ida_Latn",
+    "Idon (Latin)": "idd_Latn",
+    "Idoma (Latin)": "idu_Latn",
+    "Ifugao (Latin)": "ifa_Latn",
+    "Amganad Ifugao (Latin)": "ifb_Latn",
+    "Ifo (Latin)": "ife_Latn",
+    "Tuwali Ifugao (Latin)": "ifk_Latn",
+    "Mayoyao Ifugao (Latin)": "ifu_Latn",
+    "Keley-I Kallahan (Latin)": "ify_Latn",
+    "Igede (Latin)": "igl_Latn",
+    "Igala (Latin)": "ign_Latn",
+    "Ijaw (Latin)": "ijc_Latn",
+    "Biseni (Latin)": "ijn_Latn",
+    "Ika (Latin)": "ikk_Latn",
+    "Ikwere (Latin)": "ikw_Latn",
+    "Ila (Latin)": "ilb_Latn",
+    "Ilocano (Latin)": "ilo_Latn",
+    "Imbongu (Latin)": "imo_Latn",
+    "Interlingua (International Auxiliary Language Association) (Latin)": "ina_Latn",
+    "Inga (Latin)": "inb_Latn",
+    "Indonesian (Latin)": "ind_Latn",
+    "Iu Mien (Latin)": "iou_Latn",
+    "Ipili (Latin)": "ipi_Latn",
+    "Inupiaq (Latin)": "ipk_Latn",
+    "Iquito (Latin)": "iqw_Latn",
+    "Iresim (Latin)": "iri_Latn",
+    "Irarutu (Latin)": "irk_Latn",
+    "Isekiri (Latin)": "ish_Latn",
+    "Icelandic (Latin)": "isl_Latn",
+    "Isoko (Latin)": "iso_Latn",
+    "Italian (Latin)": "ita_Latn",
+    "Itelmen (Cyrillic)": "itl_Cyrl",
+    "Isekiri (Latin)": "its_Latn",
+    "Isekiri (Latin)": "itv_Latn",
+    "Ito (Latin)": "itw_Latn",
+    "Itzá (Latin)": "itz_Latn",
+    "Ixil (Latin)": "ixl_Latn",
+    "Izere (Latin)": "izr_Latn",
+    "Izii (Latin)": "izz_Latn",
+    "Jakaltek (Latin)": "jac_Latn",
+    "Yalahatan (Latin)": "jal_Latn",
+    "Jamaican Creole English (Latin)": "jam_Latn",
+    "Javanese (Latin)": "jav_Latn",
+    "Jambi Malay (Latin)": "jax_Latn",
+    "Jibu (Latin)": "jbu_Latn",
+    "Jerung (Latin)": "jen_Latn",
+    "Jicaque (Latin)": "jic_Latn",
+    "Jivaro (Latin)": "jiv_Latn",
+    "Machame (Latin)": "jmc_Latn",
+    "Zumbun (Latin)": "jmd_Latn",
+    "Jimi (Nigeria) (Latin)": "jmx_Latn",
+    "Japanese (Japanese)": "jpn_Jpan",
+    "Jaqaru (Latin)": "jqr_Latn",
+    "Jowulu (Latin)": "juk_Latn",
+    "Ju'hoan (Oriya)": "jun_Orya",
+    "Juang (Latin)": "juo_Latn",
+    "Wapan (Latin)": "jvn_Latn",
+    "Kara-Kalpak (Cyrillic)": "kaa_Cyrl",
+    "Kabyle (Latin)": "kab_Latn",
+    "Kachin (Latin)": "kac_Latn",
+    "Gayo (Latin)": "kai_Latn",
+    "Jju (Latin)": "kaj_Latn",
+    "Jju (Latin)": "kak_Latn",
+    "Kamba (Kenya) (Latin)": "kam_Latn",
+    "Kannada (Kannada)": "kan_Knda",
+    "Kanu (Latin)": "kao_Latn",
+    "Bezhta (Latin)": "kaq_Latn",
+    "Kashmiri (Arabic)": "kas_Arab",
+    "Georgian (Georgian)": "kat_Geor",
+    "Kadazan Dusun (Latin)": "kay_Latn",
+    "Kazakh (Cyrillic)": "kaz_Cyrl",
+    "Kabardian (Cyrillic)": "kbd_Cyrl",
+    "Kayan (Latin)": "kbl_Latn",
+    "Kande (Latin)": "kbo_Latn",
+    "Kabiye (Latin)": "kbp_Latn",
+    "Kabiye (Latin)": "kbq_Latn",
+    "Kafa (Latin)": "kbr_Latn",
+    "Kamo (Latin)": "kbt_Latn",
+    "Kikuyu (Latin)": "kby_Latn",
+    "Ket (Cyrillic)": "kca_Cyrl",
+    "Tyap (Latin)": "kcg_Latn",
+    "Kono (Nigeria) (Latin)": "kcn_Latn",
+    "Kutu (Latin)": "kcq_Latn",
+    "Kutu (Latin)": "kdc_Latn",
+    "Makonde (Latin)": "kde_Latn",
+    "Tem (Latin)": "kdh_Latn",
+    "Kumam (Latin)": "kdi_Latn",
+    "Kumam (Latin)": "kdj_Latn",
+    "Tsikimba (Latin)": "kdl_Latn",
+    "Kagulu (Latin)": "kdn_Latn",
+    "Kuy (Khmer)": "kdt_Khmr",
+    "Kepo' (Latin)": "kea_Latn",
+    "Kekchi (Latin)": "kek_Latn",
+    "Kenyang (Latin)": "ken_Latn",
+    "Kenyah (Latin)": "keo_Latn",
+    "Kera (Latin)": "ker_Latn",
+    "Kugbo (Latin)": "keu_Latn",
+    "Komi-Permyak (Telugu)": "key_Telu",
+    "Kukele (Latin)": "kez_Latn",
+    "Kobiana (Devanagari)": "kfb_Deva",
+    "Northwestern Kolami (Telugu)": "kff_Telu",
+    "Kuk (Devanagari)": "kfk_Deva",
+    "Kotaba (Devanagari)": "kfq_Deva",
+    "Koya (Gujarati)": "kfr_Gujr",
+    "Koro (India) (Latin)": "kfw_Latn",
+    "Kaili (Devanagari)": "kfx_Deva",
+    "Khasi (Latin)": "kha_Latn",
+    "Kham (Tibetan)": "khg_Tibt",
+    "Khalkha Mongolian (Cyrillic)": "khk_Cyrl",
+    "Khmer (Khmer)": "khm_Khmr",
+    "Koyra Chiini Songhay (Latin)": "khq_Latn",
+    "Khowar (Arabic)": "khw_Arab",
+    "Kim (Latin)": "kia_Latn",
+    "Koalib (Latin)": "kij_Latn",
+    "Kikuyu (Latin)": "kik_Latn",
+    "Kinyarwanda (Latin)": "kin_Latn",
+    "Kirghiz (Cyrillic)": "kir_Cyrl",
+    "Kitharaka (Latin)": "kix_Latn",
+    "Mlap (Latin)": "kjb_Latn",
+    "Coastal Konjo (Latin)": "kjc_Latn",
+    "Kisar (Latin)": "kje_Latn",
+    "Khmu (Latin)": "kjg_Latn",
+    "Khakas (Cyrillic)": "kjh_Cyrl",
+    "Khakas (Latin)": "kjk_Latn",
+    "Kagulu (Latin)": "kki_Latn",
+    "Kikuyu (Latin)": "kkj_Latn",
+    "Kalanguya (Devanagari)": "kle_Deva",
+    "Kalenjin (Latin)": "kln_Latn",
+    "Kulisusu (Latin)": "kls_Latn",
+    "Klao (Latin)": "klu_Latn",
+    "Maskelynes (Latin)": "klv_Latn",
+    "Tado (Latin)": "klw_Latn",
+    "Kama (Latin)": "kma_Latn",
+    "Kimbundu (Latin)": "kmd_Latn",
+    "Tanudan Kalinga (Latin)": "kml_Latn",
+    "Northern Kurdish (Arabic)": "kmr_Arab",
+    "Northern Kurdish (Cyrillic)": "kmr_Cyrl",
+    "Northern Kurdish (Latin)": "kmr_Latn",
+    "Kanite (Latin)": "kmu_Latn",
+    "Koma (Latin)": "kmy_Latn",
+    "Kanda (Latin)": "kna_Latn",
+    "Lubuagan Kalinga (Latin)": "knb_Latn",
+    "Central Kanuri (Latin)": "knc_Latn",
+    "Kankanaey (Latin)": "kne_Latn",
+    "Kutu (Latin)": "knf_Latn",
+    "Konda (Latin)": "knj_Latn",
+    "Kuranko (Latin)": "knk_Latn",
+    "Konkani (macrolanguage) (Devanagari)": "knn_Deva",
+    "Kono (Sierra Leone) (Latin)": "kno_Latn",
+    "Kongo (Latin)": "kog_Latn",
+    "Kol (Papua New Guinea) (Latin)": "kol_Latn",
+    "Konzo (Latin)": "koo_Latn",
+    "Korean (Hangul)": "kor_Hang",
+    "Kodia (Latin)": "kpo_Latn",
+    "Korupun-Sela (Latin)": "kpq_Latn",
+    "Kofei (Latin)": "kps_Latn",
+    "Komi-Zyrian (Cyrillic)": "kpv_Cyrl",
+    "Komi-Permyak (Cyrillic)": "kpy_Cyrl",
+    "Kofyar (Latin)": "kpz_Latn",
+    "Korafe-Yegha (Latin)": "kqe_Latn",
+    "Korafe-Yegha (Latin)": "kqo_Latn",
+    "Kimré (Latin)": "kqp_Latn",
+    "Kimaragang (Latin)": "kqr_Latn",
+    "Koyra Chiini Songhay (Ethiopic)": "kqy_Ethi",
+    "Karachay-Balkar (Cyrillic)": "krc_Cyrl",
+    "Krio (Latin)": "kri_Latn",
+    "Kinaray-A (Latin)": "krj_Latn",
+    "Karelian (Latin)": "krl_Latn",
+    "Sapo (Khmer)": "krr_Khmr",
+    "Gbaya (Sudan) (Latin)": "krs_Latn",
+    "Kurukh (Devanagari)": "kru_Deva",
+    "Tewa (Indonesia) (Latin)": "krx_Latn",
+    "Shambala (Latin)": "ksb_Latn",
+    "Kuanua (Latin)": "ksd_Latn",
+    "Bafia (Latin)": "ksf_Latn",
+    "Krisa (Latin)": "ksr_Latn",
+    "Kusasi (Latin)": "kss_Latn",
+    "Kham (Devanagari)": "ksz_Deva",
+    "Kambaata (Ethiopic)": "ktb_Ethi",
+    "Krumen (Latin)": "ktj_Latn",
+    "Kto (Latin)": "kto_Latn",
+    "Kuanyama (Latin)": "kua_Latn",
+    "Kutep (Latin)": "kub_Latn",
+    "Kuman (Papua New Guinea) (Latin)": "kue_Latn",
+    "Kushi (Latin)": "kuh_Latn",
+    "Kumyk (Cyrillic)": "kum_Cyrl",
+    "Kurdish (Arabic)": "kur_Arab",
+    "Kusaal (Latin)": "kus_Latn",
+    "Kutino (Latin)": "kvn_Latn",
+    "Kove (Latin)": "kvw_Latn",
+    "Komi (Arabic)": "kvx_Arab",
+    "Kutu (Latin)": "kwd_Latn",
+    "Kwara'ae (Latin)": "kwf_Latn",
+    "Awa-Cuaiquer (Latin)": "kwi_Latn",
+    "Kwak'wala (Latin)": "kwm_Latn",
+    "Kodia (Ethiopic)": "kxc_Ethi",
+    "Maninkakan, Kita (Latin)": "kxf_Latn",
+    "Kuanhua (Thai)": "kxm_Thai",
+    "Wadiyara Koli (Arabic)": "kxp_Arab",
+    "Kwaya (Latin)": "kyb_Latn",
+    "Kyaka (Latin)": "kyc_Latn",
+    "Karey (Latin)": "kyf_Latn",
+    "Keyagana (Latin)": "kyg_Latn",
+    "Kouya (Latin)": "kyo_Latn",
+    "Kwaya (Latin)": "kyq_Latn",
+    "Kayagar (Kayah Li)": "kyu_Kali",
+    "Kambaira (Latin)": "kyx_Latn",
+    "Kerewo (Latin)": "kyz_Latn",
+    "Kairiru (Latin)": "kzf_Latn",
+    "Kelabit (Latin)": "kzi_Latn",
+    "Lacandon (Latin)": "lac_Latn",
+    "Langi (Latin)": "lag_Latn",
+    "Lango (Uganda) (Latin)": "laj_Latn",
+    "Lamba (Latin)": "lam_Latn",
+    "Lao (Lao)": "lao_Laoo",
+    "Lama (Togo) (Latin)": "las_Latn",
+    "Latin (Latin)": "lat_Latn",
+    "Latvian (Latin)": "lav_Latn",
+    "Lavu (Latin)": "law_Latn",
+    "Lama (Myanmar) (Tibetan)": "lbj_Tibt",
+    "Lachi (Latin)": "lbw_Latn",
+    "Luchazi (Latin)": "lcm_Latn",
+    "Lola (Thai)": "lcp_Thai",
+    "Lidzonka (Latin)": "ldb_Latn",
+    "Leko (Latin)": "led_Latn",
+    "Lyélé (Latin)": "lee_Latn",
+    "Lefa (Latin)": "lef_Latn",
+    "Lembena (Latin)": "lem_Latn",
+    "Lense (Latin)": "lew_Latn",
+    "Lemio (Latin)": "lex_Latn",
+    "Lega-Shabunda (Latin)": "lgg_Latn",
+    "Laghu (Latin)": "lgl_Latn",
+    "Lahu (Latin)": "lhu_Latn",
+    "Lianshan Zhuang (Latin)": "lia_Latn",
+    "Likum (Latin)": "lid_Latn",
+    "Limbu (Devanagari)": "lif_Deva",
+    "Ligurian (Latin)": "lij_Latn",
+    "Lingala (Latin)": "lin_Latn",
+    "Liki (Latin)": "lip_Latn",
+    "Libinza (Latin)": "lir_Latn",
+    "Lisu (Lisu)": "lis_Lisu",
+    "Lithuanian (Latin)": "lit_Latn",
+    "Rampi (Latin)": "lje_Latn",
+    "Lampung Api (Latin)": "ljp_Latn",
+    "Lukabaras (Latin)": "lkb_Latn",
+    "Lakata (Latin)": "lke_Latn",
+    "Lilau (Latin)": "lla_Latn",
+    "Ladin (Latin, gherd)": "lld_Latn_gherd",
+    "Ladin (Latin, valbadia)": "lld_Latn_valbadia",
+    "Láá Láá Bwamu (Latin)": "llg_Latn",
+    "Lele (Guinea) (Latin)": "lln_Latn",
+    "Loma (Liberia) (Latin)": "lme_Latn",
+    "Lundayeh (Latin)": "lnd_Latn",
+    "Lango (South Sudan) (Latin)": "lns_Latn",
+    "Lundayeh (Latin)": "lnu_Latn",
+    "Loloda (Latin)": "loa_Latn",
+    "Lobi (Latin)": "lob_Latn",
+    "Loko (Latin)": "lok_Latn",
+    "Loma (Liberia) (Latin)": "lom_Latn",
+    "Loma (Liberia) (Latin)": "lon_Latn",
+    "Lobala (Latin)": "loq_Latn",
+    "Luri (Arabic)": "lrk_Arab",
+    "Lish (Latin)": "lsi_Latn",
+    "Sa'ban (Latin)": "lsm_Latn",
+    "Sa'ban (Arabic)": "lss_Arab",
+    "Latgalian (Latin)": "ltg_Latn",
+    "Lethu (Latin)": "lth_Latn",
+    "Lutachoni (Latin)": "lto_Latn",
+    "Luxembourgish (Latin)": "ltz_Latn",
+    "Luba-Lulua (Latin)": "lua_Latn",
+    "Aringa (Latin)": "luc_Latn",
+    "Ganda (Latin)": "lug_Latn",
+    "Luo (Kenya and Tanzania) (Latin)": "luo_Latn",
+    "Lushai (Latin)": "lus_Latn",
+    "Luwanga (Latin)": "lwg_Latn",
+    "Lwo (Latin)": "lwo_Latn",
+    "Lewo Eleng (Latin)": "lww_Latn",
+    "Laz (Latin)": "lzz_Latn",
+    "Maasai (Latin)": "maa_Latn",
+    "Yutanduchi Mixtec (Latin)": "mab_Latn",
+    "Madurese (Latin)": "mad_Latn",
+    "Mafa (Latin)": "maf_Latn",
+    "Magahi (Devanagari)": "mag_Deva",
+    "Marshallese (Latin)": "mah_Latn",
+    "Maithili (Devanagari)": "mai_Deva",
+    "Majhwar (Latin)": "maj_Latn",
+    "Makasar (Latin)": "mak_Latn",
+    "Malayalam (Malayalam)": "mal_Mlym",
+    "Mam (Latin)": "mam_Latn",
+    "Mamaindé (Latin)": "maq_Latn",
+    "Marathi (Devanagari)": "mar_Deva",
+    "Mazatec (Latin)": "mau_Latn",
+    "Sateré-Mawé (Latin)": "maw_Latn",
+    "North Moluccan Malay (Latin)": "max_Latn",
+    "Central Mazahua (Latin)": "maz_Latn",
+    "Western Bukidnon Manobo (Latin)": "mbb_Latn",
+    "Macushi (Latin)": "mbc_Latn",
+    "Duna (Latin)": "mbh_Latn",
+    "Ilianen Manobo (Latin)": "mbj_Latn",
+    "Matigsalug Manobo (Latin)": "mbt_Latn",
+    "Mbo (Cameroon) (Latin)": "mbu_Latn",
+    "Macuna (Latin)": "mca_Latn",
+    "Machiguenga (Latin)": "mcb_Latn",
+    "Bitur (Latin)": "mcd_Latn",
+    "Matsés (Latin)": "mcf_Latn",
+    "Mixe (Latin)": "mco_Latn",
+    "Ese (Latin)": "mcp_Latn",
+    "M seri (Latin)": "mcq_Latn",
+    "Mambai (Latin)": "mcu_Latn",
+    "Mpiemo (Latin)": "mcx_Latn",
+    "Mada (Nigeria) (Latin)": "mda_Latn",
+    "Morigi (Latin)": "mdd_Latn",
+    "Mbosi (Latin)": "mdv_Latn",
+    "Male (Ethiopia) (Ethiopic)": "mdy_Ethi",
+    "Medumba (Latin)": "med_Latn",
+    "Melpa (Latin)": "mee_Latn",
+    "Southwestern Tlaxiaco Mixtec (Latin)": "meh_Latn",
+    "Midob (Latin)": "mej_Latn",
+    "Mekeo (Latin)": "mek_Latn",
+    "Central Melanau (Latin)": "mel_Latn",
+    "Mende (Liberia) (Latin)": "men_Latn",
+    "Merey (Latin)": "meq_Latn",
+    "Meru (Latin)": "mer_Latn",
+    "Mato (Latin)": "met_Latn",
+    "Motu (Latin)": "meu_Latn",
+    "Mano (Latin)": "mev_Latn",
+    "Morisyen (Latin)": "mfe_Latn",
+    "Mefele (Latin)": "mfh_Latn",
+    "Mefele (Latin)": "mfi_Latn",
+    "Mogofin (Latin)": "mfk_Latn",
+    "Cross River Mbembe (Latin)": "mfm_Latn",
+    "Mefele (Latin)": "mfn_Latn",
+    "Mbe (Latin)": "mfo_Latn",
+    "Marghi South (Latin)": "mfq_Latn",
+    "Marghi (Latin)": "mfv_Latn",
+    "Pahi (Latin)": "mfy_Latn",
+    "Melo (Latin)": "mfz_Latn",
+    "Maguindanaon (Latin)": "mgd_Latn",
+    "Mpade (Latin)": "mge_Latn",
+    "Monguor (Latin)": "mgg_Latn",
+    "Makhuwa-Meetto (Latin)": "mgh_Latn",
+    "Laua (Latin)": "mgi_Latn",
+    "Meta' (Latin)": "mgo_Latn",
+    "Ma'di (Latin)": "mhi_Latn",
+    "Mouk-Aria (Latin)": "mhk_Latn",
+    "Mari (Russia) (Cyrillic)": "mhr_Cyrl",
+    "Mundat (Latin)": "mhu_Latn",
+    "Maru (Latin)": "mhx_Latn",
+    "Ma'di (Latin)": "mhy_Latn",
+    "Atatláhuca Mixtec (Latin)": "mib_Latn",
+    "Mi'kmaq (Latin)": "mie_Latn",
+    "Mofu-Gudur (Latin)": "mif_Latn",
+    "San Miguel El Grande Mixtec (Latin)": "mig_Latn",
+    "Chayuco Mixtec (Latin)": "mih_Latn",
+    "Peñoles Mixtec (Latin)": "mil_Latn",
+    "Alacatlatzala Mixtec (Latin)": "mim_Latn",
+    "Minangkabau (Latin)": "min_Latn",
+    "Pinotepa Nacional Mixtec (Latin)": "mio_Latn",
+    "Apasco-Apoala Mixtec (Latin)": "mip_Latn",
+    "Mískito (Latin)": "miq_Latn",
+    "Mískito (Latin)": "mit_Latn",
+    "Southern Puebla Mixtec (Latin)": "miu_Latn",
+    "Akoye (Latin)": "miy_Latn",
+    "Coatzospan Mixtec (Latin)": "miz_Latn",
+    "Mali (Devanagari)": "mjl_Deva",
+    "Malavedan (Malayalam)": "mjv_Mlym",
+    "Macedonian (Cyrillic)": "mkd_Cyrl",
+    "Mokole (Benin) (Latin)": "mkf_Latn",
+    "Dhatki (Arabic)": "mki_Arab",
+    "Mokole (Benin) (Latin)": "mkl_Latn",
+    "Mokole (Benin) (Latin)": "mkn_Latn",
+    "Malagasy (Latin)": "mlg_Latn",
+    "Maltese (Latin)": "mlq_Latn",
+    "Maltese (Latin)": "mlt_Latn",
+    "Mamanwa (Latin)": "mmc_Latn",
+    "Michoacán Mazahua (Latin)": "mmg_Latn",
+    "Maonan (Latin)": "mnb_Latn",
+    "Montenegrin (Latin)": "mne_Latn",
+    "Mundani (Latin)": "mnf_Latn",
+    "Manipuri (Bengali)": "mni_Beng",
+    "Maninka (Latin)": "mnk_Latn",
+    "Mon (Myanmar)": "mnw_Mymr",
+    "Manikion (Latin)": "mnx_Latn",
+    "Mwan (Latin)": "moa_Latn",
+    "Mogholi (Latin)": "mog_Latn",
+    "Mongolian (Cyrillic)": "mon_Cyrl",
+    "Mopán Maya (Latin)": "mop_Latn",
+    "Mor (New Guinea) (Latin)": "mor_Latn",
+    "Mossi (Latin)": "mos_Latn",
+    "Tucunaca (Latin)": "mox_Latn",
+    "Mukulu (Latin)": "moz_Latn",
+    "Mpompon (Latin)": "mpg_Latn",
+    "Yosondúa Mixtec (Latin)": "mpm_Latn",
+    "Mapidian (Latin)": "mpp_Latn",
+    "Mixtec (Latin)": "mpx_Latn",
+    "Malas (Latin)": "mqb_Latn",
+    "Mangole (Latin)": "mqf_Latn",
+    "Minokok (Latin)": "mqj_Latn",
+    "Mumuye (Latin)": "mqn_Latn",
+    "Manggarai (Latin)": "mqy_Latn",
+    "Maori (Latin)": "mri_Latn",
+    "Western Mari (Cyrillic)": "mrj_Cyrl",
+    "Western Magar (Devanagari)": "mrr_Deva",
+    "Maranao (Latin)": "mrt_Latn",
+    "Maru (Latin)": "mrw_Latn",
+    "Masaba (Latin)": "msh_Latn",
+    "Sabah Malay (Latin)": "msi_Latn",
+    "Mswahili (Latin)": "msw_Latn",
+    "Malay (macrolanguage) (Latin)": "msy_Latn",
+    "Mator-Taygi-Karagas (Latin)": "mtd_Latn",
+    "Binukidnon (Latin)": "mtj_Latn",
+    "Yosondúa Mixtec (Latin)": "mto_Latn",
+    "Totontepec Mixe (Devanagari)": "mtr_Deva",
+    "Tututepec Mixtec (Latin)": "mtu_Latn",
+    "Tututepec Mixtec (Latin)": "mtx_Latn",
+    "Mundang (Latin)": "mua_Latn",
+    "Mubi (Latin)": "mug_Latn",
+    "Mündü (Latin)": "muh_Latn",
+    "Musi (Latin)": "mui_Latn",
+    "Majhwar (Devanagari)": "mup_Deva",
+    "Murle (Latin)": "mur_Latn",
+    "Muthuvan (Malayalam)": "muv_Mlym",
+    "Muyang (Latin)": "muy_Latn",
+    "Marwari (Arabic)": "mve_Arab",
+    "Marwari (Arabic)": "mvp_Latn",
+    "Marwari (Arabic)": "mvy_Arab",
+    "Mwanga (Tanzania) (Latin)": "mwq_Latn",
+    "Mwera (Tanzania) (Latin)": "mwv_Latn",
+    "Metlatónoc Mixtec (Latin)": "mxb_Latn",
+    "Juxtlahuaca Mixtec (Latin)": "mxq_Latn",
+    "Silacayoapan Mixtec (Latin)": "mxs_Latn",
+    "Tezoatlán Mixtec (Latin)": "mxt_Latn",
+    "Metlatónoc Mixtec (Latin)": "mxu_Latn",
+    "Northwestern Ojibwa (Latin)": "mxv_Latn",
+    "Metlatónoc Mixtec (Latin)": "mxy_Latn",
+    "Burmese (Myanmar)": "mya_Mymr",
+    "Mbay (Latin)": "myb_Latn",
+    "Myene (Latin)": "myk_Latn",
+    "Erzya (Cyrillic)": "myv_Cyrl",
+    "Masa (Chad) (Latin)": "myx_Latn",
+    "Macuna (Latin)": "myy_Latn",
+    "Santa María Zacatepec Mixtec (Latin)": "mza_Latn",
+    "Berber languages (Latin)": "mzi_Latn",
+    "Mazatlán Mixe (Latin)": "mzj_Latn",
+    "Mazatlán Mixe (Latin)": "mzk_Latn",
+    "Mazatlán Mixe (Latin)": "mzl_Latn",
+    "Mumuye (Latin)": "mzm_Latn",
+    "Manado Malay (Latin)": "mzw_Latn",
+    "Nimanbur (Latin)": "nab_Latn",
+    "Naga languages (Latin)": "nag_Latn",
+    "Nalik (Latin)": "nal_Latn",
+    "Min Nan Chinese (Latin)": "nan_Latn",
+    "Neapolitan (Latin)": "nap_Latn",
+    "Coatepec Nahuatl (Latin)": "nas_Latn",
+    "Nawuri (Latin)": "naw_Latn",
+    "Nyemba (Latin)": "nbh_Latn",
+    "Chang Naga (Latin)": "nca_Latn",
+    "Notsi (Latin)": "ncf_Latn",
+    "Central Huasteca Nahuatl (Latin)": "nch_Latn",
+    "Central Puebla Nahuatl (Latin)": "ncj_Latn",
+    "Michoacán Nahuatl (Latin)": "ncl_Latn",
+    "N eko (Latin)": "nco_Latn",
+    "Nahuatl languages (Latin)": "ncu_Latn",
+    "Morelos Nahuatl (Latin)": "ncx_Latn",
+    "Ndogo (Latin)": "ndi_Latn",
+    "Ndjuká (Latin)": "ndj_Latn",
+    "Ndonga (Latin)": "ndo_Latn",
+    "Ndo (Latin)": "ndp_Latn",
+    "Ndut (Latin)": "ndv_Latn",
+    "Lutos (Latin)": "ndy_Latn",
+    "Ndogo (Latin)": "ndz_Latn",
+    "Toura (Côte d'Ivoire) (Latin)": "neb_Latn",
+    "Nepali (Devanagari)": "nep_Deva",
+    "Newari (Devanagari)": "new_Deva",
+    "Ngbaka'ma'bo (Latin)": "nfa_Latn",
+    "Nefamese (Latin)": "nfr_Latn",
+    "Ngad'a (Latin)": "nga_Latn",
+    "Ngemba (Latin)": "ngi_Latn",
+    "Lomwe (Latin)": "ngl_Latn",
+    "Ngulu (Latin)": "ngp_Latn",
+    "Guerrero Nahuatl (Latin)": "ngu_Latn",
+    "Eastern Huasteca Nahuatl (Latin)": "nhe_Latn",
+    "Ngiyambaa (Latin)": "nhg_Latn",
+    "Zacatlán-Ahuacatlán-Tepetzintla Nahuatl (Latin)": "nhi_Latn",
+    "Nahari (Latin)": "nhn_Latn",
+    "Tetelcingo Nahuatl (Latin)": "nhq_Latn",
+    "Orizaba Nahuatl (Latin)": "nhu_Latn",
+    "Western Huasteca Nahuatl (Latin)": "nhw_Latn",
+    "Tabasco Nahuatl (Latin)": "nhx_Latn",
+    "Ometepec Nahuatl (Latin)": "nhy_Latn",
+    "Nias (Latin)": "nia_Latn",
+    "Ngaju (Latin)": "nij_Latn",
+    "Nimi (Latin)": "nim_Latn",
+    "Ninzo (Latin)": "nin_Latn",
+    "Nganasan (Latin)": "nja_Latn",
+    "Nkonya (Latin)": "nko_Latn",
+    "Ngombale (Latin)": "nla_Latn",
+    "Ná-Meo (Latin)": "nlc_Latn",
+    "Dutch (Latin)": "nld_Latn",
+    "Gela (Latin)": "nlg_Latn",
+    "Ninia Yali (Latin)": "nlk_Latn",
+    "Orizaba Nahuatl (Latin)": "nlv_Latn",
+    "Nyamwezi (Latin)": "nmg_Latn",
+    "Nyamwezi (Latin)": "nmz_Latn",
+    "Norwegian Nynorsk (Latin)": "nnb_Latn",
+    "Ngiemboon (Latin)": "nnh_Latn",
+    "Ngen (Latin)": "nnq_Latn",
+    "Nuni (Latin)": "nnw_Latn",
+    "Nocamán (Latin)": "noa_Latn",
+    "Norwegian Bokmål (Latin)": "nob_Latn",
+    "Northern Thai (Thai)": "nod_Thai",
+    "Nimadi (Devanagari)": "noe_Deva",
+    "Nogai (Cyrillic)": "nog_Cyrl",
+    "Nomatsiguenga (Latin)": "not_Latn",
+    "Nupoid languages (Latin)": "npl_Latn",
+    "Napu (Latin)": "npy_Latn",
+    "Northern Sotho (Latin)": "nso_Latn",
+    "Nisenan (Latin)": "nst_Latn",
+    "Nisu (Latin)": "nsu_Latn",
+    "Naga languages (Latin)": "ntm_Latn",
+    "Ntrubo (Latin)": "ntr_Latn",
+    "Nobsalan (Latin)": "nuj_Latn",
+    "Nung (Viet Nam) (Latin)": "nup_Latn",
+    "Nuer (Latin)": "nus_Latn",
+    "Nuu-chah-nulth (Latin)": "nuz_Latn",
+    "Nyabwa (Latin)": "nwb_Latn",
+    "Naxi (Latin)": "nxq_Latn",
+    "Nyanja (Latin)": "nya_Latn",
+    "Nyanga-li (Latin)": "nyf_Latn",
+    "Nyankole (Latin)": "nyn_Latn",
+    "Nyoro (Latin)": "nyo_Latn",
+    "Nyulnyul (Latin)": "nyu_Latn",
+    "Nyulnyul (Latin)": "nyy_Latn",
+    "Nzima (Latin)": "nzi_Latn",
+    "Obo Manobo (Latin)": "obo_Latn",
+    "Occitan (post 1500) (Latin)": "oci_Latn",
+    "Ormuri (Arabic)": "odk_Arab",
+    "Odual (Latin)": "odu_Latn",
+    "Ogoniland (Latin)": "ogo_Latn",
+    "Ojibwa (Canadian Aboriginal Syllabics)": "ojb_Cans",
+    "Ojibwa (Latin)": "ojb_Latn",
+    "Oku (Latin)": "oku_Latn",
+    "Mochi (Latin)": "old_Latn",
+    "Omejes (Latin)": "omw_Latn",
+    "Obo Manobo (Latin)": "onb_Latn",
+    "Tohono O'odham (Latin)": "ood_Latn",
+    "Oroqen (Latin)": "orc_Latn",
+    "Oromo (Latin)": "orm_Latn",
+    "Ormuri (Arabic)": "oru_Arab",
+    "Oriya (Oriya)": "ory_Orya",
+    "Ossetian (Cyrillic)": "oss_Cyrl",
+    "Otomi (Latin)": "ote_Latn",
+    "Otomi (Latin)": "otq_Latn",
+    "Old Turkish (Latin)": "ozm_Latn",
+    "Páez (Latin)": "pab_Latn",
+    "Pareci (Latin)": "pad_Latn",
+    "Pangasinan (Latin)": "pag_Latn",
+    "Pampanga (Latin)": "pam_Latn",
+    "Panjabi (Gurmukhi)": "pan_Guru",
+    "Northern Paiute (Latin)": "pao_Latn",
+    "Papiamento (Latin)": "pap_Latn",
+    "Palauan (Latin)": "pau_Latn",
+    "Pangwa (Latin)": "pbb_Latn",
+    "Patamona (Latin)": "pbc_Latn",
+    "Mezontla Popoloca (Latin)": "pbi_Latn",
+    "Parkwa (Latin)": "pbs_Latn",
+    "Southern Pashto (Arabic)": "pbt_Arab",
+    "Northern Pashto (Arabic)": "pbu_Arab",
+    "Ruching Palaung (Thai)": "pce_Thai",
+    "Nigerian Pidgin (Latin)": "pcm_Latn",
+    "Pardhan (Latin)": "pex_Latn",
+    "Eastern Pomo (Latin)": "pez_Latn",
+    "Pahi (Arabic)": "phl_Arab",
+    "Phuan (Arabic)": "phr_Arab",
+    "Pima Bajo (Latin)": "pib_Latn",
+    "Yinjtjiparnti (Latin)": "pil_Latn",
+    "Piapoco (Latin)": "pip_Latn",
+    "Piratapuyo (Latin)": "pir_Latn",
+    "Pijin (Latin)": "pis_Latn",
+    "Pitta Pitta (Latin)": "piy_Latn",
+    "Pijao (Latin)": "pjt_Latn",
+    "Pokomo (Latin)": "pkb_Latn",
+    "Pökoot (Latin)": "pko_Latn",
+    "Shwe Palaung (Arabic)": "plk_Arab",
+    "Central Pame (Latin)": "pls_Latn",
+    "Malagasy, Plateau (Latin)": "plt_Latn",
+    "Polonombauk (Latin)": "plw_Latn",
+    "Piemontese (Latin)": "pmf_Latn",
+    "Piemontese (Latin)": "pmq_Latn",
+    "Piemontese (Latin)": "pms_Latn",
+    "Pamona (Latin)": "pmy_Latn",
+    "Western Panjabi (Arabic)": "pnb_Arab",
+    "Penesak (Latin)": "pne_Latn",
+    "Pinyin (Latin)": "pny_Latn",
+    "Ponares (Latin)": "poc_Latn",
+    "Poqomam (Latin)": "poe_Latn",
+    "Poqomchi' (Latin)": "poh_Latn",
+    "Pokangá (Latin)": "poi_Latn",
+    "Polish (Latin)": "pol_Latn",
+    "Portuguese (Latin)": "por_Latn",
+    "Pémono (Latin)": "pov_Latn",
+    "Puelche (Latin)": "pow_Latn",
+    "Puelche (Latin)": "poy_Latn",
+    "Paipai (Latin)": "ppk_Latn",
+    "San Luís Temalacayuca Popoloca (Latin)": "pps_Latn",
+    "Pa'o (Latin)": "prf_Latn",
+    "Parauk (Latin)": "prk_Latn",
+    "Parsi-Dari (Latin)": "prq_Latn",
+    "Phai (Thai)": "prt_Thai",
+    "Pai Tavytera (Latin)": "pse_Latn",
+    "Kaulong (Latin)": "pss_Latn",
+    "Central Pashto (Arabic)": "pst_Arab",
+    "Patuá (Latin)": "ptu_Latn",
+    "Punan Merap (Latin)": "pua_Latn",
+    "Punan Merap (Latin)": "pui_Latn",
+    "Pushto (Arabic)": "pus_Arab",
+    "Pangwali (Latin)": "pwg_Latn",
+    "Paiwan (Latin)": "pwn_Latn",
+    "Pwo Western Karen (Thai)": "pww_Thai",
+    "Quetzaltepec Mixe (Latin)": "pxm_Latn",
+    "Bikol (Latin)": "qub_Latn",
+    "K'iche' (Latin)": "quc_Latn",
+    "Lambayeque Quechua (Latin)": "quf_Latn",
+    "Chimborazo Highland Quichua (Latin)": "qug_Latn",
+    "South Bolivian Quechua (Latin)": "quh_Latn",
+    "North Bolivian Quechua (Latin)": "qul_Latn",
+    "Sipacapense (Latin)": "qum_Latn",
+    "Panao Huánuco Quechua (Latin)": "qup_Latn",
+    "Yanahuanca Pasco Quechua (Latin)": "qur_Latn",
+    "Southern Pastaza Quechua (Latin)": "qus_Latn",
+    "Quechua (Latin)": "quv_Latn",
+    "Quechua (Latin)": "quw_Latn",
+    "Quechua (Latin)": "qux_Latn",
+    "Ayacucho Quechua (Latin)": "quy_Latn",
+    "Cusco Quechua (Latin)": "quz_Latn",
+    "Ambo-Pasco Quechua (Latin)": "qva_Latn",
+    "Cajamarca Quechua (Latin)": "qvc_Latn",
+    "Eastern Apurímac Quechua (Latin)": "qve_Latn",
+    "Huallaga Huánuco Quechua (Latin)": "qvh_Latn",
+    "Imbabura Highland Quichua (Latin)": "qvi_Latn",
+    "Loja Highland Quichua (Latin)": "qvj_Latn",
+    "Cajatambo North Lima Quechua (Latin)": "qvl_Latn",
+    "Margos-Yarowilca-Lauricocha Quechua (Latin)": "qvm_Latn",
+    "North Junín Quechua (Latin)": "qvn_Latn",
+    "Napo Lowland Quechua (Latin)": "qvo_Latn",
+    "San Martín Quechua (Latin)": "qvs_Latn",
+    "Huaylla Wanca Quechua (Latin)": "qvw_Latn",
+    "Yauyos Quechua (Latin)": "qvz_Latn",
+    "Corongo Ancash Quechua (Latin)": "qwa_Latn",
+    "Huaylas Ancash Quechua (Latin)": "qwh_Latn",
+    "Sihuas Ancash Quechua (Latin)": "qws_Latn",
+    "Chiquián Ancash Quechua (Latin)": "qxa_Latn",
+    "Southern Conchucos Ancash Quechua (Latin)": "qxh_Latn",
+    "Northern Conchucos Ancash Quechua (Latin)": "qxl_Latn",
+    "Puno Quechua (Latin)": "qxn_Latn",
+    "Southern Pastaza Quechua (Latin)": "qxo_Latn",
+    "Puno Quechua (Latin)": "qxp_Latn",
+    "Pacaraos Quechua (Latin)": "qxr_Latn",
+    "Santa Ana de Tusi Pasco Quechua (Latin)": "qxt_Latn",
+    "Arequipa-La Unión Quechua (Latin)": "qxu_Latn",
+    "Jauja Wanca Quechua (Latin)": "qxw_Latn",
+    "Rāga (Latin)": "rag_Latn",
+    "Rahambuu (Bengali)": "rah_Beng",
+    "Ramoaaina (Latin)": "rai_Latn",
+    "Rapa Nui (Latin)": "rap_Latn",
+    "Rawang (Devanagari)": "rav_Deva",
+    "Rawang (Latin)": "raw_Latn",
+    "Rejang (Latin)": "rej_Latn",
+    "Rendille (Latin)": "rel_Latn",
+    "Raguile (Latin)": "rgu_Latn",
+    "Rohingya (Latin)": "rhg_Latn",
+    "Tarifit (Arabic)": "rif_Arab",
+    "Tarifit (Latin)": "rif_Latn",
+    "Riang (India) (Latin)": "rim_Latn",
+    "Riang (India) (Devanagari)": "rjs_Deva",
+    "Rangpuri (Bengali)": "rkt_Beng",
+    "Carpathian Romani (Cyrillic)": "rmc_Cyrl",
+    "Carpathian Romani (Latin)": "rmc_Latn",
+    "Traveller Norwegian (Latin)": "rmo_Latn",
+    "Romany (Cyrillic)": "rmy_Cyrl",
+    "Romany (Latin)": "rmy_Latn",
+    "Roon (Latin)": "rng_Latn",
+    "Roon (Latin)": "rnl_Latn",
+    "Tae' (Latin)": "rob_Latn",
+    "Rombo (Latin)": "rof_Latn",
+    "Romansh (Latin, surs1244)": "roh_Latn_surs1244",
+    "Romblomanon (Latin)": "rol_Latn",
+    "Romanian (Latin)": "ron_Latn",
+    "Rongga (Latin)": "roo_Latn",
+    "Kriol (Latin)": "rop_Latn",
+    "Rotokas (Latin)": "rro_Latn",
+    "Rathawi (Latin)": "rth_Latn",
+    "Rusyn (Latin)": "rub_Latn",
+    "Ruuli (Latin)": "ruc_Latn",
+    "Rufiji (Latin)": "ruf_Latn",
+    "Ruga (Latin)": "rug_Latn",
+    "Rundi (Latin)": "run_Latn",
+    "Russian (Cyrillic)": "rus_Cyrl",
+    "Mbwela (Latin)": "rwm_Latn",
+    "Marwari (India) (Devanagari)": "rwr_Deva",
+    "Saba (Latin)": "sab_Latn",
+    "Sango (Latin)": "sag_Latn",
+    "Yakut (Cyrillic)": "sah_Cyrl",
+    "Sahu (Latin)": "saj_Latn",
+    "Samburu (Latin)": "saq_Latn",
+    "Sasak (Latin)": "sas_Latn",
+    "Sause (Latin)": "sau_Latn",
+    "Sayula Popoluca (Latin)": "say_Latn",
+    "Ngambay (Latin)": "sba_Latn",
+    "Simbo (Latin)": "sbd_Latn",
+    "Sagala (Latin)": "sbl_Latn",
+    "Sindhi Bhil (Arabic)": "sbn_Arab",
+    "Sangu (Tanzania) (Latin)": "sbp_Latn",
+    "Sangu (Gabon) (Latin)": "sch_Latn",
+    "Sadri (Devanagari)": "sck_Deva",
+    "Shina (Arabic)": "scl_Arab",
+    "Sicilian (Latin)": "scn_Latn",
+    "Scots (Latin)": "sco_Latn",
+    "Sandawe (Latin)": "sda_Latn",
+    "Sardo-logudorese (Latin)": "sdo_Latn",
+    "Semai (Latin)": "sea_Latn",
+    "Sena (Latin)": "seh_Latn",
+    "Sena (Latin)": "sei_Latn",
+    "Serrano (Latin)": "ses_Latn",
+    "Serrano (Latin)": "sey_Latn",
+    "Sangu (Gabon) (Latin)": "sgb_Latn",
+    "Surgujia (Devanagari)": "sgj_Deva",
+    "Suri (Ethiopic)": "sgw_Ethi",
+    "Tachelhit (Latin)": "shi_Latn",
+    "Sheko (Latin)": "shk_Latn",
+    "Shan (Myanmar)": "shn_Mymr",
+    "Shanga (Latin)": "sho_Latn",
+    "Sala (Latin)": "shp_Latn",
+    "Sidamo (Latin)": "sid_Latn",
+    "Serrano (Latin)": "sig_Latn",
+    "Tumulung Sisaala (Latin)": "sil_Latn",
+    "Sinhala (Sinhala)": "sin_Sinh",
+    "Sikkimese (Tibetan)": "sip_Tibt",
+    "Siwa (Latin)": "siw_Latn",
+    "Soli (Latin)": "sja_Latn",
+    "Simaa (Latin)": "sjm_Latn",
+    "Surjapuri (Devanagari)": "sjp_Deva",
+    "Siar-Lak (Latin)": "sjr_Latn",
+    "Seke (Vanuatu) (Latin)": "skg_Latn",
+    "Saraiki (Arabic)": "skr_Arab",
+    "Sáliba (Latin)": "sld_Latn",
+    "Slovak (Latin)": "slk_Latn",
+    "Selaru (Latin)": "slu_Latn",
+    "Slovenian (Latin)": "slv_Latn",
+    "Sama (Latin)": "sml_Latn",
+    "Samoan (Latin)": "smo_Latn",
+    "Shona (Latin)": "sna_Latn",
+    "Sanga (Nigeria) (Latin)": "snc_Latn",
+    "Sindhi (Arabic)": "snd_Arab",
+    "Bau Bidayuh (Latin)": "sne_Latn",
+    "Soninke (Latin)": "snk_Latn",
+    "Siona (Latin)": "snn_Latn",
+    "Siane (Latin)": "snp_Latn",
+    "Sauk (Latin)": "snv_Latn",
+    "Sauk (Latin)": "snw_Latn",
+    "Solos (Latin)": "sol_Latn",
+    "Somali (Latin)": "som_Latn",
+    "Songe (Latin)": "soy_Latn",
+    "Spanish (Latin)": "spa_Latn",
+    "Sian (Latin)": "spp_Latn",
+    "Saponi (Latin)": "sps_Latn",
+    "Sabaot (Latin)": "spy_Latn",
+    "Sardinian (Latin)": "src_Latn",
+    "Sardinian (Latin)": "srd_Latn",
+    "Sera (Latin)": "sri_Latn",
+    "Saramaccan (Latin)": "srm_Latn",
+    "Sranan Tongo (Latin)": "srn_Latn",
+    "Sarsuti (Latin)": "sro_Latn",
+    "Serbian (Cyrillic)": "srp_Cyrl",
+    "Serer (Latin)": "srr_Latn",
+    "Seraiki (Devanagari)": "srx_Deva",
+    "Siri (Arabic)": "ssi_Arab",
+    "Seta (Latin)": "ste_Latn",
+    "Sateré-Mawé (Latin)": "stn_Latn",
+    "Stieng (Latin)": "stp_Latn",
+    "Sua (Latin)": "sua_Latn",
+    "Suku (Latin)": "suc_Latn",
+    "Sukuma (Latin)": "suk_Latn",
+    "Sundanese (Latin)": "sun_Latn",
+    "Suri (Latin)": "sur_Latn",
+    "Susu (Latin)": "sus_Latn",
+    "Susu (Latin)": "suv_Latn",
+    "Sunwar (Devanagari)": "suz_Deva",
+    "Svan (Georgian)": "sva_Geor",
+    "Swedish (Latin)": "swe_Latn",
+    "Swahili (macrolanguage) (Latin)": "swh_Latn",
+    "Seraiki (Devanagari)": "swv_Deva",
+    "Sumbwa (Latin)": "sxb_Latn",
+    "Sicanian (Latin)": "sxn_Latn",
+    "Sighu (Latin)": "sya_Latn",
+    "Sylheti (Latin)": "syl_Latn",
+    "Saurashtra (Latin)": "sza_Latn",
+    "Saurashtra (Latin)": "szy_Latn",
+    "Tuma-Irumu (Latin)": "tac_Latn",
+    "Tajio (Devanagari)": "taj_Deva",
+    "Tamil (Tamil)": "tam_Taml",
+    "Tana (Latin)": "tan_Latn",
+    "Tangale (Latin)": "tao_Latn",
+    "Taabwa (Latin)": "tap_Latn",
+    "Tarahumara (Latin)": "taq_Latn",
+    "Central Tarahumara (Latin)": "tar_Latn",
+    "Tatar (Cyrillic)": "tat_Cyrl",
+    "Tatuyo (Latin)": "tav_Latn",
+    "Tay (Latin)": "tay_Latn",
+    "Taliabu (Latin)": "tbc_Latn",
+    "Kbo (Latin)": "tbf_Latn",
+    "Tairora (Latin)": "tbg_Latn",
+    "Tboli (Latin)": "tbk_Latn",
+    "Tboli (Latin)": "tbl_Latn",
+    "Tagbu (Latin)": "tby_Latn",
+    "Ditammari (Latin)": "tbz_Latn",
+    "Ticuna (Latin)": "tca_Latn",
+    "Datooga (Latin)": "tcc_Latn",
+    "Malagasy, Tsimihety (Latin)": "tcf_Latn",
+    "Tulu (Malayalam)": "tcy_Mlym",
+    "Are'are (Latin)": "tcz_Latn",
+    "Tidong (Latin)": "tdj_Latn",
+    "Tandaganon (Latin)": "tdn_Latn",
+    "Tandroy-Mahafaly Malagasy (Latin)": "tdx_Latn",
+    "Tepo Krumen (Latin)": "ted_Latn",
+    "Teressa (Latin)": "tee_Latn",
+    "Telugu (Telugu)": "tel_Telu",
+    "Timne (Latin)": "tem_Latn",
+    "Teso (Latin)": "teo_Latn",
+    "Teso (Latin)": "ter_Latn",
+    "Tewa (USA) (Latin)": "tew_Latn",
+    "Tennet (Latin)": "tex_Latn",
+    "Terik (Latin)": "tfr_Latn",
+    "Ternate (Latin)": "tgc_Latn",
+    "Togoyo (Latin)": "tgj_Latn",
+    "Tajik (Cyrillic)": "tgk_Cyrl",
+    "Tagalog (Latin)": "tgl_Latn",
+    "Togoyo (Latin)": "tgo_Latn",
+    "Togoyo (Latin)": "tgp_Latn",
+    "Thai (Thai)": "tha_Thai",
+    "Tharu (Devanagari)": "the_Deva",
+    "Tho (Latin)": "thk_Latn",
+    "Tharu (Devanagari)": "thl_Deva",
+    "Tharu (Devanagari)": "thq_Deva",
+    "Tharu (Devanagari)": "thr_Deva",
+    "Thangmi (Tifinagh)": "thv_Tfng",
+    "Tigre (Ethiopic)": "tig_Ethi",
+    "Timugon Murut (Latin)": "tih_Latn",
+    "Tii (Latin)": "tik_Latn",
+    "Tillamook (Latin)": "tio_Latn",
+    "Tigrinya (Ethiopic)": "tir_Ethi",
+    "Masaka (Latin)": "tkg_Latn",
+    "Tukumanféd (Latin)": "tkr_Latn",
+    "Takpa (Devanagari)": "tkt_Deva",
+    "Tobo-Kube (Latin)": "tlb_Latn",
+    "Tlingit (Latin)": "tli_Latn",
+    "Talysh (Latin)": "tlj_Latn",
+    "Taloki (Latin)": "tlp_Latn",
+    "Talysh (Latin)": "tly_Latn",
+    "Tumak (Latin)": "tmc_Latn",
+    "Toba-Maskoy (Latin)": "tmf_Latn",
+    "Tasmate (Latin)": "tna_Latn",
+    "Tonga (Nyasa) (Latin)": "tng_Latn",
+    "Tenis (Latin)": "tnk_Latn",
+    "Tonsawang (Latin)": "tnn_Latn",
+    "Tontemboan (Latin)": "tnp_Latn",
+    "Ménik (Latin)": "tnr_Latn",
+    "Tenino (Latin)": "tnt_Latn",
+    "Toba (Latin)": "tob_Latn",
+    "Coyutla Totonac (Latin)": "toc_Latn",
+    "Toma (Latin)": "toh_Latn",
+    "Toki Pona (Latin)": "tok_Latn",
+    "Tomini (Latin)": "tom_Latn",
+    "Xicotepec De Juárez Totonac (Latin)": "top_Latn",
+    "Tukumanféd (Latin)": "tos_Latn",
+    "Tok Pisin (Latin)": "tpi_Latn",
+    "Tukumanféd (Latin)": "tpl_Latn",
+    "Tampulma (Latin)": "tpm_Latn",
+    "Tukumanféd (Latin)": "tpp_Latn",
+    "Tukumanféd (Latin)": "tpt_Latn",
+    "Tukumanféd (Latin)": "tpz_Latn",
+    "Tukumanféd (Latin)": "tqp_Latn",
+    "Trio (Latin)": "trc_Latn",
+    "Turi (Latin)": "tri_Latn",
+    "Torona (Latin)": "trn_Latn",
+    "Trumai (Latin)": "trp_Latn",
+    "Tregami (Latin)": "trq_Latn",
+    "Tirahi (Latin)": "trs_Latn",
+    "Trukhmen (Latin)": "trv_Latn",
+    "Torwali (Arabic)": "trw_Arab",
+    "Tswana (Latin)": "tsn_Latn",
+    "Tsonga (Latin)": "tso_Latn",
+    "Tsuvan (Latin)": "tsz_Latn",
+    "Tswa (Latin)": "ttc_Latn",
+    "Tutelo (Latin)": "tte_Latn",
+    "Tooro (Latin)": "ttj_Latn",
+    "Tawallammat Tamajaq (Tifinagh)": "ttq_Tfng",
+    "Tutoro (Latin)": "ttr_Latn",
+    "Wotu (Latin)": "ttu_Latn",
+    "Tübatulabal (Latin)": "tue_Latn",
+    "Tübatulabal (Latin)": "tuf_Latn",
+    "Tugutil (Latin)": "tui_Latn",
+    "Turkmen (Arabic)": "tuk_Arab",
+    "Turkmen (Latin)": "tuk_Latn",
+    "Tula (Latin)": "tul_Latn",
+    "Tumbuka (Latin)": "tuo_Latn",
+    "Tedaga (Latin)": "tuq_Latn",
+    "Turkish (Latin)": "tur_Latn",
+    "Tuxináwa (Latin)": "tuv_Latn",
+    "Tuxináwa (Latin)": "tuy_Latn",
+    "Tungus languages (Latin)": "tvo_Latn",
+    "Tungus languages (Latin)": "tvu_Latn",
+    "Tungus languages (Latin)": "tvw_Latn",
+    "Tawbuid (Latin)": "twb_Latn",
+    "Twents (Latin)": "twe_Latn",
+    "Tungus languages (Latin)": "twu_Latn",
+    "Tewe (Latin)": "txa_Latn",
+    "Tombonuo (Latin)": "txq_Latn",
+    "Tartessian (Latin)": "txs_Latn",
+    "Kayapó (Latin)": "txu_Latn",
+    "Tanosy Malagasy (Latin)": "txy_Latn",
+    "Tauya (Latin)": "tye_Latn",
+    "Tzeltal (Latin)": "tzh_Latn",
+    "Tz'utujil (Latin)": "tzj_Latn",
+    "Tzotzil (Latin)": "tzo_Latn",
+    "Ubi (Latin)": "ubl_Latn",
+    "Ubang (Latin)": "ubu_Latn",
+    "Ujir (Latin)": "udl_Latn",
+    "Udmurt (Cyrillic)": "udm_Cyrl",
+    "Uduk (Latin)": "udu_Latn",
+    "Uighur (Arabic)": "uig_Arab",
+    "Uighur (Cyrillic)": "uig_Cyrl",
+    "Ukuriguma (Oriya)": "uki_Orya",
+    "Ukrainian (Cyrillic)": "ukr_Cyrl",
+    "Ukuriguma (Latin)": "ukv_Latn",
+    "Umbundu (Latin)": "umb_Latn",
+    "Uripiv-Wala-Rano-Atchin (Latin)": "upv_Latn",
+    "Ura (Vanuatu) (Latin)": "ura_Latn",
+    "Urubú-Kaapor (Latin)": "urb_Latn",
+    "Urdu (Arabic)": "urd_Arab",
+    "Urdu (Devanagari)": "urd_Deva",
+    "Urdu (Latin)": "urd_Latn",
+    "Urhobo (Latin)": "urh_Latn",
+    "Urak Lawoi' (Thai)": "urk_Thai",
+    "Urat (Latin)": "urt_Latn",
+    "Uru (Latin)": "ury_Latn",
+    "Ushojo (Arabic)": "ush_Arab",
+    "Uspanteco (Latin)": "usp_Latn",
+    "Uzbek (Cyrillic)": "uzb_Cyrl",
+    "Uzbek (Latin)": "uzb_Latn",
+    "Northern Uzbek (Latin)": "uzn_Latn",
+    "Vagla (Latin)": "vag_Latn",
+    "Varhadi-Nagpuri (Devanagari)": "vah_Deva",
+    "Vehes (Latin)": "vai_Latn",
+    "Varli (Latin)": "var_Latn",
+    "Veluws (Latin)": "ver_Latn",
+    "Vinde (Latin)": "vid_Latn",
+    "Vietnamese (Latin)": "vie_Latn",
+    "Vili (Latin)": "vif_Latn",
+    "Viemo (Latin)": "vmc_Latn",
+    "Juxtlahuaca Mixtec (Latin)": "vmj_Latn",
+    "Mitlatongo Mixtec (Latin)": "vmm_Latn",
+    "Soyaltepec Mazatec (Latin)": "vmp_Latn",
+    "Makhuwa (Latin)": "vmw_Latn",
+    "Soyaltepec Mazatec (Latin)": "vmy_Latn",
+    "Soyaltepec Mazatec (Latin)": "vmz_Latn",
+    "Võro (Latin)": "vro_Latn",
+    "Vunjo (Latin)": "vun_Latn",
+    "Vute (Latin)": "vut_Latn",
+    "Wolaytta (Ethiopic)": "wal_Ethi",
+    "Wolaytta (Latin)": "wal_Latn",
+    "Wapishana (Latin)": "wap_Latn",
+    "Waray (Philippines) (Latin)": "war_Latn",
+    "Walla Walla (Latin)": "waw_Latn",
+    "Wayana (Latin)": "way_Latn",
+    "Warao (Latin)": "wba_Latn",
+    "Wakhi (Latin)": "wbl_Latn",
+    "Wagdi (Devanagari)": "wbr_Deva",
+    "Waci Gbe (Latin)": "wci_Latn",
+    "Wè Western (Latin)": "weo_Latn",
+    "Wewaw (Latin)": "wes_Latn",
+    "Wajan (Latin)": "wja_Latn",
+    "Warji (Latin)": "wji_Latn",
+    "Walloon (Latin)": "wlo_Latn",
+    "Wolio (Latin)": "wlx_Latn",
+    "Womo (Latin)": "wmw_Latn",
+    "Wobé (Latin)": "wob_Latn",
+    "Wolof (Latin)": "wof_Latn",
+    "Wolof (Latin)": "wol_Latn",
+    "Wagdi (Telugu)": "wsg_Telu",
+    "Wassa (Latin)": "wwa_Latn",
+    "Kalmyk (Cyrillic)": "xal_Cyrl",
+    "Kayan Mahakam (Latin)": "xdy_Latn",
+    "Xerénte (Latin)": "xed_Latn",
+    "Xerénte (Latin)": "xer_Latn",
+    "Khetrani (Arabic)": "xhe_Arab",
+    "Xhosa (Latin)": "xho_Latn",
+    "Kalkoti (Arabic)": "xka_Arab",
+    "Kalkoti (Latin)": "xkl_Latn",
+    "Mingrelian (Georgian)": "xmf_Geor",
+    "Malay (macrolanguage), Malaccan (Latin)": "xmm_Latn",
+    "Mean (Latin)": "xmv_Latn",
+    "Kenyan Sign Language (Latin)": "xnj_Latn",
+    "Kanjar (Devanagari)": "xnr_Deva",
+    "Xhosa (Latin)": "xog_Latn",
+    "Komo (Sudan) (Latin)": "xon_Latn",
+    "Kpelle (Latin)": "xpe_Latn",
+    "Karahawyana (Latin)": "xrb_Latn",
+    "Samberigi (Latin)": "xsb_Latn",
+    "Samberigi (Latin)": "xsm_Latn",
+    "Sherpa (Devanagari)": "xsr_Deva",
+    "Sukur (Latin)": "xsu_Latn",
+    "Alcozauca Mixtec (Latin)": "xta_Latn",
+    "Diuxi-Tilantongo Mixtec (Latin)": "xtd_Latn",
+    "Ketengban (Latin)": "xte_Latn",
+    "Sino-Tibetan languages (Latin)": "xti_Latn",
+    "Tidaá Mixtec (Latin)": "xtm_Latn",
+    "Diuxi-Tilantongo Mixtec (Latin)": "xtn_Latn",
+    "Cuyamecalco Mixtec (Latin)": "xtu_Latn",
+    "Alcozauca Mixtec (Tamil)": "xua_Taml",
+    "Kuo (Latin)": "xuo_Latn",
+    "Yaminahua (Latin)": "yaa_Latn",
+    "Yagua (Latin)": "yad_Latn",
+    "Yalunka (Latin)": "yal_Latn",
+    "Yamba (Latin)": "yam_Latn",
+    "Yao (Latin)": "yao_Latn",
+    "Yagua (Latin)": "yaq_Latn",
+    "Yagua (Latin)": "yas_Latn",
+    "Yagua (Latin)": "yat_Latn",
+    "Yavanawa (Latin)": "yav_Latn",
+    "Yei (Latin)": "yay_Latn",
+    "Yazgulyam (Latin)": "yaz_Latn",
+    "Yala (Latin)": "yba_Latn",
+    "Yemba (Latin)": "ybb_Latn",
+    "Yucatec Maya Sign Language (Latin)": "ycl_Latn",
+    "Yucuna (Latin)": "ycn_Latn",
+    "Yiddish (Hebrew)": "ydd_Hebr",
+    "Yidgha (Arabic)": "ydg_Arab",
+    "Yennu (Malayalam)": "yea_Mlym",
+    "Yenisei Say (Latin)": "yer_Latn",
+    "Yeskwa (Latin)": "yes_Latn",
+    "Yaka (Congo) (Latin)": "yka_Latn",
+    "Yalo (Latin)": "yli_Latn",
+    "Yoruba (Latin)": "yor_Latn",
+    "Yarí (Latin)": "yre_Latn",
+    "Yucateco (Latin)": "yua_Latn",
+    "Yue Chinese (Han)": "yue_Hans",
+    "Yue Chinese (Han)": "yue_Hant",
+    "Yuracare (Latin)": "yuz_Latn",
+    "Yawa (Latin)": "yva_Latn",
+    "Zapotec (Latin)": "zaa_Latn",
+    "Zapotec (Latin)": "zab_Latn",
+    "Ocotlán Zapotec (Latin)": "zac_Latn",
+    "Cajonos Zapotec (Latin)": "zad_Latn",
+    "Yareni Zapotec (Latin)": "zae_Latn",
+    "Isthmus Zapotec (Latin)": "zai_Latn",
+    "Miahuatlán Zapotec (Latin)": "zam_Latn",
+    "Ozolotepec Zapotec (Latin)": "zao_Latn",
+    "Aloápam Zapotec (Latin)": "zaq_Latn",
+    "Rincón Zapotec (Latin)": "zar_Latn",
+    "Santo Domingo Albarradas Zapotec (Latin)": "zas_Latn",
+    "Yatzachi Zapotec (Latin)": "zav_Latn",
+    "Zay (Latin)": "zaw_Latn",
+    "Choapan Zapotec (Latin)": "zca_Latn",
+    "Zhigulevsk (Latin)": "zga_Latn",
+    "Zimza (Latin)": "zim_Latn",
+    "Zinza (Latin)": "ziw_Latn",
+    "Zialo (Latin)": "zmz_Latn",
+    "Zande (macrolanguage) (Latin)": "zne_Latn",
+    "Zoque (Latin)": "zoc_Latn",
+    "Zoque (Latin)": "zoh_Latn",
+    "Zoque (Latin)": "zor_Latn",
+    "Zoque (Latin)": "zos_Latn",
+    "Coatecas Altas Zapotec (Latin)": "zpc_Latn",
+    "Guevea De Humboldt Zapotec (Latin)": "zpg_Latn",
+    "Santa María Quiegolani Zapotec (Latin)": "zpi_Latn",
+    "Lachixío Zapotec (Latin)": "zpl_Latn",
+    "Mixtepec Zapotec (Latin)": "zpm_Latn",
+    "Choapan Zapotec (Latin)": "zpo_Latn",
+    "El Alto Zapotec (Latin)": "zpt_Latn",
+    "San Vicente Coatlán Zapotec (Latin)": "zpv_Latn",
+    "Chichicapan Zapotec (Latin)": "zpy_Latn",
+    "Mazaltepec Zapotec (Latin)": "zpz_Latn",
+    "Standard Malay (Latin)": "zsm_Latn",
+    "Tlacolulita Zapotec (Latin)": "ztg_Latn",
+    "Tataltepec Zapotec (Latin)": "ztn_Latn",
+    "Tilquiapan Zapotec (Latin)": "ztp_Latn",
+    "Quiavicuzas Zapotec (Latin)": "ztq_Latn",
+    "Samo (Latin)": "zts_Latn",
+    "Samo (Latin)": "ztu_Latn",
+    "Yalálag Zapotec (Latin)": "zty_Latn",
+    "Zulu (Latin)": "zul_Latn",
+    "Yongbei Zhuang (Latin)": "zyb_Latn",
+    "Yongbei Zhuang (Latin)": "zyp_Latn",
+    "Zhuang (Latin)": "zza_Latn"
+}

server/media_transcription_processor.py ADDED Viewed

	@@ -0,0 +1,334 @@

+"""
+Media Transcription Processor
+Pipeline-focused transcription processor that maintains state through processing stages
+while exposing intermediate results for flexibility and ensuring proper resource cleanup.
+"""
+import base64
+import logging
+import os
+from typing import Dict, List, Optional
+import numpy as np
+import torch
+from audio_transcription import transcribe_full_audio_with_chunking
+from convert_media_to_wav import convert_media_to_wav_from_bytes
+from inference.audio_reading_tools import wav_to_bytes
+from transcription_status import transcription_status
+class MediaTranscriptionProcessor:
+    """
+    Pipeline-focused transcription processor that maintains state through processing stages
+    while exposing intermediate results for flexibility and ensuring proper resource cleanup.
+    """
+    # Maximum duration (in seconds) before a transcription is considered stuck
+    # MAX_TRANSCRIPTION_DURATION = 120  # 2 minutes
+    # For long meetings (1 hour max)
+    # MAX_TRANSCRIPTION_DURATION = 3600
+    # Or disable timeout entirely
+    MAX_TRANSCRIPTION_DURATION = float("inf")
+    def __init__(self, media_bytes: bytes, filename: str, language_with_script: str = None):
+        """Initialize processor with media data and metadata."""
+        # Core input data
+        self.media_bytes = media_bytes
+        self.original_filename = filename
+        self.language_with_script = language_with_script
+        # Processing state - lazy loaded
+        self._temp_wav_path: Optional[str] = None
+        self._audio_tensor: Optional[torch.Tensor] = None
+        self._audio_numpy: Optional[np.ndarray] = None
+        self._sample_rate: int = 16000
+        self._duration: Optional[float] = None
+        self._chunks: Optional[List] = None
+        self._transcription_results: Optional[Dict] = None
+        self._error: Optional[str] = None
+        # Resource tracking for cleanup
+        self._temp_files: List[str] = []
+        self._cleanup_performed = False
+        # Transcription status management
+        self._status_initialized = False
+    def start_transcription(self):
+        """Initialize transcription status tracking."""
+        if not self._status_initialized:
+            transcription_status.start_transcription("transcribe", self.original_filename)
+            self._status_initialized = True
+    def update_progress(self, progress: float):
+        """Update transcription progress."""
+        transcription_status.update_progress(progress)
+    @staticmethod
+    def is_server_busy() -> bool:
+        """
+        Check if the server is currently busy with another transcription.
+        This method includes timeout handling - if a transcription has been
+        running too long, it will be force-finished.
+        """
+        status = MediaTranscriptionProcessor.get_server_status()
+        return status.get("is_busy", False)
+    @staticmethod
+    def get_server_status() -> dict:
+        """
+        Get current server transcription status with timeout handling.
+        If a transcription has been running longer than MAX_TRANSCRIPTION_DURATION,
+        it will be force-finished to prevent the server from being stuck indefinitely.
+        """
+        status = transcription_status.get_status()
+        # Check if transcription has been running too long
+        if (status.get("is_busy", False) and
+            status.get("duration_seconds", 0) > MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Force-finishing stuck transcription after {status.get('duration_seconds', 0):.1f}s "
+                f"(max: {MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION}s). "
+                f"Operation: {status.get('current_operation')}, "
+                f"File: {status.get('current_filename')}"
+            )
+            # Force finish the transcription
+            transcription_status.finish_transcription()
+            # Get updated status
+            status = transcription_status.get_status()
+            status["force_finished"] = True
+            status["reason"] = f"Transcription exceeded maximum duration of {MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION}s"
+        return status
+    def convert_media(self) -> 'MediaTranscriptionProcessor':
+        """
+        Stage 1: Convert media to standardized audio format.
+        Returns:
+            Self for method chaining
+        """
+        if self._temp_wav_path is not None:
+            # Already converted
+            return self
+        logger = logging.getLogger(__name__)
+        logger.info(f"Converting media file: {self.original_filename}")
+        # Update progress if status is initialized
+        if self._status_initialized:
+            self.update_progress(0.1)
+        try:
+            # Convert media bytes to WAV and tensor
+            temp_wav_path, audio_tensor = convert_media_to_wav_from_bytes(
+                self.media_bytes, self.original_filename
+            )
+            # Store results and track temp file
+            self._temp_wav_path = temp_wav_path
+            self._audio_tensor = audio_tensor
+            self._temp_files.append(temp_wav_path)
+            # Calculate duration from tensor
+            if audio_tensor is not None:
+                self._duration = len(audio_tensor) / self._sample_rate
+            logger.info(f"Media conversion completed: {self.original_filename} -> {self._duration:.2f}s")
+            # Update progress if status is initialized
+            if self._status_initialized:
+                self.update_progress(0.2)
+        except Exception as e:
+            logger.error(f"Media conversion failed for {self.original_filename}: {str(e)}")
+            # Provide user-friendly error message based on the error type
+            if "ffmpeg returned error code" in str(e).lower():
+                error_msg = (
+                    f"Audio/video conversion failed for '{self.original_filename}'. "
+                    f"The file may have an unsupported audio codec or be corrupted. "
+                    f"Please try converting the file to a standard format (MP3, WAV, MP4) before uploading. "
+                    f"For best results, use files with common codecs: "
+                    f"Audio - AAC, MP3, PCM, FLAC; Video - H.264/AAC (MP4), standard codecs. "
+                    f"Avoid proprietary, DRM-protected, or very old codec variants."
+                )
+            else:
+                error_msg = f"Failed to process media file '{self.original_filename}'"
+            error_msg += f"\nTechnical Details: {str(e)}"
+            # Store the error for later retrieval
+            self._error = error_msg
+            raise RuntimeError(error_msg)
+        return self
+    def get_wav_path(self) -> str:
+        """Get the temporary WAV file path (converts media if needed)."""
+        if self._temp_wav_path is None:
+            self.convert_media()
+        return self._temp_wav_path
+    def get_audio_tensor(self) -> torch.Tensor:
+        """Get standardized audio tensor (converts media if needed)."""
+        if self._audio_tensor is None:
+            self.convert_media()
+        return self._audio_tensor
+    def get_audio_numpy(self) -> np.ndarray:
+        """Get audio as numpy array (converted from tensor if needed)."""
+        if self._audio_numpy is None:
+            tensor = self.get_audio_tensor()
+            if tensor is not None:
+                # Convert to numpy, handling different tensor types
+                if hasattr(tensor, 'cpu'):
+                    self._audio_numpy = tensor.cpu().numpy()
+                else:
+                    self._audio_numpy = tensor.numpy()
+            else:
+                self._audio_numpy = np.array([])
+        return self._audio_numpy
+    @property
+    def duration(self) -> float:
+        """Get audio duration in seconds."""
+        if self._duration is None:
+            self.convert_media()
+        return self._duration or 0.0
+    @property
+    def sample_rate(self) -> int:
+        """Get audio sample rate."""
+        return self._sample_rate
+    def transcribe_full_pipeline(self) -> 'MediaTranscriptionProcessor':
+        """
+        Stage 2: Run the complete transcription pipeline with chunking.
+        Returns:
+            Self for method chaining
+        """
+        if self._transcription_results is not None:
+            # Already transcribed
+            return self
+        logger = logging.getLogger(__name__)
+        # Ensure media is converted
+        wav_path = self.get_wav_path()
+        logger.info(f"Starting transcription pipeline for: {self.original_filename}")
+        # Get the preprocessed audio tensor instead of just the WAV path
+        audio_tensor = self.get_audio_tensor()
+        # Run the full transcription with chunking using the tensor
+        self._transcription_results = transcribe_full_audio_with_chunking(
+            audio_tensor=audio_tensor,
+            sample_rate=self._sample_rate,
+            language_with_script=self.language_with_script,
+        )
+        logger.info(f"Transcription completed: {self._transcription_results.get('num_chunks', 0)} chunks")
+        # Update progress if status is initialized
+        if self._status_initialized:
+            self.update_progress(0.9)
+        return self
+    def get_results(self, include_preprocessed_audio: bool = False) -> Dict:
+        """
+        Get final transcription results (runs transcription if needed).
+        Args:
+            include_preprocessed_audio: Whether to include base64-encoded preprocessed WAV data
+        Returns:
+            Complete transcription results dictionary, optionally with preprocessed audio
+        """
+        if self._transcription_results is None:
+            self.transcribe_full_pipeline()
+        results = self._transcription_results or {}
+        # Add preprocessed audio data if requested
+        if include_preprocessed_audio and self._audio_tensor is not None:
+            try:
+                # Convert the preprocessed tensor to WAV bytes
+                audio_tensor_cpu = self._audio_tensor.cpu() if self._audio_tensor.is_cuda else self._audio_tensor
+                wav_bytes = wav_to_bytes(audio_tensor_cpu, sample_rate=self._sample_rate, format="wav")
+                # Encode as base64
+                audio_data_b64 = base64.b64encode(wav_bytes.tobytes()).decode('utf-8')
+                results["preprocessed_audio"] = {
+                    "data": audio_data_b64,
+                    "format": "wav",
+                    "sample_rate": self._sample_rate,
+                    "duration": self.duration,
+                    "size_bytes": len(wav_bytes)
+                }
+                logging.getLogger(__name__).info(f"Added preprocessed audio data: {len(wav_bytes)} bytes")
+            except Exception as e:
+                logging.getLogger(__name__).warning(f"Failed to include preprocessed audio data: {e}")
+        return results
+    def cleanup(self):
+        """Clean up all temporary files and resources."""
+        if self._cleanup_performed:
+            return
+        logger = logging.getLogger(__name__)
+        # Clean up temporary files
+        for temp_file in self._temp_files:
+            try:
+                if os.path.exists(temp_file):
+                    os.unlink(temp_file)
+                    logger.debug(f"Cleaned up temp file: {temp_file}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up temp file {temp_file}: {e}")
+        # Finish transcription status - always call to ensure we don't get stuck
+        # It's better to be safe than risk leaving the server in a busy state
+        transcription_status.finish_transcription()
+        self._status_initialized = False
+        # Clear references to help garbage collection
+        self._audio_tensor = None
+        self._audio_numpy = None
+        self._transcription_results = None
+        self._chunks = None
+        self._temp_files.clear()
+        self._cleanup_performed = True
+        logger.debug(f"Cleanup completed for: {self.original_filename}")
+    def __enter__(self) -> 'MediaTranscriptionProcessor':
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - ensures cleanup."""
+        self.cleanup()
+    def __del__(self):
+        """Destructor - final cleanup attempt."""
+        if not self._cleanup_performed:
+            self.cleanup()

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+flask==3.0.0
+flask-cors==4.0.0
+gunicorn==21.2.0
+# Audio processing
+torchaudio<=2.8.0
+torchcodec
+librosa==0.10.1
+soundfile==0.12.1
+audioread>=3.0.0
+pydub>=0.25.1
+# VAD and audio chunking
+silero-vad>=4.0.0
+onnxruntime>=1.12.0
+# Text processing
+uroman
+# Data structures and utilities
+dataclasses
+pandas
+xxhash
+requests==2.31.0

server/subtitle.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import re
+import os
+def convert_time_to_srt_format(seconds):
+    """Converts seconds to the standard SRT time format (HH:MM:SS,ms)."""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    milliseconds = round((seconds - int(seconds)) * 1000)
+    if milliseconds == 1000:
+        milliseconds = 0
+        secs += 1
+        if secs == 60:
+            secs, minutes = 0, minutes + 1
+            if minutes == 60:
+                minutes, hours = 0, hours + 1
+    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
+def word_level_srt(words_timestamp, srt_path="word_level_subtitle.srt", shorts=False):
+    """Generates an SRT file with one word per subtitle entry."""
+    punctuation = re.compile(r'[.,!?;:"\–—_~^+*|]')
+    with open(srt_path, 'w', encoding='utf-8') as srt_file:
+        for i, word_info in enumerate(words_timestamp, start=1):
+            start = convert_time_to_srt_format(word_info['start'])
+            end = convert_time_to_srt_format(word_info['end'])
+            word = re.sub(punctuation, '', word_info['word'])
+            if word.strip().lower() == 'i': word = "I"
+            if not shorts: word = word.replace("-", "")
+            srt_file.write(f"{i}\n{start} --> {end}\n{word}\n\n")
+def split_line_by_char_limit(text, max_chars_per_line=38):
+    """Splits a string into multiple lines based on a character limit."""
+    words = text.split()
+    lines = []
+    current_line = ""
+    for word in words:
+        if not current_line:
+            current_line = word
+        elif len(current_line + " " + word) <= max_chars_per_line:
+            current_line += " " + word
+        else:
+            lines.append(current_line)
+            current_line = word
+    if current_line:
+        lines.append(current_line)
+    return lines
+def merge_punctuation_glitches(subtitles):
+    """Cleans up punctuation artifacts at the boundaries of subtitle entries."""
+    if not subtitles:
+        return []
+    cleaned = [subtitles[0]]
+    for i in range(1, len(subtitles)):
+        prev = cleaned[-1]
+        curr = subtitles[i]
+        prev_text = prev["text"].rstrip()
+        curr_text = curr["text"].lstrip()
+        match = re.match(r'^([,.:;!?]+)(\s*)(.+)', curr_text)
+        if match:
+            punct, _, rest = match.groups()
+            if not prev_text.endswith(tuple(punct)):
+                prev["text"] = prev_text + punct
+            curr_text = rest.strip()
+        unwanted_chars = ['"', '“', '”', ';', ':']
+        for ch in unwanted_chars:
+            curr_text = curr_text.replace(ch, '')
+        curr_text = curr_text.strip()
+        if not curr_text or re.fullmatch(r'[.,!?]+', curr_text):
+            prev["end"] = curr["end"]
+            continue
+        curr["text"] = curr_text
+        prev["text"] = prev["text"].replace('"', '').replace('“', '').replace('”', '')
+        cleaned.append(curr)
+    return cleaned
+import json
+def write_sentence_srt(
+    word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
+    max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
+    merge_pause_threshold=0.4
+):
+    """Creates professional-grade SRT files and a corresponding timestamp.json file."""
+    if not word_level_timestamps:
+        return
+    # Phase 1: Generate draft subtitles based on timing and length rules
+    draft_subtitles = []
+    i = 0
+    while i < len(word_level_timestamps):
+        start_time = word_level_timestamps[i]["start"]
+        # We'll now store the full word objects, not just the text
+        current_word_objects = []
+        j = i
+        while j < len(word_level_timestamps):
+            entry = word_level_timestamps[j]
+            # Create potential text from the word objects
+            potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
+            potential_text = " ".join(potential_words)
+            if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
+            if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
+            if j > i:
+                prev_entry = word_level_timestamps[j-1]
+                pause = entry["start"] - prev_entry["end"]
+                if pause >= hard_pause_threshold: break
+                if prev_entry["word"].endswith(('.','!','?')): break
+            # Append the full word object
+            current_word_objects.append(entry)
+            j += 1
+        if not current_word_objects:
+            current_word_objects.append(word_level_timestamps[i])
+            j = i + 1
+        text = " ".join([w["word"] for w in current_word_objects])
+        end_time = word_level_timestamps[j - 1]["end"]
+        # Include the list of word objects in our draft subtitle
+        draft_subtitles.append({
+            "start": start_time,
+            "end": end_time,
+            "text": text,
+            "words": current_word_objects
+        })
+        i = j
+    # Phase 2: Post-process to merge single-word "orphan" subtitles
+    if not draft_subtitles: return
+    final_subtitles = [draft_subtitles[0]]
+    for k in range(1, len(draft_subtitles)):
+        prev_sub = final_subtitles[-1]
+        current_sub = draft_subtitles[k]
+        is_orphan = len(current_sub["text"].split()) == 1
+        pause_from_prev = current_sub["start"] - prev_sub["end"]
+        if is_orphan and pause_from_prev < merge_pause_threshold:
+            merged_text = prev_sub["text"] + " " + current_sub["text"]
+            if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
+                prev_sub["text"] = merged_text
+                prev_sub["end"] = current_sub["end"]
+                # Merge the word-level data as well
+                prev_sub["words"].extend(current_sub["words"])
+                continue
+        final_subtitles.append(current_sub)
+    final_subtitles = merge_punctuation_glitches(final_subtitles)
+    # This dictionary will hold the data for our JSON file
+    timestamps_data = {}
+    # Phase 3: Write the final SRT file (and prepare JSON data)
+    with open(output_file, "w", encoding="utf-8") as f:
+        for idx, sub in enumerate(final_subtitles, start=1):
+            # --- SRT Writing (Unchanged) ---
+            text = sub["text"].replace(" ,", ",").replace(" .", ".")
+            formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
+            start_time_str = convert_time_to_srt_format(sub['start'])
+            end_time_str = convert_time_to_srt_format(sub['end'])
+            f.write(f"{idx}\n")
+            f.write(f"{start_time_str} --> {end_time_str}\n")
+            f.write("\n".join(formatted_lines) + "\n\n")
+            # Create the list of word dictionaries for the current subtitle
+            word_data = []
+            for word_obj in sub["words"]:
+                word_data.append({
+                    "word": word_obj["word"],
+                    "start": convert_time_to_srt_format(word_obj["start"]),
+                    "end": convert_time_to_srt_format(word_obj["end"])
+                })
+            # Add the complete entry to our main dictionary
+            timestamps_data[str(idx)] = {
+                "text": "\n".join(formatted_lines),
+                "start": start_time_str,
+                "end": end_time_str,
+                "words": word_data
+            }
+    # Write the collected data to the JSON file
+    json_output_file = output_file.replace(".srt",".json")
+    with open(json_output_file, "w", encoding="utf-8") as f_json:
+        json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
+    # print(f"Successfully generated SRT file: {output_file}")
+    # print(f"Successfully generated JSON file: {json_output_file}")
+    return json_output_file
+def make_subtitle(word_level_timestamps,file_path):
+  os.makedirs("./subtitles/",exist_ok=True)
+  file_name = os.path.splitext(os.path.basename(file_path))[0]
+  word_level_srt_file=f"./subtitles/{file_name}_subtitle_words.srt"
+  sentence_srt_file=f"./subtitles/{file_name}_subtitle_sentences.srt"
+  shorts_srt_file=f"./subtitles/{file_name}_subtitle_shorts.srt"
+  word_level_srt(
+      word_level_timestamps,
+      srt_path=word_level_srt_file,
+      shorts=False
+  )
+  sentence_json = write_sentence_srt(
+      word_level_timestamps,
+      output_file=sentence_srt_file,
+      max_lines=2,
+      max_duration_s=7.0,
+      max_chars_per_line=38,
+      hard_pause_threshold=0.5,
+      merge_pause_threshold=0.4
+  )
+  shorts_json = write_sentence_srt(
+      word_level_timestamps,
+      output_file=shorts_srt_file,
+      max_lines=1,
+      max_duration_s=2.0,
+      max_chars_per_line=17
+  )
+  return sentence_srt_file,word_level_srt_file,shorts_srt_file

server/transcription_status.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+import threading
+from datetime import datetime
+from typing import Dict
+logger = logging.getLogger(__name__)
+class TranscriptionStatus:
+    """Simple transcription status tracker"""
+    def __init__(self):
+        self.is_busy = False
+        self.current_operation = None
+        self.current_filename = None
+        self.started_at = None
+        self.progress = 0.0
+        self.lock = threading.Lock()
+        self.total_completed = 0
+    def start_transcription(self, operation_type: str, filename: str = None):
+        """Mark transcription as started"""
+        with self.lock:
+            self.is_busy = True
+            self.current_operation = operation_type
+            self.current_filename = filename
+            self.started_at = datetime.now()
+            self.progress = 0.0
+            logger.info(f"Started {operation_type} transcription for {filename or 'unknown file'}")
+    def update_progress(self, progress: float):
+        """Update transcription progress (0.0 to 1.0)"""
+        with self.lock:
+            self.progress = max(0.0, min(1.0, progress))
+    def finish_transcription(self):
+        """Mark transcription as finished"""
+        with self.lock:
+            self.is_busy = False
+            self.current_operation = None
+            self.current_filename = None
+            self.started_at = None
+            self.progress = 0.0
+            self.total_completed += 1
+            logger.info("Transcription finished")
+    def get_status(self) -> Dict:
+        """Get current status for API response"""
+        with self.lock:
+            status = {"is_busy": self.is_busy, "total_completed": self.total_completed}
+            if self.is_busy:
+                duration = (
+                    (datetime.now() - self.started_at).total_seconds()
+                    if self.started_at
+                    else 0
+                )
+                status.update(
+                    {
+                        "current_operation": self.current_operation,
+                        "current_filename": self.current_filename,
+                        "progress": self.progress,
+                        "duration_seconds": round(duration, 1),
+                    }
+                )
+            return status
+# Global status instance
+transcription_status = TranscriptionStatus()

server/transcriptions_blueprint.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import json
+import logging
+import os
+import tempfile
+import torch
+from audio_transcription import perform_forced_alignment
+from media_transcription_processor import MediaTranscriptionProcessor
+from transcription_status import transcription_status
+from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+from env_vars import API_LOG_LEVEL, MODEL_NAME
+from flask import Blueprint, jsonify, request, send_file
+from video_utils import check_ffmpeg_available, combine_video_with_subtitles
+transcriptions_blueprint = Blueprint(
+    "transcriptions_blueprint",
+    __name__,
+)
+logger = logging.getLogger(__name__)
+logger.level = API_LOG_LEVEL
+logging.getLogger("boto3").setLevel(API_LOG_LEVEL)
+logging.getLogger("botocore").setLevel(API_LOG_LEVEL)
+MAX_SHORTFORM_DURATION = 10  # seconds
+@transcriptions_blueprint.route("/health")
+def health():
+    """Comprehensive health check endpoint"""
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    cuda_available = torch.cuda.is_available()
+    ffmpeg_available = check_ffmpeg_available()
+    # Get transcription status
+    transcription_info = MediaTranscriptionProcessor.get_server_status()
+    # Get GPU details if CUDA is available
+    gpu_info = {}
+    if cuda_available:
+        gpu_info = {
+            "gpu_count": torch.cuda.device_count(),
+            "current_device": torch.cuda.current_device(),
+            "gpu_name": (
+                torch.cuda.get_device_name(0)
+                if torch.cuda.device_count() > 0
+                else "Unknown"
+            ),
+        }
+        # Add GPU memory information
+        try:
+            current_device = torch.cuda.current_device()
+            memory_allocated = torch.cuda.memory_allocated(current_device)
+            memory_reserved = torch.cuda.memory_reserved(current_device)
+            memory_total = torch.cuda.get_device_properties(current_device).total_memory
+            gpu_info.update(
+                {
+                    "gpu_memory_allocated_mb": round(memory_allocated / 1024 / 1024, 1),
+                    "gpu_memory_reserved_mb": round(memory_reserved / 1024 / 1024, 1),
+                    "gpu_memory_total_mb": round(memory_total / 1024 / 1024, 1),
+                    "gpu_memory_free_mb": round(
+                        (memory_total - memory_reserved) / 1024 / 1024, 1
+                    ),
+                }
+            )
+        except Exception as e:
+            logger.warning(f"Could not get GPU memory info: {e}")
+    return {
+        "status": "healthy",
+        "message": "MMS Transcription API is running",
+        "version": "1.0.0",
+        "service": "mms-transcription",
+        "device": str(device),
+        "cuda_available": cuda_available,
+        "ffmpeg_available": ffmpeg_available,
+        "transcription_status": transcription_info,
+        **gpu_info,
+    }
+@transcriptions_blueprint.route("/supported-languages")
+def get_supported_languages():
+    """Get list of supported languages for transcription"""
+    try:
+        return jsonify({
+            "supported_languages": supported_langs,
+        })
+    except Exception as e:
+        logger.error(f"Error getting supported languages: {str(e)}")
+        return jsonify({
+            "error": "Could not retrieve supported languages",
+            "message": str(e)
+        }), 500
+@transcriptions_blueprint.route("/status")
+def get_transcription_status():
+    """Get current transcription status"""
+    return jsonify(MediaTranscriptionProcessor.get_server_status())
+@transcriptions_blueprint.route("/transcribe", methods=["POST"])
+def transcribe_audio():
+    """Transcribe media using the MMS model with intelligent chunking for all audio/video files"""
+    try:
+        # Check if server is busy
+        if MediaTranscriptionProcessor.is_server_busy():
+            status = MediaTranscriptionProcessor.get_server_status()
+            return (
+                jsonify(
+                    {
+                        "error": "Server is currently processing another transcription",
+                        "status": "busy",
+                        "current_operation": status.get("current_operation"),
+                    }
+                ),
+                503,
+            )
+        # Check if media file is provided
+        if "media" not in request.files:
+            return jsonify({"error": "No media file provided"}), 400
+        media_file = request.files["media"]
+        if media_file.filename == "":
+            return jsonify({"error": "No file selected"}), 400
+        # Get optional language parameter
+        language_with_script = request.form.get("language", None)
+        if language_with_script:
+            logger.info(f"Language specified: {language_with_script}")
+        else:
+            logger.info("No language specified, using auto-detection")
+        # Get optional include_preprocessed parameter (from form data or query string)
+        include_preprocessed = (
+            request.form.get("include_preprocessed", "false").lower() == "true" or
+            request.args.get("include_preprocessed", "false").lower() == "true"
+        )
+        if include_preprocessed:
+            logger.info("Preprocessed audio will be included in response")
+        # Mark as busy and start transcription
+        # This will be handled by the processor
+        # Read file bytes once
+        media_bytes = media_file.read()
+        try:
+            # Use the MediaTranscriptionProcessor with context manager for automatic cleanup
+            with MediaTranscriptionProcessor(media_bytes, media_file.filename, language_with_script) as processor:
+                # Start transcription status tracking
+                processor.start_transcription()
+                # Stage 1: Convert media (this also calculates duration and updates progress)
+                processor.convert_media()
+                logger.info(f"Media conversion completed for: {media_file.filename}")
+                # Stage 2: Run full transcription pipeline (this also updates progress)
+                processor.transcribe_full_pipeline()
+                # Get final results with optional preprocessed audio
+                results = processor.get_results(include_preprocessed_audio=include_preprocessed)
+                logger.info(f"Transcription completed: {results.get('num_chunks', 0)} chunks")
+                # Format response
+                response = {
+                    "transcription": results.get("transcription", ""),
+                    "aligned_segments": results.get("aligned_segments", []),
+                    "chunks": results.get("chunks", []),
+                    "total_duration": results.get("total_duration", 0.0),
+                    "num_chunks": results.get("num_chunks", 0),
+                    "num_segments": results.get("num_segments", 0),
+                    "model": MODEL_NAME,
+                    "device": str(torch.device("cuda:0" if torch.cuda.is_available() else "cpu")),
+                    "status": results.get("status", "success"),
+                }
+                # Add preprocessed audio if it was included in results
+                if "preprocessed_audio" in results:
+                    response["preprocessed_audio"] = results["preprocessed_audio"]
+                if "error" in results:
+                    response["error"] = results["error"]
+                    logger.error(f"Transcription response with error: {response}")
+                    return jsonify(response), 500
+                # Print out the complete response for debugging
+                logger.info("=== TRANSCRIBE RESPONSE ===")
+                # logger.info(f"Full response: {json.dumps(response, indent=2)}")
+                logger.info("=== END TRANSCRIBE RESPONSE ===")
+                return jsonify(response)
+                # Context manager automatically handles cleanup and status finalization here
+        except Exception as e:
+            logger.error(f"Media conversion/transcription error: {str(e)}")
+            return jsonify({"error": f"Media processing failed: {str(e)}"}), 500
+    except Exception as e:
+        logger.error(f"Transcription error: {str(e)}")
+        return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
+@transcriptions_blueprint.route("/combine-video-subtitles", methods=["POST"])
+def combine_video_subtitles():
+    """Combine video with subtitles using FFmpeg"""
+    try:
+        # Check if server is busy
+        if MediaTranscriptionProcessor.is_server_busy():
+            status = MediaTranscriptionProcessor.get_server_status()
+            return (
+                jsonify(
+                    {
+                        "error": "Server is currently processing another request",
+                        "status": "busy",
+                        "current_operation": status.get("current_operation"),
+                    }
+                ),
+                503,
+            )
+        # Check required fields
+        if "video" not in request.files:
+            return jsonify({"error": "No video file provided"}), 400
+        if "subtitles" not in request.form:
+            return jsonify({"error": "No subtitles provided"}), 400
+        video_file = request.files["video"]
+        subtitles = request.form["subtitles"]
+        if video_file.filename == "":
+            return jsonify({"error": "No video file selected"}), 400
+        # Get optional parameters
+        subtitle_format = request.form.get("format", "srt")  # srt or webvtt
+        output_format = request.form.get("output_format", "mp4")  # mp4 or mkv
+        language = request.form.get("language", "eng")
+        # Mark as busy and start processing
+        transcription_status.start_transcription("combine_video", video_file.filename)
+        try:
+            transcription_status.update_progress(0.1)
+            # Save the uploaded video file to a temporary location
+            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(video_file.filename)[1]) as temp_video:
+                video_file.save(temp_video.name)
+                temp_video_path = temp_video.name
+            transcription_status.update_progress(0.3)
+            try:
+                # Combine video with subtitles using video_utils function
+                output_path = combine_video_with_subtitles(
+                    temp_video_path, subtitles, subtitle_format, output_format, language
+                )
+                transcription_status.update_progress(0.9)
+                logger.info(f"Video combination completed: {output_path}")
+                # Return the combined video file
+                return send_file(
+                    output_path,
+                    as_attachment=True,
+                    download_name=f"{video_file.filename.rsplit('.', 1)[0]}_with_subtitles.{output_format}",
+                    mimetype=f"video/{output_format}",
+                )
+            finally:
+                # Clean up temporary video file
+                try:
+                    os.unlink(temp_video_path)
+                except OSError:
+                    pass
+        finally:
+            # Mark transcription as finished
+            transcription_status.finish_transcription()
+    except Exception as e:
+        transcription_status.finish_transcription()
+        logger.error(f"Video combination error: {str(e)}")
+        return jsonify({"error": f"Video combination failed: {str(e)}"}), 500

server/video_utils.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import json
+import logging
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def combine_video_with_subtitles(
+    video_file_path: str,
+    subtitle_content: str,
+    subtitle_format: str = "srt",
+    output_format: str = "mp4",
+    language: str = "eng",
+) -> str:
+    """
+    Combine video file with subtitle content using FFmpeg.
+    Args:
+        video_file_path: Path to the input video file
+        subtitle_content: String content of the subtitles (SRT or WebVTT)
+        subtitle_format: Format of subtitles ("srt" or "webvtt")
+        output_format: Output container format ("mp4" or "mkv")
+        language: Language code for subtitle track
+    Returns:
+        Path to the output video file with embedded subtitles
+    """
+    # Create temporary files
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=f".{subtitle_format}", delete=False
+    ) as sub_file:
+        sub_file.write(subtitle_content)
+        subtitle_file_path = sub_file.name
+    # Generate output filename
+    input_path = Path(video_file_path)
+    output_path = (
+        input_path.parent / f"{input_path.stem}_with_subtitles.{output_format}"
+    )
+    try:
+        if output_format.lower() == "mkv":
+            # MKV has better subtitle support
+            if subtitle_format.lower() == "webvtt":
+                codec = "webvtt"
+            else:
+                codec = "srt"
+            cmd = [
+                "ffmpeg",
+                "-y",  # -y to overwrite output file
+                "-i",
+                video_file_path,
+                "-i",
+                subtitle_file_path,
+                "-c:v",
+                "copy",  # Copy video stream
+                "-c:a",
+                "copy",  # Copy audio stream
+                "-c:s",
+                codec,  # Subtitle codec
+                "-metadata:s:s:0",
+                f"language={language}",
+                str(output_path),
+            ]
+        else:
+            # MP4 format
+            cmd = [
+                "ffmpeg",
+                "-y",
+                "-i",
+                video_file_path,
+                "-i",
+                subtitle_file_path,
+                "-c:v",
+                "copy",  # Copy video stream
+                "-c:a",
+                "copy",  # Copy audio stream
+                "-c:s:0",
+                "mov_text",  # MP4 subtitle format
+                "-map",
+                "0:v",  # Map video from first input
+                "-map",
+                "0:a",  # Map audio from first input
+                "-map",
+                "1:s",  # Map subtitles from second input
+                "-metadata:s:s:0",
+                f"language={language}",
+                "-disposition:s:0",
+                "default",  # Make subtitles default
+                str(output_path),
+            ]
+        # Execute FFmpeg command
+        logger.info(f"Executing FFmpeg command: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        # Log FFmpeg output for debugging
+        if result.stdout:
+            logger.debug(f"FFmpeg stdout: {result.stdout}")
+        if result.stderr:
+            logger.debug(f"FFmpeg stderr: {result.stderr}")
+        logger.info(f"FFmpeg completed successfully, output file: {output_path}")
+        return str(output_path)
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"FFmpeg failed: {e.stderr}")
+    except FileNotFoundError:
+        raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
+    finally:
+        # Clean up temporary subtitle file
+        try:
+            os.unlink(subtitle_file_path)
+        except OSError:
+            pass
+def check_ffmpeg_available() -> bool:
+    """Check if FFmpeg is available on the system."""
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+def extract_audio_from_video(video_file_path: str, output_audio_path: str = None) -> str:
+    """
+    Extract audio from video file using FFmpeg.
+    Args:
+        video_file_path: Path to the input video file
+        output_audio_path: Path for output audio file (optional)
+    Returns:
+        Path to the extracted audio file
+    """
+    if not check_ffmpeg_available():
+        raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
+    # Generate output filename if not provided
+    if output_audio_path is None:
+        input_path = Path(video_file_path)
+        output_audio_path = str(input_path.with_suffix('.wav'))
+    try:
+        # FFmpeg command to extract audio
+        # -vn: disable video stream
+        # -acodec pcm_s16le: use 16-bit PCM encoding
+        # -ar 16000: set sample rate to 16kHz (optimal for speech recognition)
+        # -ac 1: mono audio (single channel)
+        cmd = [
+            "ffmpeg",
+            "-i", video_file_path,
+            "-vn",  # No video
+            "-acodec", "pcm_s16le",  # 16-bit PCM
+            "-ar", "16000",  # 16kHz sample rate
+            "-ac", "1",  # Mono
+            "-y",  # Overwrite output file if it exists
+            output_audio_path
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        logger.info(f"Audio extracted successfully to: {output_audio_path}")
+        return output_audio_path
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"FFmpeg audio extraction failed: {e.stderr}")
+    except FileNotFoundError:
+        raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
+def get_video_info(video_file_path: str) -> dict:
+    """Get basic information about a video file."""
+    try:
+        cmd = [
+            "ffprobe",
+            "-v",
+            "quiet",
+            "-print_format",
+            "json",
+            "-show_format",
+            "-show_streams",
+            video_file_path,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return json.loads(result.stdout)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return {}
+    except json.JSONDecodeError:
+        return {}