Upload 21 files
Browse files- app.py +179 -0
- server/.gitignore +1 -0
- server/audio_transcription.py +867 -0
- server/convert_media_to_wav.py +252 -0
- server/download_models.py +91 -0
- server/env_vars.py +13 -0
- server/inference/align_utils.py +188 -0
- server/inference/audio_chunker.py +359 -0
- server/inference/audio_reading_tools.py +89 -0
- server/inference/audio_sentence_alignment.py +219 -0
- server/inference/mms_model_pipeline.py +138 -0
- server/inference/norm_config_module.py +276 -0
- server/inference/punctuations.lst +188 -0
- server/inference/text_normalization.py +101 -0
- server/lang_dict.py +1675 -0
- server/media_transcription_processor.py +334 -0
- server/requirements.txt +24 -0
- server/subtitle.py +236 -0
- server/transcription_status.py +71 -0
- server/transcriptions_blueprint.py +292 -0
- server/video_utils.py +199 -0
app.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
import os
|
| 4 |
+
import warnings
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
fix_import=f"{os.getcwd()}/server"
|
| 8 |
+
sys.path.append(fix_import)
|
| 9 |
+
from inference.audio_chunker import AudioChunker
|
| 10 |
+
from inference.audio_sentence_alignment import AudioAlignment
|
| 11 |
+
from inference.mms_model_pipeline import MMSModel
|
| 12 |
+
from media_transcription_processor import MediaTranscriptionProcessor
|
| 13 |
+
from subtitle import make_subtitle
|
| 14 |
+
from lang_dict import lang_code
|
| 15 |
+
import download_models
|
| 16 |
+
|
| 17 |
+
# warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
|
| 18 |
+
warnings.filterwarnings(
|
| 19 |
+
"ignore",
|
| 20 |
+
message=".*torchaudio.functional._alignment.forced_align.*",
|
| 21 |
+
category=UserWarning
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ---- Setup Model Globals ----
|
| 26 |
+
_model_loaded = False
|
| 27 |
+
_model_loading = False
|
| 28 |
+
|
| 29 |
+
# ---- Initialize model ----
|
| 30 |
+
def load_model(model_name="omniASR_LLM_1B"):
|
| 31 |
+
"""Load MMS model on startup - only once."""
|
| 32 |
+
global _model_loaded, _model_loading
|
| 33 |
+
if _model_loaded or _model_loading:
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
_model_loading = True
|
| 37 |
+
print(f"🔄 Loading {model_name} model...")
|
| 38 |
+
|
| 39 |
+
AudioChunker()
|
| 40 |
+
AudioAlignment()
|
| 41 |
+
|
| 42 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 43 |
+
MMSModel(model_card=model_name, device=device)
|
| 44 |
+
|
| 45 |
+
_model_loaded = True
|
| 46 |
+
_model_loading = False
|
| 47 |
+
print("✅ Model loaded successfully.")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ---- Transcription function ----
|
| 51 |
+
def media_transcription(file_path, lang_code="eng_Latn"):
|
| 52 |
+
"""Perform transcription + subtitle generation."""
|
| 53 |
+
with open(file_path, "rb") as f:
|
| 54 |
+
media_bytes = f.read()
|
| 55 |
+
|
| 56 |
+
processor = MediaTranscriptionProcessor(
|
| 57 |
+
media_bytes=media_bytes,
|
| 58 |
+
filename=file_path,
|
| 59 |
+
language_with_script=lang_code
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
processor.convert_media()
|
| 63 |
+
processor.transcribe_full_pipeline()
|
| 64 |
+
results = processor.get_results()
|
| 65 |
+
|
| 66 |
+
transcription = results['transcription']
|
| 67 |
+
word_level_timestamps = [
|
| 68 |
+
{"word": s['text'], "start": s['start'], "end": s['end']}
|
| 69 |
+
for s in results.get('aligned_segments', [])
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
sentence_srt, word_level_srt, shorts_srt = make_subtitle(word_level_timestamps, file_path)
|
| 73 |
+
return transcription, sentence_srt, word_level_srt, shorts_srt
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def transcribe_interface(audio, selected_lang):
|
| 78 |
+
"""Main Gradio wrapper."""
|
| 79 |
+
if audio is None:
|
| 80 |
+
return "Please upload or record audio.", None, None, None
|
| 81 |
+
|
| 82 |
+
# Save uploaded/recorded audio
|
| 83 |
+
file_path = audio
|
| 84 |
+
find_lang_code = lang_code[selected_lang]
|
| 85 |
+
|
| 86 |
+
# print(f"🎙 Transcribing {file_path} in {selected_lang} ({find_lang_code})...")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
transcription, sentence_srt, word_level_srt, shorts_srt = media_transcription(file_path, find_lang_code)
|
| 90 |
+
return transcription, sentence_srt, word_level_srt, shorts_srt
|
| 91 |
+
except Exception as e:
|
| 92 |
+
return f"❌ Error: {e}", None, None, None
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def ui():
|
| 97 |
+
lang_list = list(lang_code.keys())
|
| 98 |
+
custom_css = """.gradio-container { font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif; }"""
|
| 99 |
+
with gr.Blocks(theme=gr.themes.Soft(),css=custom_css) as demo:
|
| 100 |
+
gr.HTML("""
|
| 101 |
+
<div style="text-align: center; margin: 20px auto; max-width: 800px;">
|
| 102 |
+
<h1 style="font-size: 2.5em; margin-bottom: 10px;">Meta Omnilingual ASR</h1>
|
| 103 |
+
<a href="https://github.com/NeuralFalconYT/omnilingual-asr-colab" target="_blank" style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white; border-radius: 6px; text-decoration: none; font-size: 1em;">😇 Run on Google Colab</a>
|
| 104 |
+
</div>
|
| 105 |
+
""")
|
| 106 |
+
|
| 107 |
+
with gr.Row():
|
| 108 |
+
with gr.Column():
|
| 109 |
+
audio_input = gr.Audio(sources=[ "microphone","upload"], type="filepath", label="🎙 Upload or Record Audio")
|
| 110 |
+
language_dropdown = gr.Dropdown(
|
| 111 |
+
choices=lang_list,
|
| 112 |
+
value=lang_list[0],
|
| 113 |
+
label="🌐 Select Language"
|
| 114 |
+
)
|
| 115 |
+
transcribe_btn = gr.Button("🚀 Transcribe")
|
| 116 |
+
with gr.Column():
|
| 117 |
+
transcription_output = gr.Textbox(label="Transcription", lines=8,show_copy_button=True)
|
| 118 |
+
with gr.Accordion("🎬 Subtitle (Not Accurate)", open=False):
|
| 119 |
+
sentence_srt_out = gr.File(label="Sentence-level Subtitle File")
|
| 120 |
+
word_srt_out = gr.File(label="Word-level Subtitle File")
|
| 121 |
+
shorts_srt_out = gr.File(label="Shorts Subtitle File")
|
| 122 |
+
|
| 123 |
+
transcribe_btn.click(
|
| 124 |
+
fn=transcribe_interface,
|
| 125 |
+
inputs=[audio_input, language_dropdown],
|
| 126 |
+
outputs=[transcription_output, sentence_srt_out, word_srt_out, shorts_srt_out]
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return demo
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
import click
|
| 135 |
+
|
| 136 |
+
@click.command()
|
| 137 |
+
@click.option(
|
| 138 |
+
"--debug",
|
| 139 |
+
is_flag=True,
|
| 140 |
+
default=False,
|
| 141 |
+
help="Enable debug mode (shows detailed logs)."
|
| 142 |
+
)
|
| 143 |
+
@click.option(
|
| 144 |
+
"--share",
|
| 145 |
+
is_flag=True,
|
| 146 |
+
default=False,
|
| 147 |
+
help="Create a public Gradio share link (for Colab or remote usage)."
|
| 148 |
+
)
|
| 149 |
+
@click.option(
|
| 150 |
+
"--model",
|
| 151 |
+
default="omniASR_LLM_1B",
|
| 152 |
+
type=click.Choice([
|
| 153 |
+
"omniASR_CTC_300M",
|
| 154 |
+
"omniASR_CTC_1B",
|
| 155 |
+
"omniASR_CTC_3B",
|
| 156 |
+
"omniASR_CTC_7B",
|
| 157 |
+
"omniASR_LLM_300M",
|
| 158 |
+
"omniASR_LLM_1B",
|
| 159 |
+
"omniASR_LLM_3B",
|
| 160 |
+
"omniASR_LLM_7B",
|
| 161 |
+
"omniASR_LLM_7B_ZS",
|
| 162 |
+
]),
|
| 163 |
+
help="Choose the OmniASR model to load."
|
| 164 |
+
)
|
| 165 |
+
def main(debug, share, model):
|
| 166 |
+
# def main(debug=True, share=True,model="omniASR_LLM_1B"):
|
| 167 |
+
|
| 168 |
+
"""Universal CLI entry point for omniASR transcription UI."""
|
| 169 |
+
print(f"\n🚀 Starting omniASR UI with model: {model}")
|
| 170 |
+
# ✅ Load model
|
| 171 |
+
load_model(model)
|
| 172 |
+
# ✅ Launch UI
|
| 173 |
+
demo = ui()
|
| 174 |
+
demo.queue().launch(share=share, debug=debug)
|
| 175 |
+
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
main()
|
| 178 |
+
|
| 179 |
+
|
server/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
models/
|
server/audio_transcription.py
ADDED
|
@@ -0,0 +1,867 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
# Standard library imports
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
from typing import Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
# Third-party imports
|
| 10 |
+
import librosa
|
| 11 |
+
import numpy as np
|
| 12 |
+
import soundfile as sf
|
| 13 |
+
import torch
|
| 14 |
+
import uroman
|
| 15 |
+
|
| 16 |
+
# fairseq2 imports
|
| 17 |
+
from inference.align_utils import get_uroman_tokens
|
| 18 |
+
from inference.audio_chunker import AudioChunker
|
| 19 |
+
|
| 20 |
+
from inference.audio_reading_tools import wav_to_bytes
|
| 21 |
+
|
| 22 |
+
# Import AudioAlignment and its config classes
|
| 23 |
+
from inference.audio_sentence_alignment import AudioAlignment
|
| 24 |
+
from inference.mms_model_pipeline import MMSModel
|
| 25 |
+
from inference.text_normalization import text_normalize
|
| 26 |
+
from transcription_status import transcription_status
|
| 27 |
+
from env_vars import USE_CHUNKING
|
| 28 |
+
|
| 29 |
+
# Constants
|
| 30 |
+
SAMPLE_RATE = 16000
|
| 31 |
+
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def transcribe_single_chunk(audio_tensor: torch.Tensor, sample_rate: int = 16000, language_with_script: str = None):
|
| 36 |
+
"""
|
| 37 |
+
Basic transcription pipeline for a single audio chunk using MMS model pipeline.
|
| 38 |
+
This is the lowest-level transcription function that handles individual audio segments.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
audio_tensor (torch.Tensor): Audio tensor (1D waveform)
|
| 42 |
+
sample_rate (int): Sample rate of the audio tensor
|
| 43 |
+
language_with_script (str): language_with_script for transcription (3-letter ISO codes like "eng", "spa") with script
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
str: Transcribed text
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
logger.info("Starting complete audio transcription pipeline...")
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
logger.info("Using pipeline transcription...")
|
| 53 |
+
# Use the singleton model instance
|
| 54 |
+
model = MMSModel.get_instance()
|
| 55 |
+
|
| 56 |
+
# Transcribe using pipeline - convert tensor to list format
|
| 57 |
+
lang_list = [language_with_script] if language_with_script else None
|
| 58 |
+
results = model.transcribe_audio(audio_tensor, batch_size=1, language_with_scripts=lang_list)
|
| 59 |
+
result = results[0] if results else {}
|
| 60 |
+
|
| 61 |
+
# Convert pipeline result to expected format
|
| 62 |
+
if isinstance(result, dict) and 'text' in result:
|
| 63 |
+
transcription_text = result['text']
|
| 64 |
+
elif isinstance(result, str):
|
| 65 |
+
transcription_text = result
|
| 66 |
+
else:
|
| 67 |
+
transcription_text = str(result)
|
| 68 |
+
|
| 69 |
+
if not transcription_text.strip():
|
| 70 |
+
logger.warning("Pipeline returned empty transcription")
|
| 71 |
+
return ""
|
| 72 |
+
|
| 73 |
+
logger.info(f"✓ Pipeline transcription successful: '{transcription_text}'")
|
| 74 |
+
|
| 75 |
+
# Return the transcription text
|
| 76 |
+
return transcription_text
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Error in transcription pipeline: {str(e)}", exc_info=True)
|
| 80 |
+
raise
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def perform_forced_alignment(
|
| 84 |
+
audio_tensor: torch.Tensor,
|
| 85 |
+
transcription_tokens: List[str],
|
| 86 |
+
device,
|
| 87 |
+
sample_rate: int = 16000,
|
| 88 |
+
) -> List[Dict]:
|
| 89 |
+
"""
|
| 90 |
+
Perform forced alignment using the AudioAlignment class from audio_sentence_alignment.py.
|
| 91 |
+
Uses the provided audio tensor directly.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
audio_tensor (torch.Tensor): Audio tensor (1D waveform)
|
| 95 |
+
transcription_tokens (List[str]): List of tokens from transcription
|
| 96 |
+
device: Device for computation
|
| 97 |
+
sample_rate (int): Audio sample rate
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
List[Dict]: List of segments with timestamps and text
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
logger.info(f"Starting forced alignment with audio tensor")
|
| 105 |
+
logger.info(f"Audio shape: {audio_tensor.shape}, sample_rate: {sample_rate}")
|
| 106 |
+
logger.info(f"Tokens to align: {transcription_tokens}")
|
| 107 |
+
|
| 108 |
+
# Use the provided audio tensor directly
|
| 109 |
+
# Convert to the format expected by AudioAlignment.get_one_row_alignments
|
| 110 |
+
if hasattr(audio_tensor, "cpu"):
|
| 111 |
+
# If it's a torch tensor, use it directly
|
| 112 |
+
alignment_tensor = audio_tensor.float()
|
| 113 |
+
else:
|
| 114 |
+
# If it's numpy, convert to tensor
|
| 115 |
+
alignment_tensor = torch.from_numpy(audio_tensor).float()
|
| 116 |
+
|
| 117 |
+
# Ensure it's 1D (flatten if needed)
|
| 118 |
+
if len(alignment_tensor.shape) > 1:
|
| 119 |
+
alignment_tensor = alignment_tensor.flatten()
|
| 120 |
+
|
| 121 |
+
# Convert audio tensor to bytes format expected by AudioAlignment
|
| 122 |
+
# Use wav_to_bytes to create proper audio bytes
|
| 123 |
+
# Move tensor to CPU first to avoid CUDA tensor to numpy conversion error
|
| 124 |
+
audio_tensor_cpu = alignment_tensor.cpu() if alignment_tensor.is_cuda else alignment_tensor
|
| 125 |
+
|
| 126 |
+
audio_arr = wav_to_bytes(
|
| 127 |
+
audio_tensor_cpu, sample_rate=sample_rate, format="wav"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# logger.info(
|
| 131 |
+
# f"Converted audio to bytes: shape={audio_arr.shape}, dtype={audio_arr.dtype}"
|
| 132 |
+
# )
|
| 133 |
+
logger.info(f"Converted audio to bytes: {len(audio_arr)} bytes")
|
| 134 |
+
|
| 135 |
+
# Preprocess tokens for MMS alignment model using the same approach as TextRomanizer
|
| 136 |
+
# The MMS alignment model expects romanized tokens in the same format as text_sentences_tokens
|
| 137 |
+
try:
|
| 138 |
+
# Join tokens back to text for uroman processing
|
| 139 |
+
transcription_text = " ".join(transcription_tokens)
|
| 140 |
+
|
| 141 |
+
# Create uroman instance and process the text the same way as TextRomanizer
|
| 142 |
+
uroman_instance = uroman.Uroman()
|
| 143 |
+
|
| 144 |
+
# Step 1: Normalize the text first using text_normalize function (same as TextRomanizer)
|
| 145 |
+
normalized_text = text_normalize(transcription_text.strip(), "en")
|
| 146 |
+
|
| 147 |
+
# Step 2: Get uroman tokens using the same function as TextRomanizer
|
| 148 |
+
# This creates character-level tokens with spaces between characters
|
| 149 |
+
uroman_tokens_str = get_uroman_tokens(
|
| 150 |
+
[normalized_text], uroman_instance, "en"
|
| 151 |
+
)[0]
|
| 152 |
+
|
| 153 |
+
# Step 3: Split by spaces to get individual character tokens (same as real MMS pipeline)
|
| 154 |
+
alignment_tokens = uroman_tokens_str.split()
|
| 155 |
+
|
| 156 |
+
logger.info(f"Original tokens: {transcription_tokens}")
|
| 157 |
+
logger.info(f"Original text: '{transcription_text}'")
|
| 158 |
+
logger.info(f"Normalized text: '{normalized_text}'")
|
| 159 |
+
logger.info(f"Uroman tokens string: '{uroman_tokens_str}'")
|
| 160 |
+
logger.info(
|
| 161 |
+
f"Alignment tokens (count={len(alignment_tokens)}): {alignment_tokens[:20]}..."
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# Additional debugging - check for any unusual characters
|
| 165 |
+
for i, token in enumerate(alignment_tokens[:10]): # Check first 10 tokens
|
| 166 |
+
logger.info(
|
| 167 |
+
f"Token {i}: '{token}' (length={len(token)}, chars={[c for c in token]})"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.warning(
|
| 172 |
+
f"Failed to preprocess tokens with TextRomanizer approach: {e}"
|
| 173 |
+
)
|
| 174 |
+
logger.exception("Full error traceback:")
|
| 175 |
+
# Fallback: use simple character-level tokenization
|
| 176 |
+
transcription_text = " ".join(transcription_tokens).lower()
|
| 177 |
+
# Simple character-level tokenization as fallback
|
| 178 |
+
alignment_tokens = []
|
| 179 |
+
for char in transcription_text:
|
| 180 |
+
if char == " ":
|
| 181 |
+
alignment_tokens.append(" ")
|
| 182 |
+
else:
|
| 183 |
+
alignment_tokens.append(char)
|
| 184 |
+
logger.info(f"Using fallback character tokens: {alignment_tokens[:20]}...")
|
| 185 |
+
|
| 186 |
+
logger.info(
|
| 187 |
+
f"Using {len(alignment_tokens)} alignment tokens for forced alignment"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# Create AudioAlignment instance
|
| 191 |
+
logger.info("Creating AudioAlignment instance...")
|
| 192 |
+
alignment = AudioAlignment()
|
| 193 |
+
|
| 194 |
+
# Perform alignment using get_one_row_alignments
|
| 195 |
+
logger.info("Performing alignment...")
|
| 196 |
+
logger.info(f"About to call get_one_row_alignments with:")
|
| 197 |
+
# logger.info(f" audio_arr type: {type(audio_arr)}, shape: {audio_arr.shape}")
|
| 198 |
+
logger.info(f"audio_arr type: {type(audio_arr)}")
|
| 199 |
+
logger.info(
|
| 200 |
+
f" alignment_tokens type: {type(alignment_tokens)}, length: {len(alignment_tokens)}"
|
| 201 |
+
)
|
| 202 |
+
logger.info(
|
| 203 |
+
f" First 10 tokens: {alignment_tokens[:10] if len(alignment_tokens) >= 10 else alignment_tokens}"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Check for any problematic characters in tokens
|
| 207 |
+
for i, token in enumerate(alignment_tokens[:5]):
|
| 208 |
+
token_chars = [ord(c) for c in str(token)]
|
| 209 |
+
logger.info(f" Token {i} '{token}' char codes: {token_chars}")
|
| 210 |
+
|
| 211 |
+
# Check if tokens contain any RTL characters that might cause the LTR assertion
|
| 212 |
+
rtl_chars = []
|
| 213 |
+
for i, token in enumerate(alignment_tokens):
|
| 214 |
+
for char in str(token):
|
| 215 |
+
# Check for Arabic, Hebrew, and other RTL characters
|
| 216 |
+
if (
|
| 217 |
+
"\u0590" <= char <= "\u08ff"
|
| 218 |
+
or "\ufb1d" <= char <= "\ufdff"
|
| 219 |
+
or "\ufe70" <= char <= "\ufeff"
|
| 220 |
+
):
|
| 221 |
+
rtl_chars.append((i, token, char, ord(char)))
|
| 222 |
+
|
| 223 |
+
if rtl_chars:
|
| 224 |
+
logger.warning(f"Found RTL characters in tokens: {rtl_chars[:10]}...")
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
audio_segments = alignment.get_one_row_alignments(
|
| 228 |
+
audio_arr, sample_rate, alignment_tokens
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
except Exception as alignment_error:
|
| 232 |
+
logger.error(f"Alignment failed with error: {alignment_error}")
|
| 233 |
+
logger.error(f"Error type: {type(alignment_error)}")
|
| 234 |
+
|
| 235 |
+
# Try to provide more context about the error
|
| 236 |
+
if "ltr" in str(alignment_error).lower():
|
| 237 |
+
logger.error("LTR assertion error detected. This might be due to:")
|
| 238 |
+
logger.error("1. RTL characters in the input tokens")
|
| 239 |
+
logger.error(
|
| 240 |
+
"2. Incorrect token format - tokens should be individual characters"
|
| 241 |
+
)
|
| 242 |
+
logger.error("3. Unicode normalization issues")
|
| 243 |
+
|
| 244 |
+
# Try a simple ASCII-only fallback
|
| 245 |
+
logger.info("Attempting ASCII-only fallback...")
|
| 246 |
+
ascii_tokens = []
|
| 247 |
+
for token in alignment_tokens:
|
| 248 |
+
# Keep only ASCII characters
|
| 249 |
+
ascii_token = "".join(c for c in str(token) if ord(c) < 128)
|
| 250 |
+
if ascii_token:
|
| 251 |
+
ascii_tokens.append(ascii_token)
|
| 252 |
+
|
| 253 |
+
logger.info(
|
| 254 |
+
f"ASCII tokens (count={len(ascii_tokens)}): {ascii_tokens[:20]}..."
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
try:
|
| 258 |
+
audio_segments = alignment.get_one_row_alignments(
|
| 259 |
+
audio_arr, ascii_tokens
|
| 260 |
+
)
|
| 261 |
+
alignment_tokens = ascii_tokens # Update for later use
|
| 262 |
+
logger.info("ASCII fallback successful!")
|
| 263 |
+
except Exception as ascii_error:
|
| 264 |
+
logger.error(f"ASCII fallback also failed: {ascii_error}")
|
| 265 |
+
raise alignment_error
|
| 266 |
+
else:
|
| 267 |
+
raise
|
| 268 |
+
|
| 269 |
+
logger.info(
|
| 270 |
+
f"Alignment completed, got {len(audio_segments)} character segments"
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Debug: Log the actual structure of audio_segments
|
| 274 |
+
if audio_segments:
|
| 275 |
+
logger.info("=== Audio Segments Debug Info ===")
|
| 276 |
+
logger.info(f"Total segments: {len(audio_segments)}")
|
| 277 |
+
|
| 278 |
+
# Print ALL audio segments for complete debugging
|
| 279 |
+
logger.info("=== ALL AUDIO SEGMENTS ===")
|
| 280 |
+
for i, segment in enumerate(audio_segments):
|
| 281 |
+
logger.info(f"Segment {i}: {segment}")
|
| 282 |
+
if i > 0 and i % 20 == 0: # Print progress every 20 segments
|
| 283 |
+
logger.info(
|
| 284 |
+
f"... printed {i+1}/{len(audio_segments)} segments so far..."
|
| 285 |
+
)
|
| 286 |
+
logger.info("=== End All Audio Segments ===")
|
| 287 |
+
logger.info("=== End Audio Segments Debug ===")
|
| 288 |
+
|
| 289 |
+
# Convert character-level segments back to word-level segments
|
| 290 |
+
# Use the actual alignment timings to preserve silence and natural timing
|
| 291 |
+
aligned_segments = []
|
| 292 |
+
|
| 293 |
+
logger.info(
|
| 294 |
+
f"Converting {len(audio_segments)} character segments to word segments"
|
| 295 |
+
)
|
| 296 |
+
logger.info(f"Original tokens: {transcription_tokens}")
|
| 297 |
+
logger.info(f"Alignment tokens: {alignment_tokens[:20]}...")
|
| 298 |
+
|
| 299 |
+
# Validate that we have segments and tokens
|
| 300 |
+
if not audio_segments or not transcription_tokens:
|
| 301 |
+
logger.warning("No audio segments or transcription tokens available")
|
| 302 |
+
return []
|
| 303 |
+
|
| 304 |
+
# Get actual timing from character segments
|
| 305 |
+
if audio_segments:
|
| 306 |
+
# Use the known segment keys from audio_sentence_alignment
|
| 307 |
+
start_key, duration_key = "segment_start_sec", "segment_duration"
|
| 308 |
+
|
| 309 |
+
first_segment = audio_segments[0]
|
| 310 |
+
last_segment = audio_segments[-1]
|
| 311 |
+
|
| 312 |
+
total_audio_duration = last_segment.get(start_key, 0) + last_segment.get(
|
| 313 |
+
duration_key, 0
|
| 314 |
+
)
|
| 315 |
+
logger.info(
|
| 316 |
+
f"Total audio duration from segments: {total_audio_duration:.3f}s"
|
| 317 |
+
)
|
| 318 |
+
else:
|
| 319 |
+
total_audio_duration = 0.0
|
| 320 |
+
start_key, duration_key = "segment_start_sec", "segment_duration"
|
| 321 |
+
|
| 322 |
+
# Strategy: Group character segments by words using the actual alignment timing
|
| 323 |
+
# This preserves the natural timing including silences from the forced alignment
|
| 324 |
+
|
| 325 |
+
# First, reconstruct the alignment character sequence
|
| 326 |
+
alignment_char_sequence = "".join(alignment_tokens)
|
| 327 |
+
transcription_text = "".join(
|
| 328 |
+
transcription_tokens
|
| 329 |
+
) # Remove spaces for character matching
|
| 330 |
+
|
| 331 |
+
logger.info(f"Alignment sequence length: {len(alignment_char_sequence)}")
|
| 332 |
+
logger.info(f"Transcription length: {len(transcription_text)}")
|
| 333 |
+
|
| 334 |
+
# Create word boundaries based on romanized alignment tokens
|
| 335 |
+
# We need to map each original word to its position in the romanized sequence
|
| 336 |
+
word_boundaries = []
|
| 337 |
+
alignment_pos = 0
|
| 338 |
+
|
| 339 |
+
# Process each word individually to get its romanized representation
|
| 340 |
+
for word in transcription_tokens:
|
| 341 |
+
try:
|
| 342 |
+
# Get romanized version of this individual word
|
| 343 |
+
normalized_word = text_normalize(word.strip(), "en")
|
| 344 |
+
uroman_word_str = get_uroman_tokens([normalized_word], uroman_instance, "en")[0]
|
| 345 |
+
romanized_word_tokens = uroman_word_str.split()
|
| 346 |
+
|
| 347 |
+
word_start = alignment_pos
|
| 348 |
+
word_end = alignment_pos + len(romanized_word_tokens)
|
| 349 |
+
word_boundaries.append((word_start, word_end))
|
| 350 |
+
alignment_pos = word_end
|
| 351 |
+
|
| 352 |
+
logger.info(f"Word '{word}' -> romanized tokens {romanized_word_tokens} -> positions {word_start}-{word_end}")
|
| 353 |
+
|
| 354 |
+
except Exception as e:
|
| 355 |
+
logger.warning(f"Failed to romanize word '{word}': {e}")
|
| 356 |
+
# Fallback: estimate based on character length ratio
|
| 357 |
+
estimated_length = max(1, int(len(word) * len(alignment_tokens) / len(transcription_text)))
|
| 358 |
+
word_start = alignment_pos
|
| 359 |
+
word_end = min(alignment_pos + estimated_length, len(alignment_tokens))
|
| 360 |
+
word_boundaries.append((word_start, word_end))
|
| 361 |
+
alignment_pos = word_end
|
| 362 |
+
|
| 363 |
+
logger.info(f"Word '{word}' (fallback) -> estimated positions {word_start}-{word_end}")
|
| 364 |
+
|
| 365 |
+
logger.info(f"Word boundaries (romanized): {word_boundaries[:5]}...")
|
| 366 |
+
logger.info(f"Total alignment tokens used: {alignment_pos}/{len(alignment_tokens)}")
|
| 367 |
+
|
| 368 |
+
# Map each word to its character segments using the boundaries
|
| 369 |
+
for word_idx, (word, (word_start, word_end)) in enumerate(
|
| 370 |
+
zip(transcription_tokens, word_boundaries)
|
| 371 |
+
):
|
| 372 |
+
# Find character segments that belong to this word
|
| 373 |
+
word_segments = []
|
| 374 |
+
|
| 375 |
+
# Map word character range to alignment token indices
|
| 376 |
+
# Since alignment_tokens might be slightly different due to normalization,
|
| 377 |
+
# we'll be flexible and use a range around the expected positions
|
| 378 |
+
start_idx = max(0, min(word_start, len(audio_segments) - 1))
|
| 379 |
+
end_idx = min(word_end, len(audio_segments))
|
| 380 |
+
|
| 381 |
+
# Ensure we don't go beyond available segments
|
| 382 |
+
for seg_idx in range(start_idx, end_idx):
|
| 383 |
+
if seg_idx < len(audio_segments):
|
| 384 |
+
word_segments.append(audio_segments[seg_idx])
|
| 385 |
+
|
| 386 |
+
if word_segments:
|
| 387 |
+
# Use actual timing from the character segments for this word
|
| 388 |
+
start_times = [seg.get(start_key, 0) for seg in word_segments]
|
| 389 |
+
end_times = [
|
| 390 |
+
seg.get(start_key, 0) + seg.get(duration_key, 0)
|
| 391 |
+
for seg in word_segments
|
| 392 |
+
]
|
| 393 |
+
|
| 394 |
+
start_time = min(start_times) if start_times else 0
|
| 395 |
+
end_time = max(end_times) if end_times else start_time + 0.1
|
| 396 |
+
duration = end_time - start_time
|
| 397 |
+
|
| 398 |
+
# Ensure minimum duration
|
| 399 |
+
if duration < 0.05: # Minimum 50ms
|
| 400 |
+
duration = 0.05
|
| 401 |
+
end_time = start_time + duration
|
| 402 |
+
|
| 403 |
+
logger.debug(
|
| 404 |
+
f"Word '{word}' (segments {start_idx}-{end_idx}, {len(word_segments)} segs): {start_time:.3f}s - {end_time:.3f}s ({duration:.3f}s)"
|
| 405 |
+
)
|
| 406 |
+
else:
|
| 407 |
+
logger.warning(
|
| 408 |
+
f"No segments found for word '{word}' at position {word_start}-{word_end}"
|
| 409 |
+
)
|
| 410 |
+
# Fallback: use proportional timing if no segments found
|
| 411 |
+
if total_audio_duration > 0 and len(transcription_text) > 0:
|
| 412 |
+
start_proportion = word_start / len(transcription_text)
|
| 413 |
+
end_proportion = word_end / len(transcription_text)
|
| 414 |
+
start_time = start_proportion * total_audio_duration
|
| 415 |
+
end_time = end_proportion * total_audio_duration
|
| 416 |
+
duration = end_time - start_time
|
| 417 |
+
else:
|
| 418 |
+
# Ultimate fallback
|
| 419 |
+
word_duration = 0.5
|
| 420 |
+
start_time = word_idx * word_duration
|
| 421 |
+
end_time = start_time + word_duration
|
| 422 |
+
duration = word_duration
|
| 423 |
+
|
| 424 |
+
logger.debug(
|
| 425 |
+
f"Word '{word}' (fallback): {start_time:.3f}s - {end_time:.3f}s"
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
aligned_segments.append(
|
| 429 |
+
{
|
| 430 |
+
"text": word,
|
| 431 |
+
"start": start_time,
|
| 432 |
+
"end": end_time,
|
| 433 |
+
"duration": duration,
|
| 434 |
+
}
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
# Validate segments don't overlap but preserve natural gaps/silences
|
| 438 |
+
for i in range(1, len(aligned_segments)):
|
| 439 |
+
prev_end = aligned_segments[i - 1]["end"]
|
| 440 |
+
current_start = aligned_segments[i]["start"]
|
| 441 |
+
|
| 442 |
+
if current_start < prev_end:
|
| 443 |
+
# Only fix actual overlaps, don't force adjacency
|
| 444 |
+
gap = prev_end - current_start
|
| 445 |
+
logger.debug(
|
| 446 |
+
f"Overlap detected: segment {i-1} ends at {prev_end:.3f}s, segment {i} starts at {current_start:.3f}s (overlap: {gap:.3f}s)"
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
# Fix overlap by adjusting current segment start to previous end
|
| 450 |
+
aligned_segments[i]["start"] = prev_end
|
| 451 |
+
aligned_segments[i]["duration"] = (
|
| 452 |
+
aligned_segments[i]["end"] - aligned_segments[i]["start"]
|
| 453 |
+
)
|
| 454 |
+
logger.debug(
|
| 455 |
+
f"Fixed overlap for segment {i}: adjusted start to {prev_end:.3f}s"
|
| 456 |
+
)
|
| 457 |
+
else:
|
| 458 |
+
# Log natural gaps (this is normal and expected)
|
| 459 |
+
gap = current_start - prev_end
|
| 460 |
+
if gap > 0.1: # Log gaps > 100ms
|
| 461 |
+
logger.debug(
|
| 462 |
+
f"Natural gap preserved: {gap:.3f}s between segments {i-1} and {i}"
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
logger.info(f"Forced alignment completed: {len(aligned_segments)} segments")
|
| 466 |
+
return aligned_segments
|
| 467 |
+
|
| 468 |
+
except Exception as e:
|
| 469 |
+
logger.error(f"Error in forced alignment: {str(e)}", exc_info=True)
|
| 470 |
+
|
| 471 |
+
# Fallback: create uniform timestamps based on audio tensor length
|
| 472 |
+
logger.info("Using fallback uniform timestamps")
|
| 473 |
+
try:
|
| 474 |
+
# Calculate duration from the audio tensor
|
| 475 |
+
total_duration = (
|
| 476 |
+
len(audio_tensor) / sample_rate
|
| 477 |
+
if len(audio_tensor) > 0
|
| 478 |
+
else len(transcription_tokens) * 0.5
|
| 479 |
+
)
|
| 480 |
+
except:
|
| 481 |
+
total_duration = len(transcription_tokens) * 0.5 # Fallback
|
| 482 |
+
|
| 483 |
+
segment_duration = (
|
| 484 |
+
total_duration / len(transcription_tokens) if transcription_tokens else 1.0
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
fallback_segments = []
|
| 488 |
+
for i, token in enumerate(transcription_tokens):
|
| 489 |
+
start_time = i * segment_duration
|
| 490 |
+
end_time = (i + 1) * segment_duration
|
| 491 |
+
|
| 492 |
+
fallback_segments.append(
|
| 493 |
+
{
|
| 494 |
+
"text": token,
|
| 495 |
+
"start": start_time,
|
| 496 |
+
"end": end_time,
|
| 497 |
+
"duration": segment_duration,
|
| 498 |
+
}
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
logger.info(
|
| 502 |
+
f"Using fallback uniform timestamps: {len(fallback_segments)} segments"
|
| 503 |
+
)
|
| 504 |
+
return fallback_segments
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
def transcribe_with_word_alignment(audio_tensor: torch.Tensor, sample_rate: int = 16000, language_with_script: str = None) -> Dict:
|
| 508 |
+
"""
|
| 509 |
+
Transcription pipeline that includes word-level timing through forced alignment.
|
| 510 |
+
Adds precise word-level timestamps to the basic transcription capability.
|
| 511 |
+
|
| 512 |
+
Args:
|
| 513 |
+
audio_tensor (torch.Tensor): Audio tensor (1D waveform)
|
| 514 |
+
sample_rate (int): Sample rate of the audio tensor
|
| 515 |
+
language_with_script (str): language_with_script code for transcription (3-letter ISO codes like "eng", "spa") with script
|
| 516 |
+
|
| 517 |
+
Returns:
|
| 518 |
+
Dict: Transcription results with alignment information including word-level timestamps
|
| 519 |
+
"""
|
| 520 |
+
|
| 521 |
+
try:
|
| 522 |
+
# Get model and device first
|
| 523 |
+
|
| 524 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 525 |
+
# Get the transcription results
|
| 526 |
+
transcription_text = transcribe_single_chunk(audio_tensor, sample_rate=sample_rate, language_with_script=language_with_script)
|
| 527 |
+
|
| 528 |
+
if not transcription_text:
|
| 529 |
+
return {
|
| 530 |
+
"transcription": "",
|
| 531 |
+
"tokens": [],
|
| 532 |
+
"aligned_segments": [],
|
| 533 |
+
"total_duration": 0.0,
|
| 534 |
+
}
|
| 535 |
+
|
| 536 |
+
# Tokenize the transcription for alignment
|
| 537 |
+
tokens = transcription_text.split()
|
| 538 |
+
|
| 539 |
+
# Perform forced alignment using the original audio tensor
|
| 540 |
+
logger.info("Performing forced alignment with original audio tensor...")
|
| 541 |
+
aligned_segments = perform_forced_alignment(audio_tensor, tokens, device, sample_rate)
|
| 542 |
+
|
| 543 |
+
# Calculate total duration
|
| 544 |
+
total_duration = aligned_segments[-1]["end"] if aligned_segments else 0.0
|
| 545 |
+
|
| 546 |
+
result = {
|
| 547 |
+
"transcription": transcription_text,
|
| 548 |
+
"tokens": tokens,
|
| 549 |
+
"aligned_segments": aligned_segments,
|
| 550 |
+
"total_duration": total_duration,
|
| 551 |
+
"num_segments": len(aligned_segments),
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
logger.info(
|
| 555 |
+
f"Transcription with alignment completed: {len(aligned_segments)} segments, {total_duration:.2f}s total"
|
| 556 |
+
)
|
| 557 |
+
return result
|
| 558 |
+
|
| 559 |
+
except Exception as e:
|
| 560 |
+
logger.error(f"Error in transcription with alignment: {str(e)}", exc_info=True)
|
| 561 |
+
# Return basic transcription without alignment
|
| 562 |
+
try:
|
| 563 |
+
transcription_text = transcribe_single_chunk(audio_tensor, sample_rate=sample_rate, language_with_script=language_with_script)
|
| 564 |
+
tokens = transcription_text.split() if transcription_text else []
|
| 565 |
+
|
| 566 |
+
return {
|
| 567 |
+
"transcription": transcription_text,
|
| 568 |
+
"tokens": tokens,
|
| 569 |
+
"aligned_segments": [],
|
| 570 |
+
"total_duration": 0.0,
|
| 571 |
+
"alignment_error": str(e),
|
| 572 |
+
}
|
| 573 |
+
except Exception as e2:
|
| 574 |
+
logger.error(f"Error in fallback transcription: {str(e2)}", exc_info=True)
|
| 575 |
+
return {
|
| 576 |
+
"transcription": "",
|
| 577 |
+
"tokens": [],
|
| 578 |
+
"aligned_segments": [],
|
| 579 |
+
"total_duration": 0.0,
|
| 580 |
+
"error": str(e2),
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def _validate_and_adjust_segments(
|
| 585 |
+
aligned_segments: List[Dict],
|
| 586 |
+
chunk_start_time: float,
|
| 587 |
+
chunk_audio_tensor: torch.Tensor,
|
| 588 |
+
chunk_sample_rate: int,
|
| 589 |
+
chunk_duration: float,
|
| 590 |
+
chunk_index: int
|
| 591 |
+
) -> List[Dict]:
|
| 592 |
+
"""
|
| 593 |
+
Private helper function to validate and adjust segment timestamps to global timeline.
|
| 594 |
+
|
| 595 |
+
Args:
|
| 596 |
+
aligned_segments: Raw segments from forced alignment (local chunk timeline)
|
| 597 |
+
chunk_start_time: Start time of this chunk in global timeline
|
| 598 |
+
chunk_audio_tensor: Audio tensor for this chunk (to get actual duration)
|
| 599 |
+
chunk_sample_rate: Sample rate of the chunk
|
| 600 |
+
chunk_duration: Reported duration of the chunk
|
| 601 |
+
chunk_index: Index of this chunk for debugging
|
| 602 |
+
|
| 603 |
+
Returns:
|
| 604 |
+
List of validated segments with global timeline timestamps
|
| 605 |
+
"""
|
| 606 |
+
adjusted_segments = []
|
| 607 |
+
|
| 608 |
+
# Get the actual audio duration from the chunk tensor instead of the potentially incorrect chunk duration
|
| 609 |
+
actual_chunk_duration = len(chunk_audio_tensor) / chunk_sample_rate if len(chunk_audio_tensor) > 0 else chunk_duration
|
| 610 |
+
|
| 611 |
+
for segment in aligned_segments:
|
| 612 |
+
original_start = segment["start"]
|
| 613 |
+
original_end = segment["end"]
|
| 614 |
+
|
| 615 |
+
# Validate that segment timestamps are within chunk boundaries
|
| 616 |
+
if original_start < 0:
|
| 617 |
+
logger.warning(
|
| 618 |
+
f"Segment '{segment['text']}' has negative start time {original_start:.3f}s, clipping to 0"
|
| 619 |
+
)
|
| 620 |
+
original_start = 0
|
| 621 |
+
|
| 622 |
+
if original_end > actual_chunk_duration + 1.0: # Allow 1s buffer for alignment errors
|
| 623 |
+
logger.warning(
|
| 624 |
+
f"Segment '{segment['text']}' end time {original_end:.3f}s exceeds actual chunk duration {actual_chunk_duration:.3f}s, clipping"
|
| 625 |
+
)
|
| 626 |
+
original_end = actual_chunk_duration
|
| 627 |
+
|
| 628 |
+
if original_start >= original_end:
|
| 629 |
+
logger.warning(
|
| 630 |
+
f"Segment '{segment['text']}' has invalid timing {original_start:.3f}s-{original_end:.3f}s, using fallback"
|
| 631 |
+
)
|
| 632 |
+
# Use proportional timing based on segment position using actual chunk duration
|
| 633 |
+
segment_index = len(adjusted_segments)
|
| 634 |
+
total_segments = len(aligned_segments)
|
| 635 |
+
if total_segments > 0:
|
| 636 |
+
segment_proportion = segment_index / total_segments
|
| 637 |
+
next_proportion = (segment_index + 1) / total_segments
|
| 638 |
+
original_start = segment_proportion * actual_chunk_duration
|
| 639 |
+
original_end = next_proportion * actual_chunk_duration
|
| 640 |
+
else:
|
| 641 |
+
original_start = 0
|
| 642 |
+
original_end = 0.5
|
| 643 |
+
|
| 644 |
+
# Create segment with absolute timeline
|
| 645 |
+
adjusted_segment = {
|
| 646 |
+
"text": segment["text"],
|
| 647 |
+
"start": original_start + chunk_start_time, # Global timeline
|
| 648 |
+
"end": original_end + chunk_start_time, # Global timeline
|
| 649 |
+
"duration": original_end - original_start,
|
| 650 |
+
"chunk_index": chunk_index,
|
| 651 |
+
"original_start": original_start, # Local chunk time
|
| 652 |
+
"original_end": original_end, # Local chunk time
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
adjusted_segments.append(adjusted_segment)
|
| 656 |
+
|
| 657 |
+
logger.debug(
|
| 658 |
+
f"Segment '{segment['text']}': {original_start:.3f}-{original_end:.3f} -> {adjusted_segment['start']:.3f}-{adjusted_segment['end']:.3f}"
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
logger.info(
|
| 662 |
+
f"Adjusted {len(adjusted_segments)} segments to absolute timeline (chunk starts at {chunk_start_time:.2f}s)"
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
return adjusted_segments
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
def transcribe_full_audio_with_chunking(
|
| 669 |
+
audio_tensor: torch.Tensor, sample_rate: int = 16000, chunk_duration: float = 30.0, language_with_script: str = None, progress_callback=None
|
| 670 |
+
) -> Dict:
|
| 671 |
+
"""
|
| 672 |
+
Complete audio transcription pipeline that handles any length audio with intelligent chunking.
|
| 673 |
+
This is the full-featured transcription function that can process both short and long audio files.
|
| 674 |
+
|
| 675 |
+
Chunking mode is controlled by USE_CHUNKING environment variable:
|
| 676 |
+
- USE_CHUNKING=false: No chunking (single chunk mode)
|
| 677 |
+
- USE_CHUNKING=true (default): VAD-based intelligent chunking
|
| 678 |
+
|
| 679 |
+
Args:
|
| 680 |
+
audio_tensor: Audio tensor (1D waveform)
|
| 681 |
+
sample_rate: Sample rate of the audio tensor
|
| 682 |
+
chunk_duration: Target chunk duration in seconds (for static chunking)
|
| 683 |
+
language_with_script: {Language code}_{script} for transcription
|
| 684 |
+
progress_callback: Optional callback for progress updates
|
| 685 |
+
|
| 686 |
+
Returns:
|
| 687 |
+
Dict with full transcription and segment information including word-level timestamps
|
| 688 |
+
"""
|
| 689 |
+
|
| 690 |
+
try:
|
| 691 |
+
logger.info(f"Starting long-form transcription: tensor shape {audio_tensor.shape} at {sample_rate}Hz")
|
| 692 |
+
logger.info(f"USE_CHUNKING = {USE_CHUNKING}")
|
| 693 |
+
|
| 694 |
+
# Initialize chunker
|
| 695 |
+
chunker = AudioChunker()
|
| 696 |
+
|
| 697 |
+
# Determine chunking mode based on USE_CHUNKING setting
|
| 698 |
+
chunking_mode = "vad" if USE_CHUNKING else "none"
|
| 699 |
+
|
| 700 |
+
# Chunk the audio using the new unified interface
|
| 701 |
+
# Ensure tensor is 1D before chunking (squeeze any extra dimensions)
|
| 702 |
+
if len(audio_tensor.shape) > 1:
|
| 703 |
+
logger.info(f"Squeezing audio tensor from {audio_tensor.shape} to 1D")
|
| 704 |
+
audio_tensor_1d = audio_tensor.squeeze()
|
| 705 |
+
else:
|
| 706 |
+
audio_tensor_1d = audio_tensor
|
| 707 |
+
|
| 708 |
+
chunks = chunker.chunk_audio(audio_tensor_1d, sample_rate=sample_rate, mode=chunking_mode, chunk_duration=chunk_duration)
|
| 709 |
+
|
| 710 |
+
if not chunks:
|
| 711 |
+
logger.warning("No audio chunks created")
|
| 712 |
+
return {
|
| 713 |
+
"transcription": "",
|
| 714 |
+
"chunks": [],
|
| 715 |
+
"total_duration": 0.0,
|
| 716 |
+
"error": "No audio content detected",
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
logger.info(f"Processing {len(chunks)} audio chunks (mode: {chunking_mode})")
|
| 720 |
+
|
| 721 |
+
# Validate chunk continuity
|
| 722 |
+
for i, chunk in enumerate(chunks):
|
| 723 |
+
logger.info(
|
| 724 |
+
f"Chunk {i+1}: {chunk['start_time']:.2f}s - {chunk['end_time']:.2f}s ({chunk['duration']:.2f}s)"
|
| 725 |
+
)
|
| 726 |
+
if i > 0:
|
| 727 |
+
prev_end = chunks[i - 1]["end_time"]
|
| 728 |
+
current_start = chunk["start_time"]
|
| 729 |
+
gap = current_start - prev_end
|
| 730 |
+
if abs(gap) > 0.1: # More than 100ms gap/overlap
|
| 731 |
+
logger.warning(
|
| 732 |
+
f"Gap/overlap between chunks {i} and {i+1}: {gap:.3f}s"
|
| 733 |
+
)
|
| 734 |
+
|
| 735 |
+
# Process each chunk - now all chunks have uniform format!
|
| 736 |
+
all_segments = []
|
| 737 |
+
full_transcription_parts = []
|
| 738 |
+
total_duration = 0.0
|
| 739 |
+
chunk_details = []
|
| 740 |
+
|
| 741 |
+
for i, chunk in enumerate(chunks):
|
| 742 |
+
logger.info(
|
| 743 |
+
f"Processing chunk {i+1}/{len(chunks)} ({chunk['duration']:.1f}s, {chunk['start_time']:.1f}s-{chunk['end_time']:.1f}s)"
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
try:
|
| 747 |
+
# Process this chunk using tensor-based transcription pipeline
|
| 748 |
+
# Use the chunk's audio_data tensor directly - no more file operations!
|
| 749 |
+
chunk_audio_tensor = chunk["audio_data"]
|
| 750 |
+
chunk_sample_rate = chunk["sample_rate"]
|
| 751 |
+
|
| 752 |
+
chunk_result = transcribe_with_word_alignment(
|
| 753 |
+
audio_tensor=chunk_audio_tensor,
|
| 754 |
+
sample_rate=chunk_sample_rate,
|
| 755 |
+
language_with_script=language_with_script
|
| 756 |
+
)
|
| 757 |
+
|
| 758 |
+
# Process alignment results - uniform handling for all chunk types
|
| 759 |
+
chunk_segments = []
|
| 760 |
+
chunk_start_time = chunk["start_time"]
|
| 761 |
+
chunk_duration = chunk["duration"]
|
| 762 |
+
|
| 763 |
+
if chunk_result.get("aligned_segments"):
|
| 764 |
+
logger.info(
|
| 765 |
+
f"Chunk {i+1} has {len(chunk_result['aligned_segments'])} segments"
|
| 766 |
+
)
|
| 767 |
+
|
| 768 |
+
chunk_segments = _validate_and_adjust_segments(
|
| 769 |
+
aligned_segments=chunk_result["aligned_segments"],
|
| 770 |
+
chunk_start_time=chunk_start_time,
|
| 771 |
+
chunk_audio_tensor=chunk_audio_tensor,
|
| 772 |
+
chunk_sample_rate=chunk_sample_rate,
|
| 773 |
+
chunk_duration=chunk_duration,
|
| 774 |
+
chunk_index=i
|
| 775 |
+
)
|
| 776 |
+
|
| 777 |
+
all_segments.extend(chunk_segments)
|
| 778 |
+
logger.info(f"Chunk {i+1} processed {len(chunk_segments)} valid segments")
|
| 779 |
+
|
| 780 |
+
# Add to full transcription
|
| 781 |
+
chunk_transcription = ""
|
| 782 |
+
if chunk_result.get("transcription"):
|
| 783 |
+
chunk_transcription = chunk_result["transcription"]
|
| 784 |
+
full_transcription_parts.append(chunk_transcription)
|
| 785 |
+
|
| 786 |
+
# Store detailed chunk information
|
| 787 |
+
chunk_detail = {
|
| 788 |
+
"chunk_index": i,
|
| 789 |
+
"start_time": chunk["start_time"],
|
| 790 |
+
"end_time": chunk["end_time"],
|
| 791 |
+
"duration": chunk["duration"],
|
| 792 |
+
"transcription": chunk_transcription,
|
| 793 |
+
"num_segments": len(chunk_segments),
|
| 794 |
+
"segments": chunk_segments,
|
| 795 |
+
}
|
| 796 |
+
chunk_details.append(chunk_detail)
|
| 797 |
+
|
| 798 |
+
total_duration = max(total_duration, chunk["end_time"])
|
| 799 |
+
|
| 800 |
+
# Update progress linearly from 0.1 to 0.9 based on chunk processing
|
| 801 |
+
progress = 0.1 + (0.8 * (i + 1) / len(chunks))
|
| 802 |
+
transcription_status.update_progress(progress)
|
| 803 |
+
|
| 804 |
+
logger.info(
|
| 805 |
+
f"Chunk {i+1} processed: '{chunk_transcription}' ({len(chunk_segments)} segments)"
|
| 806 |
+
)
|
| 807 |
+
|
| 808 |
+
except Exception as chunk_error:
|
| 809 |
+
logger.error(f"Error processing chunk {i+1}: {chunk_error}")
|
| 810 |
+
# Continue with next chunk
|
| 811 |
+
|
| 812 |
+
# Combine results
|
| 813 |
+
full_transcription = " ".join(full_transcription_parts)
|
| 814 |
+
|
| 815 |
+
# Validate segment continuity
|
| 816 |
+
logger.info("Validating segment continuity...")
|
| 817 |
+
for i in range(1, len(all_segments)):
|
| 818 |
+
prev_end = all_segments[i - 1]["end"]
|
| 819 |
+
current_start = all_segments[i]["start"]
|
| 820 |
+
gap = current_start - prev_end
|
| 821 |
+
if abs(gap) > 1.0: # More than 1 second gap
|
| 822 |
+
logger.warning(f"Large gap between segments {i-1} and {i}: {gap:.3f}s")
|
| 823 |
+
|
| 824 |
+
result = {
|
| 825 |
+
"transcription": full_transcription,
|
| 826 |
+
"aligned_segments": all_segments,
|
| 827 |
+
"chunks": [
|
| 828 |
+
{
|
| 829 |
+
"chunk_index": chunk_detail["chunk_index"],
|
| 830 |
+
"start_time": chunk_detail["start_time"],
|
| 831 |
+
"end_time": chunk_detail["end_time"],
|
| 832 |
+
"duration": chunk_detail["duration"],
|
| 833 |
+
"transcription": chunk_detail["transcription"],
|
| 834 |
+
"num_segments": chunk_detail["num_segments"],
|
| 835 |
+
}
|
| 836 |
+
for chunk_detail in chunk_details
|
| 837 |
+
],
|
| 838 |
+
"chunk_details": chunk_details, # Full details including segments per chunk
|
| 839 |
+
"total_duration": total_duration,
|
| 840 |
+
"num_chunks": len(chunks),
|
| 841 |
+
"num_segments": len(all_segments),
|
| 842 |
+
"status": "success",
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
+
logger.info(
|
| 846 |
+
f"Long-form transcription completed: {len(chunks)} chunks, {total_duration:.1f}s total"
|
| 847 |
+
)
|
| 848 |
+
logger.info(f"Total segments: {len(all_segments)}")
|
| 849 |
+
|
| 850 |
+
# Log chunk timing summary
|
| 851 |
+
for chunk_detail in chunk_details:
|
| 852 |
+
logger.info(
|
| 853 |
+
f"Chunk {chunk_detail['chunk_index']}: {chunk_detail['start_time']:.2f}-{chunk_detail['end_time']:.2f}s, {chunk_detail['num_segments']} segments"
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
return result
|
| 857 |
+
|
| 858 |
+
except Exception as e:
|
| 859 |
+
logger.error(f"Error in long-form transcription: {str(e)}", exc_info=True)
|
| 860 |
+
return {
|
| 861 |
+
"transcription": "",
|
| 862 |
+
"chunks": [],
|
| 863 |
+
"total_duration": 0.0,
|
| 864 |
+
"error": str(e),
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
|
server/convert_media_to_wav.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Media to WAV Converter Module
|
| 3 |
+
|
| 4 |
+
Converts various media formats (m4a, mp3, mp4, etc.) to standardized WAV files
|
| 5 |
+
and PyTorch tensors for audio transcription pipelines.
|
| 6 |
+
|
| 7 |
+
Standardization:
|
| 8 |
+
- 16kHz sample rate
|
| 9 |
+
- Mono channel (merged if multi-channel)
|
| 10 |
+
- Layer normalized
|
| 11 |
+
- bfloat16 dtype tensor
|
| 12 |
+
- Fail-fast error handling
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import tempfile
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Tuple, Union, Optional
|
| 19 |
+
|
| 20 |
+
import librosa
|
| 21 |
+
import numpy as np
|
| 22 |
+
import soundfile as sf
|
| 23 |
+
import torch
|
| 24 |
+
import torch.nn.functional as F
|
| 25 |
+
from pydub import AudioSegment
|
| 26 |
+
from pydub.utils import which
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Constants
|
| 30 |
+
TARGET_SAMPLE_RATE = 16000
|
| 31 |
+
TARGET_DTYPE = torch.bfloat16
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def verify_ffmpeg_installation():
|
| 35 |
+
"""Verify FFmpeg is available for pydub operations."""
|
| 36 |
+
if not which("ffmpeg"):
|
| 37 |
+
raise RuntimeError(
|
| 38 |
+
"FFmpeg not found. Please install FFmpeg for media format support. "
|
| 39 |
+
"On Ubuntu: sudo apt install ffmpeg"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def layer_norm(tensor: torch.Tensor, shape: torch.Size) -> torch.Tensor:
|
| 44 |
+
"""Apply layer normalization to audio tensor."""
|
| 45 |
+
# Simple layer normalization: (x - mean) / std
|
| 46 |
+
mean = tensor.mean()
|
| 47 |
+
std = tensor.std()
|
| 48 |
+
if std == 0:
|
| 49 |
+
return tensor - mean
|
| 50 |
+
return (tensor - mean) / std
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def detect_media_format(file_path: str) -> str:
|
| 54 |
+
"""Detect media format from file extension."""
|
| 55 |
+
file_path = Path(file_path)
|
| 56 |
+
extension = file_path.suffix.lower()
|
| 57 |
+
|
| 58 |
+
supported_formats = {
|
| 59 |
+
'.wav': 'wav',
|
| 60 |
+
'.mp3': 'mp3',
|
| 61 |
+
'.m4a': 'm4a',
|
| 62 |
+
'.aac': 'aac',
|
| 63 |
+
'.flac': 'flac',
|
| 64 |
+
'.ogg': 'ogg',
|
| 65 |
+
'.wma': 'wma',
|
| 66 |
+
'.mp4': 'mp4',
|
| 67 |
+
'.avi': 'avi',
|
| 68 |
+
'.mov': 'mov',
|
| 69 |
+
'.mkv': 'mkv'
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Return known format or just pass through the extension without the dot
|
| 73 |
+
# Let downstream processing handle unknown formats with detailed error messages
|
| 74 |
+
return supported_formats.get(extension, extension[1:] if extension.startswith('.') else extension)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def convert_to_wav_with_pydub(input_path: str, output_path: str, format_hint: str = None):
|
| 78 |
+
"""Convert media file to WAV using pydub (FFmpeg backend)."""
|
| 79 |
+
verify_ffmpeg_installation()
|
| 80 |
+
|
| 81 |
+
# Load audio file - pydub auto-detects format or use hint
|
| 82 |
+
if format_hint:
|
| 83 |
+
audio = AudioSegment.from_file(input_path, format=format_hint)
|
| 84 |
+
else:
|
| 85 |
+
# Let pydub auto-detect
|
| 86 |
+
audio = AudioSegment.from_file(input_path)
|
| 87 |
+
|
| 88 |
+
# Convert to WAV format with standard settings
|
| 89 |
+
# pydub will handle the initial conversion, librosa will do the final processing
|
| 90 |
+
audio.export(output_path, format="wav")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def process_wav_to_standard_format(wav_path: str) -> Tuple[np.ndarray, int]:
|
| 94 |
+
"""Process WAV file to standard format using librosa."""
|
| 95 |
+
# Load the WAV file with librosa (handles resampling better than pydub)
|
| 96 |
+
data, fs = librosa.load(wav_path, sr=None) # Load at original sample rate first
|
| 97 |
+
|
| 98 |
+
# Resample to target sample rate if needed
|
| 99 |
+
if fs != TARGET_SAMPLE_RATE:
|
| 100 |
+
data = librosa.resample(data, orig_sr=fs, target_sr=TARGET_SAMPLE_RATE)
|
| 101 |
+
|
| 102 |
+
# Handle multi-channel audio by merging to mono
|
| 103 |
+
if len(data.shape) > 1:
|
| 104 |
+
# Average across channels
|
| 105 |
+
data = np.mean(data, axis=0)
|
| 106 |
+
|
| 107 |
+
# Ensure it's a 1D array
|
| 108 |
+
data = np.asarray(data, dtype=np.float32)
|
| 109 |
+
|
| 110 |
+
return data, TARGET_SAMPLE_RATE
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def create_normalized_tensor(audio_data: np.ndarray) -> torch.Tensor:
|
| 114 |
+
"""Convert numpy audio data to normalized PyTorch tensor with device handling."""
|
| 115 |
+
# Convert to bf16 tensor and normalize
|
| 116 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 117 |
+
data = torch.Tensor(audio_data).to(torch.bfloat16)
|
| 118 |
+
data = layer_norm(data, data.shape)
|
| 119 |
+
data = data.unsqueeze(0).to(device)
|
| 120 |
+
|
| 121 |
+
return data
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def convert_media_to_wav(
|
| 125 |
+
input_path: str,
|
| 126 |
+
output_dir: Optional[str] = None,
|
| 127 |
+
keep_temp_wav: bool = True
|
| 128 |
+
) -> Tuple[str, torch.Tensor]:
|
| 129 |
+
"""
|
| 130 |
+
Convert media file to standardized WAV file and normalized tensor.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
input_path: Path to input media file
|
| 134 |
+
output_dir: Directory for output WAV file (default: temp directory)
|
| 135 |
+
keep_temp_wav: Whether to keep the temporary WAV file
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
Tuple of (wav_file_path, normalized_tensor)
|
| 139 |
+
|
| 140 |
+
Raises:
|
| 141 |
+
ValueError: If file format is unsupported
|
| 142 |
+
RuntimeError: If FFmpeg is not available
|
| 143 |
+
FileNotFoundError: If input file doesn't exist
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
# Validate input file
|
| 147 |
+
if not os.path.exists(input_path):
|
| 148 |
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
| 149 |
+
|
| 150 |
+
input_path = os.path.abspath(input_path)
|
| 151 |
+
|
| 152 |
+
# Detect format
|
| 153 |
+
media_format = detect_media_format(input_path)
|
| 154 |
+
|
| 155 |
+
# Setup output path
|
| 156 |
+
if output_dir is None:
|
| 157 |
+
output_dir = tempfile.gettempdir()
|
| 158 |
+
|
| 159 |
+
# Create output filename
|
| 160 |
+
input_name = Path(input_path).stem
|
| 161 |
+
output_wav_path = os.path.join(output_dir, f"{input_name}_converted.wav")
|
| 162 |
+
|
| 163 |
+
# Step 1: Convert to WAV using pydub (handles format conversion)
|
| 164 |
+
if media_format == 'wav':
|
| 165 |
+
# Already WAV, but still process through pydub to normalize format
|
| 166 |
+
convert_to_wav_with_pydub(input_path, output_wav_path, 'wav')
|
| 167 |
+
else:
|
| 168 |
+
# Convert from other format to WAV
|
| 169 |
+
convert_to_wav_with_pydub(input_path, output_wav_path, media_format)
|
| 170 |
+
|
| 171 |
+
# Step 2: Process WAV to standard format using librosa
|
| 172 |
+
audio_data, sample_rate = process_wav_to_standard_format(output_wav_path)
|
| 173 |
+
|
| 174 |
+
# Step 3: Create normalized tensor
|
| 175 |
+
normalized_tensor = create_normalized_tensor(audio_data)
|
| 176 |
+
|
| 177 |
+
# Step 4: Save the processed audio back to WAV file
|
| 178 |
+
# Overwrite the temp WAV with the processed version
|
| 179 |
+
sf.write(output_wav_path, audio_data, sample_rate)
|
| 180 |
+
|
| 181 |
+
return output_wav_path, normalized_tensor
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def convert_media_to_wav_from_bytes(
|
| 185 |
+
media_bytes: bytes,
|
| 186 |
+
original_filename: str,
|
| 187 |
+
output_dir: Optional[str] = None
|
| 188 |
+
) -> Tuple[str, torch.Tensor]:
|
| 189 |
+
"""
|
| 190 |
+
Convert media from bytes to WAV file and tensor.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
media_bytes: Raw media file bytes
|
| 194 |
+
original_filename: Original filename for format detection
|
| 195 |
+
output_dir: Directory for output files
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Tuple of (wav_file_path, normalized_tensor)
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
# Create temporary input file
|
| 202 |
+
input_extension = Path(original_filename).suffix
|
| 203 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=input_extension) as temp_input:
|
| 204 |
+
temp_input.write(media_bytes)
|
| 205 |
+
temp_input_path = temp_input.name
|
| 206 |
+
|
| 207 |
+
# Convert using the main function
|
| 208 |
+
wav_path, tensor = convert_media_to_wav(temp_input_path, output_dir)
|
| 209 |
+
|
| 210 |
+
# Clean up temporary input file
|
| 211 |
+
os.unlink(temp_input_path)
|
| 212 |
+
|
| 213 |
+
return wav_path, tensor
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# Utility function for getting audio info
|
| 217 |
+
def get_media_info(file_path: str) -> dict:
|
| 218 |
+
"""Get information about media file."""
|
| 219 |
+
verify_ffmpeg_installation()
|
| 220 |
+
|
| 221 |
+
audio = AudioSegment.from_file(file_path)
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
"duration_seconds": len(audio) / 1000.0,
|
| 225 |
+
"frame_rate": audio.frame_rate,
|
| 226 |
+
"channels": audio.channels,
|
| 227 |
+
"sample_width": audio.sample_width,
|
| 228 |
+
"format": detect_media_format(file_path)
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == "__main__":
|
| 233 |
+
# Example usage
|
| 234 |
+
import sys
|
| 235 |
+
|
| 236 |
+
if len(sys.argv) != 2:
|
| 237 |
+
print("Usage: python convert_media_to_wav.py <input_file>")
|
| 238 |
+
sys.exit(1)
|
| 239 |
+
|
| 240 |
+
input_file = sys.argv[1]
|
| 241 |
+
|
| 242 |
+
print(f"Converting {input_file}...")
|
| 243 |
+
wav_path, tensor = convert_media_to_wav(input_file)
|
| 244 |
+
|
| 245 |
+
print(f"✓ WAV file: {wav_path}")
|
| 246 |
+
print(f"✓ Tensor shape: {tensor.shape}")
|
| 247 |
+
print(f"✓ Tensor dtype: {tensor.dtype}")
|
| 248 |
+
print(f"✓ Tensor device: {tensor.device}")
|
| 249 |
+
|
| 250 |
+
# Show media info
|
| 251 |
+
info = get_media_info(input_file)
|
| 252 |
+
print(f"✓ Media info: {info}")
|
server/download_models.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#@title download model /content/omniasr-transcriptions/server/download_models.sh
|
| 2 |
+
|
| 3 |
+
# %%writefile /content/omniasr-transcriptions/server/download_models.py
|
| 4 |
+
#!/usr/bin/env python3
|
| 5 |
+
"""
|
| 6 |
+
download_models.py
|
| 7 |
+
Ensures the MMS model files are downloaded into MODELS_DIR.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import urllib.request
|
| 12 |
+
import urllib.error
|
| 13 |
+
from tqdm.auto import tqdm
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def download_file(url: str, download_file_path: str, redownload: bool = False) -> bool:
|
| 18 |
+
"""Download a single file with urllib + tqdm progress bar."""
|
| 19 |
+
base_path = os.path.dirname(download_file_path)
|
| 20 |
+
os.makedirs(base_path, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
# Skip if file already exists
|
| 23 |
+
if os.path.exists(download_file_path):
|
| 24 |
+
if redownload:
|
| 25 |
+
os.remove(download_file_path)
|
| 26 |
+
tqdm.write(f"♻️ Redownloading: {os.path.basename(download_file_path)}")
|
| 27 |
+
elif os.path.getsize(download_file_path) > 0:
|
| 28 |
+
tqdm.write(f"✔️ Skipped (already exists): {os.path.basename(download_file_path)}")
|
| 29 |
+
return True
|
| 30 |
+
|
| 31 |
+
# Try fetching metadata
|
| 32 |
+
try:
|
| 33 |
+
request = urllib.request.urlopen(url)
|
| 34 |
+
total = int(request.headers.get("Content-Length", 0))
|
| 35 |
+
except urllib.error.URLError as e:
|
| 36 |
+
print(f"❌ Error: Unable to open URL: {url}")
|
| 37 |
+
print(f"Reason: {e.reason}")
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
# Download with progress bar
|
| 41 |
+
with tqdm(
|
| 42 |
+
total=total,
|
| 43 |
+
desc=os.path.basename(download_file_path),
|
| 44 |
+
unit="B",
|
| 45 |
+
unit_scale=True,
|
| 46 |
+
unit_divisor=1024,
|
| 47 |
+
) as progress:
|
| 48 |
+
try:
|
| 49 |
+
urllib.request.urlretrieve(
|
| 50 |
+
url,
|
| 51 |
+
download_file_path,
|
| 52 |
+
reporthook=lambda count, block_size, total_size: progress.update(block_size),
|
| 53 |
+
)
|
| 54 |
+
except urllib.error.URLError as e:
|
| 55 |
+
print(f"❌ Error: Failed to download {url}")
|
| 56 |
+
print(f"Reason: {e.reason}")
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
+
tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
|
| 60 |
+
return True
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def main():
|
| 64 |
+
# Use MODELS_DIR from environment variable or default
|
| 65 |
+
MODELS_DIR = os.environ.get("MODELS_DIR", "./models")
|
| 66 |
+
print(f"📁 Checking and downloading MMS models to: {MODELS_DIR}")
|
| 67 |
+
|
| 68 |
+
# Check write permission
|
| 69 |
+
if not os.access(os.path.dirname(MODELS_DIR) or ".", os.W_OK):
|
| 70 |
+
print(f"✗ No write permission to {MODELS_DIR}")
|
| 71 |
+
sys.exit(1)
|
| 72 |
+
|
| 73 |
+
# ✅ Define URLs and build full local paths here
|
| 74 |
+
model_urls = {
|
| 75 |
+
"https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/dictionary.txt":
|
| 76 |
+
os.path.join(MODELS_DIR, "ctc_alignment_mling_uroman_model_dict.txt"),
|
| 77 |
+
"https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt":
|
| 78 |
+
os.path.join(MODELS_DIR, "ctc_alignment_mling_uroman_model.pt"),
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
for url, full_path in model_urls.items():
|
| 82 |
+
success = download_file(url, full_path)
|
| 83 |
+
if not success:
|
| 84 |
+
print(f"✗ Failed to fetch: {os.path.basename(full_path)}")
|
| 85 |
+
sys.exit(1)
|
| 86 |
+
|
| 87 |
+
print("✅ All model files are ready!")
|
| 88 |
+
|
| 89 |
+
main()
|
| 90 |
+
# if __name__ == "__main__":
|
| 91 |
+
# main()
|
server/env_vars.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#@title change model name at /content/omniasr-transcriptions/server/env_vars.py
|
| 2 |
+
# %%writefile /content/omniasr-transcriptions/server/env_vars.py
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
log_level = os.environ.get("API_LOG_LEVEL", "INFO") # see logging._nameToLevel
|
| 7 |
+
API_LOG_LEVEL = logging._nameToLevel.get(log_level)
|
| 8 |
+
|
| 9 |
+
# MMS Model Configuration
|
| 10 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "omniASR_LLM_1B") # Model name for pipeline
|
| 11 |
+
|
| 12 |
+
# Audio Processing Configuration
|
| 13 |
+
USE_CHUNKING = os.environ.get("USE_CHUNKING", "true").lower() == "true" # Whether to use audio chunking
|
server/inference/align_utils.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#@title fix file path /content/omniasr-transcriptions/server/inference/align_utils.py
|
| 2 |
+
# %%writefile /content/omniasr-transcriptions/server/inference/align_utils.py
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import tempfile
|
| 7 |
+
import logging
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
from torchaudio.models import wav2vec2_model
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# iso codes with specialized rules in uroman
|
| 16 |
+
special_isos_uroman = "ara, bel, bul, deu, ell, eng, fas, grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid".split(
|
| 17 |
+
","
|
| 18 |
+
)
|
| 19 |
+
special_isos_uroman = [i.strip() for i in special_isos_uroman]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def normalize_uroman(text):
|
| 23 |
+
text = text.lower()
|
| 24 |
+
text = re.sub("([^a-z' ])", " ", text)
|
| 25 |
+
text = re.sub(" +", " ", text)
|
| 26 |
+
return text.strip()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def get_uroman_tokens(norm_transcripts, uroman, iso=None):
|
| 30 |
+
tf = tempfile.NamedTemporaryFile()
|
| 31 |
+
tf2 = tempfile.NamedTemporaryFile()
|
| 32 |
+
with open(tf.name, "w") as f:
|
| 33 |
+
for t in norm_transcripts:
|
| 34 |
+
f.write(t + "\n")
|
| 35 |
+
uroman.romanize_file(
|
| 36 |
+
input_filename=tf.name,
|
| 37 |
+
output_filename=tf2.name,
|
| 38 |
+
lcode=iso if iso in special_isos_uroman else None,
|
| 39 |
+
)
|
| 40 |
+
outtexts = []
|
| 41 |
+
with open(tf2.name) as f:
|
| 42 |
+
for line in f:
|
| 43 |
+
line = " ".join(line.strip())
|
| 44 |
+
line = re.sub(r"\s+", " ", line).strip()
|
| 45 |
+
outtexts.append(line)
|
| 46 |
+
assert len(outtexts) == len(norm_transcripts)
|
| 47 |
+
uromans = []
|
| 48 |
+
for ot in outtexts:
|
| 49 |
+
uromans.append(normalize_uroman(ot))
|
| 50 |
+
return uromans
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class Segment:
|
| 55 |
+
label: str
|
| 56 |
+
start: int
|
| 57 |
+
end: int
|
| 58 |
+
|
| 59 |
+
def __repr__(self):
|
| 60 |
+
return f"{self.label}: [{self.start:5d}, {self.end:5d})"
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def length(self):
|
| 64 |
+
return self.end - self.start
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def merge_repeats(path, idx_to_token_map):
|
| 68 |
+
i1, i2 = 0, 0
|
| 69 |
+
segments = []
|
| 70 |
+
while i1 < len(path):
|
| 71 |
+
while i2 < len(path) and path[i1] == path[i2]:
|
| 72 |
+
i2 += 1
|
| 73 |
+
segments.append(Segment(idx_to_token_map[path[i1]], i1, i2 - 1))
|
| 74 |
+
i1 = i2
|
| 75 |
+
return segments
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def time_to_frame(time):
|
| 79 |
+
stride_msec = 20
|
| 80 |
+
frames_per_sec = 1000 / stride_msec
|
| 81 |
+
return int(time * frames_per_sec)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def load_model_dict():
|
| 85 |
+
# Use models directory from environment variable
|
| 86 |
+
models_dir = os.environ.get("MODELS_DIR", "./models")
|
| 87 |
+
model_path_name = os.path.join(models_dir, "ctc_alignment_mling_uroman_model.pt")
|
| 88 |
+
|
| 89 |
+
logger.info("Loading model from models directory...")
|
| 90 |
+
if not os.path.exists(model_path_name):
|
| 91 |
+
raise FileNotFoundError(f"Model file not found at {model_path_name}")
|
| 92 |
+
logger.info(f"Model found at: {model_path_name}")
|
| 93 |
+
state_dict = torch.load(model_path_name, map_location="cpu")
|
| 94 |
+
|
| 95 |
+
model = wav2vec2_model(
|
| 96 |
+
extractor_mode="layer_norm",
|
| 97 |
+
extractor_conv_layer_config=[
|
| 98 |
+
(512, 10, 5),
|
| 99 |
+
(512, 3, 2),
|
| 100 |
+
(512, 3, 2),
|
| 101 |
+
(512, 3, 2),
|
| 102 |
+
(512, 3, 2),
|
| 103 |
+
(512, 2, 2),
|
| 104 |
+
(512, 2, 2),
|
| 105 |
+
],
|
| 106 |
+
extractor_conv_bias=True,
|
| 107 |
+
encoder_embed_dim=1024,
|
| 108 |
+
encoder_projection_dropout=0.0,
|
| 109 |
+
encoder_pos_conv_kernel=128,
|
| 110 |
+
encoder_pos_conv_groups=16,
|
| 111 |
+
encoder_num_layers=24,
|
| 112 |
+
encoder_num_heads=16,
|
| 113 |
+
encoder_attention_dropout=0.0,
|
| 114 |
+
encoder_ff_interm_features=4096,
|
| 115 |
+
encoder_ff_interm_dropout=0.1,
|
| 116 |
+
encoder_dropout=0.0,
|
| 117 |
+
encoder_layer_norm_first=True,
|
| 118 |
+
encoder_layer_drop=0.1,
|
| 119 |
+
aux_num_out=31,
|
| 120 |
+
)
|
| 121 |
+
model.load_state_dict(state_dict)
|
| 122 |
+
model.eval()
|
| 123 |
+
|
| 124 |
+
# Use models directory from environment variable
|
| 125 |
+
models_dir = os.environ.get("MODELS_DIR", "./models")
|
| 126 |
+
dict_path_name = os.path.join(
|
| 127 |
+
models_dir, "ctc_alignment_mling_uroman_model_dict.txt"
|
| 128 |
+
)
|
| 129 |
+
if not os.path.exists(dict_path_name):
|
| 130 |
+
raise FileNotFoundError(f"Dictionary file not found at {dict_path_name}")
|
| 131 |
+
logger.info(f"Dictionary found at: {dict_path_name}")
|
| 132 |
+
dictionary = {}
|
| 133 |
+
with open(dict_path_name) as f:
|
| 134 |
+
dictionary = {l.strip(): i for i, l in enumerate(f.readlines())}
|
| 135 |
+
|
| 136 |
+
return model, dictionary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def get_spans(tokens, segments):
|
| 140 |
+
ltr_idx = 0
|
| 141 |
+
tokens_idx = 0
|
| 142 |
+
intervals = []
|
| 143 |
+
start, end = (0, 0)
|
| 144 |
+
sil = "<blank>"
|
| 145 |
+
for seg_idx, seg in enumerate(segments):
|
| 146 |
+
if tokens_idx == len(tokens):
|
| 147 |
+
assert seg_idx == len(segments) - 1
|
| 148 |
+
assert seg.label == "<blank>"
|
| 149 |
+
continue
|
| 150 |
+
cur_token = tokens[tokens_idx].split(" ")
|
| 151 |
+
ltr = cur_token[ltr_idx]
|
| 152 |
+
if seg.label == "<blank>":
|
| 153 |
+
continue
|
| 154 |
+
assert seg.label == ltr
|
| 155 |
+
if (ltr_idx) == 0:
|
| 156 |
+
start = seg_idx
|
| 157 |
+
if ltr_idx == len(cur_token) - 1:
|
| 158 |
+
ltr_idx = 0
|
| 159 |
+
tokens_idx += 1
|
| 160 |
+
intervals.append((start, seg_idx))
|
| 161 |
+
while tokens_idx < len(tokens) and len(tokens[tokens_idx]) == 0:
|
| 162 |
+
intervals.append((seg_idx, seg_idx))
|
| 163 |
+
tokens_idx += 1
|
| 164 |
+
else:
|
| 165 |
+
ltr_idx += 1
|
| 166 |
+
spans = []
|
| 167 |
+
for idx, (start, end) in enumerate(intervals):
|
| 168 |
+
span = segments[start : end + 1]
|
| 169 |
+
if start > 0:
|
| 170 |
+
prev_seg = segments[start - 1]
|
| 171 |
+
if prev_seg.label == sil:
|
| 172 |
+
pad_start = (
|
| 173 |
+
prev_seg.start
|
| 174 |
+
if (idx == 0)
|
| 175 |
+
else int((prev_seg.start + prev_seg.end) / 2)
|
| 176 |
+
)
|
| 177 |
+
span = [Segment(sil, pad_start, span[0].start)] + span
|
| 178 |
+
if end + 1 < len(segments):
|
| 179 |
+
next_seg = segments[end + 1]
|
| 180 |
+
if next_seg.label == sil:
|
| 181 |
+
pad_end = (
|
| 182 |
+
next_seg.end
|
| 183 |
+
if (idx == len(intervals) - 1)
|
| 184 |
+
else math.floor((next_seg.start + next_seg.end) / 2)
|
| 185 |
+
)
|
| 186 |
+
span = span + [Segment(sil, span[-1].end, pad_end)]
|
| 187 |
+
spans.append(span)
|
| 188 |
+
return spans
|
server/inference/audio_chunker.py
ADDED
|
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torchaudio
|
| 3 |
+
import numpy as np
|
| 4 |
+
import logging
|
| 5 |
+
import tempfile
|
| 6 |
+
import os
|
| 7 |
+
import threading
|
| 8 |
+
from typing import List, Tuple, Dict, Optional, Any
|
| 9 |
+
import silero_vad
|
| 10 |
+
import soundfile as sf
|
| 11 |
+
import librosa
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
TARGET_CHUNK_DURATION = 30.0
|
| 16 |
+
MIN_CHUNK_DURATION = 5.0
|
| 17 |
+
SAMPLE_RATE = 16000
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AudioChunker:
|
| 21 |
+
"""
|
| 22 |
+
Handles audio chunking with different strategies:
|
| 23 |
+
- 'none': Single chunk (no chunking)
|
| 24 |
+
- 'vad': VAD-based intelligent chunking
|
| 25 |
+
- 'static': Fixed-duration time-based chunking
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
_instance = None
|
| 29 |
+
_instance_lock = threading.Lock()
|
| 30 |
+
vad_model: Optional[Any]
|
| 31 |
+
|
| 32 |
+
def __new__(cls):
|
| 33 |
+
if cls._instance is None:
|
| 34 |
+
with cls._instance_lock:
|
| 35 |
+
# Check again after acquiring lock as the value could have been set
|
| 36 |
+
if cls._instance is None:
|
| 37 |
+
cls._instance = super().__new__(cls)
|
| 38 |
+
# Only load VAD model here since this only runs once
|
| 39 |
+
cls._instance.vad_model = cls.load_vad_model()
|
| 40 |
+
return cls._instance
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def load_vad_model():
|
| 44 |
+
"""Load silero VAD model with error handling."""
|
| 45 |
+
try:
|
| 46 |
+
logger.info("Loading Silero VAD model...")
|
| 47 |
+
vad_model = silero_vad.load_silero_vad()
|
| 48 |
+
logger.info("✓ VAD model loaded successfully")
|
| 49 |
+
return vad_model
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.error(f"Failed to load VAD model: {e}")
|
| 52 |
+
logger.warning("VAD chunking will fall back to time-based chunking")
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
@torch.inference_mode()
|
| 56 |
+
def chunk_audio(self, audio_tensor: torch.Tensor, sample_rate: int = SAMPLE_RATE, mode: str = "vad", chunk_duration: float = 30.0) -> List[Dict]:
|
| 57 |
+
"""
|
| 58 |
+
Chunk audio tensor using specified strategy.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
audio_tensor: Audio tensor (1D waveform)
|
| 62 |
+
sample_rate: Sample rate of the audio tensor
|
| 63 |
+
mode: Chunking mode - 'none', 'vad', or 'static'
|
| 64 |
+
chunk_duration: Target duration for static chunking (seconds)
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
List of chunk info dicts with uniform format:
|
| 68 |
+
- start_time: Start time in seconds
|
| 69 |
+
- end_time: End time in seconds
|
| 70 |
+
- duration: Duration in seconds
|
| 71 |
+
- audio_data: Audio tensor for this chunk
|
| 72 |
+
- sample_rate: Sample rate
|
| 73 |
+
- chunk_index: Index of this chunk
|
| 74 |
+
"""
|
| 75 |
+
logger.info(f"Chunking audio tensor: {audio_tensor.shape} at {sample_rate}Hz (mode: {mode})")
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
# Assert tensor is already 1D (should be preprocessed by MediaTranscriptionProcessor)
|
| 79 |
+
assert len(audio_tensor.shape) == 1, f"Expected 1D audio tensor, got shape {audio_tensor.shape}"
|
| 80 |
+
|
| 81 |
+
# Assert sample rate is already 16kHz (should be preprocessed by MediaTranscriptionProcessor)
|
| 82 |
+
assert sample_rate == SAMPLE_RATE, f"Expected {SAMPLE_RATE}Hz sample rate, got {sample_rate}Hz"
|
| 83 |
+
|
| 84 |
+
# Route to appropriate chunking strategy
|
| 85 |
+
if mode == "none":
|
| 86 |
+
return self._create_single_chunk(audio_tensor, sample_rate)
|
| 87 |
+
elif mode == "vad":
|
| 88 |
+
if self.vad_model is not None:
|
| 89 |
+
return self._chunk_with_vad(audio_tensor)
|
| 90 |
+
else:
|
| 91 |
+
logger.warning("VAD model not available, falling back to static chunking")
|
| 92 |
+
return self._chunk_static(audio_tensor, chunk_duration)
|
| 93 |
+
elif mode == "static":
|
| 94 |
+
return self._chunk_static(audio_tensor, chunk_duration)
|
| 95 |
+
else:
|
| 96 |
+
raise ValueError(f"Unknown chunking mode: {mode}")
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"Error chunking audio tensor: {e}")
|
| 100 |
+
# Ultimate fallback to single chunk
|
| 101 |
+
return self._create_single_chunk(audio_tensor, sample_rate)
|
| 102 |
+
|
| 103 |
+
def _create_single_chunk(self, waveform: torch.Tensor, sample_rate: int = SAMPLE_RATE) -> List[Dict]:
|
| 104 |
+
"""Create a single chunk containing the entire audio."""
|
| 105 |
+
duration = len(waveform) / sample_rate
|
| 106 |
+
|
| 107 |
+
return [{
|
| 108 |
+
"start_time": 0.0,
|
| 109 |
+
"end_time": duration,
|
| 110 |
+
"duration": duration,
|
| 111 |
+
"audio_data": waveform,
|
| 112 |
+
"sample_rate": sample_rate,
|
| 113 |
+
"chunk_index": 0,
|
| 114 |
+
}]
|
| 115 |
+
|
| 116 |
+
def _chunk_static(self, waveform: torch.Tensor, chunk_duration: float) -> List[Dict]:
|
| 117 |
+
"""Create fixed-duration chunks."""
|
| 118 |
+
chunks = []
|
| 119 |
+
total_samples = len(waveform)
|
| 120 |
+
target_samples = int(chunk_duration * SAMPLE_RATE)
|
| 121 |
+
|
| 122 |
+
start_sample = 0
|
| 123 |
+
chunk_idx = 0
|
| 124 |
+
|
| 125 |
+
while start_sample < total_samples:
|
| 126 |
+
end_sample = min(start_sample + target_samples, total_samples)
|
| 127 |
+
chunk_audio = waveform[start_sample:end_sample]
|
| 128 |
+
duration = len(chunk_audio) / SAMPLE_RATE
|
| 129 |
+
|
| 130 |
+
# Only add chunk if it meets minimum duration
|
| 131 |
+
if duration >= MIN_CHUNK_DURATION:
|
| 132 |
+
chunks.append({
|
| 133 |
+
"start_time": start_sample / SAMPLE_RATE,
|
| 134 |
+
"end_time": end_sample / SAMPLE_RATE,
|
| 135 |
+
"duration": duration,
|
| 136 |
+
"audio_data": chunk_audio,
|
| 137 |
+
"sample_rate": SAMPLE_RATE,
|
| 138 |
+
"chunk_index": chunk_idx,
|
| 139 |
+
})
|
| 140 |
+
chunk_idx += 1
|
| 141 |
+
|
| 142 |
+
start_sample = end_sample
|
| 143 |
+
|
| 144 |
+
logger.info(f"Created {len(chunks)} static chunks of ~{chunk_duration}s each")
|
| 145 |
+
return chunks
|
| 146 |
+
|
| 147 |
+
def _chunk_fallback(self, audio_path: str) -> List[Dict]:
|
| 148 |
+
"""Ultimate fallback - create single chunk using librosa (for file-based legacy method)."""
|
| 149 |
+
try:
|
| 150 |
+
logger.warning("Using librosa fallback for chunking")
|
| 151 |
+
data, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
|
| 152 |
+
waveform = torch.from_numpy(data)
|
| 153 |
+
return self._create_single_chunk(waveform, SAMPLE_RATE)
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.error(f"All chunking methods failed: {e}")
|
| 156 |
+
return []
|
| 157 |
+
def _chunk_with_vad(self, waveform: torch.Tensor) -> List[Dict]:
|
| 158 |
+
"""Chunk audio using VAD for speech detection with uniform return format."""
|
| 159 |
+
try:
|
| 160 |
+
# VAD model expects tensor on CPU
|
| 161 |
+
vad_waveform = waveform.cpu() if waveform.is_cuda else waveform
|
| 162 |
+
|
| 163 |
+
# Get speech timestamps using VAD
|
| 164 |
+
speech_timestamps = silero_vad.get_speech_timestamps(
|
| 165 |
+
vad_waveform,
|
| 166 |
+
self.vad_model,
|
| 167 |
+
sampling_rate=SAMPLE_RATE,
|
| 168 |
+
min_speech_duration_ms=500, # Minimum speech segment
|
| 169 |
+
min_silence_duration_ms=300, # Minimum silence to split
|
| 170 |
+
window_size_samples=1536,
|
| 171 |
+
speech_pad_ms=100, # Padding around speech
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
logger.info(f"Found {len(speech_timestamps)} speech segments")
|
| 175 |
+
|
| 176 |
+
# Create chunks based on speech segments and target duration
|
| 177 |
+
# Pass original waveform (with device preserved) to chunk creation
|
| 178 |
+
chunks = self._create_chunks_from_speech_segments(
|
| 179 |
+
waveform, speech_timestamps
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
logger.info(f"Created {len(chunks)} audio chunks using VAD")
|
| 183 |
+
return chunks
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.error(f"VAD chunking failed: {e}")
|
| 187 |
+
return self._chunk_static(waveform, TARGET_CHUNK_DURATION)
|
| 188 |
+
def _create_chunks_from_speech_segments(
|
| 189 |
+
self, waveform: torch.Tensor, speech_segments: List[Dict]
|
| 190 |
+
) -> List[Dict]:
|
| 191 |
+
"""Create chunks that respect speech boundaries and target duration with uniform format."""
|
| 192 |
+
if not speech_segments:
|
| 193 |
+
logger.warning(
|
| 194 |
+
"No speech segments found, falling back to static chunking"
|
| 195 |
+
)
|
| 196 |
+
return self._chunk_static(waveform, TARGET_CHUNK_DURATION)
|
| 197 |
+
|
| 198 |
+
chunks = []
|
| 199 |
+
current_chunk_start = 0
|
| 200 |
+
target_samples = int(TARGET_CHUNK_DURATION * SAMPLE_RATE)
|
| 201 |
+
total_samples = len(waveform)
|
| 202 |
+
chunk_idx = 0
|
| 203 |
+
|
| 204 |
+
while current_chunk_start < total_samples:
|
| 205 |
+
# Calculate target end for this chunk
|
| 206 |
+
target_chunk_end = current_chunk_start + target_samples
|
| 207 |
+
|
| 208 |
+
# If this would be the last chunk or close to it, just take the rest
|
| 209 |
+
if target_chunk_end >= total_samples or (
|
| 210 |
+
total_samples - target_chunk_end
|
| 211 |
+
) < (target_samples * 0.3):
|
| 212 |
+
chunk_end = total_samples
|
| 213 |
+
else:
|
| 214 |
+
# Find the best place to end this chunk using VAD, but ensure continuous coverage
|
| 215 |
+
chunk_end = self._find_best_chunk_end_continuous(
|
| 216 |
+
speech_segments,
|
| 217 |
+
current_chunk_start,
|
| 218 |
+
target_chunk_end,
|
| 219 |
+
total_samples,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# Create chunk with uniform format
|
| 223 |
+
chunk_audio = waveform[current_chunk_start:chunk_end]
|
| 224 |
+
duration = len(chunk_audio) / SAMPLE_RATE
|
| 225 |
+
|
| 226 |
+
chunks.append({
|
| 227 |
+
"start_time": current_chunk_start / SAMPLE_RATE,
|
| 228 |
+
"end_time": chunk_end / SAMPLE_RATE,
|
| 229 |
+
"duration": duration,
|
| 230 |
+
"audio_data": chunk_audio,
|
| 231 |
+
"sample_rate": SAMPLE_RATE,
|
| 232 |
+
"chunk_index": chunk_idx,
|
| 233 |
+
})
|
| 234 |
+
|
| 235 |
+
logger.info(
|
| 236 |
+
f"Created chunk {chunk_idx + 1}: {current_chunk_start/SAMPLE_RATE:.2f}s - {chunk_end/SAMPLE_RATE:.2f}s ({duration:.2f}s)"
|
| 237 |
+
)
|
| 238 |
+
chunk_idx += 1
|
| 239 |
+
|
| 240 |
+
# Move to next chunk - IMPORTANT: start exactly where this chunk ended
|
| 241 |
+
current_chunk_start = chunk_end
|
| 242 |
+
|
| 243 |
+
# Verify total coverage
|
| 244 |
+
total_audio_duration = len(waveform) / SAMPLE_RATE
|
| 245 |
+
total_chunks_duration = sum(chunk["duration"] for chunk in chunks)
|
| 246 |
+
logger.info(
|
| 247 |
+
f"Audio chunking complete: {len(chunks)} chunks covering {total_chunks_duration:.2f}s of {total_audio_duration:.2f}s total audio"
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
if (
|
| 251 |
+
abs(total_chunks_duration - total_audio_duration) > 0.01
|
| 252 |
+
): # Allow 10ms tolerance
|
| 253 |
+
logger.error(
|
| 254 |
+
f"Duration mismatch: chunks={total_chunks_duration:.2f}s, original={total_audio_duration:.2f}s"
|
| 255 |
+
)
|
| 256 |
+
else:
|
| 257 |
+
logger.info("✓ Perfect audio coverage achieved")
|
| 258 |
+
|
| 259 |
+
return chunks
|
| 260 |
+
|
| 261 |
+
def _find_best_chunk_end_continuous(
|
| 262 |
+
self,
|
| 263 |
+
speech_segments: List[Dict],
|
| 264 |
+
chunk_start: int,
|
| 265 |
+
target_end: int,
|
| 266 |
+
total_samples: int,
|
| 267 |
+
) -> int:
|
| 268 |
+
"""Find the best place to end a chunk while ensuring continuous coverage."""
|
| 269 |
+
|
| 270 |
+
# Don't go beyond the audio
|
| 271 |
+
target_end = min(target_end, total_samples)
|
| 272 |
+
|
| 273 |
+
# Look for a good break point within a reasonable window around target
|
| 274 |
+
search_window = int(SAMPLE_RATE * 3) # 3 second window
|
| 275 |
+
search_start = max(chunk_start, target_end - search_window)
|
| 276 |
+
search_end = min(total_samples, target_end + search_window)
|
| 277 |
+
|
| 278 |
+
best_end = target_end
|
| 279 |
+
best_score = 0
|
| 280 |
+
|
| 281 |
+
# Look for speech segment boundaries within the search window
|
| 282 |
+
for segment in speech_segments:
|
| 283 |
+
segment_start = segment["start"]
|
| 284 |
+
segment_end = segment["end"]
|
| 285 |
+
|
| 286 |
+
# Check if segment end is in our search window
|
| 287 |
+
if search_start <= segment_end <= search_end:
|
| 288 |
+
# Score based on how close to target and if it's a good break point
|
| 289 |
+
distance_score = 1.0 - abs(segment_end - target_end) / search_window
|
| 290 |
+
|
| 291 |
+
# Prefer segment ends (natural pauses)
|
| 292 |
+
boundary_score = 1.0
|
| 293 |
+
|
| 294 |
+
total_score = distance_score * boundary_score
|
| 295 |
+
|
| 296 |
+
if total_score > best_score:
|
| 297 |
+
best_score = total_score
|
| 298 |
+
best_end = segment_end
|
| 299 |
+
|
| 300 |
+
# Ensure we don't go beyond audio bounds
|
| 301 |
+
best_end = min(int(best_end), total_samples)
|
| 302 |
+
|
| 303 |
+
# Ensure we make progress (don't end before we started)
|
| 304 |
+
if best_end <= chunk_start:
|
| 305 |
+
best_end = min(target_end, total_samples)
|
| 306 |
+
|
| 307 |
+
return best_end
|
| 308 |
+
|
| 309 |
+
def _find_best_chunk_end(
|
| 310 |
+
self,
|
| 311 |
+
speech_segments: List[Dict],
|
| 312 |
+
start_idx: int,
|
| 313 |
+
chunk_start: int,
|
| 314 |
+
target_end: int,
|
| 315 |
+
) -> int:
|
| 316 |
+
"""Find the best place to end a chunk (at silence, near target duration)."""
|
| 317 |
+
|
| 318 |
+
best_end = target_end
|
| 319 |
+
|
| 320 |
+
# Look for speech segments that could provide good break points
|
| 321 |
+
for i in range(start_idx, len(speech_segments)):
|
| 322 |
+
segment = speech_segments[i]
|
| 323 |
+
segment_start = segment["start"]
|
| 324 |
+
segment_end = segment["end"]
|
| 325 |
+
|
| 326 |
+
# If segment starts after our target end, use the gap before it
|
| 327 |
+
if segment_start > target_end:
|
| 328 |
+
best_end = min(target_end, segment_start)
|
| 329 |
+
break
|
| 330 |
+
|
| 331 |
+
# If segment ends near our target, use the end of the segment
|
| 332 |
+
if abs(segment_end - target_end) < SAMPLE_RATE * 5: # Within 5 seconds
|
| 333 |
+
best_end = segment_end
|
| 334 |
+
break
|
| 335 |
+
|
| 336 |
+
# If segment extends way past target, look for a good break point
|
| 337 |
+
if segment_end > target_end + SAMPLE_RATE * 10: # 10+ seconds past
|
| 338 |
+
# Try to find a silence gap within the segment or use target
|
| 339 |
+
best_end = target_end
|
| 340 |
+
break
|
| 341 |
+
|
| 342 |
+
return int(best_end)
|
| 343 |
+
|
| 344 |
+
def save_chunk_to_file(self, chunk: Dict, output_path: str) -> str:
|
| 345 |
+
"""Save a chunk to a temporary audio file."""
|
| 346 |
+
try:
|
| 347 |
+
# Convert tensor to numpy if needed
|
| 348 |
+
audio_data = chunk["audio_data"]
|
| 349 |
+
if isinstance(audio_data, torch.Tensor):
|
| 350 |
+
# Move to CPU first if on GPU, then convert to numpy
|
| 351 |
+
audio_data = audio_data.cpu().numpy()
|
| 352 |
+
|
| 353 |
+
# Save to file
|
| 354 |
+
sf.write(output_path, audio_data, chunk["sample_rate"])
|
| 355 |
+
return output_path
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
logger.error(f"Failed to save chunk to file: {e}")
|
| 359 |
+
raise
|
server/inference/audio_reading_tools.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
import torch
|
| 6 |
+
from numpy.typing import NDArray
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# def wav_to_bytes(
|
| 10 |
+
# wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav"
|
| 11 |
+
# ) -> NDArray[np.int8]:
|
| 12 |
+
# """Convert audio tensor to bytes using soundfile directly."""
|
| 13 |
+
# # Convert to numpy if torch tensor
|
| 14 |
+
# if isinstance(wav, torch.Tensor):
|
| 15 |
+
# if wav.is_cuda:
|
| 16 |
+
# wav = wav.cpu()
|
| 17 |
+
# # Convert to float32 first (numpy doesn't support bfloat16)
|
| 18 |
+
# if wav.dtype != torch.float32:
|
| 19 |
+
# wav = wav.float()
|
| 20 |
+
# wav = wav.numpy()
|
| 21 |
+
|
| 22 |
+
# # Ensure float32 dtype for numpy arrays
|
| 23 |
+
# if wav.dtype != np.float32:
|
| 24 |
+
# wav = wav.astype(np.float32)
|
| 25 |
+
|
| 26 |
+
# # Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel
|
| 27 |
+
# if wav.ndim == 1:
|
| 28 |
+
# # Already correct shape for mono
|
| 29 |
+
# pass
|
| 30 |
+
# elif wav.ndim == 2:
|
| 31 |
+
# # If shape is (channels, samples), transpose to (samples, channels)
|
| 32 |
+
# if wav.shape[0] < wav.shape[1]:
|
| 33 |
+
# wav = wav.T
|
| 34 |
+
|
| 35 |
+
# # Create buffer and write using soundfile directly
|
| 36 |
+
# buffer = io.BytesIO()
|
| 37 |
+
|
| 38 |
+
# # Map format string to soundfile format
|
| 39 |
+
# sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV'
|
| 40 |
+
# subtype = 'PCM_16' if sf_format == 'WAV' else None
|
| 41 |
+
|
| 42 |
+
# # Write to buffer
|
| 43 |
+
# sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype)
|
| 44 |
+
|
| 45 |
+
# buffer.seek(0)
|
| 46 |
+
# return np.frombuffer(buffer.getvalue(), dtype=np.int8)
|
| 47 |
+
# # return buffer.read()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def wav_to_bytes(wav: torch.Tensor | np.ndarray, sample_rate: int = 16000, format: str = "wav"):
|
| 57 |
+
"""Convert audio tensor to bytes using soundfile directly (safe + dtype fix)."""
|
| 58 |
+
|
| 59 |
+
# ✅ Convert to numpy if torch tensor
|
| 60 |
+
if isinstance(wav, torch.Tensor):
|
| 61 |
+
wav = wav.detach().cpu()
|
| 62 |
+
if wav.dtype == torch.bfloat16:
|
| 63 |
+
wav = wav.to(torch.float32) # FIX: convert unsupported dtype
|
| 64 |
+
elif wav.dtype != torch.float32:
|
| 65 |
+
wav = wav.float()
|
| 66 |
+
wav = wav.numpy()
|
| 67 |
+
|
| 68 |
+
# ✅ Handle empty or multi-dim cases
|
| 69 |
+
if wav.ndim > 1:
|
| 70 |
+
wav = wav.squeeze()
|
| 71 |
+
if wav.size == 0:
|
| 72 |
+
raise ValueError("Empty audio segment passed to wav_to_bytes")
|
| 73 |
+
|
| 74 |
+
# ✅ Ensure valid range and dtype
|
| 75 |
+
wav = wav.astype(np.float32)
|
| 76 |
+
wav = np.nan_to_num(np.clip(wav, -1.0, 1.0))
|
| 77 |
+
|
| 78 |
+
buffer = io.BytesIO()
|
| 79 |
+
try:
|
| 80 |
+
sf.write(buffer, wav, sample_rate, format="WAV", subtype="PCM_16")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"[ERROR] soundfile write failed: {e}")
|
| 83 |
+
raise
|
| 84 |
+
|
| 85 |
+
buffer.seek(0)
|
| 86 |
+
return np.frombuffer(buffer.getvalue(), dtype=np.int8)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
server/inference/audio_sentence_alignment.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
import gc
|
| 9 |
+
import io
|
| 10 |
+
import logging
|
| 11 |
+
import threading
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from typing import Dict, List
|
| 14 |
+
|
| 15 |
+
import torch
|
| 16 |
+
import torchaudio
|
| 17 |
+
import torchaudio.functional as audio_F
|
| 18 |
+
|
| 19 |
+
from .align_utils import get_spans, load_model_dict, merge_repeats, time_to_frame
|
| 20 |
+
from .audio_reading_tools import wav_to_bytes
|
| 21 |
+
|
| 22 |
+
# Global logger for this module
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass(kw_only=True)
|
| 27 |
+
class AudioAlignmentConfig:
|
| 28 |
+
model_path_name: str = ""
|
| 29 |
+
emission_interval: int = 30
|
| 30 |
+
audio_format: str = "flac"
|
| 31 |
+
use_star: bool = False
|
| 32 |
+
device: str = "cuda"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class AudioAlignment:
|
| 36 |
+
"""Thread-safe singleton for audio-text alignment."""
|
| 37 |
+
|
| 38 |
+
_instance = None
|
| 39 |
+
_lock = threading.Lock()
|
| 40 |
+
|
| 41 |
+
scale: int = 1000
|
| 42 |
+
|
| 43 |
+
def __new__(cls):
|
| 44 |
+
if cls._instance is None:
|
| 45 |
+
with cls._lock:
|
| 46 |
+
# Double-check locking pattern
|
| 47 |
+
if cls._instance is None:
|
| 48 |
+
cls._instance = super(AudioAlignment, cls).__new__(cls)
|
| 49 |
+
cls._instance._initialize()
|
| 50 |
+
return cls._instance
|
| 51 |
+
|
| 52 |
+
def _initialize(self):
|
| 53 |
+
"""Initialize the singleton instance (called only once)."""
|
| 54 |
+
logger.info("Initializing AudioAlignment model...")
|
| 55 |
+
|
| 56 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 57 |
+
config = AudioAlignmentConfig(
|
| 58 |
+
device=str(device),
|
| 59 |
+
use_star=False, # Set to False for standard alignment
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
self.config = config
|
| 63 |
+
|
| 64 |
+
# FIXME: pass model name correctly
|
| 65 |
+
logger.info("Loading forced alignment model and dictionary...")
|
| 66 |
+
self.model, self.dictionary = load_model_dict()
|
| 67 |
+
self.device = torch.device(config.device)
|
| 68 |
+
self.model.to(self.device)
|
| 69 |
+
|
| 70 |
+
if self.config.use_star:
|
| 71 |
+
self.dictionary["<star>"] = len(self.dictionary)
|
| 72 |
+
|
| 73 |
+
self.blank = self.dictionary["<blank>"]
|
| 74 |
+
self.inverse_dictionary = {v: k for k, v in self.dictionary.items()}
|
| 75 |
+
|
| 76 |
+
logger.info(
|
| 77 |
+
f"AudioAlignment model loaded successfully on device: {self.device}"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
@torch.inference_mode()
|
| 81 |
+
def generate_emissions(self, waveform: torch.Tensor, reading_sr):
|
| 82 |
+
emission_interval = self.config.emission_interval
|
| 83 |
+
total_duration = waveform.size(1) / reading_sr
|
| 84 |
+
|
| 85 |
+
emissions_arr = []
|
| 86 |
+
|
| 87 |
+
i = 0
|
| 88 |
+
while i < total_duration:
|
| 89 |
+
segment_start_time, segment_end_time = (i, i + emission_interval)
|
| 90 |
+
|
| 91 |
+
context = emission_interval * 0.1
|
| 92 |
+
input_start_time = max(segment_start_time - context, 0)
|
| 93 |
+
input_end_time = min(segment_end_time + context, total_duration)
|
| 94 |
+
waveform_split = waveform[
|
| 95 |
+
:,
|
| 96 |
+
int(reading_sr * input_start_time) : int(reading_sr * (input_end_time)),
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
model_outs, _ = self.model(waveform_split)
|
| 100 |
+
emissions_ = model_outs[0]
|
| 101 |
+
emission_start_frame = time_to_frame(segment_start_time)
|
| 102 |
+
emission_end_frame = time_to_frame(segment_end_time)
|
| 103 |
+
offset = time_to_frame(input_start_time)
|
| 104 |
+
|
| 105 |
+
emissions_ = emissions_[
|
| 106 |
+
emission_start_frame - offset : emission_end_frame - offset, :
|
| 107 |
+
]
|
| 108 |
+
emissions_arr.append(emissions_)
|
| 109 |
+
i += emission_interval
|
| 110 |
+
|
| 111 |
+
emissions = torch.cat(emissions_arr, dim=0).squeeze()
|
| 112 |
+
emissions = torch.log_softmax(emissions, dim=-1)
|
| 113 |
+
|
| 114 |
+
stride = float(waveform.size(1) * self.scale / emissions.size(0) / reading_sr)
|
| 115 |
+
|
| 116 |
+
return emissions, stride
|
| 117 |
+
|
| 118 |
+
@torch.inference_mode()
|
| 119 |
+
def get_one_row_alignments(
|
| 120 |
+
self, audio_arr, reading_sr, tokens: List[str]
|
| 121 |
+
) -> List[Dict]:
|
| 122 |
+
"""Internal method to perform forced alignment."""
|
| 123 |
+
# buffer = audio_arr.tobytes()
|
| 124 |
+
buffer = audio_arr if isinstance(audio_arr, (bytes, bytearray)) else audio_arr.tobytes()
|
| 125 |
+
waveform, audio_sf = torchaudio.load(io.BytesIO(buffer))
|
| 126 |
+
waveform = waveform.to(self.device)
|
| 127 |
+
assert audio_sf == reading_sr
|
| 128 |
+
|
| 129 |
+
emissions, stride = self.generate_emissions(waveform, reading_sr)
|
| 130 |
+
waveform = waveform.cpu()
|
| 131 |
+
|
| 132 |
+
if self.config.use_star:
|
| 133 |
+
T, _ = emissions.size()
|
| 134 |
+
emissions = torch.cat(
|
| 135 |
+
[emissions, torch.zeros(T, 1, device=self.device)], dim=1
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
if self.config.use_star:
|
| 139 |
+
tokens = ["<star>"] + tokens
|
| 140 |
+
|
| 141 |
+
token_indices = [
|
| 142 |
+
self.dictionary[c]
|
| 143 |
+
for c in " ".join(tokens).split(" ")
|
| 144 |
+
if c in self.dictionary
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
targets = torch.tensor(token_indices, dtype=torch.int32, device=self.device)
|
| 148 |
+
|
| 149 |
+
input_lengths = torch.tensor(emissions.shape[0]).unsqueeze(-1)
|
| 150 |
+
target_lengths = torch.tensor(targets.shape[0]).unsqueeze(-1)
|
| 151 |
+
|
| 152 |
+
path, _ = audio_F.forced_align(
|
| 153 |
+
emissions.unsqueeze(0),
|
| 154 |
+
targets.unsqueeze(0),
|
| 155 |
+
input_lengths,
|
| 156 |
+
target_lengths,
|
| 157 |
+
blank=self.blank,
|
| 158 |
+
)
|
| 159 |
+
path = path.squeeze().to("cpu").tolist()
|
| 160 |
+
|
| 161 |
+
segments = merge_repeats(path, self.inverse_dictionary)
|
| 162 |
+
|
| 163 |
+
spans = get_spans(tokens, segments)
|
| 164 |
+
|
| 165 |
+
# audio_segments = []
|
| 166 |
+
# for span in spans:
|
| 167 |
+
# seg_start_idx, seg_end_idx = span[0].start, span[-1].end
|
| 168 |
+
# segment_start_sec = seg_start_idx * stride / self.scale
|
| 169 |
+
# segment_end_sec = seg_end_idx * stride / self.scale
|
| 170 |
+
# start_frame = int(segment_start_sec * reading_sr)
|
| 171 |
+
# end_frame = int(segment_end_sec * reading_sr)
|
| 172 |
+
# trimmed_waveform = waveform[:, start_frame:end_frame]
|
| 173 |
+
|
| 174 |
+
# audio_segments.append(
|
| 175 |
+
# {
|
| 176 |
+
# "segment_start_sec": segment_start_sec,
|
| 177 |
+
# "segment_end_sec": segment_end_sec,
|
| 178 |
+
# "segment_duration": segment_end_sec - segment_start_sec,
|
| 179 |
+
# "segment_audio_bytes": wav_to_bytes(
|
| 180 |
+
# trimmed_waveform, reading_sr, self.config.audio_format
|
| 181 |
+
# ),
|
| 182 |
+
# }
|
| 183 |
+
# )
|
| 184 |
+
# return audio_segments
|
| 185 |
+
audio_segments = []
|
| 186 |
+
for i, span in enumerate(spans):
|
| 187 |
+
seg_start_idx, seg_end_idx = span[0].start, span[-1].end
|
| 188 |
+
segment_start_sec = seg_start_idx * stride / self.scale
|
| 189 |
+
segment_end_sec = seg_end_idx * stride / self.scale
|
| 190 |
+
start_frame = int(segment_start_sec * reading_sr)
|
| 191 |
+
end_frame = int(segment_end_sec * reading_sr)
|
| 192 |
+
trimmed_waveform = waveform[:, start_frame:end_frame]
|
| 193 |
+
|
| 194 |
+
# 🧩 Fix: Skip empty or invalid audio segments
|
| 195 |
+
if trimmed_waveform is None or trimmed_waveform.numel() == 0:
|
| 196 |
+
# logger.warning(
|
| 197 |
+
# f"⚠️ Skipping empty audio segment {i} "
|
| 198 |
+
# f"({segment_start_sec:.2f}-{segment_end_sec:.2f}s)"
|
| 199 |
+
# )
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
try:
|
| 203 |
+
audio_bytes = wav_to_bytes(trimmed_waveform, reading_sr, self.config.audio_format)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
# logger.error(f"❌ Failed to convert segment {i} to bytes: {e}")
|
| 206 |
+
continue
|
| 207 |
+
|
| 208 |
+
audio_segments.append(
|
| 209 |
+
{
|
| 210 |
+
"segment_start_sec": segment_start_sec,
|
| 211 |
+
"segment_end_sec": segment_end_sec,
|
| 212 |
+
"segment_duration": segment_end_sec - segment_start_sec,
|
| 213 |
+
"segment_audio_bytes": audio_bytes,
|
| 214 |
+
}
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
return audio_segments
|
| 218 |
+
|
| 219 |
+
|
server/inference/mms_model_pipeline.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#@title fix import and path /content/omniasr-transcriptions/server/inference/mms_model_pipeline.py
|
| 2 |
+
# %%writefile /content/omniasr-transcriptions/server/inference/mms_model_pipeline.py
|
| 3 |
+
"""
|
| 4 |
+
Pipeline-based MMS Model using the official MMS library.
|
| 5 |
+
This implementation uses Wav2Vec2LlamaInferencePipeline to avoid Seq2SeqBatch complexity.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import torch
|
| 11 |
+
from typing import List, Dict, Any, Optional
|
| 12 |
+
# from omnilingual_asr.models.inference.pipeline import Wav2Vec2InferencePipeline
|
| 13 |
+
from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
|
| 14 |
+
|
| 15 |
+
from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
|
| 16 |
+
|
| 17 |
+
from inference.audio_reading_tools import wav_to_bytes
|
| 18 |
+
from env_vars import MODEL_NAME
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class MMSModel:
|
| 24 |
+
"""Pipeline-based MMS model wrapper using the official inference pipeline."""
|
| 25 |
+
_instance = None
|
| 26 |
+
_initialized = False
|
| 27 |
+
|
| 28 |
+
def __new__(cls, *args, **kwargs):
|
| 29 |
+
if cls._instance is None:
|
| 30 |
+
logger.info("Creating new MMSModel singleton instance")
|
| 31 |
+
cls._instance = super().__new__(cls)
|
| 32 |
+
else:
|
| 33 |
+
logger.info("Using existing MMSModel singleton instance")
|
| 34 |
+
return cls._instance
|
| 35 |
+
|
| 36 |
+
def __init__(self, model_card: str = None, device = None):
|
| 37 |
+
"""
|
| 38 |
+
Initialize the MMS model with the official pipeline.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
model_card: Model card to use (omniASR_LLM_1B, omniASR_LLM_300M, etc.)
|
| 42 |
+
If None, uses MODEL_NAME from environment variables
|
| 43 |
+
device: Device to use (torch.device object, "cuda", "cpu", etc.)
|
| 44 |
+
"""
|
| 45 |
+
# Only initialize once
|
| 46 |
+
if self._initialized:
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
# Get model name from environment variable with default fallback
|
| 50 |
+
self.model_card = model_card or MODEL_NAME
|
| 51 |
+
self.device = device
|
| 52 |
+
|
| 53 |
+
# Load the pipeline immediately during initialization
|
| 54 |
+
self._load_pipeline()
|
| 55 |
+
|
| 56 |
+
# Mark as initialized
|
| 57 |
+
self._initialized = True
|
| 58 |
+
|
| 59 |
+
def _load_pipeline(self):
|
| 60 |
+
"""Load the MMS pipeline during initialization."""
|
| 61 |
+
logger.info(f"Loading MMS pipeline: {self.model_card}")
|
| 62 |
+
logger.info(f"Target device: {self.device}")
|
| 63 |
+
|
| 64 |
+
# Debug FAIRSEQ2_CACHE_DIR environment variable
|
| 65 |
+
# fairseq2_cache_dir = os.environ.get('FAIRSEQ2_CACHE_DIR')
|
| 66 |
+
fairseq2_cache_dir = os.environ.get('FAIRSEQ2_CACHE_DIR',"./models")
|
| 67 |
+
logger.info(f"DEBUG: FAIRSEQ2_CACHE_DIR = {fairseq2_cache_dir}")
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
# Convert device to string if it's a torch.device object
|
| 71 |
+
device_str = str(self.device) if hasattr(self.device, 'type') else str(self.device)
|
| 72 |
+
# self.pipeline = Wav2Vec2InferencePipeline(
|
| 73 |
+
# model_card=self.model_card,
|
| 74 |
+
# device=device_str
|
| 75 |
+
# )
|
| 76 |
+
self.pipeline = ASRInferencePipeline(
|
| 77 |
+
model_card=self.model_card,
|
| 78 |
+
device=device_str
|
| 79 |
+
)
|
| 80 |
+
logger.info("✓ MMS pipeline loaded successfully")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Failed to load MMS pipeline: {e}")
|
| 83 |
+
raise
|
| 84 |
+
|
| 85 |
+
def transcribe_audio(self, audio_tensor: torch.Tensor, batch_size: int = 1, language_with_scripts: List[str] = None) -> List[Dict[str, Any]]:
|
| 86 |
+
"""
|
| 87 |
+
Transcribe audio tensor using the MMS pipeline.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
audio_tensor: Audio tensor (1D waveform) to transcribe
|
| 91 |
+
batch_size: Batch size for processing
|
| 92 |
+
language_with_scripts: List of language_with_scripts codes for transcription (3-letter ISO codes with script)
|
| 93 |
+
If None, uses auto-detection
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
List of transcription results
|
| 97 |
+
"""
|
| 98 |
+
# Pipeline is already loaded during initialization, no need to check
|
| 99 |
+
|
| 100 |
+
# Convert tensor to bytes for the pipeline
|
| 101 |
+
logger.info(f"Converting tensor (shape: {audio_tensor.shape}) to bytes")
|
| 102 |
+
# Move to CPU first if on GPU
|
| 103 |
+
tensor_cpu = audio_tensor.cpu() if audio_tensor.is_cuda else audio_tensor
|
| 104 |
+
# Convert to bytes using wav_to_bytes with 16kHz sample rate
|
| 105 |
+
audio_bytes = wav_to_bytes(tensor_cpu, sample_rate=16000, format="wav")
|
| 106 |
+
|
| 107 |
+
logger.info(f"Transcribing audio tensor with batch_size={batch_size}, language_with_scripts={language_with_scripts}")
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
# Use the official pipeline transcribe method with a list containing the single audio bytes
|
| 111 |
+
if language_with_scripts is not None:
|
| 112 |
+
transcriptions = self.pipeline.transcribe([audio_bytes], batch_size=batch_size, lang=language_with_scripts)
|
| 113 |
+
else:
|
| 114 |
+
transcriptions = self.pipeline.transcribe([audio_bytes], batch_size=batch_size)
|
| 115 |
+
|
| 116 |
+
logger.info(f"✓ Successfully transcribed audio tensor")
|
| 117 |
+
return transcriptions
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"Transcription failed: {e}")
|
| 121 |
+
raise
|
| 122 |
+
|
| 123 |
+
@classmethod
|
| 124 |
+
def get_instance(cls, model_card: str = None, device = None):
|
| 125 |
+
"""
|
| 126 |
+
Get the singleton instance of MMSModel.
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
model_card: Model card to use (omniASR_LLM_1B, omniASR_LLM_300M, etc.)
|
| 130 |
+
If None, uses MODEL_NAME from environment variables
|
| 131 |
+
device: Device to use (torch.device object, "cuda", "cpu", etc.)
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
MMSModel: The singleton instance
|
| 135 |
+
"""
|
| 136 |
+
if cls._instance is None:
|
| 137 |
+
cls._instance = cls(model_card=model_card, device=device)
|
| 138 |
+
return cls._instance
|
server/inference/norm_config_module.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# type: ignore
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
colon = ":"
|
| 6 |
+
comma = ","
|
| 7 |
+
exclamation_mark = "!"
|
| 8 |
+
period = re.escape(".")
|
| 9 |
+
question_mark = re.escape("?")
|
| 10 |
+
semicolon = ";"
|
| 11 |
+
|
| 12 |
+
left_curly_bracket = "{"
|
| 13 |
+
right_curly_bracket = "}"
|
| 14 |
+
quotation_mark = '"'
|
| 15 |
+
|
| 16 |
+
basic_punc = (
|
| 17 |
+
period
|
| 18 |
+
+ question_mark
|
| 19 |
+
+ comma
|
| 20 |
+
+ colon
|
| 21 |
+
+ exclamation_mark
|
| 22 |
+
+ left_curly_bracket
|
| 23 |
+
+ right_curly_bracket
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# General punc unicode block (0x2000-0x206F)
|
| 27 |
+
zero_width_space = r"\u200B"
|
| 28 |
+
zero_width_nonjoiner = r"\u200C"
|
| 29 |
+
left_to_right_mark = r"\u200E"
|
| 30 |
+
right_to_left_mark = r"\u200F"
|
| 31 |
+
left_to_right_embedding = r"\u202A"
|
| 32 |
+
pop_directional_formatting = r"\u202C"
|
| 33 |
+
|
| 34 |
+
# Here are some commonly ill-typed versions of apostrophe
|
| 35 |
+
right_single_quotation_mark = r"\u2019"
|
| 36 |
+
left_single_quotation_mark = r"\u2018"
|
| 37 |
+
|
| 38 |
+
# Language specific definitions
|
| 39 |
+
# Spanish
|
| 40 |
+
inverted_exclamation_mark = r"\u00A1"
|
| 41 |
+
inverted_question_mark = r"\u00BF"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Hindi
|
| 45 |
+
hindi_danda = "\u0964"
|
| 46 |
+
|
| 47 |
+
# Egyptian Arabic
|
| 48 |
+
# arabic_percent = r"\u066A"
|
| 49 |
+
arabic_comma = r"\u060C"
|
| 50 |
+
arabic_question_mark = r"\u061F"
|
| 51 |
+
arabic_semicolon = r"\u061B"
|
| 52 |
+
arabic_diacritics = r"\u064B-\u0652"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# Chinese
|
| 59 |
+
full_stop = r"\u3002"
|
| 60 |
+
full_comma = r"\uFF0C"
|
| 61 |
+
full_exclamation_mark = r"\uFF01"
|
| 62 |
+
full_question_mark = r"\uFF1F"
|
| 63 |
+
full_semicolon = r"\uFF1B"
|
| 64 |
+
full_colon = r"\uFF1A"
|
| 65 |
+
full_parentheses = r"\uFF08\uFF09"
|
| 66 |
+
quotation_mark_horizontal = r"\u300C-\u300F"
|
| 67 |
+
quotation_mark_vertical = r"\uFF41-\uFF44"
|
| 68 |
+
title_marks = r"\u3008-\u300B"
|
| 69 |
+
wavy_low_line = r"\uFE4F"
|
| 70 |
+
ellipsis = r"\u22EF"
|
| 71 |
+
enumeration_comma = r"\u3001"
|
| 72 |
+
hyphenation_point = r"\u2027"
|
| 73 |
+
forward_slash = r"\uFF0F"
|
| 74 |
+
wavy_dash = r"\uFF5E"
|
| 75 |
+
box_drawings_light_horizontal = r"\u2500"
|
| 76 |
+
fullwidth_low_line = r"\uFF3F"
|
| 77 |
+
chinese_punc = (
|
| 78 |
+
full_stop
|
| 79 |
+
+ full_comma
|
| 80 |
+
+ full_exclamation_mark
|
| 81 |
+
+ full_question_mark
|
| 82 |
+
+ full_semicolon
|
| 83 |
+
+ full_colon
|
| 84 |
+
+ full_parentheses
|
| 85 |
+
+ quotation_mark_horizontal
|
| 86 |
+
+ quotation_mark_vertical
|
| 87 |
+
+ title_marks
|
| 88 |
+
+ wavy_low_line
|
| 89 |
+
+ ellipsis
|
| 90 |
+
+ enumeration_comma
|
| 91 |
+
+ hyphenation_point
|
| 92 |
+
+ forward_slash
|
| 93 |
+
+ wavy_dash
|
| 94 |
+
+ box_drawings_light_horizontal
|
| 95 |
+
+ fullwidth_low_line
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Armenian
|
| 99 |
+
armenian_apostrophe = r"\u055A"
|
| 100 |
+
emphasis_mark = r"\u055B"
|
| 101 |
+
exclamation_mark = r"\u055C"
|
| 102 |
+
armenian_comma = r"\u055D"
|
| 103 |
+
armenian_question_mark = r"\u055E"
|
| 104 |
+
abbreviation_mark = r"\u055F"
|
| 105 |
+
armenian_full_stop = r"\u0589"
|
| 106 |
+
armenian_punc = (
|
| 107 |
+
armenian_apostrophe
|
| 108 |
+
+ emphasis_mark
|
| 109 |
+
+ exclamation_mark
|
| 110 |
+
+ armenian_comma
|
| 111 |
+
+ armenian_question_mark
|
| 112 |
+
+ abbreviation_mark
|
| 113 |
+
+ armenian_full_stop
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
lesser_than_symbol = r"<"
|
| 117 |
+
greater_than_symbol = r">"
|
| 118 |
+
|
| 119 |
+
lesser_than_sign = r"\u003c"
|
| 120 |
+
greater_than_sign = r"\u003e"
|
| 121 |
+
|
| 122 |
+
nbsp_written_form = r" "
|
| 123 |
+
|
| 124 |
+
# Quotation marks
|
| 125 |
+
left_double_quotes = r"\u201c"
|
| 126 |
+
right_double_quotes = r"\u201d"
|
| 127 |
+
left_double_angle = r"\u00ab"
|
| 128 |
+
right_double_angle = r"\u00bb"
|
| 129 |
+
left_single_angle = r"\u2039"
|
| 130 |
+
right_single_angle = r"\u203a"
|
| 131 |
+
low_double_quotes = r"\u201e"
|
| 132 |
+
low_single_quotes = r"\u201a"
|
| 133 |
+
high_double_quotes = r"\u201f"
|
| 134 |
+
high_single_quotes = r"\u201b"
|
| 135 |
+
|
| 136 |
+
all_punct_quotes = (
|
| 137 |
+
left_double_quotes
|
| 138 |
+
+ right_double_quotes
|
| 139 |
+
+ left_double_angle
|
| 140 |
+
+ right_double_angle
|
| 141 |
+
+ left_single_angle
|
| 142 |
+
+ right_single_angle
|
| 143 |
+
+ low_double_quotes
|
| 144 |
+
+ low_single_quotes
|
| 145 |
+
+ high_double_quotes
|
| 146 |
+
+ high_single_quotes
|
| 147 |
+
+ right_single_quotation_mark
|
| 148 |
+
+ left_single_quotation_mark
|
| 149 |
+
)
|
| 150 |
+
mapping_quotes = (
|
| 151 |
+
"["
|
| 152 |
+
+ high_single_quotes
|
| 153 |
+
+ right_single_quotation_mark
|
| 154 |
+
+ left_single_quotation_mark
|
| 155 |
+
+ "]"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# Digits
|
| 160 |
+
|
| 161 |
+
english_digits = r"\u0030-\u0039"
|
| 162 |
+
bengali_digits = r"\u09e6-\u09ef"
|
| 163 |
+
khmer_digits = r"\u17e0-\u17e9"
|
| 164 |
+
devanagari_digits = r"\u0966-\u096f"
|
| 165 |
+
oriya_digits = r"\u0b66-\u0b6f"
|
| 166 |
+
extended_arabic_indic_digits = r"\u06f0-\u06f9"
|
| 167 |
+
kayah_li_digits = r"\ua900-\ua909"
|
| 168 |
+
fullwidth_digits = r"\uff10-\uff19"
|
| 169 |
+
malayam_digits = r"\u0d66-\u0d6f"
|
| 170 |
+
myanmar_digits = r"\u1040-\u1049"
|
| 171 |
+
roman_numeral = r"\u2170-\u2179"
|
| 172 |
+
nominal_digit_shapes = r"\u206f"
|
| 173 |
+
|
| 174 |
+
# Load punctuations from MMS-lab data from the current directory
|
| 175 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 176 |
+
with open(os.path.join(current_dir, "punctuations.lst"), "r") as punc_f:
|
| 177 |
+
punc_list = punc_f.readlines()
|
| 178 |
+
|
| 179 |
+
punct_pattern = r""
|
| 180 |
+
for punc in punc_list:
|
| 181 |
+
# the first character in the tab separated line is the punc to be removed
|
| 182 |
+
punct_pattern += re.escape(punc.split("\t")[0])
|
| 183 |
+
|
| 184 |
+
shared_digits = (
|
| 185 |
+
english_digits
|
| 186 |
+
+ bengali_digits
|
| 187 |
+
+ khmer_digits
|
| 188 |
+
+ devanagari_digits
|
| 189 |
+
+ oriya_digits
|
| 190 |
+
+ extended_arabic_indic_digits
|
| 191 |
+
+ kayah_li_digits
|
| 192 |
+
+ fullwidth_digits
|
| 193 |
+
+ malayam_digits
|
| 194 |
+
+ myanmar_digits
|
| 195 |
+
+ roman_numeral
|
| 196 |
+
+ nominal_digit_shapes
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
shared_punc_list = (
|
| 200 |
+
basic_punc
|
| 201 |
+
+ all_punct_quotes
|
| 202 |
+
+ greater_than_sign
|
| 203 |
+
+ lesser_than_sign
|
| 204 |
+
+ inverted_question_mark
|
| 205 |
+
+ full_stop
|
| 206 |
+
+ semicolon
|
| 207 |
+
+ armenian_punc
|
| 208 |
+
+ inverted_exclamation_mark
|
| 209 |
+
+ arabic_comma
|
| 210 |
+
+ enumeration_comma
|
| 211 |
+
+ hindi_danda
|
| 212 |
+
+ quotation_mark
|
| 213 |
+
+ arabic_semicolon
|
| 214 |
+
+ arabic_question_mark
|
| 215 |
+
+ chinese_punc
|
| 216 |
+
+ punct_pattern
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
shared_mappping = {
|
| 220 |
+
lesser_than_symbol: "",
|
| 221 |
+
greater_than_symbol: "",
|
| 222 |
+
nbsp_written_form: "",
|
| 223 |
+
r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
shared_deletion_list = (
|
| 227 |
+
left_to_right_mark
|
| 228 |
+
+ zero_width_nonjoiner
|
| 229 |
+
+ arabic_subscript_alef_and_inverted_damma
|
| 230 |
+
+ zero_width_space
|
| 231 |
+
+ arabic_diacritics
|
| 232 |
+
+ pop_directional_formatting
|
| 233 |
+
+ right_to_left_mark
|
| 234 |
+
+ left_to_right_embedding
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
norm_config = {
|
| 238 |
+
"*": {
|
| 239 |
+
"lower_case": True,
|
| 240 |
+
"punc_set": shared_punc_list,
|
| 241 |
+
"del_set": shared_deletion_list,
|
| 242 |
+
"mapping": shared_mappping,
|
| 243 |
+
"digit_set": shared_digits,
|
| 244 |
+
"unicode_norm": "NFKC",
|
| 245 |
+
"rm_diacritics": False,
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
# =============== Mongolian ===============#
|
| 250 |
+
|
| 251 |
+
norm_config["mon"] = norm_config["*"].copy()
|
| 252 |
+
# add soft hyphen to punc list to match with fleurs
|
| 253 |
+
norm_config["mon"]["del_set"] += r"\u00AD"
|
| 254 |
+
|
| 255 |
+
norm_config["khk"] = norm_config["mon"].copy()
|
| 256 |
+
|
| 257 |
+
# =============== Hebrew ===============#
|
| 258 |
+
|
| 259 |
+
norm_config["heb"] = norm_config["*"].copy()
|
| 260 |
+
# add "HEBREW POINT" symbols to match with fleurs
|
| 261 |
+
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
| 262 |
+
|
| 263 |
+
# =============== Thai ===============#
|
| 264 |
+
|
| 265 |
+
norm_config["tha"] = norm_config["*"].copy()
|
| 266 |
+
# add "Zero width joiner" symbols to match with fleurs
|
| 267 |
+
norm_config["tha"]["punc_set"] += r"\u200D"
|
| 268 |
+
|
| 269 |
+
# =============== Arabic ===============#
|
| 270 |
+
norm_config["ara"] = norm_config["*"].copy()
|
| 271 |
+
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
| 272 |
+
norm_config["arb"] = norm_config["ara"].copy()
|
| 273 |
+
|
| 274 |
+
# =============== Javanese ===============#
|
| 275 |
+
norm_config["jav"] = norm_config["*"].copy()
|
| 276 |
+
norm_config["jav"]["rm_diacritics"] = True
|
server/inference/punctuations.lst
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
7355 INVALID UNICODE 0x81
|
| 2 |
+
5265 INVALID UNICODE 0x90
|
| 3 |
+
75 INVALID UNICODE 0x8
|
| 4 |
+
31 INVALID UNICODE 0x8d
|
| 5 |
+
3 INVALID UNICODE 0x94
|
| 6 |
+
2 INVALID UNICODE 0x8f
|
| 7 |
+
2 INVALID UNICODE 0x1a
|
| 8 |
+
1 INVALID UNICODE 0x9d
|
| 9 |
+
1 INVALID UNICODE 0x93
|
| 10 |
+
1 INVALID UNICODE 0x92
|
| 11 |
+
8647 INVALID UNICODE 0xe295
|
| 12 |
+
6650 INVALID UNICODE 0xf21d
|
| 13 |
+
6234 INVALID UNICODE 0xf62d
|
| 14 |
+
4815 INVALID UNICODE 0xf173
|
| 15 |
+
4789 INVALID UNICODE 0xe514
|
| 16 |
+
4409 INVALID UNICODE 0xe293
|
| 17 |
+
3881 INVALID UNICODE 0xf523
|
| 18 |
+
3788 INVALID UNICODE 0xe233
|
| 19 |
+
2448 INVALID UNICODE 0xf50f
|
| 20 |
+
2177 INVALID UNICODE 0xe232
|
| 21 |
+
1955 INVALID UNICODE 0xea7b
|
| 22 |
+
1926 INVALID UNICODE 0xf172
|
| 23 |
+
973 INVALID UNICODE 0xe290
|
| 24 |
+
972 INVALID UNICODE 0xf519
|
| 25 |
+
661 INVALID UNICODE 0xe292
|
| 26 |
+
591 INVALID UNICODE 0xe328
|
| 27 |
+
509 INVALID UNICODE 0xe2fa
|
| 28 |
+
458 INVALID UNICODE 0xe234
|
| 29 |
+
446 INVALID UNICODE 0xe043
|
| 30 |
+
419 INVALID UNICODE 0xe040
|
| 31 |
+
399 INVALID UNICODE 0xe2fb
|
| 32 |
+
387 INVALID UNICODE 0xe32b
|
| 33 |
+
381 INVALID UNICODE 0xe236
|
| 34 |
+
374 INVALID UNICODE 0xf511
|
| 35 |
+
314 INVALID UNICODE 0xe517
|
| 36 |
+
296 INVALID UNICODE 0xe2fe
|
| 37 |
+
293 INVALID UNICODE 0xe492
|
| 38 |
+
291 INVALID UNICODE 0xf52d
|
| 39 |
+
289 INVALID UNICODE 0xe2fc
|
| 40 |
+
195 INVALID UNICODE 0xf521
|
| 41 |
+
190 INVALID UNICODE 0xe516
|
| 42 |
+
182 INVALID UNICODE 0xe041
|
| 43 |
+
178 INVALID UNICODE 0xf529
|
| 44 |
+
113 INVALID UNICODE 0xe2f9
|
| 45 |
+
87 INVALID UNICODE 0xe2d9
|
| 46 |
+
78 INVALID UNICODE 0xe32a
|
| 47 |
+
76 INVALID UNICODE 0xe291
|
| 48 |
+
74 INVALID UNICODE 0xe296
|
| 49 |
+
66 INVALID UNICODE 0xe518
|
| 50 |
+
52 INVALID UNICODE 0xe32c
|
| 51 |
+
46 INVALID UNICODE 0xe2db
|
| 52 |
+
41 INVALID UNICODE 0xe231
|
| 53 |
+
34 INVALID UNICODE 0xf522
|
| 54 |
+
33 INVALID UNICODE 0xf518
|
| 55 |
+
32 INVALID UNICODE 0xf513
|
| 56 |
+
27 INVALID UNICODE 0xe32d
|
| 57 |
+
25 INVALID UNICODE 0xe32e
|
| 58 |
+
23 INVALID UNICODE 0xe06b
|
| 59 |
+
15 INVALID UNICODE 0xea01
|
| 60 |
+
12 INVALID UNICODE 0xe294
|
| 61 |
+
11 INVALID UNICODE 0xe203
|
| 62 |
+
8 INVALID UNICODE 0xf218
|
| 63 |
+
7 INVALID UNICODE 0xe070
|
| 64 |
+
7 INVALID UNICODE 0xe013
|
| 65 |
+
5 INVALID UNICODE 0xe2de
|
| 66 |
+
4 INVALID UNICODE 0xe493
|
| 67 |
+
3 INVALID UNICODE 0xf7e8
|
| 68 |
+
3 INVALID UNICODE 0xf7d0
|
| 69 |
+
3 INVALID UNICODE 0xe313
|
| 70 |
+
2 INVALID UNICODE 0xe329
|
| 71 |
+
2 INVALID UNICODE 0xe06d
|
| 72 |
+
2 INVALID UNICODE 0xe003
|
| 73 |
+
1 INVALID UNICODE 0xf50e
|
| 74 |
+
1 INVALID UNICODE 0xf171
|
| 75 |
+
1 INVALID UNICODE 0xe01d
|
| 76 |
+
71 NOMINAL DIGIT SHAPES 0x206f
|
| 77 |
+
3 WORD JOINER 0x2060
|
| 78 |
+
― 126545 HORIZONTAL BAR 0x2015
|
| 79 |
+
־ 1028 HEBREW PUNCTUATION MAQAF 0x5be
|
| 80 |
+
) 98429 RIGHT PARENTHESIS 0x29
|
| 81 |
+
] 27108 RIGHT SQUARE BRACKET 0x5d
|
| 82 |
+
⌋ 1567 RIGHT FLOOR 0x230b
|
| 83 |
+
〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015
|
| 84 |
+
】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011
|
| 85 |
+
﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e
|
| 86 |
+
& 170517 AMPERSAND 0x26
|
| 87 |
+
། 106330 TIBETAN MARK SHAD 0xf0d
|
| 88 |
+
። 90203 ETHIOPIC FULL STOP 0x1362
|
| 89 |
+
፥ 60484 ETHIOPIC COLON 0x1365
|
| 90 |
+
༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c
|
| 91 |
+
။ 51567 MYANMAR SIGN SECTION 0x104b
|
| 92 |
+
/ 46929 SOLIDUS 0x2f
|
| 93 |
+
၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a
|
| 94 |
+
· 37985 MIDDLE DOT 0xb7
|
| 95 |
+
‸ 36310 CARET 0x2038
|
| 96 |
+
* 34793 ASTERISK 0x2a
|
| 97 |
+
۔ 32432 ARABIC FULL STOP 0x6d4
|
| 98 |
+
፤ 31906 ETHIOPIC SEMICOLON 0x1364
|
| 99 |
+
၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f
|
| 100 |
+
។ 20834 KHMER SIGN KHAN 0x17d4
|
| 101 |
+
꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe
|
| 102 |
+
᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e
|
| 103 |
+
꤯ 12892 KAYAH LI SIGN SHYA 0xa92f
|
| 104 |
+
⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70
|
| 105 |
+
꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff
|
| 106 |
+
॥ 10763 DEVANAGARI DOUBLE DANDA 0x965
|
| 107 |
+
؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e
|
| 108 |
+
၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d
|
| 109 |
+
· 8431 GREEK ANO TELEIA 0x387
|
| 110 |
+
† 7477 DAGGER 0x2020
|
| 111 |
+
၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c
|
| 112 |
+
፣ 5719 ETHIOPIC COMMA 0x1363
|
| 113 |
+
៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6
|
| 114 |
+
꤮ 4791 KAYAH LI SIGN CWI 0xa92e
|
| 115 |
+
※ 3439 REFERENCE MARK 0x203b
|
| 116 |
+
፦ 2727 ETHIOPIC PREFACE COLON 0x1366
|
| 117 |
+
• 1749 BULLET 0x2022
|
| 118 |
+
¶ 1507 PILCROW SIGN 0xb6
|
| 119 |
+
၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e
|
| 120 |
+
﹖ 1224 SMALL QUESTION MARK 0xfe56
|
| 121 |
+
; 975 GREEK QUESTION MARK 0x37e
|
| 122 |
+
… 827 HORIZONTAL ELLIPSIS 0x2026
|
| 123 |
+
% 617 PERCENT SIGN 0x25
|
| 124 |
+
・ 468 KATAKANA MIDDLE DOT 0x30fb
|
| 125 |
+
༎ 306 TIBETAN MARK NYIS SHAD 0xf0e
|
| 126 |
+
‡ 140 DOUBLE DAGGER 0x2021
|
| 127 |
+
# 137 NUMBER SIGN 0x23
|
| 128 |
+
@ 125 COMMERCIAL AT 0x40
|
| 129 |
+
፡ 121 ETHIOPIC WORDSPACE 0x1361
|
| 130 |
+
៚ 55 KHMER SIGN KOOMUUT 0x17da
|
| 131 |
+
៕ 49 KHMER SIGN BARIYOOSAN 0x17d5
|
| 132 |
+
﹐ 10 SMALL COMMA 0xfe50
|
| 133 |
+
༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05
|
| 134 |
+
༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04
|
| 135 |
+
. 2 FULLWIDTH FULL STOP 0xff0e
|
| 136 |
+
﹗ 2 SMALL EXCLAMATION MARK 0xfe57
|
| 137 |
+
﹕ 2 SMALL COLON 0xfe55
|
| 138 |
+
‰ 2 PER MILLE SIGN 0x2030
|
| 139 |
+
・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65
|
| 140 |
+
( 98504 LEFT PARENTHESIS 0x28
|
| 141 |
+
[ 27245 LEFT SQUARE BRACKET 0x5b
|
| 142 |
+
⌊ 1567 LEFT FLOOR 0x230a
|
| 143 |
+
〔 95 LEFT TORTOISE SHELL BRACKET 0x3014
|
| 144 |
+
【 36 LEFT BLACK LENTICULAR BRACKET 0x3010
|
| 145 |
+
﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f
|
| 146 |
+
_ 4851 LOW LINE 0x5f
|
| 147 |
+
$ 72 DOLLAR SIGN 0x24
|
| 148 |
+
€ 14 EURO SIGN 0x20ac
|
| 149 |
+
£ 2 POUND SIGN 0xa3
|
| 150 |
+
~ 27462 TILDE 0x7e
|
| 151 |
+
= 11450 EQUALS SIGN 0x3d
|
| 152 |
+
| 8430 VERTICAL LINE 0x7c
|
| 153 |
+
− 3971 MINUS SIGN 0x2212
|
| 154 |
+
≫ 1904 MUCH GREATER-THAN 0x226b
|
| 155 |
+
≪ 1903 MUCH LESS-THAN 0x226a
|
| 156 |
+
+ 1450 PLUS SIGN 0x2b
|
| 157 |
+
< 345 FULLWIDTH LESS-THAN SIGN 0xff1c
|
| 158 |
+
> 344 FULLWIDTH GREATER-THAN SIGN 0xff1e
|
| 159 |
+
¬ 5 NOT SIGN 0xac
|
| 160 |
+
× 4 MULTIPLICATION SIGN 0xd7
|
| 161 |
+
→ 2 RIGHTWARDS ARROW 0x2192
|
| 162 |
+
᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d
|
| 163 |
+
° 499 DEGREE SIGN 0xb0
|
| 164 |
+
႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f
|
| 165 |
+
� 192 REPLACEMENT CHARACTER 0xfffd
|
| 166 |
+
⌟ 54 BOTTOM RIGHT CORNER 0x231f
|
| 167 |
+
⌞ 54 BOTTOM LEFT CORNER 0x231e
|
| 168 |
+
© 2 COPYRIGHT SIGN 0xa9
|
| 169 |
+
40 NARROW NO-BREAK SPACE 0x202f
|
| 170 |
+
1 SIX-PER-EM SPACE 0x2006
|
| 171 |
+
˜ 40261 SMALL TILDE 0x2dc
|
| 172 |
+
^ 6469 CIRCUMFLEX ACCENT 0x5e
|
| 173 |
+
¯ 20 MACRON 0xaf
|
| 174 |
+
ˇ 191442 CARON 0x2c7
|
| 175 |
+
ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f
|
| 176 |
+
ـ 9440 ARABIC TATWEEL 0x640
|
| 177 |
+
ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46
|
| 178 |
+
ៗ 3310 KHMER SIGN LEK TOO 0x17d7
|
| 179 |
+
々 678 IDEOGRAPHIC ITERATION MARK 0x3005
|
| 180 |
+
ໆ 430 LAO KO LA 0xec6
|
| 181 |
+
ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc
|
| 182 |
+
ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071
|
| 183 |
+
৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7
|
| 184 |
+
⅓ 26 VULGAR FRACTION ONE THIRD 0x2153
|
| 185 |
+
½ 26 VULGAR FRACTION ONE HALF 0xbd
|
| 186 |
+
¼ 4 VULGAR FRACTION ONE QUARTER 0xbc
|
| 187 |
+
⅟ 1 FRACTION NUMERATOR ONE 0x215f
|
| 188 |
+
⁄ 57 FRACTION SLASH 0x2044
|
server/inference/text_normalization.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import unicodedata
|
| 3 |
+
|
| 4 |
+
from . import norm_config_module
|
| 5 |
+
|
| 6 |
+
norm_config = norm_config_module.norm_config # type: ignore
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def text_normalize(
|
| 10 |
+
text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
|
| 11 |
+
):
|
| 12 |
+
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
text : The string to be normalized
|
| 16 |
+
iso_code :
|
| 17 |
+
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
normalized_text : the string after all normalization
|
| 21 |
+
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
config = norm_config.get(iso_code, norm_config["*"])
|
| 25 |
+
|
| 26 |
+
for field in [
|
| 27 |
+
"lower_case",
|
| 28 |
+
"punc_set",
|
| 29 |
+
"del_set",
|
| 30 |
+
"mapping",
|
| 31 |
+
"digit_set",
|
| 32 |
+
"unicode_norm",
|
| 33 |
+
]:
|
| 34 |
+
if field not in config:
|
| 35 |
+
config[field] = norm_config["*"][field]
|
| 36 |
+
|
| 37 |
+
text = unicodedata.normalize(config["unicode_norm"], text)
|
| 38 |
+
|
| 39 |
+
# Convert to lower case
|
| 40 |
+
|
| 41 |
+
if config["lower_case"] and lower_case:
|
| 42 |
+
text = text.lower()
|
| 43 |
+
|
| 44 |
+
# brackets
|
| 45 |
+
|
| 46 |
+
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
| 47 |
+
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
| 48 |
+
if remove_brackets:
|
| 49 |
+
text = re.sub(r"\([^\)]*\)", " ", text)
|
| 50 |
+
|
| 51 |
+
# Apply mappings
|
| 52 |
+
|
| 53 |
+
for old, new in config["mapping"].items():
|
| 54 |
+
text = re.sub(old, new, text)
|
| 55 |
+
|
| 56 |
+
# Replace punctutations with space
|
| 57 |
+
|
| 58 |
+
punct_pattern = r"[" + config["punc_set"]
|
| 59 |
+
|
| 60 |
+
punct_pattern += "]"
|
| 61 |
+
|
| 62 |
+
normalized_text = re.sub(punct_pattern, " ", text)
|
| 63 |
+
|
| 64 |
+
# remove characters in delete list
|
| 65 |
+
|
| 66 |
+
delete_patten = r"[" + config["del_set"] + "]"
|
| 67 |
+
|
| 68 |
+
normalized_text = re.sub(delete_patten, "", normalized_text)
|
| 69 |
+
|
| 70 |
+
# Remove words containing only digits
|
| 71 |
+
# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
|
| 72 |
+
# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
|
| 73 |
+
# The lookaround enables overlapping pattern matches to be replaced
|
| 74 |
+
|
| 75 |
+
if remove_numbers:
|
| 76 |
+
|
| 77 |
+
digits_pattern = "[" + config["digit_set"]
|
| 78 |
+
|
| 79 |
+
digits_pattern += "]+"
|
| 80 |
+
|
| 81 |
+
complete_digit_pattern = (
|
| 82 |
+
r"^"
|
| 83 |
+
+ digits_pattern
|
| 84 |
+
+ r"(?=\s)|(?<=\s)"
|
| 85 |
+
+ digits_pattern
|
| 86 |
+
+ r"(?=\s)|(?<=\s)"
|
| 87 |
+
+ digits_pattern
|
| 88 |
+
+ "$"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
|
| 92 |
+
|
| 93 |
+
if config["rm_diacritics"]:
|
| 94 |
+
from unidecode import unidecode
|
| 95 |
+
|
| 96 |
+
normalized_text = unidecode(normalized_text)
|
| 97 |
+
|
| 98 |
+
# Remove extra spaces
|
| 99 |
+
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
| 100 |
+
|
| 101 |
+
return normalized_text
|
server/lang_dict.py
ADDED
|
@@ -0,0 +1,1675 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
lang_code={
|
| 2 |
+
"English (Latin)": "eng_Latn",
|
| 3 |
+
"Hindi (Devanagari)": "hin_Deva",
|
| 4 |
+
"Bengali (Bengali)": "ben_Beng",
|
| 5 |
+
"Arbëreshë Albanian (Latin)": "aae_Latn",
|
| 6 |
+
"Afade (Latin)": "aal_Latn",
|
| 7 |
+
"Abung (Latin)": "abb_Latn",
|
| 8 |
+
"Abidji (Latin)": "abi_Latn",
|
| 9 |
+
"Abkhazian (Cyrillic)": "abk_Cyrl",
|
| 10 |
+
"Abua (Latin)": "abn_Latn",
|
| 11 |
+
"Abellen Ayta (Latin)": "abp_Latn",
|
| 12 |
+
"Abron (Latin)": "abr_Latn",
|
| 13 |
+
"Ambulas (Latin)": "abs_Latn",
|
| 14 |
+
"Achagua (Latin)": "aca_Latn",
|
| 15 |
+
"Gikyode (Latin)": "acd_Latn",
|
| 16 |
+
"Achinese (Latin)": "ace_Latn",
|
| 17 |
+
"Saint Lucian Creole French (Latin)": "acf_Latn",
|
| 18 |
+
"Acholi (Latin)": "ach_Latn",
|
| 19 |
+
"Iraqi Arabic (Arabic)": "acm_Arab",
|
| 20 |
+
"Achang (Latin)": "acn_Latn",
|
| 21 |
+
"Achi (Latin)": "acr_Latn",
|
| 22 |
+
"Achuar-Shiwiar (Latin)": "acu_Latn",
|
| 23 |
+
"Hijazi Arabic (Arabic)": "acw_Arab",
|
| 24 |
+
"Adele (Latin)": "ade_Latn",
|
| 25 |
+
"Adhola (Latin)": "adh_Latn",
|
| 26 |
+
"Adioukrou (Latin)": "adj_Latn",
|
| 27 |
+
"Amdo Tibetan (Tibetan)": "adx_Tibt",
|
| 28 |
+
"Adyghe (Cyrillic)": "ady_Cyrl",
|
| 29 |
+
"Tunisian Arabic (Arabic)": "aeb_Arab",
|
| 30 |
+
"Saidi Arabic (Arabic)": "aec_Arab",
|
| 31 |
+
"Arem (Latin)": "aeu_Latn",
|
| 32 |
+
"Gulf Arabic (Arabic)": "afb_Arab",
|
| 33 |
+
"Eloyi (Latin)": "afo_Latn",
|
| 34 |
+
"Afrikaans (Latin)": "afr_Latn",
|
| 35 |
+
"Agarabi (Latin)": "agd_Latn",
|
| 36 |
+
"Angor (Latin)": "agg_Latn",
|
| 37 |
+
"Agariya (Latin)": "agn_Latn",
|
| 38 |
+
"Aguaruna (Latin)": "agr_Latn",
|
| 39 |
+
"Aguacateco (Latin)": "agu_Latn",
|
| 40 |
+
"Agul (Cyrillic)": "agx_Cyrl",
|
| 41 |
+
"Ahanta (Latin)": "aha_Latn",
|
| 42 |
+
"Akha (Latin)": "ahk_Latn",
|
| 43 |
+
"Igo (Latin)": "ahl_Latn",
|
| 44 |
+
"Arosi (Latin)": "ahs_Latn",
|
| 45 |
+
"Arosi (Latin)": "aia_Latn",
|
| 46 |
+
"Aja (Benin) (Latin)": "ajg_Latn",
|
| 47 |
+
"Akan (Latin)": "aka_Latn",
|
| 48 |
+
"Batak Angkola (Latin)": "akb_Latn",
|
| 49 |
+
"Akawaio (Latin)": "ake_Latn",
|
| 50 |
+
"Akpes (Latin)": "akp_Latn",
|
| 51 |
+
"Alago (Latin)": "ala_Latn",
|
| 52 |
+
"Alangan (Latin)": "alj_Latn",
|
| 53 |
+
"Gheg Albanian (Latin)": "aln_Latn",
|
| 54 |
+
"Larike-Wakasihu (Latin)": "alo_Latn",
|
| 55 |
+
"Alune (Latin)": "alp_Latn",
|
| 56 |
+
"Tosk Albanian (Latin)": "als_Latn",
|
| 57 |
+
"Southern Altai (Cyrillic)": "alt_Cyrl",
|
| 58 |
+
"Alur (Latin)": "alz_Latn",
|
| 59 |
+
"Amarasi (Latin)": "ame_Latn",
|
| 60 |
+
"Hamer-Banna (Latin)": "amf_Latn",
|
| 61 |
+
"Amharic (Ethiopic)": "amh_Ethi",
|
| 62 |
+
"Amis (Latin)": "ami_Latn",
|
| 63 |
+
"Amo (Latin)": "amk_Latn",
|
| 64 |
+
"Amanab (Latin)": "amu_Latn",
|
| 65 |
+
"Ngas (Latin)": "anc_Latn",
|
| 66 |
+
"Goemai (Latin)": "ank_Latn",
|
| 67 |
+
"Obolo (Latin)": "ann_Latn",
|
| 68 |
+
"Angika (Devanagari)": "anp_Deva",
|
| 69 |
+
"Anaang (Latin)": "anw_Latn",
|
| 70 |
+
"Anyin (Latin)": "any_Latn",
|
| 71 |
+
"A'ou (Latin)": "aom_Latn",
|
| 72 |
+
"Uab Meto (Latin)": "aoz_Latn",
|
| 73 |
+
"Sa'a (Latin)": "apb_Latn",
|
| 74 |
+
"North Levantine Arabic (Arabic)": "apc_Arab",
|
| 75 |
+
"Sudanese Arabic (Arabic)": "apd_Arab",
|
| 76 |
+
"A-Pucikwar (Latin)": "apr_Latn",
|
| 77 |
+
"Standard Arabic (Arabic)": "arb_Arab",
|
| 78 |
+
"Aragonese (Latin)": "arg_Latn",
|
| 79 |
+
"Arhâ (Latin)": "arl_Latn",
|
| 80 |
+
"Algerian Arabic (Arabic)": "arq_Arab",
|
| 81 |
+
"Najdi Arabic (Arabic)": "ars_Arab",
|
| 82 |
+
"Moroccan Arabic (Arabic)": "ary_Arab",
|
| 83 |
+
"Egyptian Arabic (Arabic)": "arz_Arab",
|
| 84 |
+
"Asu (Tanzania) (Latin)": "asa_Latn",
|
| 85 |
+
"Cishingini (Latin)": "asg_Latn",
|
| 86 |
+
"Assamese (Bengali)": "asm_Beng",
|
| 87 |
+
"Asturian (Latin)": "ast_Latn",
|
| 88 |
+
"Ata (Latin)": "ata_Latn",
|
| 89 |
+
"Atsi (Latin)": "atb_Latn",
|
| 90 |
+
"Atong (India) (Latin)": "atg_Latn",
|
| 91 |
+
"Ivbie North-Okpela-Arhe (Latin)": "ati_Latn",
|
| 92 |
+
"Atikamekw (Latin)": "atq_Latn",
|
| 93 |
+
"Avaric (Cyrillic)": "ava_Cyrl",
|
| 94 |
+
"Avikam (Latin)": "avn_Latn",
|
| 95 |
+
"Avokaya (Latin)": "avu_Latn",
|
| 96 |
+
"Awadhi (Devanagari)": "awa_Deva",
|
| 97 |
+
"Awa-Cuaiquer (Latin)": "awb_Latn",
|
| 98 |
+
"Arawum (Latin)": "awo_Latn",
|
| 99 |
+
"South Levantine Arabic (Arabic)": "ayl_Arab",
|
| 100 |
+
"Ayizo Gbe (Latin)": "ayo_Latn",
|
| 101 |
+
"North Mesopotamian Arabic (Arabic)": "ayp_Arab",
|
| 102 |
+
"Aymara (Latin)": "ayr_Latn",
|
| 103 |
+
"Mai Brat (Latin)": "ayz_Latn",
|
| 104 |
+
"Azerbaijani (Arabic)": "aze_Arab",
|
| 105 |
+
"Azerbaijani (Cyrillic)": "aze_Cyrl",
|
| 106 |
+
"Azerbaijani (Latin)": "aze_Latn",
|
| 107 |
+
"Ambele (Latin)": "azg_Latn",
|
| 108 |
+
"Highland Oaxaca Chontal (Latin)": "azz_Latn",
|
| 109 |
+
"Bagheli (Latin)": "bag_Latn",
|
| 110 |
+
"Bashkir (Cyrillic)": "bak_Cyrl",
|
| 111 |
+
"Bambara (Latin)": "bam_Latn",
|
| 112 |
+
"Balinese (Latin)": "ban_Latn",
|
| 113 |
+
"Waimaha (Latin)": "bao_Latn",
|
| 114 |
+
"Basa (Cameroon) (Latin)": "bas_Latn",
|
| 115 |
+
"Vengo (Latin)": "bav_Latn",
|
| 116 |
+
"Bambili-Bambui (Latin)": "bax_Latn",
|
| 117 |
+
"Barai (Latin)": "bba_Latn",
|
| 118 |
+
"Baeggu (Latin)": "bbb_Latn",
|
| 119 |
+
"Batak Toba (Latin)": "bbc_Latn",
|
| 120 |
+
"Ghomálá' (Latin)": "bbj_Latn",
|
| 121 |
+
"Babanki (Georgian)": "bbl_Geor",
|
| 122 |
+
"Northern Bobo Madaré (Latin)": "bbo_Latn",
|
| 123 |
+
"Kulung (Nigeria) (Latin)": "bbu_Latn",
|
| 124 |
+
"Southern Balochi (Arabic)": "bcc_Arab",
|
| 125 |
+
"Southern Balochi (Latin)": "bcc_Latn",
|
| 126 |
+
"Bainouk-Samik (Latin)": "bce_Latn",
|
| 127 |
+
"Baoulé (Latin)": "bci_Latn",
|
| 128 |
+
"Central Bikol (Latin)": "bcl_Latn",
|
| 129 |
+
"Bainouk-Gunyaamolo (Latin)": "bcs_Latn",
|
| 130 |
+
"Bana (Latin)": "bcw_Latn",
|
| 131 |
+
"Bannoni (Latin)": "bcy_Latn",
|
| 132 |
+
"Bainouk-Gunyaamolo (Latin)": "bcz_Latn",
|
| 133 |
+
"Bai (Latin)": "bda_Latn",
|
| 134 |
+
"Bade (Latin)": "bde_Latn",
|
| 135 |
+
"Balesin-Bisaya (Latin)": "bdg_Latn",
|
| 136 |
+
"Baka (South Sudan) (Latin)": "bdh_Latn",
|
| 137 |
+
"Burun (Latin)": "bdm_Latn",
|
| 138 |
+
"Bau (Latin)": "bdq_Latn",
|
| 139 |
+
"Oroko (Latin)": "bdu_Latn",
|
| 140 |
+
"Bebele (Latin)": "beb_Latn",
|
| 141 |
+
"Biali (Latin)": "beh_Latn",
|
| 142 |
+
"Belarusian (Cyrillic)": "bel_Cyrl",
|
| 143 |
+
"Bemba (Zambia) (Latin)": "bem_Latn",
|
| 144 |
+
"Bengali (Bengali)": "ben_Beng",
|
| 145 |
+
"Bila (Latin)": "bep_Latn",
|
| 146 |
+
"Betawi (Latin)": "bew_Latn",
|
| 147 |
+
"Yarawa (Latin)": "bex_Latn",
|
| 148 |
+
"Beba (Latin)": "bfa_Latn",
|
| 149 |
+
"Bafut (Latin)": "bfd_Latn",
|
| 150 |
+
"Beba (Latin)": "bfo_Latn",
|
| 151 |
+
"Balti (Arabic)": "bft_Arab",
|
| 152 |
+
"Bagheli (Devanagari)": "bfy_Deva",
|
| 153 |
+
"Pahari-Potwari (Devanagari)": "bfz_Deva",
|
| 154 |
+
"Haryanvi (Devanagari)": "bgc_Deva",
|
| 155 |
+
"Gwamhi-Wuri (Arabic)": "bgp_Arab",
|
| 156 |
+
"Bagri (Devanagari)": "bgq_Deva",
|
| 157 |
+
"Bauria (Latin)": "bgr_Latn",
|
| 158 |
+
"Gamo-Gofa-Dawro (Latin)": "bgt_Latn",
|
| 159 |
+
"Bhatri (Devanagari)": "bgw_Deva",
|
| 160 |
+
"Bharia (Devanagari)": "bha_Deva",
|
| 161 |
+
"Bhili (Devanagari)": "bhb_Deva",
|
| 162 |
+
"Bukhari (Cyrillic)": "bhh_Cyrl",
|
| 163 |
+
"Bhojpuri (Devanagari)": "bho_Deva",
|
| 164 |
+
"Bima (Latin)": "bhp_Latn",
|
| 165 |
+
"Bhattiyali (Devanagari)": "bht_Deva",
|
| 166 |
+
"Biangai (Latin)": "bhz_Latn",
|
| 167 |
+
"Bissa (Latin)": "bib_Latn",
|
| 168 |
+
"Bimoba (Latin)": "bim_Latn",
|
| 169 |
+
"Bislama (Latin)": "bis_Latn",
|
| 170 |
+
"B Eliot (Latin)": "biv_Latn",
|
| 171 |
+
"Badyara (Devanagari)": "bjj_Deva",
|
| 172 |
+
"Barok (Latin)": "bjk_Latn",
|
| 173 |
+
"Banjar (Latin)": "bjn_Latn",
|
| 174 |
+
"Binumarien (Latin)": "bjr_Latn",
|
| 175 |
+
"Bulu (Papua New Guinea) (Latin)": "bjt_Latn",
|
| 176 |
+
"Bedjond (Latin)": "bjv_Latn",
|
| 177 |
+
"Bakwé (Latin)": "bjw_Latn",
|
| 178 |
+
"Bariji (Latin)": "bjz_Latn",
|
| 179 |
+
"Binukid (Latin)": "bkd_Latn",
|
| 180 |
+
"Bakoko (Latin)": "bkh_Latn",
|
| 181 |
+
"Boki (Latin)": "bkm_Latn",
|
| 182 |
+
"Bekwarra (Latin)": "bkv_Latn",
|
| 183 |
+
"Bungku (Latin)": "bky_Latn",
|
| 184 |
+
"Bolia (Latin)": "ble_Latn",
|
| 185 |
+
"Baluan-Pam (Latin)": "blh_Latn",
|
| 186 |
+
"Tai Dam (Latin)": "blt_Latn",
|
| 187 |
+
"Mag-Indi Ayta (Latin)": "blx_Latn",
|
| 188 |
+
"Balantak (Latin)": "blz_Latn",
|
| 189 |
+
"Bembe (Latin)": "bmm_Latn",
|
| 190 |
+
"Biao Mon (Latin)": "bmq_Latn",
|
| 191 |
+
"Muinane (Latin)": "bmr_Latn",
|
| 192 |
+
"Bomwali (Latin)": "bmu_Latn",
|
| 193 |
+
"Bum (Latin)": "bmv_Latn",
|
| 194 |
+
"Bangi (Bengali)": "bng_Beng",
|
| 195 |
+
"Bonerif (Latin)": "bnm_Latn",
|
| 196 |
+
"Bontok (Latin)": "bnn_Latn",
|
| 197 |
+
"Bantoanon (Latin)": "bno_Latn",
|
| 198 |
+
"Bola (Papua New Guinea) (Latin)": "bnp_Latn",
|
| 199 |
+
"Bunun (Devanagari)": "bns_Deva",
|
| 200 |
+
"Bora (Latin)": "boa_Latn",
|
| 201 |
+
"Tibetan (Tibetan)": "bod_Tibt",
|
| 202 |
+
"Anjam (Latin)": "boj_Latn",
|
| 203 |
+
"Berom (Latin)": "bom_Latn",
|
| 204 |
+
"Borôro (Latin)": "bor_Latn",
|
| 205 |
+
"Bosnian (Latin)": "bos_Latn",
|
| 206 |
+
"Bonkiman (Latin)": "bou_Latn",
|
| 207 |
+
"Bongo (Latin)": "bov_Latn",
|
| 208 |
+
"Tuwuli (Latin)": "box_Latn",
|
| 209 |
+
"Barapasi (Latin)": "bpr_Latn",
|
| 210 |
+
"Banda-Banda (Latin)": "bps_Latn",
|
| 211 |
+
"Birgid (Latin)": "bqc_Latn",
|
| 212 |
+
"Baga Pokur (Latin)": "bqg_Latn",
|
| 213 |
+
"Bakhtiari (Arabic)": "bqi_Arab",
|
| 214 |
+
"Banda-Mbrès (Latin)": "bqj_Latn",
|
| 215 |
+
"Banda-Ndélé (Latin)": "bqp_Latn",
|
| 216 |
+
"Braj (Devanagari)": "bra_Deva",
|
| 217 |
+
"Breton (Latin)": "bre_Latn",
|
| 218 |
+
"Brahui (Arabic)": "brh_Arab",
|
| 219 |
+
"Bira (Congo) (Latin)": "bri_Latn",
|
| 220 |
+
"Burui (Latin)": "bru_Latn",
|
| 221 |
+
"Bodo (India) (Devanagari)": "brx_Deva",
|
| 222 |
+
"Basa (Nigeria) (Latin)": "bsc_Latn",
|
| 223 |
+
"Kati (Arabic)": "bsh_Arab",
|
| 224 |
+
"Bangolan (Latin)": "bsj_Latn",
|
| 225 |
+
"Burushaski (Latin)": "bsk_Latn",
|
| 226 |
+
"Bassa-Kontagora (Latin)": "bsq_Latn",
|
| 227 |
+
"Akoose (Latin)": "bss_Latn",
|
| 228 |
+
"Busami (Latin)": "bsy_Latn",
|
| 229 |
+
"Batak Dairi (Latin)": "btd_Latn",
|
| 230 |
+
"Batak Mandailing (Latin)": "btm_Latn",
|
| 231 |
+
"Ratte Buri (Latin)": "bts_Latn",
|
| 232 |
+
"Bete-Bendi (Latin)": "btt_Latn",
|
| 233 |
+
"Bateri (Arabic)": "btv_Arab",
|
| 234 |
+
"Batak Karo (Latin)": "btx_Latn",
|
| 235 |
+
"Budu (Latin)": "bud_Latn",
|
| 236 |
+
"Buginese (Latin)": "bug_Latn",
|
| 237 |
+
"Bulgarian (Cyrillic)": "bul_Cyrl",
|
| 238 |
+
"Bulu (Cameroon) (Latin)": "bum_Latn",
|
| 239 |
+
"Bulu (Cameroon) (Latin)": "buo_Latn",
|
| 240 |
+
"Bussa (Latin)": "bus_Latn",
|
| 241 |
+
"Bokobaru (Latin)": "bux_Latn",
|
| 242 |
+
"Bube (Latin)": "bvb_Latn",
|
| 243 |
+
"Baelelea (Latin)": "bvc_Latn",
|
| 244 |
+
"Buriat (Latin)": "bvz_Latn",
|
| 245 |
+
"Bwatoo (Latin)": "bwq_Latn",
|
| 246 |
+
"Bura-Pabir (Latin)": "bwr_Latn",
|
| 247 |
+
"Buli (Ghana) (Latin)": "bwu_Latn",
|
| 248 |
+
"Bilur (Latin)": "bxf_Latn",
|
| 249 |
+
"Buhutu (Latin)": "bxk_Latn",
|
| 250 |
+
"Tiéyaxo Bozo (Latin)": "byc_Latn",
|
| 251 |
+
"Bina (Nigeria) (Latin)": "byr_Latn",
|
| 252 |
+
"Bisa (Latin)": "bys_Latn",
|
| 253 |
+
"Batak (Latin)": "byv_Latn",
|
| 254 |
+
"Qaqet (Latin)": "byx_Latn",
|
| 255 |
+
"Blaan (Latin)": "bzh_Latn",
|
| 256 |
+
"Bisu (Thai)": "bzi_Thai",
|
| 257 |
+
"Jamaican Creole English (Latin)": "bzj_Latn",
|
| 258 |
+
"Boano (Sulawesi) (Latin)": "bzw_Latn",
|
| 259 |
+
"Chortí (Latin)": "caa_Latn",
|
| 260 |
+
"Garifuna (Latin)": "cab_Latn",
|
| 261 |
+
"Chuj (Latin)": "cac_Latn",
|
| 262 |
+
"Kaqchikel (Latin)": "cak_Latn",
|
| 263 |
+
"Carolinian (Latin)": "cap_Latn",
|
| 264 |
+
"Galibi Carib (Latin)": "car_Latn",
|
| 265 |
+
"Tsimané (Latin)": "cas_Latn",
|
| 266 |
+
"Catalan (Latin)": "cat_Latn",
|
| 267 |
+
"Cua (Latin)": "cax_Latn",
|
| 268 |
+
"Cabiyarí (Latin)": "cbc_Latn",
|
| 269 |
+
"Chachi (Latin)": "cbi_Latn",
|
| 270 |
+
"Carijona (Latin)": "cbr_Latn",
|
| 271 |
+
"Cashibo-Cacataibo (Latin)": "cbs_Latn",
|
| 272 |
+
"Chayahuita (Latin)": "cbt_Latn",
|
| 273 |
+
"Chachi (Latin)": "cbu_Latn",
|
| 274 |
+
"Kakua (Latin)": "cbv_Latn",
|
| 275 |
+
"Chopi (Latin)": "cce_Latn",
|
| 276 |
+
"Samba Daka (Latin)": "ccg_Latn",
|
| 277 |
+
"Chakma (Latin)": "cco_Latn",
|
| 278 |
+
"Churahi (Devanagari)": "cdj_Deva",
|
| 279 |
+
"Min Dong Chinese (Han)": "cdo_Hans",
|
| 280 |
+
"Cebuano (Latin)": "ceb_Latn",
|
| 281 |
+
"Cen Gbe (Latin)": "ceg_Latn",
|
| 282 |
+
"Cek pet (Latin)": "cek_Latn",
|
| 283 |
+
"Centúúm (Latin)": "cen_Latn",
|
| 284 |
+
"Czech (Latin)": "ces_Latn",
|
| 285 |
+
"Chafarruscas (Latin)": "cfa_Latn",
|
| 286 |
+
"Falam Chin (Latin)": "cfm_Latn",
|
| 287 |
+
"Chiga (Latin)": "cgg_Latn",
|
| 288 |
+
"Chiga (Latin)": "cgg_Latn",
|
| 289 |
+
"Chechen (Cyrillic)": "che_Cyrl",
|
| 290 |
+
"Chontal de Tabasco (Latin)": "chf_Latn",
|
| 291 |
+
"Chatino (Latin)": "chq_Latn",
|
| 292 |
+
"Chuvash (Cyrillic)": "chv_Cyrl",
|
| 293 |
+
"Ozumacín Chinantec (Latin)": "chz_Latn",
|
| 294 |
+
"Chokwe (Latin)": "cjk_Latn",
|
| 295 |
+
"Chamorro (Latin)": "cjo_Latn",
|
| 296 |
+
"Upper Chehalis (Latin)": "cjp_Latn",
|
| 297 |
+
"Shor (Cyrillic)": "cjs_Cyrl",
|
| 298 |
+
"Central Kurdish (Arabic)": "ckb_Arab",
|
| 299 |
+
"Cibak (Latin)": "ckl_Latn",
|
| 300 |
+
"Anufo (Latin)": "cko_Latn",
|
| 301 |
+
"Chak (Latin)": "ckr_Latn",
|
| 302 |
+
"Chukot (Cyrillic)": "ckt_Cyrl",
|
| 303 |
+
"Chukot (Latin)": "cky_Latn",
|
| 304 |
+
"Chala (Latin)": "cla_Latn",
|
| 305 |
+
"Lealao Chinantec (Latin)": "cle_Latn",
|
| 306 |
+
"Eastern Highland Chatino (Latin)": "cly_Latn",
|
| 307 |
+
"Mro-Khimi Chin (Latin)": "cme_Latn",
|
| 308 |
+
"Mandarin Chinese (Han)": "cmn_Hans",
|
| 309 |
+
"Mandarin Chinese (Han)": "cmn_Hant",
|
| 310 |
+
"Central Mnong (Khmer)": "cmo_Khmr",
|
| 311 |
+
"Central Mnong (Latin)": "cmo_Latn",
|
| 312 |
+
"Mro-Khimi Chin (Latin)": "cmr_Latn",
|
| 313 |
+
"Hakha Chin (Latin)": "cnh_Latn",
|
| 314 |
+
"Ashéninka Pajonal (Latin)": "cni_Latn",
|
| 315 |
+
"Lalana Chinantec (Latin)": "cnl_Latn",
|
| 316 |
+
"Northern Tlaxiaco Chatino (Latin)": "cnt_Latn",
|
| 317 |
+
"Cochimi (Latin)": "coe_Latn",
|
| 318 |
+
"Cofán (Latin)": "cof_Latn",
|
| 319 |
+
"Chong (Latin)": "cok_Latn",
|
| 320 |
+
"Cotoname (Latin)": "con_Latn",
|
| 321 |
+
"Cornish (Latin)": "cor_Latn",
|
| 322 |
+
"Caquinte (Latin)": "cot_Latn",
|
| 323 |
+
"Wamey (Latin)": "cou_Latn",
|
| 324 |
+
"Ponares (Latin)": "cpa_Latn",
|
| 325 |
+
"Ucayali-Yurúa Ashéninka (Latin)": "cpb_Latn",
|
| 326 |
+
"Pichis Ashéninka (Latin)": "cpu_Latn",
|
| 327 |
+
"Pu-Xian Chinese (Han)": "cpx_Hans",
|
| 328 |
+
"Ucayali-Yurúa Ashéninka (Latin)": "cpy_Latn",
|
| 329 |
+
"Crimean Tatar (Cyrillic)": "crh_Cyrl",
|
| 330 |
+
"Cree (Canadian Aboriginal Syllabics)": "crk_Cans",
|
| 331 |
+
"Cree (Latin)": "crk_Latn",
|
| 332 |
+
"El Nayar Cora (Latin)": "crn_Latn",
|
| 333 |
+
"Caramanta (Latin)": "crq_Latn",
|
| 334 |
+
"Seselwa Creole French (Latin)": "crs_Latn",
|
| 335 |
+
"Iyo'wujwa Chorote (Latin)": "crt_Latn",
|
| 336 |
+
"Carrier (Latin)": "csk_Latn",
|
| 337 |
+
"Southern Ping Chinese (Latin)": "cso_Latn",
|
| 338 |
+
"Northern Tlaxiaco Chatino (Latin)": "ctd_Latn",
|
| 339 |
+
"Tepinapa Chinantec (Latin)": "cte_Latn",
|
| 340 |
+
"Chittagonian (Bengali)": "ctg_Beng",
|
| 341 |
+
"Tataltepec Chatino (Latin)": "ctl_Latn",
|
| 342 |
+
"Tataltepec Chatino (Latin)": "cto_Latn",
|
| 343 |
+
"Wayanad Chetti (Latin)": "ctu_Latn",
|
| 344 |
+
"Cun (Latin)": "cuc_Latn",
|
| 345 |
+
"Culina (Latin)": "cui_Latn",
|
| 346 |
+
"Culina (Latin)": "cuk_Latn",
|
| 347 |
+
"Culina (Latin)": "cul_Latn",
|
| 348 |
+
"Teutila Cuicatec (Latin)": "cut_Latn",
|
| 349 |
+
"Chuka (Latin)": "cux_Latn",
|
| 350 |
+
"Chuwabu (Latin)": "cwa_Latn",
|
| 351 |
+
"Kwere (Latin)": "cwe_Latn",
|
| 352 |
+
"Nute (Latin)": "cwt_Latn",
|
| 353 |
+
"Cemuhî (Latin)": "cya_Latn",
|
| 354 |
+
"Welsh (Latin)": "cym_Latn",
|
| 355 |
+
"Dambi (Latin)": "daa_Latn",
|
| 356 |
+
"Dagbani (Latin)": "dag_Latn",
|
| 357 |
+
"Gwahatike (Latin)": "dah_Latn",
|
| 358 |
+
"Danish (Latin)": "dan_Latn",
|
| 359 |
+
"Dargwa (Cyrillic)": "dar_Cyrl",
|
| 360 |
+
"Taita (Latin)": "dav_Latn",
|
| 361 |
+
"Dabarre (Latin)": "dbd_Latn",
|
| 362 |
+
"Doga (Latin)": "dbj_Latn",
|
| 363 |
+
"Daba (Latin)": "dbq_Latn",
|
| 364 |
+
"Deccan (Arabic)": "dcc_Arab",
|
| 365 |
+
"Dendi (Nigeria) (Latin)": "ddn_Latn",
|
| 366 |
+
"Dedua (Latin)": "ded_Latn",
|
| 367 |
+
"Dezfuli (Latin)": "deg_Latn",
|
| 368 |
+
"Desano (Latin)": "des_Latn",
|
| 369 |
+
"German (Latin)": "deu_Latn",
|
| 370 |
+
"Dagaari Dioula (Latin)": "dga_Latn",
|
| 371 |
+
"Dghwede (Latin)": "dgh_Latn",
|
| 372 |
+
"Dugwor (Latin)": "dgi_Latn",
|
| 373 |
+
"Dakka (Latin)": "dgk_Latn",
|
| 374 |
+
"Dogri (macrolanguage) (Devanagari)": "dgo_Deva",
|
| 375 |
+
"Dogrib (Latin)": "dgr_Latn",
|
| 376 |
+
"Didinga (Devanagari)": "dhi_Deva",
|
| 377 |
+
"Digo (Latin)": "did_Latn",
|
| 378 |
+
"Digo (Latin)": "dig_Latn",
|
| 379 |
+
"Dilling (Latin)": "dik_Latn",
|
| 380 |
+
"Dilling (Latin)": "dip_Latn",
|
| 381 |
+
"Dhivehi (Thaana)": "div_Thaa",
|
| 382 |
+
"Zarma (Latin)": "dje_Latn",
|
| 383 |
+
"Jukun of Takum (Latin)": "djk_Latn",
|
| 384 |
+
"Domaaki (Arabic)": "dmk_Arab",
|
| 385 |
+
"Domaaki (Arabic)": "dml_Arab",
|
| 386 |
+
"Dan (Latin)": "dnj_Latn",
|
| 387 |
+
"Dan (Latin)": "dnt_Latn",
|
| 388 |
+
"Dan (Latin)": "dnw_Latn",
|
| 389 |
+
"Dom (Latin)": "dop_Latn",
|
| 390 |
+
"Dogosé (Latin)": "dos_Latn",
|
| 391 |
+
"Duruwa (Latin)": "dru_Latn",
|
| 392 |
+
"Lower Sorbian (Latin)": "dsb_Latn",
|
| 393 |
+
"Daasanach (Latin)": "dsh_Latn",
|
| 394 |
+
"Dusner (Latin)": "dtp_Latn",
|
| 395 |
+
"Toro So Dogon (Latin)": "dts_Latn",
|
| 396 |
+
"Dotyali (Devanagari)": "dty_Deva",
|
| 397 |
+
"Duala (Latin)": "dua_Latn",
|
| 398 |
+
"Duna (Latin)": "dug_Latn",
|
| 399 |
+
"Dutton World Speedwords (Latin)": "dwr_Latn",
|
| 400 |
+
"Dyiri (Latin)": "dyi_Latn",
|
| 401 |
+
"Dyola-Fonyi (Latin)": "dyo_Latn",
|
| 402 |
+
"Dyula (Latin)": "dyu_Latn",
|
| 403 |
+
"Dazaga (Latin)": "dzg_Latn",
|
| 404 |
+
"Dzongkha (Tibetan)": "dzo_Tibt",
|
| 405 |
+
"Embu (Latin)": "ebu_Latn",
|
| 406 |
+
"Epie (Latin)": "ego_Latn",
|
| 407 |
+
"Eipomek (Latin)": "eip_Latn",
|
| 408 |
+
"Askopan (Latin)": "eiv_Latn",
|
| 409 |
+
"Eka (Latin)": "eka_Latn",
|
| 410 |
+
"Standard Estonian (Latin)": "ekk_Latn",
|
| 411 |
+
"Eki (Latin)": "eko_Latn",
|
| 412 |
+
"Yace (Latin)": "ekr_Latn",
|
| 413 |
+
"Modern Greek (1453-) (Greek)": "ell_Grek",
|
| 414 |
+
"Modern Greek (1453-) (Greek, cypr1249)": "ell_Grek_cypr1249",
|
| 415 |
+
"Eleme (Latin)": "elm_Latn",
|
| 416 |
+
"Eman (Latin)": "emp_Latn",
|
| 417 |
+
"Enlhet (Latin)": "enb_Latn",
|
| 418 |
+
"English (Latin)": "eng_Latn",
|
| 419 |
+
"Enxet (Latin)": "enx_Latn",
|
| 420 |
+
"Esperanto (Latin)": "epo_Latn",
|
| 421 |
+
"Ese Ejja (Latin)": "ese_Latn",
|
| 422 |
+
"Esselen (Latin)": "ess_Latn",
|
| 423 |
+
"Central Yupik (Latin)": "esu_Latn",
|
| 424 |
+
"Eton (Vanuatu) (Latin)": "eto_Latn",
|
| 425 |
+
"Eton (Cameroon) (Latin)": "ets_Latn",
|
| 426 |
+
"Eton (Cameroon) (Latin)": "etu_Latn",
|
| 427 |
+
"Basque (Latin)": "eus_Latn",
|
| 428 |
+
"Even (Cyrillic)": "evn_Cyrl",
|
| 429 |
+
"Ewe (Latin)": "ewe_Latn",
|
| 430 |
+
"Ewondo (Latin)": "ewo_Latn",
|
| 431 |
+
"Eyak (Latin)": "eyo_Latn",
|
| 432 |
+
"Ezaa (Latin)": "eza_Latn",
|
| 433 |
+
"Fali (Latin)": "fal_Latn",
|
| 434 |
+
"Fang (Equatorial Guinea) (Latin)": "fan_Latn",
|
| 435 |
+
"Faroese (Latin)": "fao_Latn",
|
| 436 |
+
"Fasu (Latin)": "far_Latn",
|
| 437 |
+
"Persian (Arabic)": "fas_Arab",
|
| 438 |
+
"Fanti (Latin)": "fat_Latn",
|
| 439 |
+
"Faita (Latin)": "fia_Latn",
|
| 440 |
+
"Fijian (Latin)": "fij_Latn",
|
| 441 |
+
"Filipino (Latin)": "fil_Latn",
|
| 442 |
+
"Finnish (Latin)": "fin_Latn",
|
| 443 |
+
"Fipa (Latin)": "fip_Latn",
|
| 444 |
+
"Knaanic (Latin)": "fkk_Latn",
|
| 445 |
+
"Foau (Latin)": "flr_Latn",
|
| 446 |
+
"Fe'fe' (Latin)": "fmp_Latn",
|
| 447 |
+
"Far Western Muria (Devanagari)": "fmu_Deva",
|
| 448 |
+
"Fon (Latin)": "fon_Latn",
|
| 449 |
+
"French (Latin)": "fra_Latn",
|
| 450 |
+
"Fordata (Latin)": "frd_Latn",
|
| 451 |
+
"Western Frisian (Latin)": "fry_Latn",
|
| 452 |
+
"Fulah (Latin)": "fub_Latn",
|
| 453 |
+
"Pulaar (Latin)": "fuc_Latn",
|
| 454 |
+
"East Futuna (Latin)": "fue_Latn",
|
| 455 |
+
"Fulah (Latin)": "ful_Latn",
|
| 456 |
+
"Pulaar (Latin)": "fuq_Latn",
|
| 457 |
+
"Nigerian Fulfulde (Latin)": "fuv_Latn",
|
| 458 |
+
"Gagauz (Cyrillic)": "gag_Cyrl",
|
| 459 |
+
"Gagauz (Latin)": "gag_Latn",
|
| 460 |
+
"Gaina (Latin)": "gai_Latn",
|
| 461 |
+
"Gamkonora (Latin)": "gam_Latn",
|
| 462 |
+
"Kandawo (Telugu)": "gau_Telu",
|
| 463 |
+
"Gabri (Latin)": "gbi_Latn",
|
| 464 |
+
"Kaytetye (Devanagari)": "gbk_Deva",
|
| 465 |
+
"Garhwali (Devanagari)": "gbm_Deva",
|
| 466 |
+
"Gbari (Latin)": "gbo_Latn",
|
| 467 |
+
"Gbagyi (Latin)": "gbr_Latn",
|
| 468 |
+
"Gbagyi (Latin)": "gby_Latn",
|
| 469 |
+
"Alekano (Latin)": "gcc_Latn",
|
| 470 |
+
"Gade (Latin)": "gde_Latn",
|
| 471 |
+
"Guduf-Gava (Latin)": "gdf_Latn",
|
| 472 |
+
"Gengle (Latin)": "geb_Latn",
|
| 473 |
+
"Gebe (Latin)": "gej_Latn",
|
| 474 |
+
"Geser-Gorom (Latin)": "ges_Latn",
|
| 475 |
+
"Guria (Arabic)": "ggg_Arab",
|
| 476 |
+
"Gidar (Latin)": "gid_Latn",
|
| 477 |
+
"Gbazari (Arabic)": "gig_Arab",
|
| 478 |
+
"Gilbertese (Latin)": "gil_Latn",
|
| 479 |
+
"Gimi (Papua New Guinea) (Latin)": "giz_Latn",
|
| 480 |
+
"Kachi Koli (Arabic)": "gjk_Arab",
|
| 481 |
+
"Gunditjmara (Latin)": "gjn_Latn",
|
| 482 |
+
"Gujari (Arabic)": "gju_Arab",
|
| 483 |
+
"Gokana (Latin)": "gkn_Latn",
|
| 484 |
+
"Nanai (Cyrillic)": "gld_Cyrl",
|
| 485 |
+
"Irish (Latin)": "gle_Latn",
|
| 486 |
+
"Galician (Latin)": "glg_Latn",
|
| 487 |
+
"Gilaki (Arabic)": "glk_Arab",
|
| 488 |
+
"Manx (Latin)": "glv_Latn",
|
| 489 |
+
"Gula (Chad) (Latin)": "glw_Latn",
|
| 490 |
+
"Gamo (Latin)": "gmv_Latn",
|
| 491 |
+
"Gana (Latin)": "gna_Latn",
|
| 492 |
+
"Gondi (Latin)": "gnd_Latn",
|
| 493 |
+
"Ngangam (Latin)": "gng_Latn",
|
| 494 |
+
"Gofa (Latin)": "gof_Latn",
|
| 495 |
+
"Gogo (Latin)": "gog_Latn",
|
| 496 |
+
"Gola (Latin)": "gol_Latn",
|
| 497 |
+
"Goan Konkani (Devanagari)": "gom_Deva",
|
| 498 |
+
"Gorontalo (Latin)": "gor_Latn",
|
| 499 |
+
"Gor (Latin)": "gqr_Latn",
|
| 500 |
+
"Ancient Greek (to 1453) (Greek)": "grc_Grek",
|
| 501 |
+
"Gbiri-Niragu (Latin)": "gri_Latn",
|
| 502 |
+
"Guarani (Latin)": "grn_Latn",
|
| 503 |
+
"Garo (Bengali)": "grt_Beng",
|
| 504 |
+
"Guriaso (Latin)": "gsl_Latn",
|
| 505 |
+
"German Sign Language (Latin)": "gso_Latn",
|
| 506 |
+
"Guajajára (Latin)": "gub_Latn",
|
| 507 |
+
"Wayuu (Latin)": "guc_Latn",
|
| 508 |
+
"Yocoboué Dida (Latin)": "gud_Latn",
|
| 509 |
+
"Paraguayan Guaraní (Latin)": "gug_Latn",
|
| 510 |
+
"Guahibo (Latin)": "guh_Latn",
|
| 511 |
+
"Eastern Bolivian Guaraní (Latin)": "gui_Latn",
|
| 512 |
+
"Gujarati (Gujarati)": "guj_Gujr",
|
| 513 |
+
"Gumuz (Ethiopic)": "guk_Ethi",
|
| 514 |
+
"Gumuz (Latin)": "gum_Latn",
|
| 515 |
+
"Guro (Latin)": "guo_Latn",
|
| 516 |
+
"Guinau dan (Latin)": "guq_Latn",
|
| 517 |
+
"Farefare (Latin)": "gur_Latn",
|
| 518 |
+
"Farefare (Latin)": "guu_Latn",
|
| 519 |
+
"Gusilay (Latin)": "gux_Latn",
|
| 520 |
+
"Gusii (Latin)": "guz_Latn",
|
| 521 |
+
"Guanano (Latin)": "gvc_Latn",
|
| 522 |
+
"Gwanja (Latin)": "gvl_Latn",
|
| 523 |
+
"Kalami (Arabic)": "gwc_Arab",
|
| 524 |
+
"Gweno (Latin)": "gwe_Latn",
|
| 525 |
+
"Gwichʼin (Latin)": "gwi_Latn",
|
| 526 |
+
"Gwere (Latin)": "gwr_Latn",
|
| 527 |
+
"Gwere (Arabic)": "gwt_Arab",
|
| 528 |
+
"Guaymí (Latin)": "gym_Latn",
|
| 529 |
+
"Gyem (Latin)": "gyr_Latn",
|
| 530 |
+
"Geji (Latin)": "gyz_Latn",
|
| 531 |
+
"Hadiyya (Latin)": "had_Latn",
|
| 532 |
+
"Hanga (Latin)": "hag_Latn",
|
| 533 |
+
"Hahon (Latin)": "hah_Latn",
|
| 534 |
+
"Hakka Chinese (Latin)": "hak_Latn",
|
| 535 |
+
"Ha(Latin)": "hao_Latn",
|
| 536 |
+
"Hdi (Latin)": "hap_Latn",
|
| 537 |
+
"Haitian (Latin)": "hat_Latn",
|
| 538 |
+
"Hausa (Latin)": "hau_Latn",
|
| 539 |
+
"Hawaiian (Latin)": "haw_Latn",
|
| 540 |
+
"Haya (Latin)": "hay_Latn",
|
| 541 |
+
"Huba (Latin)": "hbb_Latn",
|
| 542 |
+
"Huichol (Latin)": "hch_Latn",
|
| 543 |
+
"Hebrew (Hebrew)": "heb_Hebr",
|
| 544 |
+
"Hehe (Latin)": "heh_Latn",
|
| 545 |
+
"Herero (Latin)": "her_Latn",
|
| 546 |
+
"Hiaitsiihi (Latin)": "hia_Latn",
|
| 547 |
+
"Fiji Hindi (Latin)": "hif_Latn",
|
| 548 |
+
"Higgi (Latin)": "hig_Latn",
|
| 549 |
+
"Hiligaynon (Latin)": "hil_Latn",
|
| 550 |
+
"Hindi (Devanagari)": "hin_Deva",
|
| 551 |
+
"Hkongso Chin (Latin)": "hkk_Latn",
|
| 552 |
+
"Halang (Latin)": "hla_Latn",
|
| 553 |
+
"Halia (Devanagari)": "hlb_Deva",
|
| 554 |
+
"Matu Chin (Latin)": "hlt_Latn",
|
| 555 |
+
"Chhattisgarhi (Devanagari)": "hne_Deva",
|
| 556 |
+
"Hän (Latin)": "hnn_Latn",
|
| 557 |
+
"Northern Hindko (Arabic)": "hno_Arab",
|
| 558 |
+
"Hunsrik (Latin)": "hns_Latn",
|
| 559 |
+
"Ho (Oriya)": "hoc_Orya",
|
| 560 |
+
"Croatian (Latin)": "hrv_Latn",
|
| 561 |
+
"Upper Sorbian (Latin)": "hsb_Latn",
|
| 562 |
+
"Hoti (Latin)": "hto_Latn",
|
| 563 |
+
"Huba (Latin)": "hub_Latn",
|
| 564 |
+
"Huave (Latin)": "hue_Latn",
|
| 565 |
+
"San Francisco Del Mar Huave (Latin)": "hui_Latn",
|
| 566 |
+
"Hula (Latin)": "hul_Latn",
|
| 567 |
+
"Hungarian (Latin)": "hun_Latn",
|
| 568 |
+
"Huastec (Latin)": "hus_Latn",
|
| 569 |
+
"Humla (Latin)": "huu_Latn",
|
| 570 |
+
"San Mateo Del Mar Huave (Latin)": "huv_Latn",
|
| 571 |
+
"Hulaulá (Latin)": "hux_Latn",
|
| 572 |
+
"Havanese (Latin)": "hvn_Latn",
|
| 573 |
+
"Hwana (Latin)": "hwo_Latn",
|
| 574 |
+
"Armenian (Armenian)": "hye_Armn",
|
| 575 |
+
"Western Armenian (Armenian)": "hyw_Armn",
|
| 576 |
+
"Iban (Latin)": "iba_Latn",
|
| 577 |
+
"Ibibio (Latin)": "ibb_Latn",
|
| 578 |
+
"Igbo (Latin)": "ibo_Latn",
|
| 579 |
+
"Etkywan (Latin)": "icr_Latn",
|
| 580 |
+
"Ido (Latin)": "ida_Latn",
|
| 581 |
+
"Idon (Latin)": "idd_Latn",
|
| 582 |
+
"Idoma (Latin)": "idu_Latn",
|
| 583 |
+
"Ifugao (Latin)": "ifa_Latn",
|
| 584 |
+
"Amganad Ifugao (Latin)": "ifb_Latn",
|
| 585 |
+
"Ifo (Latin)": "ife_Latn",
|
| 586 |
+
"Tuwali Ifugao (Latin)": "ifk_Latn",
|
| 587 |
+
"Mayoyao Ifugao (Latin)": "ifu_Latn",
|
| 588 |
+
"Keley-I Kallahan (Latin)": "ify_Latn",
|
| 589 |
+
"Igede (Latin)": "igl_Latn",
|
| 590 |
+
"Igala (Latin)": "ign_Latn",
|
| 591 |
+
"Ijaw (Latin)": "ijc_Latn",
|
| 592 |
+
"Biseni (Latin)": "ijn_Latn",
|
| 593 |
+
"Ika (Latin)": "ikk_Latn",
|
| 594 |
+
"Ikwere (Latin)": "ikw_Latn",
|
| 595 |
+
"Ila (Latin)": "ilb_Latn",
|
| 596 |
+
"Ilocano (Latin)": "ilo_Latn",
|
| 597 |
+
"Imbongu (Latin)": "imo_Latn",
|
| 598 |
+
"Interlingua (International Auxiliary Language Association) (Latin)": "ina_Latn",
|
| 599 |
+
"Inga (Latin)": "inb_Latn",
|
| 600 |
+
"Indonesian (Latin)": "ind_Latn",
|
| 601 |
+
"Iu Mien (Latin)": "iou_Latn",
|
| 602 |
+
"Ipili (Latin)": "ipi_Latn",
|
| 603 |
+
"Inupiaq (Latin)": "ipk_Latn",
|
| 604 |
+
"Iquito (Latin)": "iqw_Latn",
|
| 605 |
+
"Iresim (Latin)": "iri_Latn",
|
| 606 |
+
"Irarutu (Latin)": "irk_Latn",
|
| 607 |
+
"Isekiri (Latin)": "ish_Latn",
|
| 608 |
+
"Icelandic (Latin)": "isl_Latn",
|
| 609 |
+
"Isoko (Latin)": "iso_Latn",
|
| 610 |
+
"Italian (Latin)": "ita_Latn",
|
| 611 |
+
"Itelmen (Cyrillic)": "itl_Cyrl",
|
| 612 |
+
"Isekiri (Latin)": "its_Latn",
|
| 613 |
+
"Isekiri (Latin)": "itv_Latn",
|
| 614 |
+
"Ito (Latin)": "itw_Latn",
|
| 615 |
+
"Itzá (Latin)": "itz_Latn",
|
| 616 |
+
"Ixil (Latin)": "ixl_Latn",
|
| 617 |
+
"Izere (Latin)": "izr_Latn",
|
| 618 |
+
"Izii (Latin)": "izz_Latn",
|
| 619 |
+
"Jakaltek (Latin)": "jac_Latn",
|
| 620 |
+
"Yalahatan (Latin)": "jal_Latn",
|
| 621 |
+
"Jamaican Creole English (Latin)": "jam_Latn",
|
| 622 |
+
"Javanese (Latin)": "jav_Latn",
|
| 623 |
+
"Jambi Malay (Latin)": "jax_Latn",
|
| 624 |
+
"Jibu (Latin)": "jbu_Latn",
|
| 625 |
+
"Jerung (Latin)": "jen_Latn",
|
| 626 |
+
"Jicaque (Latin)": "jic_Latn",
|
| 627 |
+
"Jivaro (Latin)": "jiv_Latn",
|
| 628 |
+
"Machame (Latin)": "jmc_Latn",
|
| 629 |
+
"Zumbun (Latin)": "jmd_Latn",
|
| 630 |
+
"Jimi (Nigeria) (Latin)": "jmx_Latn",
|
| 631 |
+
"Japanese (Japanese)": "jpn_Jpan",
|
| 632 |
+
"Jaqaru (Latin)": "jqr_Latn",
|
| 633 |
+
"Jowulu (Latin)": "juk_Latn",
|
| 634 |
+
"Ju'hoan (Oriya)": "jun_Orya",
|
| 635 |
+
"Juang (Latin)": "juo_Latn",
|
| 636 |
+
"Wapan (Latin)": "jvn_Latn",
|
| 637 |
+
"Kara-Kalpak (Cyrillic)": "kaa_Cyrl",
|
| 638 |
+
"Kabyle (Latin)": "kab_Latn",
|
| 639 |
+
"Kachin (Latin)": "kac_Latn",
|
| 640 |
+
"Gayo (Latin)": "kai_Latn",
|
| 641 |
+
"Jju (Latin)": "kaj_Latn",
|
| 642 |
+
"Jju (Latin)": "kak_Latn",
|
| 643 |
+
"Kamba (Kenya) (Latin)": "kam_Latn",
|
| 644 |
+
"Kannada (Kannada)": "kan_Knda",
|
| 645 |
+
"Kanu (Latin)": "kao_Latn",
|
| 646 |
+
"Bezhta (Latin)": "kaq_Latn",
|
| 647 |
+
"Kashmiri (Arabic)": "kas_Arab",
|
| 648 |
+
"Georgian (Georgian)": "kat_Geor",
|
| 649 |
+
"Kadazan Dusun (Latin)": "kay_Latn",
|
| 650 |
+
"Kazakh (Cyrillic)": "kaz_Cyrl",
|
| 651 |
+
"Kabardian (Cyrillic)": "kbd_Cyrl",
|
| 652 |
+
"Kayan (Latin)": "kbl_Latn",
|
| 653 |
+
"Kande (Latin)": "kbo_Latn",
|
| 654 |
+
"Kabiye (Latin)": "kbp_Latn",
|
| 655 |
+
"Kabiye (Latin)": "kbq_Latn",
|
| 656 |
+
"Kafa (Latin)": "kbr_Latn",
|
| 657 |
+
"Kamo (Latin)": "kbt_Latn",
|
| 658 |
+
"Kikuyu (Latin)": "kby_Latn",
|
| 659 |
+
"Ket (Cyrillic)": "kca_Cyrl",
|
| 660 |
+
"Tyap (Latin)": "kcg_Latn",
|
| 661 |
+
"Kono (Nigeria) (Latin)": "kcn_Latn",
|
| 662 |
+
"Kutu (Latin)": "kcq_Latn",
|
| 663 |
+
"Kutu (Latin)": "kdc_Latn",
|
| 664 |
+
"Makonde (Latin)": "kde_Latn",
|
| 665 |
+
"Tem (Latin)": "kdh_Latn",
|
| 666 |
+
"Kumam (Latin)": "kdi_Latn",
|
| 667 |
+
"Kumam (Latin)": "kdj_Latn",
|
| 668 |
+
"Tsikimba (Latin)": "kdl_Latn",
|
| 669 |
+
"Kagulu (Latin)": "kdn_Latn",
|
| 670 |
+
"Kuy (Khmer)": "kdt_Khmr",
|
| 671 |
+
"Kepo' (Latin)": "kea_Latn",
|
| 672 |
+
"Kekchi (Latin)": "kek_Latn",
|
| 673 |
+
"Kenyang (Latin)": "ken_Latn",
|
| 674 |
+
"Kenyah (Latin)": "keo_Latn",
|
| 675 |
+
"Kera (Latin)": "ker_Latn",
|
| 676 |
+
"Kugbo (Latin)": "keu_Latn",
|
| 677 |
+
"Komi-Permyak (Telugu)": "key_Telu",
|
| 678 |
+
"Kukele (Latin)": "kez_Latn",
|
| 679 |
+
"Kobiana (Devanagari)": "kfb_Deva",
|
| 680 |
+
"Northwestern Kolami (Telugu)": "kff_Telu",
|
| 681 |
+
"Kuk (Devanagari)": "kfk_Deva",
|
| 682 |
+
"Kotaba (Devanagari)": "kfq_Deva",
|
| 683 |
+
"Koya (Gujarati)": "kfr_Gujr",
|
| 684 |
+
"Koro (India) (Latin)": "kfw_Latn",
|
| 685 |
+
"Kaili (Devanagari)": "kfx_Deva",
|
| 686 |
+
"Khasi (Latin)": "kha_Latn",
|
| 687 |
+
"Kham (Tibetan)": "khg_Tibt",
|
| 688 |
+
"Khalkha Mongolian (Cyrillic)": "khk_Cyrl",
|
| 689 |
+
"Khmer (Khmer)": "khm_Khmr",
|
| 690 |
+
"Koyra Chiini Songhay (Latin)": "khq_Latn",
|
| 691 |
+
"Khowar (Arabic)": "khw_Arab",
|
| 692 |
+
"Kim (Latin)": "kia_Latn",
|
| 693 |
+
"Koalib (Latin)": "kij_Latn",
|
| 694 |
+
"Kikuyu (Latin)": "kik_Latn",
|
| 695 |
+
"Kinyarwanda (Latin)": "kin_Latn",
|
| 696 |
+
"Kirghiz (Cyrillic)": "kir_Cyrl",
|
| 697 |
+
"Kitharaka (Latin)": "kix_Latn",
|
| 698 |
+
"Mlap (Latin)": "kjb_Latn",
|
| 699 |
+
"Coastal Konjo (Latin)": "kjc_Latn",
|
| 700 |
+
"Kisar (Latin)": "kje_Latn",
|
| 701 |
+
"Khmu (Latin)": "kjg_Latn",
|
| 702 |
+
"Khakas (Cyrillic)": "kjh_Cyrl",
|
| 703 |
+
"Khakas (Latin)": "kjk_Latn",
|
| 704 |
+
"Kagulu (Latin)": "kki_Latn",
|
| 705 |
+
"Kikuyu (Latin)": "kkj_Latn",
|
| 706 |
+
"Kalanguya (Devanagari)": "kle_Deva",
|
| 707 |
+
"Kalenjin (Latin)": "kln_Latn",
|
| 708 |
+
"Kulisusu (Latin)": "kls_Latn",
|
| 709 |
+
"Klao (Latin)": "klu_Latn",
|
| 710 |
+
"Maskelynes (Latin)": "klv_Latn",
|
| 711 |
+
"Tado (Latin)": "klw_Latn",
|
| 712 |
+
"Kama (Latin)": "kma_Latn",
|
| 713 |
+
"Kimbundu (Latin)": "kmd_Latn",
|
| 714 |
+
"Tanudan Kalinga (Latin)": "kml_Latn",
|
| 715 |
+
"Northern Kurdish (Arabic)": "kmr_Arab",
|
| 716 |
+
"Northern Kurdish (Cyrillic)": "kmr_Cyrl",
|
| 717 |
+
"Northern Kurdish (Latin)": "kmr_Latn",
|
| 718 |
+
"Kanite (Latin)": "kmu_Latn",
|
| 719 |
+
"Koma (Latin)": "kmy_Latn",
|
| 720 |
+
"Kanda (Latin)": "kna_Latn",
|
| 721 |
+
"Lubuagan Kalinga (Latin)": "knb_Latn",
|
| 722 |
+
"Central Kanuri (Latin)": "knc_Latn",
|
| 723 |
+
"Kankanaey (Latin)": "kne_Latn",
|
| 724 |
+
"Kutu (Latin)": "knf_Latn",
|
| 725 |
+
"Konda (Latin)": "knj_Latn",
|
| 726 |
+
"Kuranko (Latin)": "knk_Latn",
|
| 727 |
+
"Konkani (macrolanguage) (Devanagari)": "knn_Deva",
|
| 728 |
+
"Kono (Sierra Leone) (Latin)": "kno_Latn",
|
| 729 |
+
"Kongo (Latin)": "kog_Latn",
|
| 730 |
+
"Kol (Papua New Guinea) (Latin)": "kol_Latn",
|
| 731 |
+
"Konzo (Latin)": "koo_Latn",
|
| 732 |
+
"Korean (Hangul)": "kor_Hang",
|
| 733 |
+
"Kodia (Latin)": "kpo_Latn",
|
| 734 |
+
"Korupun-Sela (Latin)": "kpq_Latn",
|
| 735 |
+
"Kofei (Latin)": "kps_Latn",
|
| 736 |
+
"Komi-Zyrian (Cyrillic)": "kpv_Cyrl",
|
| 737 |
+
"Komi-Permyak (Cyrillic)": "kpy_Cyrl",
|
| 738 |
+
"Kofyar (Latin)": "kpz_Latn",
|
| 739 |
+
"Korafe-Yegha (Latin)": "kqe_Latn",
|
| 740 |
+
"Korafe-Yegha (Latin)": "kqo_Latn",
|
| 741 |
+
"Kimré (Latin)": "kqp_Latn",
|
| 742 |
+
"Kimaragang (Latin)": "kqr_Latn",
|
| 743 |
+
"Koyra Chiini Songhay (Ethiopic)": "kqy_Ethi",
|
| 744 |
+
"Karachay-Balkar (Cyrillic)": "krc_Cyrl",
|
| 745 |
+
"Krio (Latin)": "kri_Latn",
|
| 746 |
+
"Kinaray-A (Latin)": "krj_Latn",
|
| 747 |
+
"Karelian (Latin)": "krl_Latn",
|
| 748 |
+
"Sapo (Khmer)": "krr_Khmr",
|
| 749 |
+
"Gbaya (Sudan) (Latin)": "krs_Latn",
|
| 750 |
+
"Kurukh (Devanagari)": "kru_Deva",
|
| 751 |
+
"Tewa (Indonesia) (Latin)": "krx_Latn",
|
| 752 |
+
"Shambala (Latin)": "ksb_Latn",
|
| 753 |
+
"Kuanua (Latin)": "ksd_Latn",
|
| 754 |
+
"Bafia (Latin)": "ksf_Latn",
|
| 755 |
+
"Krisa (Latin)": "ksr_Latn",
|
| 756 |
+
"Kusasi (Latin)": "kss_Latn",
|
| 757 |
+
"Kham (Devanagari)": "ksz_Deva",
|
| 758 |
+
"Kambaata (Ethiopic)": "ktb_Ethi",
|
| 759 |
+
"Krumen (Latin)": "ktj_Latn",
|
| 760 |
+
"Kto (Latin)": "kto_Latn",
|
| 761 |
+
"Kuanyama (Latin)": "kua_Latn",
|
| 762 |
+
"Kutep (Latin)": "kub_Latn",
|
| 763 |
+
"Kuman (Papua New Guinea) (Latin)": "kue_Latn",
|
| 764 |
+
"Kushi (Latin)": "kuh_Latn",
|
| 765 |
+
"Kumyk (Cyrillic)": "kum_Cyrl",
|
| 766 |
+
"Kurdish (Arabic)": "kur_Arab",
|
| 767 |
+
"Kusaal (Latin)": "kus_Latn",
|
| 768 |
+
"Kutino (Latin)": "kvn_Latn",
|
| 769 |
+
"Kove (Latin)": "kvw_Latn",
|
| 770 |
+
"Komi (Arabic)": "kvx_Arab",
|
| 771 |
+
"Kutu (Latin)": "kwd_Latn",
|
| 772 |
+
"Kwara'ae (Latin)": "kwf_Latn",
|
| 773 |
+
"Awa-Cuaiquer (Latin)": "kwi_Latn",
|
| 774 |
+
"Kwak'wala (Latin)": "kwm_Latn",
|
| 775 |
+
"Kodia (Ethiopic)": "kxc_Ethi",
|
| 776 |
+
"Maninkakan, Kita (Latin)": "kxf_Latn",
|
| 777 |
+
"Kuanhua (Thai)": "kxm_Thai",
|
| 778 |
+
"Wadiyara Koli (Arabic)": "kxp_Arab",
|
| 779 |
+
"Kwaya (Latin)": "kyb_Latn",
|
| 780 |
+
"Kyaka (Latin)": "kyc_Latn",
|
| 781 |
+
"Karey (Latin)": "kyf_Latn",
|
| 782 |
+
"Keyagana (Latin)": "kyg_Latn",
|
| 783 |
+
"Kouya (Latin)": "kyo_Latn",
|
| 784 |
+
"Kwaya (Latin)": "kyq_Latn",
|
| 785 |
+
"Kayagar (Kayah Li)": "kyu_Kali",
|
| 786 |
+
"Kambaira (Latin)": "kyx_Latn",
|
| 787 |
+
"Kerewo (Latin)": "kyz_Latn",
|
| 788 |
+
"Kairiru (Latin)": "kzf_Latn",
|
| 789 |
+
"Kelabit (Latin)": "kzi_Latn",
|
| 790 |
+
"Lacandon (Latin)": "lac_Latn",
|
| 791 |
+
"Langi (Latin)": "lag_Latn",
|
| 792 |
+
"Lango (Uganda) (Latin)": "laj_Latn",
|
| 793 |
+
"Lamba (Latin)": "lam_Latn",
|
| 794 |
+
"Lao (Lao)": "lao_Laoo",
|
| 795 |
+
"Lama (Togo) (Latin)": "las_Latn",
|
| 796 |
+
"Latin (Latin)": "lat_Latn",
|
| 797 |
+
"Latvian (Latin)": "lav_Latn",
|
| 798 |
+
"Lavu (Latin)": "law_Latn",
|
| 799 |
+
"Lama (Myanmar) (Tibetan)": "lbj_Tibt",
|
| 800 |
+
"Lachi (Latin)": "lbw_Latn",
|
| 801 |
+
"Luchazi (Latin)": "lcm_Latn",
|
| 802 |
+
"Lola (Thai)": "lcp_Thai",
|
| 803 |
+
"Lidzonka (Latin)": "ldb_Latn",
|
| 804 |
+
"Leko (Latin)": "led_Latn",
|
| 805 |
+
"Lyélé (Latin)": "lee_Latn",
|
| 806 |
+
"Lefa (Latin)": "lef_Latn",
|
| 807 |
+
"Lembena (Latin)": "lem_Latn",
|
| 808 |
+
"Lense (Latin)": "lew_Latn",
|
| 809 |
+
"Lemio (Latin)": "lex_Latn",
|
| 810 |
+
"Lega-Shabunda (Latin)": "lgg_Latn",
|
| 811 |
+
"Laghu (Latin)": "lgl_Latn",
|
| 812 |
+
"Lahu (Latin)": "lhu_Latn",
|
| 813 |
+
"Lianshan Zhuang (Latin)": "lia_Latn",
|
| 814 |
+
"Likum (Latin)": "lid_Latn",
|
| 815 |
+
"Limbu (Devanagari)": "lif_Deva",
|
| 816 |
+
"Ligurian (Latin)": "lij_Latn",
|
| 817 |
+
"Lingala (Latin)": "lin_Latn",
|
| 818 |
+
"Liki (Latin)": "lip_Latn",
|
| 819 |
+
"Libinza (Latin)": "lir_Latn",
|
| 820 |
+
"Lisu (Lisu)": "lis_Lisu",
|
| 821 |
+
"Lithuanian (Latin)": "lit_Latn",
|
| 822 |
+
"Rampi (Latin)": "lje_Latn",
|
| 823 |
+
"Lampung Api (Latin)": "ljp_Latn",
|
| 824 |
+
"Lukabaras (Latin)": "lkb_Latn",
|
| 825 |
+
"Lakata (Latin)": "lke_Latn",
|
| 826 |
+
"Lilau (Latin)": "lla_Latn",
|
| 827 |
+
"Ladin (Latin, gherd)": "lld_Latn_gherd",
|
| 828 |
+
"Ladin (Latin, valbadia)": "lld_Latn_valbadia",
|
| 829 |
+
"Láá Láá Bwamu (Latin)": "llg_Latn",
|
| 830 |
+
"Lele (Guinea) (Latin)": "lln_Latn",
|
| 831 |
+
"Loma (Liberia) (Latin)": "lme_Latn",
|
| 832 |
+
"Lundayeh (Latin)": "lnd_Latn",
|
| 833 |
+
"Lango (South Sudan) (Latin)": "lns_Latn",
|
| 834 |
+
"Lundayeh (Latin)": "lnu_Latn",
|
| 835 |
+
"Loloda (Latin)": "loa_Latn",
|
| 836 |
+
"Lobi (Latin)": "lob_Latn",
|
| 837 |
+
"Loko (Latin)": "lok_Latn",
|
| 838 |
+
"Loma (Liberia) (Latin)": "lom_Latn",
|
| 839 |
+
"Loma (Liberia) (Latin)": "lon_Latn",
|
| 840 |
+
"Lobala (Latin)": "loq_Latn",
|
| 841 |
+
"Luri (Arabic)": "lrk_Arab",
|
| 842 |
+
"Lish (Latin)": "lsi_Latn",
|
| 843 |
+
"Sa'ban (Latin)": "lsm_Latn",
|
| 844 |
+
"Sa'ban (Arabic)": "lss_Arab",
|
| 845 |
+
"Latgalian (Latin)": "ltg_Latn",
|
| 846 |
+
"Lethu (Latin)": "lth_Latn",
|
| 847 |
+
"Lutachoni (Latin)": "lto_Latn",
|
| 848 |
+
"Luxembourgish (Latin)": "ltz_Latn",
|
| 849 |
+
"Luba-Lulua (Latin)": "lua_Latn",
|
| 850 |
+
"Aringa (Latin)": "luc_Latn",
|
| 851 |
+
"Ganda (Latin)": "lug_Latn",
|
| 852 |
+
"Luo (Kenya and Tanzania) (Latin)": "luo_Latn",
|
| 853 |
+
"Lushai (Latin)": "lus_Latn",
|
| 854 |
+
"Luwanga (Latin)": "lwg_Latn",
|
| 855 |
+
"Lwo (Latin)": "lwo_Latn",
|
| 856 |
+
"Lewo Eleng (Latin)": "lww_Latn",
|
| 857 |
+
"Laz (Latin)": "lzz_Latn",
|
| 858 |
+
"Maasai (Latin)": "maa_Latn",
|
| 859 |
+
"Yutanduchi Mixtec (Latin)": "mab_Latn",
|
| 860 |
+
"Madurese (Latin)": "mad_Latn",
|
| 861 |
+
"Mafa (Latin)": "maf_Latn",
|
| 862 |
+
"Magahi (Devanagari)": "mag_Deva",
|
| 863 |
+
"Marshallese (Latin)": "mah_Latn",
|
| 864 |
+
"Maithili (Devanagari)": "mai_Deva",
|
| 865 |
+
"Majhwar (Latin)": "maj_Latn",
|
| 866 |
+
"Makasar (Latin)": "mak_Latn",
|
| 867 |
+
"Malayalam (Malayalam)": "mal_Mlym",
|
| 868 |
+
"Mam (Latin)": "mam_Latn",
|
| 869 |
+
"Mamaindé (Latin)": "maq_Latn",
|
| 870 |
+
"Marathi (Devanagari)": "mar_Deva",
|
| 871 |
+
"Mazatec (Latin)": "mau_Latn",
|
| 872 |
+
"Sateré-Mawé (Latin)": "maw_Latn",
|
| 873 |
+
"North Moluccan Malay (Latin)": "max_Latn",
|
| 874 |
+
"Central Mazahua (Latin)": "maz_Latn",
|
| 875 |
+
"Western Bukidnon Manobo (Latin)": "mbb_Latn",
|
| 876 |
+
"Macushi (Latin)": "mbc_Latn",
|
| 877 |
+
"Duna (Latin)": "mbh_Latn",
|
| 878 |
+
"Ilianen Manobo (Latin)": "mbj_Latn",
|
| 879 |
+
"Matigsalug Manobo (Latin)": "mbt_Latn",
|
| 880 |
+
"Mbo (Cameroon) (Latin)": "mbu_Latn",
|
| 881 |
+
"Macuna (Latin)": "mca_Latn",
|
| 882 |
+
"Machiguenga (Latin)": "mcb_Latn",
|
| 883 |
+
"Bitur (Latin)": "mcd_Latn",
|
| 884 |
+
"Matsés (Latin)": "mcf_Latn",
|
| 885 |
+
"Mixe (Latin)": "mco_Latn",
|
| 886 |
+
"Ese (Latin)": "mcp_Latn",
|
| 887 |
+
"M seri (Latin)": "mcq_Latn",
|
| 888 |
+
"Mambai (Latin)": "mcu_Latn",
|
| 889 |
+
"Mpiemo (Latin)": "mcx_Latn",
|
| 890 |
+
"Mada (Nigeria) (Latin)": "mda_Latn",
|
| 891 |
+
"Morigi (Latin)": "mdd_Latn",
|
| 892 |
+
"Mbosi (Latin)": "mdv_Latn",
|
| 893 |
+
"Male (Ethiopia) (Ethiopic)": "mdy_Ethi",
|
| 894 |
+
"Medumba (Latin)": "med_Latn",
|
| 895 |
+
"Melpa (Latin)": "mee_Latn",
|
| 896 |
+
"Southwestern Tlaxiaco Mixtec (Latin)": "meh_Latn",
|
| 897 |
+
"Midob (Latin)": "mej_Latn",
|
| 898 |
+
"Mekeo (Latin)": "mek_Latn",
|
| 899 |
+
"Central Melanau (Latin)": "mel_Latn",
|
| 900 |
+
"Mende (Liberia) (Latin)": "men_Latn",
|
| 901 |
+
"Merey (Latin)": "meq_Latn",
|
| 902 |
+
"Meru (Latin)": "mer_Latn",
|
| 903 |
+
"Mato (Latin)": "met_Latn",
|
| 904 |
+
"Motu (Latin)": "meu_Latn",
|
| 905 |
+
"Mano (Latin)": "mev_Latn",
|
| 906 |
+
"Morisyen (Latin)": "mfe_Latn",
|
| 907 |
+
"Mefele (Latin)": "mfh_Latn",
|
| 908 |
+
"Mefele (Latin)": "mfi_Latn",
|
| 909 |
+
"Mogofin (Latin)": "mfk_Latn",
|
| 910 |
+
"Cross River Mbembe (Latin)": "mfm_Latn",
|
| 911 |
+
"Mefele (Latin)": "mfn_Latn",
|
| 912 |
+
"Mbe (Latin)": "mfo_Latn",
|
| 913 |
+
"Marghi South (Latin)": "mfq_Latn",
|
| 914 |
+
"Marghi (Latin)": "mfv_Latn",
|
| 915 |
+
"Pahi (Latin)": "mfy_Latn",
|
| 916 |
+
"Melo (Latin)": "mfz_Latn",
|
| 917 |
+
"Maguindanaon (Latin)": "mgd_Latn",
|
| 918 |
+
"Mpade (Latin)": "mge_Latn",
|
| 919 |
+
"Monguor (Latin)": "mgg_Latn",
|
| 920 |
+
"Makhuwa-Meetto (Latin)": "mgh_Latn",
|
| 921 |
+
"Laua (Latin)": "mgi_Latn",
|
| 922 |
+
"Meta' (Latin)": "mgo_Latn",
|
| 923 |
+
"Ma'di (Latin)": "mhi_Latn",
|
| 924 |
+
"Mouk-Aria (Latin)": "mhk_Latn",
|
| 925 |
+
"Mari (Russia) (Cyrillic)": "mhr_Cyrl",
|
| 926 |
+
"Mundat (Latin)": "mhu_Latn",
|
| 927 |
+
"Maru (Latin)": "mhx_Latn",
|
| 928 |
+
"Ma'di (Latin)": "mhy_Latn",
|
| 929 |
+
"Atatláhuca Mixtec (Latin)": "mib_Latn",
|
| 930 |
+
"Mi'kmaq (Latin)": "mie_Latn",
|
| 931 |
+
"Mofu-Gudur (Latin)": "mif_Latn",
|
| 932 |
+
"San Miguel El Grande Mixtec (Latin)": "mig_Latn",
|
| 933 |
+
"Chayuco Mixtec (Latin)": "mih_Latn",
|
| 934 |
+
"Peñoles Mixtec (Latin)": "mil_Latn",
|
| 935 |
+
"Alacatlatzala Mixtec (Latin)": "mim_Latn",
|
| 936 |
+
"Minangkabau (Latin)": "min_Latn",
|
| 937 |
+
"Pinotepa Nacional Mixtec (Latin)": "mio_Latn",
|
| 938 |
+
"Apasco-Apoala Mixtec (Latin)": "mip_Latn",
|
| 939 |
+
"Mískito (Latin)": "miq_Latn",
|
| 940 |
+
"Mískito (Latin)": "mit_Latn",
|
| 941 |
+
"Southern Puebla Mixtec (Latin)": "miu_Latn",
|
| 942 |
+
"Akoye (Latin)": "miy_Latn",
|
| 943 |
+
"Coatzospan Mixtec (Latin)": "miz_Latn",
|
| 944 |
+
"Mali (Devanagari)": "mjl_Deva",
|
| 945 |
+
"Malavedan (Malayalam)": "mjv_Mlym",
|
| 946 |
+
"Macedonian (Cyrillic)": "mkd_Cyrl",
|
| 947 |
+
"Mokole (Benin) (Latin)": "mkf_Latn",
|
| 948 |
+
"Dhatki (Arabic)": "mki_Arab",
|
| 949 |
+
"Mokole (Benin) (Latin)": "mkl_Latn",
|
| 950 |
+
"Mokole (Benin) (Latin)": "mkn_Latn",
|
| 951 |
+
"Malagasy (Latin)": "mlg_Latn",
|
| 952 |
+
"Maltese (Latin)": "mlq_Latn",
|
| 953 |
+
"Maltese (Latin)": "mlt_Latn",
|
| 954 |
+
"Mamanwa (Latin)": "mmc_Latn",
|
| 955 |
+
"Michoacán Mazahua (Latin)": "mmg_Latn",
|
| 956 |
+
"Maonan (Latin)": "mnb_Latn",
|
| 957 |
+
"Montenegrin (Latin)": "mne_Latn",
|
| 958 |
+
"Mundani (Latin)": "mnf_Latn",
|
| 959 |
+
"Manipuri (Bengali)": "mni_Beng",
|
| 960 |
+
"Maninka (Latin)": "mnk_Latn",
|
| 961 |
+
"Mon (Myanmar)": "mnw_Mymr",
|
| 962 |
+
"Manikion (Latin)": "mnx_Latn",
|
| 963 |
+
"Mwan (Latin)": "moa_Latn",
|
| 964 |
+
"Mogholi (Latin)": "mog_Latn",
|
| 965 |
+
"Mongolian (Cyrillic)": "mon_Cyrl",
|
| 966 |
+
"Mopán Maya (Latin)": "mop_Latn",
|
| 967 |
+
"Mor (New Guinea) (Latin)": "mor_Latn",
|
| 968 |
+
"Mossi (Latin)": "mos_Latn",
|
| 969 |
+
"Tucunaca (Latin)": "mox_Latn",
|
| 970 |
+
"Mukulu (Latin)": "moz_Latn",
|
| 971 |
+
"Mpompon (Latin)": "mpg_Latn",
|
| 972 |
+
"Yosondúa Mixtec (Latin)": "mpm_Latn",
|
| 973 |
+
"Mapidian (Latin)": "mpp_Latn",
|
| 974 |
+
"Mixtec (Latin)": "mpx_Latn",
|
| 975 |
+
"Malas (Latin)": "mqb_Latn",
|
| 976 |
+
"Mangole (Latin)": "mqf_Latn",
|
| 977 |
+
"Minokok (Latin)": "mqj_Latn",
|
| 978 |
+
"Mumuye (Latin)": "mqn_Latn",
|
| 979 |
+
"Manggarai (Latin)": "mqy_Latn",
|
| 980 |
+
"Maori (Latin)": "mri_Latn",
|
| 981 |
+
"Western Mari (Cyrillic)": "mrj_Cyrl",
|
| 982 |
+
"Western Magar (Devanagari)": "mrr_Deva",
|
| 983 |
+
"Maranao (Latin)": "mrt_Latn",
|
| 984 |
+
"Maru (Latin)": "mrw_Latn",
|
| 985 |
+
"Masaba (Latin)": "msh_Latn",
|
| 986 |
+
"Sabah Malay (Latin)": "msi_Latn",
|
| 987 |
+
"Mswahili (Latin)": "msw_Latn",
|
| 988 |
+
"Malay (macrolanguage) (Latin)": "msy_Latn",
|
| 989 |
+
"Mator-Taygi-Karagas (Latin)": "mtd_Latn",
|
| 990 |
+
"Binukidnon (Latin)": "mtj_Latn",
|
| 991 |
+
"Yosondúa Mixtec (Latin)": "mto_Latn",
|
| 992 |
+
"Totontepec Mixe (Devanagari)": "mtr_Deva",
|
| 993 |
+
"Tututepec Mixtec (Latin)": "mtu_Latn",
|
| 994 |
+
"Tututepec Mixtec (Latin)": "mtx_Latn",
|
| 995 |
+
"Mundang (Latin)": "mua_Latn",
|
| 996 |
+
"Mubi (Latin)": "mug_Latn",
|
| 997 |
+
"Mündü (Latin)": "muh_Latn",
|
| 998 |
+
"Musi (Latin)": "mui_Latn",
|
| 999 |
+
"Majhwar (Devanagari)": "mup_Deva",
|
| 1000 |
+
"Murle (Latin)": "mur_Latn",
|
| 1001 |
+
"Muthuvan (Malayalam)": "muv_Mlym",
|
| 1002 |
+
"Muyang (Latin)": "muy_Latn",
|
| 1003 |
+
"Marwari (Arabic)": "mve_Arab",
|
| 1004 |
+
"Marwari (Arabic)": "mvp_Latn",
|
| 1005 |
+
"Marwari (Arabic)": "mvy_Arab",
|
| 1006 |
+
"Mwanga (Tanzania) (Latin)": "mwq_Latn",
|
| 1007 |
+
"Mwera (Tanzania) (Latin)": "mwv_Latn",
|
| 1008 |
+
"Metlatónoc Mixtec (Latin)": "mxb_Latn",
|
| 1009 |
+
"Juxtlahuaca Mixtec (Latin)": "mxq_Latn",
|
| 1010 |
+
"Silacayoapan Mixtec (Latin)": "mxs_Latn",
|
| 1011 |
+
"Tezoatlán Mixtec (Latin)": "mxt_Latn",
|
| 1012 |
+
"Metlatónoc Mixtec (Latin)": "mxu_Latn",
|
| 1013 |
+
"Northwestern Ojibwa (Latin)": "mxv_Latn",
|
| 1014 |
+
"Metlatónoc Mixtec (Latin)": "mxy_Latn",
|
| 1015 |
+
"Burmese (Myanmar)": "mya_Mymr",
|
| 1016 |
+
"Mbay (Latin)": "myb_Latn",
|
| 1017 |
+
"Myene (Latin)": "myk_Latn",
|
| 1018 |
+
"Erzya (Cyrillic)": "myv_Cyrl",
|
| 1019 |
+
"Masa (Chad) (Latin)": "myx_Latn",
|
| 1020 |
+
"Macuna (Latin)": "myy_Latn",
|
| 1021 |
+
"Santa María Zacatepec Mixtec (Latin)": "mza_Latn",
|
| 1022 |
+
"Berber languages (Latin)": "mzi_Latn",
|
| 1023 |
+
"Mazatlán Mixe (Latin)": "mzj_Latn",
|
| 1024 |
+
"Mazatlán Mixe (Latin)": "mzk_Latn",
|
| 1025 |
+
"Mazatlán Mixe (Latin)": "mzl_Latn",
|
| 1026 |
+
"Mumuye (Latin)": "mzm_Latn",
|
| 1027 |
+
"Manado Malay (Latin)": "mzw_Latn",
|
| 1028 |
+
"Nimanbur (Latin)": "nab_Latn",
|
| 1029 |
+
"Naga languages (Latin)": "nag_Latn",
|
| 1030 |
+
"Nalik (Latin)": "nal_Latn",
|
| 1031 |
+
"Min Nan Chinese (Latin)": "nan_Latn",
|
| 1032 |
+
"Neapolitan (Latin)": "nap_Latn",
|
| 1033 |
+
"Coatepec Nahuatl (Latin)": "nas_Latn",
|
| 1034 |
+
"Nawuri (Latin)": "naw_Latn",
|
| 1035 |
+
"Nyemba (Latin)": "nbh_Latn",
|
| 1036 |
+
"Chang Naga (Latin)": "nca_Latn",
|
| 1037 |
+
"Notsi (Latin)": "ncf_Latn",
|
| 1038 |
+
"Central Huasteca Nahuatl (Latin)": "nch_Latn",
|
| 1039 |
+
"Central Puebla Nahuatl (Latin)": "ncj_Latn",
|
| 1040 |
+
"Michoacán Nahuatl (Latin)": "ncl_Latn",
|
| 1041 |
+
"N eko (Latin)": "nco_Latn",
|
| 1042 |
+
"Nahuatl languages (Latin)": "ncu_Latn",
|
| 1043 |
+
"Morelos Nahuatl (Latin)": "ncx_Latn",
|
| 1044 |
+
"Ndogo (Latin)": "ndi_Latn",
|
| 1045 |
+
"Ndjuká (Latin)": "ndj_Latn",
|
| 1046 |
+
"Ndonga (Latin)": "ndo_Latn",
|
| 1047 |
+
"Ndo (Latin)": "ndp_Latn",
|
| 1048 |
+
"Ndut (Latin)": "ndv_Latn",
|
| 1049 |
+
"Lutos (Latin)": "ndy_Latn",
|
| 1050 |
+
"Ndogo (Latin)": "ndz_Latn",
|
| 1051 |
+
"Toura (Côte d'Ivoire) (Latin)": "neb_Latn",
|
| 1052 |
+
"Nepali (Devanagari)": "nep_Deva",
|
| 1053 |
+
"Newari (Devanagari)": "new_Deva",
|
| 1054 |
+
"Ngbaka'ma'bo (Latin)": "nfa_Latn",
|
| 1055 |
+
"Nefamese (Latin)": "nfr_Latn",
|
| 1056 |
+
"Ngad'a (Latin)": "nga_Latn",
|
| 1057 |
+
"Ngemba (Latin)": "ngi_Latn",
|
| 1058 |
+
"Lomwe (Latin)": "ngl_Latn",
|
| 1059 |
+
"Ngulu (Latin)": "ngp_Latn",
|
| 1060 |
+
"Guerrero Nahuatl (Latin)": "ngu_Latn",
|
| 1061 |
+
"Eastern Huasteca Nahuatl (Latin)": "nhe_Latn",
|
| 1062 |
+
"Ngiyambaa (Latin)": "nhg_Latn",
|
| 1063 |
+
"Zacatlán-Ahuacatlán-Tepetzintla Nahuatl (Latin)": "nhi_Latn",
|
| 1064 |
+
"Nahari (Latin)": "nhn_Latn",
|
| 1065 |
+
"Tetelcingo Nahuatl (Latin)": "nhq_Latn",
|
| 1066 |
+
"Orizaba Nahuatl (Latin)": "nhu_Latn",
|
| 1067 |
+
"Western Huasteca Nahuatl (Latin)": "nhw_Latn",
|
| 1068 |
+
"Tabasco Nahuatl (Latin)": "nhx_Latn",
|
| 1069 |
+
"Ometepec Nahuatl (Latin)": "nhy_Latn",
|
| 1070 |
+
"Nias (Latin)": "nia_Latn",
|
| 1071 |
+
"Ngaju (Latin)": "nij_Latn",
|
| 1072 |
+
"Nimi (Latin)": "nim_Latn",
|
| 1073 |
+
"Ninzo (Latin)": "nin_Latn",
|
| 1074 |
+
"Nganasan (Latin)": "nja_Latn",
|
| 1075 |
+
"Nkonya (Latin)": "nko_Latn",
|
| 1076 |
+
"Ngombale (Latin)": "nla_Latn",
|
| 1077 |
+
"Ná-Meo (Latin)": "nlc_Latn",
|
| 1078 |
+
"Dutch (Latin)": "nld_Latn",
|
| 1079 |
+
"Gela (Latin)": "nlg_Latn",
|
| 1080 |
+
"Ninia Yali (Latin)": "nlk_Latn",
|
| 1081 |
+
"Orizaba Nahuatl (Latin)": "nlv_Latn",
|
| 1082 |
+
"Nyamwezi (Latin)": "nmg_Latn",
|
| 1083 |
+
"Nyamwezi (Latin)": "nmz_Latn",
|
| 1084 |
+
"Norwegian Nynorsk (Latin)": "nnb_Latn",
|
| 1085 |
+
"Ngiemboon (Latin)": "nnh_Latn",
|
| 1086 |
+
"Ngen (Latin)": "nnq_Latn",
|
| 1087 |
+
"Nuni (Latin)": "nnw_Latn",
|
| 1088 |
+
"Nocamán (Latin)": "noa_Latn",
|
| 1089 |
+
"Norwegian Bokmål (Latin)": "nob_Latn",
|
| 1090 |
+
"Northern Thai (Thai)": "nod_Thai",
|
| 1091 |
+
"Nimadi (Devanagari)": "noe_Deva",
|
| 1092 |
+
"Nogai (Cyrillic)": "nog_Cyrl",
|
| 1093 |
+
"Nomatsiguenga (Latin)": "not_Latn",
|
| 1094 |
+
"Nupoid languages (Latin)": "npl_Latn",
|
| 1095 |
+
"Napu (Latin)": "npy_Latn",
|
| 1096 |
+
"Northern Sotho (Latin)": "nso_Latn",
|
| 1097 |
+
"Nisenan (Latin)": "nst_Latn",
|
| 1098 |
+
"Nisu (Latin)": "nsu_Latn",
|
| 1099 |
+
"Naga languages (Latin)": "ntm_Latn",
|
| 1100 |
+
"Ntrubo (Latin)": "ntr_Latn",
|
| 1101 |
+
"Nobsalan (Latin)": "nuj_Latn",
|
| 1102 |
+
"Nung (Viet Nam) (Latin)": "nup_Latn",
|
| 1103 |
+
"Nuer (Latin)": "nus_Latn",
|
| 1104 |
+
"Nuu-chah-nulth (Latin)": "nuz_Latn",
|
| 1105 |
+
"Nyabwa (Latin)": "nwb_Latn",
|
| 1106 |
+
"Naxi (Latin)": "nxq_Latn",
|
| 1107 |
+
"Nyanja (Latin)": "nya_Latn",
|
| 1108 |
+
"Nyanga-li (Latin)": "nyf_Latn",
|
| 1109 |
+
"Nyankole (Latin)": "nyn_Latn",
|
| 1110 |
+
"Nyoro (Latin)": "nyo_Latn",
|
| 1111 |
+
"Nyulnyul (Latin)": "nyu_Latn",
|
| 1112 |
+
"Nyulnyul (Latin)": "nyy_Latn",
|
| 1113 |
+
"Nzima (Latin)": "nzi_Latn",
|
| 1114 |
+
"Obo Manobo (Latin)": "obo_Latn",
|
| 1115 |
+
"Occitan (post 1500) (Latin)": "oci_Latn",
|
| 1116 |
+
"Ormuri (Arabic)": "odk_Arab",
|
| 1117 |
+
"Odual (Latin)": "odu_Latn",
|
| 1118 |
+
"Ogoniland (Latin)": "ogo_Latn",
|
| 1119 |
+
"Ojibwa (Canadian Aboriginal Syllabics)": "ojb_Cans",
|
| 1120 |
+
"Ojibwa (Latin)": "ojb_Latn",
|
| 1121 |
+
"Oku (Latin)": "oku_Latn",
|
| 1122 |
+
"Mochi (Latin)": "old_Latn",
|
| 1123 |
+
"Omejes (Latin)": "omw_Latn",
|
| 1124 |
+
"Obo Manobo (Latin)": "onb_Latn",
|
| 1125 |
+
"Tohono O'odham (Latin)": "ood_Latn",
|
| 1126 |
+
"Oroqen (Latin)": "orc_Latn",
|
| 1127 |
+
"Oromo (Latin)": "orm_Latn",
|
| 1128 |
+
"Ormuri (Arabic)": "oru_Arab",
|
| 1129 |
+
"Oriya (Oriya)": "ory_Orya",
|
| 1130 |
+
"Ossetian (Cyrillic)": "oss_Cyrl",
|
| 1131 |
+
"Otomi (Latin)": "ote_Latn",
|
| 1132 |
+
"Otomi (Latin)": "otq_Latn",
|
| 1133 |
+
"Old Turkish (Latin)": "ozm_Latn",
|
| 1134 |
+
"Páez (Latin)": "pab_Latn",
|
| 1135 |
+
"Pareci (Latin)": "pad_Latn",
|
| 1136 |
+
"Pangasinan (Latin)": "pag_Latn",
|
| 1137 |
+
"Pampanga (Latin)": "pam_Latn",
|
| 1138 |
+
"Panjabi (Gurmukhi)": "pan_Guru",
|
| 1139 |
+
"Northern Paiute (Latin)": "pao_Latn",
|
| 1140 |
+
"Papiamento (Latin)": "pap_Latn",
|
| 1141 |
+
"Palauan (Latin)": "pau_Latn",
|
| 1142 |
+
"Pangwa (Latin)": "pbb_Latn",
|
| 1143 |
+
"Patamona (Latin)": "pbc_Latn",
|
| 1144 |
+
"Mezontla Popoloca (Latin)": "pbi_Latn",
|
| 1145 |
+
"Parkwa (Latin)": "pbs_Latn",
|
| 1146 |
+
"Southern Pashto (Arabic)": "pbt_Arab",
|
| 1147 |
+
"Northern Pashto (Arabic)": "pbu_Arab",
|
| 1148 |
+
"Ruching Palaung (Thai)": "pce_Thai",
|
| 1149 |
+
"Nigerian Pidgin (Latin)": "pcm_Latn",
|
| 1150 |
+
"Pardhan (Latin)": "pex_Latn",
|
| 1151 |
+
"Eastern Pomo (Latin)": "pez_Latn",
|
| 1152 |
+
"Pahi (Arabic)": "phl_Arab",
|
| 1153 |
+
"Phuan (Arabic)": "phr_Arab",
|
| 1154 |
+
"Pima Bajo (Latin)": "pib_Latn",
|
| 1155 |
+
"Yinjtjiparnti (Latin)": "pil_Latn",
|
| 1156 |
+
"Piapoco (Latin)": "pip_Latn",
|
| 1157 |
+
"Piratapuyo (Latin)": "pir_Latn",
|
| 1158 |
+
"Pijin (Latin)": "pis_Latn",
|
| 1159 |
+
"Pitta Pitta (Latin)": "piy_Latn",
|
| 1160 |
+
"Pijao (Latin)": "pjt_Latn",
|
| 1161 |
+
"Pokomo (Latin)": "pkb_Latn",
|
| 1162 |
+
"Pökoot (Latin)": "pko_Latn",
|
| 1163 |
+
"Shwe Palaung (Arabic)": "plk_Arab",
|
| 1164 |
+
"Central Pame (Latin)": "pls_Latn",
|
| 1165 |
+
"Malagasy, Plateau (Latin)": "plt_Latn",
|
| 1166 |
+
"Polonombauk (Latin)": "plw_Latn",
|
| 1167 |
+
"Piemontese (Latin)": "pmf_Latn",
|
| 1168 |
+
"Piemontese (Latin)": "pmq_Latn",
|
| 1169 |
+
"Piemontese (Latin)": "pms_Latn",
|
| 1170 |
+
"Pamona (Latin)": "pmy_Latn",
|
| 1171 |
+
"Western Panjabi (Arabic)": "pnb_Arab",
|
| 1172 |
+
"Penesak (Latin)": "pne_Latn",
|
| 1173 |
+
"Pinyin (Latin)": "pny_Latn",
|
| 1174 |
+
"Ponares (Latin)": "poc_Latn",
|
| 1175 |
+
"Poqomam (Latin)": "poe_Latn",
|
| 1176 |
+
"Poqomchi' (Latin)": "poh_Latn",
|
| 1177 |
+
"Pokangá (Latin)": "poi_Latn",
|
| 1178 |
+
"Polish (Latin)": "pol_Latn",
|
| 1179 |
+
"Portuguese (Latin)": "por_Latn",
|
| 1180 |
+
"Pémono (Latin)": "pov_Latn",
|
| 1181 |
+
"Puelche (Latin)": "pow_Latn",
|
| 1182 |
+
"Puelche (Latin)": "poy_Latn",
|
| 1183 |
+
"Paipai (Latin)": "ppk_Latn",
|
| 1184 |
+
"San Luís Temalacayuca Popoloca (Latin)": "pps_Latn",
|
| 1185 |
+
"Pa'o (Latin)": "prf_Latn",
|
| 1186 |
+
"Parauk (Latin)": "prk_Latn",
|
| 1187 |
+
"Parsi-Dari (Latin)": "prq_Latn",
|
| 1188 |
+
"Phai (Thai)": "prt_Thai",
|
| 1189 |
+
"Pai Tavytera (Latin)": "pse_Latn",
|
| 1190 |
+
"Kaulong (Latin)": "pss_Latn",
|
| 1191 |
+
"Central Pashto (Arabic)": "pst_Arab",
|
| 1192 |
+
"Patuá (Latin)": "ptu_Latn",
|
| 1193 |
+
"Punan Merap (Latin)": "pua_Latn",
|
| 1194 |
+
"Punan Merap (Latin)": "pui_Latn",
|
| 1195 |
+
"Pushto (Arabic)": "pus_Arab",
|
| 1196 |
+
"Pangwali (Latin)": "pwg_Latn",
|
| 1197 |
+
"Paiwan (Latin)": "pwn_Latn",
|
| 1198 |
+
"Pwo Western Karen (Thai)": "pww_Thai",
|
| 1199 |
+
"Quetzaltepec Mixe (Latin)": "pxm_Latn",
|
| 1200 |
+
"Bikol (Latin)": "qub_Latn",
|
| 1201 |
+
"K'iche' (Latin)": "quc_Latn",
|
| 1202 |
+
"Lambayeque Quechua (Latin)": "quf_Latn",
|
| 1203 |
+
"Chimborazo Highland Quichua (Latin)": "qug_Latn",
|
| 1204 |
+
"South Bolivian Quechua (Latin)": "quh_Latn",
|
| 1205 |
+
"North Bolivian Quechua (Latin)": "qul_Latn",
|
| 1206 |
+
"Sipacapense (Latin)": "qum_Latn",
|
| 1207 |
+
"Panao Huánuco Quechua (Latin)": "qup_Latn",
|
| 1208 |
+
"Yanahuanca Pasco Quechua (Latin)": "qur_Latn",
|
| 1209 |
+
"Southern Pastaza Quechua (Latin)": "qus_Latn",
|
| 1210 |
+
"Quechua (Latin)": "quv_Latn",
|
| 1211 |
+
"Quechua (Latin)": "quw_Latn",
|
| 1212 |
+
"Quechua (Latin)": "qux_Latn",
|
| 1213 |
+
"Ayacucho Quechua (Latin)": "quy_Latn",
|
| 1214 |
+
"Cusco Quechua (Latin)": "quz_Latn",
|
| 1215 |
+
"Ambo-Pasco Quechua (Latin)": "qva_Latn",
|
| 1216 |
+
"Cajamarca Quechua (Latin)": "qvc_Latn",
|
| 1217 |
+
"Eastern Apurímac Quechua (Latin)": "qve_Latn",
|
| 1218 |
+
"Huallaga Huánuco Quechua (Latin)": "qvh_Latn",
|
| 1219 |
+
"Imbabura Highland Quichua (Latin)": "qvi_Latn",
|
| 1220 |
+
"Loja Highland Quichua (Latin)": "qvj_Latn",
|
| 1221 |
+
"Cajatambo North Lima Quechua (Latin)": "qvl_Latn",
|
| 1222 |
+
"Margos-Yarowilca-Lauricocha Quechua (Latin)": "qvm_Latn",
|
| 1223 |
+
"North Junín Quechua (Latin)": "qvn_Latn",
|
| 1224 |
+
"Napo Lowland Quechua (Latin)": "qvo_Latn",
|
| 1225 |
+
"San Martín Quechua (Latin)": "qvs_Latn",
|
| 1226 |
+
"Huaylla Wanca Quechua (Latin)": "qvw_Latn",
|
| 1227 |
+
"Yauyos Quechua (Latin)": "qvz_Latn",
|
| 1228 |
+
"Corongo Ancash Quechua (Latin)": "qwa_Latn",
|
| 1229 |
+
"Huaylas Ancash Quechua (Latin)": "qwh_Latn",
|
| 1230 |
+
"Sihuas Ancash Quechua (Latin)": "qws_Latn",
|
| 1231 |
+
"Chiquián Ancash Quechua (Latin)": "qxa_Latn",
|
| 1232 |
+
"Southern Conchucos Ancash Quechua (Latin)": "qxh_Latn",
|
| 1233 |
+
"Northern Conchucos Ancash Quechua (Latin)": "qxl_Latn",
|
| 1234 |
+
"Puno Quechua (Latin)": "qxn_Latn",
|
| 1235 |
+
"Southern Pastaza Quechua (Latin)": "qxo_Latn",
|
| 1236 |
+
"Puno Quechua (Latin)": "qxp_Latn",
|
| 1237 |
+
"Pacaraos Quechua (Latin)": "qxr_Latn",
|
| 1238 |
+
"Santa Ana de Tusi Pasco Quechua (Latin)": "qxt_Latn",
|
| 1239 |
+
"Arequipa-La Unión Quechua (Latin)": "qxu_Latn",
|
| 1240 |
+
"Jauja Wanca Quechua (Latin)": "qxw_Latn",
|
| 1241 |
+
"Rāga (Latin)": "rag_Latn",
|
| 1242 |
+
"Rahambuu (Bengali)": "rah_Beng",
|
| 1243 |
+
"Ramoaaina (Latin)": "rai_Latn",
|
| 1244 |
+
"Rapa Nui (Latin)": "rap_Latn",
|
| 1245 |
+
"Rawang (Devanagari)": "rav_Deva",
|
| 1246 |
+
"Rawang (Latin)": "raw_Latn",
|
| 1247 |
+
"Rejang (Latin)": "rej_Latn",
|
| 1248 |
+
"Rendille (Latin)": "rel_Latn",
|
| 1249 |
+
"Raguile (Latin)": "rgu_Latn",
|
| 1250 |
+
"Rohingya (Latin)": "rhg_Latn",
|
| 1251 |
+
"Tarifit (Arabic)": "rif_Arab",
|
| 1252 |
+
"Tarifit (Latin)": "rif_Latn",
|
| 1253 |
+
"Riang (India) (Latin)": "rim_Latn",
|
| 1254 |
+
"Riang (India) (Devanagari)": "rjs_Deva",
|
| 1255 |
+
"Rangpuri (Bengali)": "rkt_Beng",
|
| 1256 |
+
"Carpathian Romani (Cyrillic)": "rmc_Cyrl",
|
| 1257 |
+
"Carpathian Romani (Latin)": "rmc_Latn",
|
| 1258 |
+
"Traveller Norwegian (Latin)": "rmo_Latn",
|
| 1259 |
+
"Romany (Cyrillic)": "rmy_Cyrl",
|
| 1260 |
+
"Romany (Latin)": "rmy_Latn",
|
| 1261 |
+
"Roon (Latin)": "rng_Latn",
|
| 1262 |
+
"Roon (Latin)": "rnl_Latn",
|
| 1263 |
+
"Tae' (Latin)": "rob_Latn",
|
| 1264 |
+
"Rombo (Latin)": "rof_Latn",
|
| 1265 |
+
"Romansh (Latin, surs1244)": "roh_Latn_surs1244",
|
| 1266 |
+
"Romblomanon (Latin)": "rol_Latn",
|
| 1267 |
+
"Romanian (Latin)": "ron_Latn",
|
| 1268 |
+
"Rongga (Latin)": "roo_Latn",
|
| 1269 |
+
"Kriol (Latin)": "rop_Latn",
|
| 1270 |
+
"Rotokas (Latin)": "rro_Latn",
|
| 1271 |
+
"Rathawi (Latin)": "rth_Latn",
|
| 1272 |
+
"Rusyn (Latin)": "rub_Latn",
|
| 1273 |
+
"Ruuli (Latin)": "ruc_Latn",
|
| 1274 |
+
"Rufiji (Latin)": "ruf_Latn",
|
| 1275 |
+
"Ruga (Latin)": "rug_Latn",
|
| 1276 |
+
"Rundi (Latin)": "run_Latn",
|
| 1277 |
+
"Russian (Cyrillic)": "rus_Cyrl",
|
| 1278 |
+
"Mbwela (Latin)": "rwm_Latn",
|
| 1279 |
+
"Marwari (India) (Devanagari)": "rwr_Deva",
|
| 1280 |
+
"Saba (Latin)": "sab_Latn",
|
| 1281 |
+
"Sango (Latin)": "sag_Latn",
|
| 1282 |
+
"Yakut (Cyrillic)": "sah_Cyrl",
|
| 1283 |
+
"Sahu (Latin)": "saj_Latn",
|
| 1284 |
+
"Samburu (Latin)": "saq_Latn",
|
| 1285 |
+
"Sasak (Latin)": "sas_Latn",
|
| 1286 |
+
"Sause (Latin)": "sau_Latn",
|
| 1287 |
+
"Sayula Popoluca (Latin)": "say_Latn",
|
| 1288 |
+
"Ngambay (Latin)": "sba_Latn",
|
| 1289 |
+
"Simbo (Latin)": "sbd_Latn",
|
| 1290 |
+
"Sagala (Latin)": "sbl_Latn",
|
| 1291 |
+
"Sindhi Bhil (Arabic)": "sbn_Arab",
|
| 1292 |
+
"Sangu (Tanzania) (Latin)": "sbp_Latn",
|
| 1293 |
+
"Sangu (Gabon) (Latin)": "sch_Latn",
|
| 1294 |
+
"Sadri (Devanagari)": "sck_Deva",
|
| 1295 |
+
"Shina (Arabic)": "scl_Arab",
|
| 1296 |
+
"Sicilian (Latin)": "scn_Latn",
|
| 1297 |
+
"Scots (Latin)": "sco_Latn",
|
| 1298 |
+
"Sandawe (Latin)": "sda_Latn",
|
| 1299 |
+
"Sardo-logudorese (Latin)": "sdo_Latn",
|
| 1300 |
+
"Semai (Latin)": "sea_Latn",
|
| 1301 |
+
"Sena (Latin)": "seh_Latn",
|
| 1302 |
+
"Sena (Latin)": "sei_Latn",
|
| 1303 |
+
"Serrano (Latin)": "ses_Latn",
|
| 1304 |
+
"Serrano (Latin)": "sey_Latn",
|
| 1305 |
+
"Sangu (Gabon) (Latin)": "sgb_Latn",
|
| 1306 |
+
"Surgujia (Devanagari)": "sgj_Deva",
|
| 1307 |
+
"Suri (Ethiopic)": "sgw_Ethi",
|
| 1308 |
+
"Tachelhit (Latin)": "shi_Latn",
|
| 1309 |
+
"Sheko (Latin)": "shk_Latn",
|
| 1310 |
+
"Shan (Myanmar)": "shn_Mymr",
|
| 1311 |
+
"Shanga (Latin)": "sho_Latn",
|
| 1312 |
+
"Sala (Latin)": "shp_Latn",
|
| 1313 |
+
"Sidamo (Latin)": "sid_Latn",
|
| 1314 |
+
"Serrano (Latin)": "sig_Latn",
|
| 1315 |
+
"Tumulung Sisaala (Latin)": "sil_Latn",
|
| 1316 |
+
"Sinhala (Sinhala)": "sin_Sinh",
|
| 1317 |
+
"Sikkimese (Tibetan)": "sip_Tibt",
|
| 1318 |
+
"Siwa (Latin)": "siw_Latn",
|
| 1319 |
+
"Soli (Latin)": "sja_Latn",
|
| 1320 |
+
"Simaa (Latin)": "sjm_Latn",
|
| 1321 |
+
"Surjapuri (Devanagari)": "sjp_Deva",
|
| 1322 |
+
"Siar-Lak (Latin)": "sjr_Latn",
|
| 1323 |
+
"Seke (Vanuatu) (Latin)": "skg_Latn",
|
| 1324 |
+
"Saraiki (Arabic)": "skr_Arab",
|
| 1325 |
+
"Sáliba (Latin)": "sld_Latn",
|
| 1326 |
+
"Slovak (Latin)": "slk_Latn",
|
| 1327 |
+
"Selaru (Latin)": "slu_Latn",
|
| 1328 |
+
"Slovenian (Latin)": "slv_Latn",
|
| 1329 |
+
"Sama (Latin)": "sml_Latn",
|
| 1330 |
+
"Samoan (Latin)": "smo_Latn",
|
| 1331 |
+
"Shona (Latin)": "sna_Latn",
|
| 1332 |
+
"Sanga (Nigeria) (Latin)": "snc_Latn",
|
| 1333 |
+
"Sindhi (Arabic)": "snd_Arab",
|
| 1334 |
+
"Bau Bidayuh (Latin)": "sne_Latn",
|
| 1335 |
+
"Soninke (Latin)": "snk_Latn",
|
| 1336 |
+
"Siona (Latin)": "snn_Latn",
|
| 1337 |
+
"Siane (Latin)": "snp_Latn",
|
| 1338 |
+
"Sauk (Latin)": "snv_Latn",
|
| 1339 |
+
"Sauk (Latin)": "snw_Latn",
|
| 1340 |
+
"Solos (Latin)": "sol_Latn",
|
| 1341 |
+
"Somali (Latin)": "som_Latn",
|
| 1342 |
+
"Songe (Latin)": "soy_Latn",
|
| 1343 |
+
"Spanish (Latin)": "spa_Latn",
|
| 1344 |
+
"Sian (Latin)": "spp_Latn",
|
| 1345 |
+
"Saponi (Latin)": "sps_Latn",
|
| 1346 |
+
"Sabaot (Latin)": "spy_Latn",
|
| 1347 |
+
"Sardinian (Latin)": "src_Latn",
|
| 1348 |
+
"Sardinian (Latin)": "srd_Latn",
|
| 1349 |
+
"Sera (Latin)": "sri_Latn",
|
| 1350 |
+
"Saramaccan (Latin)": "srm_Latn",
|
| 1351 |
+
"Sranan Tongo (Latin)": "srn_Latn",
|
| 1352 |
+
"Sarsuti (Latin)": "sro_Latn",
|
| 1353 |
+
"Serbian (Cyrillic)": "srp_Cyrl",
|
| 1354 |
+
"Serer (Latin)": "srr_Latn",
|
| 1355 |
+
"Seraiki (Devanagari)": "srx_Deva",
|
| 1356 |
+
"Siri (Arabic)": "ssi_Arab",
|
| 1357 |
+
"Seta (Latin)": "ste_Latn",
|
| 1358 |
+
"Sateré-Mawé (Latin)": "stn_Latn",
|
| 1359 |
+
"Stieng (Latin)": "stp_Latn",
|
| 1360 |
+
"Sua (Latin)": "sua_Latn",
|
| 1361 |
+
"Suku (Latin)": "suc_Latn",
|
| 1362 |
+
"Sukuma (Latin)": "suk_Latn",
|
| 1363 |
+
"Sundanese (Latin)": "sun_Latn",
|
| 1364 |
+
"Suri (Latin)": "sur_Latn",
|
| 1365 |
+
"Susu (Latin)": "sus_Latn",
|
| 1366 |
+
"Susu (Latin)": "suv_Latn",
|
| 1367 |
+
"Sunwar (Devanagari)": "suz_Deva",
|
| 1368 |
+
"Svan (Georgian)": "sva_Geor",
|
| 1369 |
+
"Swedish (Latin)": "swe_Latn",
|
| 1370 |
+
"Swahili (macrolanguage) (Latin)": "swh_Latn",
|
| 1371 |
+
"Seraiki (Devanagari)": "swv_Deva",
|
| 1372 |
+
"Sumbwa (Latin)": "sxb_Latn",
|
| 1373 |
+
"Sicanian (Latin)": "sxn_Latn",
|
| 1374 |
+
"Sighu (Latin)": "sya_Latn",
|
| 1375 |
+
"Sylheti (Latin)": "syl_Latn",
|
| 1376 |
+
"Saurashtra (Latin)": "sza_Latn",
|
| 1377 |
+
"Saurashtra (Latin)": "szy_Latn",
|
| 1378 |
+
"Tuma-Irumu (Latin)": "tac_Latn",
|
| 1379 |
+
"Tajio (Devanagari)": "taj_Deva",
|
| 1380 |
+
"Tamil (Tamil)": "tam_Taml",
|
| 1381 |
+
"Tana (Latin)": "tan_Latn",
|
| 1382 |
+
"Tangale (Latin)": "tao_Latn",
|
| 1383 |
+
"Taabwa (Latin)": "tap_Latn",
|
| 1384 |
+
"Tarahumara (Latin)": "taq_Latn",
|
| 1385 |
+
"Central Tarahumara (Latin)": "tar_Latn",
|
| 1386 |
+
"Tatar (Cyrillic)": "tat_Cyrl",
|
| 1387 |
+
"Tatuyo (Latin)": "tav_Latn",
|
| 1388 |
+
"Tay (Latin)": "tay_Latn",
|
| 1389 |
+
"Taliabu (Latin)": "tbc_Latn",
|
| 1390 |
+
"Kbo (Latin)": "tbf_Latn",
|
| 1391 |
+
"Tairora (Latin)": "tbg_Latn",
|
| 1392 |
+
"Tboli (Latin)": "tbk_Latn",
|
| 1393 |
+
"Tboli (Latin)": "tbl_Latn",
|
| 1394 |
+
"Tagbu (Latin)": "tby_Latn",
|
| 1395 |
+
"Ditammari (Latin)": "tbz_Latn",
|
| 1396 |
+
"Ticuna (Latin)": "tca_Latn",
|
| 1397 |
+
"Datooga (Latin)": "tcc_Latn",
|
| 1398 |
+
"Malagasy, Tsimihety (Latin)": "tcf_Latn",
|
| 1399 |
+
"Tulu (Malayalam)": "tcy_Mlym",
|
| 1400 |
+
"Are'are (Latin)": "tcz_Latn",
|
| 1401 |
+
"Tidong (Latin)": "tdj_Latn",
|
| 1402 |
+
"Tandaganon (Latin)": "tdn_Latn",
|
| 1403 |
+
"Tandroy-Mahafaly Malagasy (Latin)": "tdx_Latn",
|
| 1404 |
+
"Tepo Krumen (Latin)": "ted_Latn",
|
| 1405 |
+
"Teressa (Latin)": "tee_Latn",
|
| 1406 |
+
"Telugu (Telugu)": "tel_Telu",
|
| 1407 |
+
"Timne (Latin)": "tem_Latn",
|
| 1408 |
+
"Teso (Latin)": "teo_Latn",
|
| 1409 |
+
"Teso (Latin)": "ter_Latn",
|
| 1410 |
+
"Tewa (USA) (Latin)": "tew_Latn",
|
| 1411 |
+
"Tennet (Latin)": "tex_Latn",
|
| 1412 |
+
"Terik (Latin)": "tfr_Latn",
|
| 1413 |
+
"Ternate (Latin)": "tgc_Latn",
|
| 1414 |
+
"Togoyo (Latin)": "tgj_Latn",
|
| 1415 |
+
"Tajik (Cyrillic)": "tgk_Cyrl",
|
| 1416 |
+
"Tagalog (Latin)": "tgl_Latn",
|
| 1417 |
+
"Togoyo (Latin)": "tgo_Latn",
|
| 1418 |
+
"Togoyo (Latin)": "tgp_Latn",
|
| 1419 |
+
"Thai (Thai)": "tha_Thai",
|
| 1420 |
+
"Tharu (Devanagari)": "the_Deva",
|
| 1421 |
+
"Tho (Latin)": "thk_Latn",
|
| 1422 |
+
"Tharu (Devanagari)": "thl_Deva",
|
| 1423 |
+
"Tharu (Devanagari)": "thq_Deva",
|
| 1424 |
+
"Tharu (Devanagari)": "thr_Deva",
|
| 1425 |
+
"Thangmi (Tifinagh)": "thv_Tfng",
|
| 1426 |
+
"Tigre (Ethiopic)": "tig_Ethi",
|
| 1427 |
+
"Timugon Murut (Latin)": "tih_Latn",
|
| 1428 |
+
"Tii (Latin)": "tik_Latn",
|
| 1429 |
+
"Tillamook (Latin)": "tio_Latn",
|
| 1430 |
+
"Tigrinya (Ethiopic)": "tir_Ethi",
|
| 1431 |
+
"Masaka (Latin)": "tkg_Latn",
|
| 1432 |
+
"Tukumanféd (Latin)": "tkr_Latn",
|
| 1433 |
+
"Takpa (Devanagari)": "tkt_Deva",
|
| 1434 |
+
"Tobo-Kube (Latin)": "tlb_Latn",
|
| 1435 |
+
"Tlingit (Latin)": "tli_Latn",
|
| 1436 |
+
"Talysh (Latin)": "tlj_Latn",
|
| 1437 |
+
"Taloki (Latin)": "tlp_Latn",
|
| 1438 |
+
"Talysh (Latin)": "tly_Latn",
|
| 1439 |
+
"Tumak (Latin)": "tmc_Latn",
|
| 1440 |
+
"Toba-Maskoy (Latin)": "tmf_Latn",
|
| 1441 |
+
"Tasmate (Latin)": "tna_Latn",
|
| 1442 |
+
"Tonga (Nyasa) (Latin)": "tng_Latn",
|
| 1443 |
+
"Tenis (Latin)": "tnk_Latn",
|
| 1444 |
+
"Tonsawang (Latin)": "tnn_Latn",
|
| 1445 |
+
"Tontemboan (Latin)": "tnp_Latn",
|
| 1446 |
+
"Ménik (Latin)": "tnr_Latn",
|
| 1447 |
+
"Tenino (Latin)": "tnt_Latn",
|
| 1448 |
+
"Toba (Latin)": "tob_Latn",
|
| 1449 |
+
"Coyutla Totonac (Latin)": "toc_Latn",
|
| 1450 |
+
"Toma (Latin)": "toh_Latn",
|
| 1451 |
+
"Toki Pona (Latin)": "tok_Latn",
|
| 1452 |
+
"Tomini (Latin)": "tom_Latn",
|
| 1453 |
+
"Xicotepec De Juárez Totonac (Latin)": "top_Latn",
|
| 1454 |
+
"Tukumanféd (Latin)": "tos_Latn",
|
| 1455 |
+
"Tok Pisin (Latin)": "tpi_Latn",
|
| 1456 |
+
"Tukumanféd (Latin)": "tpl_Latn",
|
| 1457 |
+
"Tampulma (Latin)": "tpm_Latn",
|
| 1458 |
+
"Tukumanféd (Latin)": "tpp_Latn",
|
| 1459 |
+
"Tukumanféd (Latin)": "tpt_Latn",
|
| 1460 |
+
"Tukumanféd (Latin)": "tpz_Latn",
|
| 1461 |
+
"Tukumanféd (Latin)": "tqp_Latn",
|
| 1462 |
+
"Trio (Latin)": "trc_Latn",
|
| 1463 |
+
"Turi (Latin)": "tri_Latn",
|
| 1464 |
+
"Torona (Latin)": "trn_Latn",
|
| 1465 |
+
"Trumai (Latin)": "trp_Latn",
|
| 1466 |
+
"Tregami (Latin)": "trq_Latn",
|
| 1467 |
+
"Tirahi (Latin)": "trs_Latn",
|
| 1468 |
+
"Trukhmen (Latin)": "trv_Latn",
|
| 1469 |
+
"Torwali (Arabic)": "trw_Arab",
|
| 1470 |
+
"Tswana (Latin)": "tsn_Latn",
|
| 1471 |
+
"Tsonga (Latin)": "tso_Latn",
|
| 1472 |
+
"Tsuvan (Latin)": "tsz_Latn",
|
| 1473 |
+
"Tswa (Latin)": "ttc_Latn",
|
| 1474 |
+
"Tutelo (Latin)": "tte_Latn",
|
| 1475 |
+
"Tooro (Latin)": "ttj_Latn",
|
| 1476 |
+
"Tawallammat Tamajaq (Tifinagh)": "ttq_Tfng",
|
| 1477 |
+
"Tutoro (Latin)": "ttr_Latn",
|
| 1478 |
+
"Wotu (Latin)": "ttu_Latn",
|
| 1479 |
+
"Tübatulabal (Latin)": "tue_Latn",
|
| 1480 |
+
"Tübatulabal (Latin)": "tuf_Latn",
|
| 1481 |
+
"Tugutil (Latin)": "tui_Latn",
|
| 1482 |
+
"Turkmen (Arabic)": "tuk_Arab",
|
| 1483 |
+
"Turkmen (Latin)": "tuk_Latn",
|
| 1484 |
+
"Tula (Latin)": "tul_Latn",
|
| 1485 |
+
"Tumbuka (Latin)": "tuo_Latn",
|
| 1486 |
+
"Tedaga (Latin)": "tuq_Latn",
|
| 1487 |
+
"Turkish (Latin)": "tur_Latn",
|
| 1488 |
+
"Tuxináwa (Latin)": "tuv_Latn",
|
| 1489 |
+
"Tuxináwa (Latin)": "tuy_Latn",
|
| 1490 |
+
"Tungus languages (Latin)": "tvo_Latn",
|
| 1491 |
+
"Tungus languages (Latin)": "tvu_Latn",
|
| 1492 |
+
"Tungus languages (Latin)": "tvw_Latn",
|
| 1493 |
+
"Tawbuid (Latin)": "twb_Latn",
|
| 1494 |
+
"Twents (Latin)": "twe_Latn",
|
| 1495 |
+
"Tungus languages (Latin)": "twu_Latn",
|
| 1496 |
+
"Tewe (Latin)": "txa_Latn",
|
| 1497 |
+
"Tombonuo (Latin)": "txq_Latn",
|
| 1498 |
+
"Tartessian (Latin)": "txs_Latn",
|
| 1499 |
+
"Kayapó (Latin)": "txu_Latn",
|
| 1500 |
+
"Tanosy Malagasy (Latin)": "txy_Latn",
|
| 1501 |
+
"Tauya (Latin)": "tye_Latn",
|
| 1502 |
+
"Tzeltal (Latin)": "tzh_Latn",
|
| 1503 |
+
"Tz'utujil (Latin)": "tzj_Latn",
|
| 1504 |
+
"Tzotzil (Latin)": "tzo_Latn",
|
| 1505 |
+
"Ubi (Latin)": "ubl_Latn",
|
| 1506 |
+
"Ubang (Latin)": "ubu_Latn",
|
| 1507 |
+
"Ujir (Latin)": "udl_Latn",
|
| 1508 |
+
"Udmurt (Cyrillic)": "udm_Cyrl",
|
| 1509 |
+
"Uduk (Latin)": "udu_Latn",
|
| 1510 |
+
"Uighur (Arabic)": "uig_Arab",
|
| 1511 |
+
"Uighur (Cyrillic)": "uig_Cyrl",
|
| 1512 |
+
"Ukuriguma (Oriya)": "uki_Orya",
|
| 1513 |
+
"Ukrainian (Cyrillic)": "ukr_Cyrl",
|
| 1514 |
+
"Ukuriguma (Latin)": "ukv_Latn",
|
| 1515 |
+
"Umbundu (Latin)": "umb_Latn",
|
| 1516 |
+
"Uripiv-Wala-Rano-Atchin (Latin)": "upv_Latn",
|
| 1517 |
+
"Ura (Vanuatu) (Latin)": "ura_Latn",
|
| 1518 |
+
"Urubú-Kaapor (Latin)": "urb_Latn",
|
| 1519 |
+
"Urdu (Arabic)": "urd_Arab",
|
| 1520 |
+
"Urdu (Devanagari)": "urd_Deva",
|
| 1521 |
+
"Urdu (Latin)": "urd_Latn",
|
| 1522 |
+
"Urhobo (Latin)": "urh_Latn",
|
| 1523 |
+
"Urak Lawoi' (Thai)": "urk_Thai",
|
| 1524 |
+
"Urat (Latin)": "urt_Latn",
|
| 1525 |
+
"Uru (Latin)": "ury_Latn",
|
| 1526 |
+
"Ushojo (Arabic)": "ush_Arab",
|
| 1527 |
+
"Uspanteco (Latin)": "usp_Latn",
|
| 1528 |
+
"Uzbek (Cyrillic)": "uzb_Cyrl",
|
| 1529 |
+
"Uzbek (Latin)": "uzb_Latn",
|
| 1530 |
+
"Northern Uzbek (Latin)": "uzn_Latn",
|
| 1531 |
+
"Vagla (Latin)": "vag_Latn",
|
| 1532 |
+
"Varhadi-Nagpuri (Devanagari)": "vah_Deva",
|
| 1533 |
+
"Vehes (Latin)": "vai_Latn",
|
| 1534 |
+
"Varli (Latin)": "var_Latn",
|
| 1535 |
+
"Veluws (Latin)": "ver_Latn",
|
| 1536 |
+
"Vinde (Latin)": "vid_Latn",
|
| 1537 |
+
"Vietnamese (Latin)": "vie_Latn",
|
| 1538 |
+
"Vili (Latin)": "vif_Latn",
|
| 1539 |
+
"Viemo (Latin)": "vmc_Latn",
|
| 1540 |
+
"Juxtlahuaca Mixtec (Latin)": "vmj_Latn",
|
| 1541 |
+
"Mitlatongo Mixtec (Latin)": "vmm_Latn",
|
| 1542 |
+
"Soyaltepec Mazatec (Latin)": "vmp_Latn",
|
| 1543 |
+
"Makhuwa (Latin)": "vmw_Latn",
|
| 1544 |
+
"Soyaltepec Mazatec (Latin)": "vmy_Latn",
|
| 1545 |
+
"Soyaltepec Mazatec (Latin)": "vmz_Latn",
|
| 1546 |
+
"Võro (Latin)": "vro_Latn",
|
| 1547 |
+
"Vunjo (Latin)": "vun_Latn",
|
| 1548 |
+
"Vute (Latin)": "vut_Latn",
|
| 1549 |
+
"Wolaytta (Ethiopic)": "wal_Ethi",
|
| 1550 |
+
"Wolaytta (Latin)": "wal_Latn",
|
| 1551 |
+
"Wapishana (Latin)": "wap_Latn",
|
| 1552 |
+
"Waray (Philippines) (Latin)": "war_Latn",
|
| 1553 |
+
"Walla Walla (Latin)": "waw_Latn",
|
| 1554 |
+
"Wayana (Latin)": "way_Latn",
|
| 1555 |
+
"Warao (Latin)": "wba_Latn",
|
| 1556 |
+
"Wakhi (Latin)": "wbl_Latn",
|
| 1557 |
+
"Wagdi (Devanagari)": "wbr_Deva",
|
| 1558 |
+
"Waci Gbe (Latin)": "wci_Latn",
|
| 1559 |
+
"Wè Western (Latin)": "weo_Latn",
|
| 1560 |
+
"Wewaw (Latin)": "wes_Latn",
|
| 1561 |
+
"Wajan (Latin)": "wja_Latn",
|
| 1562 |
+
"Warji (Latin)": "wji_Latn",
|
| 1563 |
+
"Walloon (Latin)": "wlo_Latn",
|
| 1564 |
+
"Wolio (Latin)": "wlx_Latn",
|
| 1565 |
+
"Womo (Latin)": "wmw_Latn",
|
| 1566 |
+
"Wobé (Latin)": "wob_Latn",
|
| 1567 |
+
"Wolof (Latin)": "wof_Latn",
|
| 1568 |
+
"Wolof (Latin)": "wol_Latn",
|
| 1569 |
+
"Wagdi (Telugu)": "wsg_Telu",
|
| 1570 |
+
"Wassa (Latin)": "wwa_Latn",
|
| 1571 |
+
"Kalmyk (Cyrillic)": "xal_Cyrl",
|
| 1572 |
+
"Kayan Mahakam (Latin)": "xdy_Latn",
|
| 1573 |
+
"Xerénte (Latin)": "xed_Latn",
|
| 1574 |
+
"Xerénte (Latin)": "xer_Latn",
|
| 1575 |
+
"Khetrani (Arabic)": "xhe_Arab",
|
| 1576 |
+
"Xhosa (Latin)": "xho_Latn",
|
| 1577 |
+
"Kalkoti (Arabic)": "xka_Arab",
|
| 1578 |
+
"Kalkoti (Latin)": "xkl_Latn",
|
| 1579 |
+
"Mingrelian (Georgian)": "xmf_Geor",
|
| 1580 |
+
"Malay (macrolanguage), Malaccan (Latin)": "xmm_Latn",
|
| 1581 |
+
"Mean (Latin)": "xmv_Latn",
|
| 1582 |
+
"Kenyan Sign Language (Latin)": "xnj_Latn",
|
| 1583 |
+
"Kanjar (Devanagari)": "xnr_Deva",
|
| 1584 |
+
"Xhosa (Latin)": "xog_Latn",
|
| 1585 |
+
"Komo (Sudan) (Latin)": "xon_Latn",
|
| 1586 |
+
"Kpelle (Latin)": "xpe_Latn",
|
| 1587 |
+
"Karahawyana (Latin)": "xrb_Latn",
|
| 1588 |
+
"Samberigi (Latin)": "xsb_Latn",
|
| 1589 |
+
"Samberigi (Latin)": "xsm_Latn",
|
| 1590 |
+
"Sherpa (Devanagari)": "xsr_Deva",
|
| 1591 |
+
"Sukur (Latin)": "xsu_Latn",
|
| 1592 |
+
"Alcozauca Mixtec (Latin)": "xta_Latn",
|
| 1593 |
+
"Diuxi-Tilantongo Mixtec (Latin)": "xtd_Latn",
|
| 1594 |
+
"Ketengban (Latin)": "xte_Latn",
|
| 1595 |
+
"Sino-Tibetan languages (Latin)": "xti_Latn",
|
| 1596 |
+
"Tidaá Mixtec (Latin)": "xtm_Latn",
|
| 1597 |
+
"Diuxi-Tilantongo Mixtec (Latin)": "xtn_Latn",
|
| 1598 |
+
"Cuyamecalco Mixtec (Latin)": "xtu_Latn",
|
| 1599 |
+
"Alcozauca Mixtec (Tamil)": "xua_Taml",
|
| 1600 |
+
"Kuo (Latin)": "xuo_Latn",
|
| 1601 |
+
"Yaminahua (Latin)": "yaa_Latn",
|
| 1602 |
+
"Yagua (Latin)": "yad_Latn",
|
| 1603 |
+
"Yalunka (Latin)": "yal_Latn",
|
| 1604 |
+
"Yamba (Latin)": "yam_Latn",
|
| 1605 |
+
"Yao (Latin)": "yao_Latn",
|
| 1606 |
+
"Yagua (Latin)": "yaq_Latn",
|
| 1607 |
+
"Yagua (Latin)": "yas_Latn",
|
| 1608 |
+
"Yagua (Latin)": "yat_Latn",
|
| 1609 |
+
"Yavanawa (Latin)": "yav_Latn",
|
| 1610 |
+
"Yei (Latin)": "yay_Latn",
|
| 1611 |
+
"Yazgulyam (Latin)": "yaz_Latn",
|
| 1612 |
+
"Yala (Latin)": "yba_Latn",
|
| 1613 |
+
"Yemba (Latin)": "ybb_Latn",
|
| 1614 |
+
"Yucatec Maya Sign Language (Latin)": "ycl_Latn",
|
| 1615 |
+
"Yucuna (Latin)": "ycn_Latn",
|
| 1616 |
+
"Yiddish (Hebrew)": "ydd_Hebr",
|
| 1617 |
+
"Yidgha (Arabic)": "ydg_Arab",
|
| 1618 |
+
"Yennu (Malayalam)": "yea_Mlym",
|
| 1619 |
+
"Yenisei Say (Latin)": "yer_Latn",
|
| 1620 |
+
"Yeskwa (Latin)": "yes_Latn",
|
| 1621 |
+
"Yaka (Congo) (Latin)": "yka_Latn",
|
| 1622 |
+
"Yalo (Latin)": "yli_Latn",
|
| 1623 |
+
"Yoruba (Latin)": "yor_Latn",
|
| 1624 |
+
"Yarí (Latin)": "yre_Latn",
|
| 1625 |
+
"Yucateco (Latin)": "yua_Latn",
|
| 1626 |
+
"Yue Chinese (Han)": "yue_Hans",
|
| 1627 |
+
"Yue Chinese (Han)": "yue_Hant",
|
| 1628 |
+
"Yuracare (Latin)": "yuz_Latn",
|
| 1629 |
+
"Yawa (Latin)": "yva_Latn",
|
| 1630 |
+
"Zapotec (Latin)": "zaa_Latn",
|
| 1631 |
+
"Zapotec (Latin)": "zab_Latn",
|
| 1632 |
+
"Ocotlán Zapotec (Latin)": "zac_Latn",
|
| 1633 |
+
"Cajonos Zapotec (Latin)": "zad_Latn",
|
| 1634 |
+
"Yareni Zapotec (Latin)": "zae_Latn",
|
| 1635 |
+
"Isthmus Zapotec (Latin)": "zai_Latn",
|
| 1636 |
+
"Miahuatlán Zapotec (Latin)": "zam_Latn",
|
| 1637 |
+
"Ozolotepec Zapotec (Latin)": "zao_Latn",
|
| 1638 |
+
"Aloápam Zapotec (Latin)": "zaq_Latn",
|
| 1639 |
+
"Rincón Zapotec (Latin)": "zar_Latn",
|
| 1640 |
+
"Santo Domingo Albarradas Zapotec (Latin)": "zas_Latn",
|
| 1641 |
+
"Yatzachi Zapotec (Latin)": "zav_Latn",
|
| 1642 |
+
"Zay (Latin)": "zaw_Latn",
|
| 1643 |
+
"Choapan Zapotec (Latin)": "zca_Latn",
|
| 1644 |
+
"Zhigulevsk (Latin)": "zga_Latn",
|
| 1645 |
+
"Zimza (Latin)": "zim_Latn",
|
| 1646 |
+
"Zinza (Latin)": "ziw_Latn",
|
| 1647 |
+
"Zialo (Latin)": "zmz_Latn",
|
| 1648 |
+
"Zande (macrolanguage) (Latin)": "zne_Latn",
|
| 1649 |
+
"Zoque (Latin)": "zoc_Latn",
|
| 1650 |
+
"Zoque (Latin)": "zoh_Latn",
|
| 1651 |
+
"Zoque (Latin)": "zor_Latn",
|
| 1652 |
+
"Zoque (Latin)": "zos_Latn",
|
| 1653 |
+
"Coatecas Altas Zapotec (Latin)": "zpc_Latn",
|
| 1654 |
+
"Guevea De Humboldt Zapotec (Latin)": "zpg_Latn",
|
| 1655 |
+
"Santa María Quiegolani Zapotec (Latin)": "zpi_Latn",
|
| 1656 |
+
"Lachixío Zapotec (Latin)": "zpl_Latn",
|
| 1657 |
+
"Mixtepec Zapotec (Latin)": "zpm_Latn",
|
| 1658 |
+
"Choapan Zapotec (Latin)": "zpo_Latn",
|
| 1659 |
+
"El Alto Zapotec (Latin)": "zpt_Latn",
|
| 1660 |
+
"San Vicente Coatlán Zapotec (Latin)": "zpv_Latn",
|
| 1661 |
+
"Chichicapan Zapotec (Latin)": "zpy_Latn",
|
| 1662 |
+
"Mazaltepec Zapotec (Latin)": "zpz_Latn",
|
| 1663 |
+
"Standard Malay (Latin)": "zsm_Latn",
|
| 1664 |
+
"Tlacolulita Zapotec (Latin)": "ztg_Latn",
|
| 1665 |
+
"Tataltepec Zapotec (Latin)": "ztn_Latn",
|
| 1666 |
+
"Tilquiapan Zapotec (Latin)": "ztp_Latn",
|
| 1667 |
+
"Quiavicuzas Zapotec (Latin)": "ztq_Latn",
|
| 1668 |
+
"Samo (Latin)": "zts_Latn",
|
| 1669 |
+
"Samo (Latin)": "ztu_Latn",
|
| 1670 |
+
"Yalálag Zapotec (Latin)": "zty_Latn",
|
| 1671 |
+
"Zulu (Latin)": "zul_Latn",
|
| 1672 |
+
"Yongbei Zhuang (Latin)": "zyb_Latn",
|
| 1673 |
+
"Yongbei Zhuang (Latin)": "zyp_Latn",
|
| 1674 |
+
"Zhuang (Latin)": "zza_Latn"
|
| 1675 |
+
}
|
server/media_transcription_processor.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Media Transcription Processor
|
| 3 |
+
|
| 4 |
+
Pipeline-focused transcription processor that maintains state through processing stages
|
| 5 |
+
while exposing intermediate results for flexibility and ensuring proper resource cleanup.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import base64
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
from typing import Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
+
from audio_transcription import transcribe_full_audio_with_chunking
|
| 16 |
+
from convert_media_to_wav import convert_media_to_wav_from_bytes
|
| 17 |
+
from inference.audio_reading_tools import wav_to_bytes
|
| 18 |
+
from transcription_status import transcription_status
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class MediaTranscriptionProcessor:
|
| 22 |
+
"""
|
| 23 |
+
Pipeline-focused transcription processor that maintains state through processing stages
|
| 24 |
+
while exposing intermediate results for flexibility and ensuring proper resource cleanup.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
# Maximum duration (in seconds) before a transcription is considered stuck
|
| 28 |
+
# MAX_TRANSCRIPTION_DURATION = 120 # 2 minutes
|
| 29 |
+
|
| 30 |
+
# For long meetings (1 hour max)
|
| 31 |
+
# MAX_TRANSCRIPTION_DURATION = 3600
|
| 32 |
+
|
| 33 |
+
# Or disable timeout entirely
|
| 34 |
+
MAX_TRANSCRIPTION_DURATION = float("inf")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def __init__(self, media_bytes: bytes, filename: str, language_with_script: str = None):
|
| 38 |
+
"""Initialize processor with media data and metadata."""
|
| 39 |
+
# Core input data
|
| 40 |
+
self.media_bytes = media_bytes
|
| 41 |
+
self.original_filename = filename
|
| 42 |
+
self.language_with_script = language_with_script
|
| 43 |
+
|
| 44 |
+
# Processing state - lazy loaded
|
| 45 |
+
self._temp_wav_path: Optional[str] = None
|
| 46 |
+
self._audio_tensor: Optional[torch.Tensor] = None
|
| 47 |
+
self._audio_numpy: Optional[np.ndarray] = None
|
| 48 |
+
self._sample_rate: int = 16000
|
| 49 |
+
self._duration: Optional[float] = None
|
| 50 |
+
self._chunks: Optional[List] = None
|
| 51 |
+
self._transcription_results: Optional[Dict] = None
|
| 52 |
+
self._error: Optional[str] = None
|
| 53 |
+
|
| 54 |
+
# Resource tracking for cleanup
|
| 55 |
+
self._temp_files: List[str] = []
|
| 56 |
+
self._cleanup_performed = False
|
| 57 |
+
|
| 58 |
+
# Transcription status management
|
| 59 |
+
self._status_initialized = False
|
| 60 |
+
|
| 61 |
+
def start_transcription(self):
|
| 62 |
+
"""Initialize transcription status tracking."""
|
| 63 |
+
if not self._status_initialized:
|
| 64 |
+
transcription_status.start_transcription("transcribe", self.original_filename)
|
| 65 |
+
self._status_initialized = True
|
| 66 |
+
|
| 67 |
+
def update_progress(self, progress: float):
|
| 68 |
+
"""Update transcription progress."""
|
| 69 |
+
transcription_status.update_progress(progress)
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def is_server_busy() -> bool:
|
| 73 |
+
"""
|
| 74 |
+
Check if the server is currently busy with another transcription.
|
| 75 |
+
|
| 76 |
+
This method includes timeout handling - if a transcription has been
|
| 77 |
+
running too long, it will be force-finished.
|
| 78 |
+
"""
|
| 79 |
+
status = MediaTranscriptionProcessor.get_server_status()
|
| 80 |
+
return status.get("is_busy", False)
|
| 81 |
+
|
| 82 |
+
@staticmethod
|
| 83 |
+
def get_server_status() -> dict:
|
| 84 |
+
"""
|
| 85 |
+
Get current server transcription status with timeout handling.
|
| 86 |
+
|
| 87 |
+
If a transcription has been running longer than MAX_TRANSCRIPTION_DURATION,
|
| 88 |
+
it will be force-finished to prevent the server from being stuck indefinitely.
|
| 89 |
+
"""
|
| 90 |
+
status = transcription_status.get_status()
|
| 91 |
+
|
| 92 |
+
# Check if transcription has been running too long
|
| 93 |
+
if (status.get("is_busy", False) and
|
| 94 |
+
status.get("duration_seconds", 0) > MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION):
|
| 95 |
+
|
| 96 |
+
logger = logging.getLogger(__name__)
|
| 97 |
+
logger.warning(
|
| 98 |
+
f"Force-finishing stuck transcription after {status.get('duration_seconds', 0):.1f}s "
|
| 99 |
+
f"(max: {MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION}s). "
|
| 100 |
+
f"Operation: {status.get('current_operation')}, "
|
| 101 |
+
f"File: {status.get('current_filename')}"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Force finish the transcription
|
| 105 |
+
transcription_status.finish_transcription()
|
| 106 |
+
|
| 107 |
+
# Get updated status
|
| 108 |
+
status = transcription_status.get_status()
|
| 109 |
+
status["force_finished"] = True
|
| 110 |
+
status["reason"] = f"Transcription exceeded maximum duration of {MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION}s"
|
| 111 |
+
|
| 112 |
+
return status
|
| 113 |
+
|
| 114 |
+
def convert_media(self) -> 'MediaTranscriptionProcessor':
|
| 115 |
+
"""
|
| 116 |
+
Stage 1: Convert media to standardized audio format.
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
Self for method chaining
|
| 120 |
+
"""
|
| 121 |
+
if self._temp_wav_path is not None:
|
| 122 |
+
# Already converted
|
| 123 |
+
return self
|
| 124 |
+
|
| 125 |
+
logger = logging.getLogger(__name__)
|
| 126 |
+
logger.info(f"Converting media file: {self.original_filename}")
|
| 127 |
+
|
| 128 |
+
# Update progress if status is initialized
|
| 129 |
+
if self._status_initialized:
|
| 130 |
+
self.update_progress(0.1)
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
# Convert media bytes to WAV and tensor
|
| 134 |
+
temp_wav_path, audio_tensor = convert_media_to_wav_from_bytes(
|
| 135 |
+
self.media_bytes, self.original_filename
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Store results and track temp file
|
| 139 |
+
self._temp_wav_path = temp_wav_path
|
| 140 |
+
self._audio_tensor = audio_tensor
|
| 141 |
+
self._temp_files.append(temp_wav_path)
|
| 142 |
+
|
| 143 |
+
# Calculate duration from tensor
|
| 144 |
+
if audio_tensor is not None:
|
| 145 |
+
self._duration = len(audio_tensor) / self._sample_rate
|
| 146 |
+
|
| 147 |
+
logger.info(f"Media conversion completed: {self.original_filename} -> {self._duration:.2f}s")
|
| 148 |
+
|
| 149 |
+
# Update progress if status is initialized
|
| 150 |
+
if self._status_initialized:
|
| 151 |
+
self.update_progress(0.2)
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Media conversion failed for {self.original_filename}: {str(e)}")
|
| 155 |
+
|
| 156 |
+
# Provide user-friendly error message based on the error type
|
| 157 |
+
if "ffmpeg returned error code" in str(e).lower():
|
| 158 |
+
error_msg = (
|
| 159 |
+
f"Audio/video conversion failed for '{self.original_filename}'. "
|
| 160 |
+
f"The file may have an unsupported audio codec or be corrupted. "
|
| 161 |
+
f"Please try converting the file to a standard format (MP3, WAV, MP4) before uploading. "
|
| 162 |
+
f"For best results, use files with common codecs: "
|
| 163 |
+
f"Audio - AAC, MP3, PCM, FLAC; Video - H.264/AAC (MP4), standard codecs. "
|
| 164 |
+
f"Avoid proprietary, DRM-protected, or very old codec variants."
|
| 165 |
+
)
|
| 166 |
+
else:
|
| 167 |
+
error_msg = f"Failed to process media file '{self.original_filename}'"
|
| 168 |
+
|
| 169 |
+
error_msg += f"\nTechnical Details: {str(e)}"
|
| 170 |
+
|
| 171 |
+
# Store the error for later retrieval
|
| 172 |
+
self._error = error_msg
|
| 173 |
+
raise RuntimeError(error_msg)
|
| 174 |
+
|
| 175 |
+
return self
|
| 176 |
+
|
| 177 |
+
def get_wav_path(self) -> str:
|
| 178 |
+
"""Get the temporary WAV file path (converts media if needed)."""
|
| 179 |
+
if self._temp_wav_path is None:
|
| 180 |
+
self.convert_media()
|
| 181 |
+
return self._temp_wav_path
|
| 182 |
+
|
| 183 |
+
def get_audio_tensor(self) -> torch.Tensor:
|
| 184 |
+
"""Get standardized audio tensor (converts media if needed)."""
|
| 185 |
+
if self._audio_tensor is None:
|
| 186 |
+
self.convert_media()
|
| 187 |
+
return self._audio_tensor
|
| 188 |
+
|
| 189 |
+
def get_audio_numpy(self) -> np.ndarray:
|
| 190 |
+
"""Get audio as numpy array (converted from tensor if needed)."""
|
| 191 |
+
if self._audio_numpy is None:
|
| 192 |
+
tensor = self.get_audio_tensor()
|
| 193 |
+
if tensor is not None:
|
| 194 |
+
# Convert to numpy, handling different tensor types
|
| 195 |
+
if hasattr(tensor, 'cpu'):
|
| 196 |
+
self._audio_numpy = tensor.cpu().numpy()
|
| 197 |
+
else:
|
| 198 |
+
self._audio_numpy = tensor.numpy()
|
| 199 |
+
else:
|
| 200 |
+
self._audio_numpy = np.array([])
|
| 201 |
+
return self._audio_numpy
|
| 202 |
+
|
| 203 |
+
@property
|
| 204 |
+
def duration(self) -> float:
|
| 205 |
+
"""Get audio duration in seconds."""
|
| 206 |
+
if self._duration is None:
|
| 207 |
+
self.convert_media()
|
| 208 |
+
return self._duration or 0.0
|
| 209 |
+
|
| 210 |
+
@property
|
| 211 |
+
def sample_rate(self) -> int:
|
| 212 |
+
"""Get audio sample rate."""
|
| 213 |
+
return self._sample_rate
|
| 214 |
+
|
| 215 |
+
def transcribe_full_pipeline(self) -> 'MediaTranscriptionProcessor':
|
| 216 |
+
"""
|
| 217 |
+
Stage 2: Run the complete transcription pipeline with chunking.
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
Self for method chaining
|
| 221 |
+
"""
|
| 222 |
+
if self._transcription_results is not None:
|
| 223 |
+
# Already transcribed
|
| 224 |
+
return self
|
| 225 |
+
|
| 226 |
+
logger = logging.getLogger(__name__)
|
| 227 |
+
|
| 228 |
+
# Ensure media is converted
|
| 229 |
+
wav_path = self.get_wav_path()
|
| 230 |
+
|
| 231 |
+
logger.info(f"Starting transcription pipeline for: {self.original_filename}")
|
| 232 |
+
|
| 233 |
+
# Get the preprocessed audio tensor instead of just the WAV path
|
| 234 |
+
audio_tensor = self.get_audio_tensor()
|
| 235 |
+
|
| 236 |
+
# Run the full transcription with chunking using the tensor
|
| 237 |
+
self._transcription_results = transcribe_full_audio_with_chunking(
|
| 238 |
+
audio_tensor=audio_tensor,
|
| 239 |
+
sample_rate=self._sample_rate,
|
| 240 |
+
language_with_script=self.language_with_script,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
logger.info(f"Transcription completed: {self._transcription_results.get('num_chunks', 0)} chunks")
|
| 244 |
+
|
| 245 |
+
# Update progress if status is initialized
|
| 246 |
+
if self._status_initialized:
|
| 247 |
+
self.update_progress(0.9)
|
| 248 |
+
|
| 249 |
+
return self
|
| 250 |
+
|
| 251 |
+
def get_results(self, include_preprocessed_audio: bool = False) -> Dict:
|
| 252 |
+
"""
|
| 253 |
+
Get final transcription results (runs transcription if needed).
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
include_preprocessed_audio: Whether to include base64-encoded preprocessed WAV data
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
Complete transcription results dictionary, optionally with preprocessed audio
|
| 260 |
+
"""
|
| 261 |
+
if self._transcription_results is None:
|
| 262 |
+
self.transcribe_full_pipeline()
|
| 263 |
+
|
| 264 |
+
results = self._transcription_results or {}
|
| 265 |
+
|
| 266 |
+
# Add preprocessed audio data if requested
|
| 267 |
+
if include_preprocessed_audio and self._audio_tensor is not None:
|
| 268 |
+
try:
|
| 269 |
+
# Convert the preprocessed tensor to WAV bytes
|
| 270 |
+
audio_tensor_cpu = self._audio_tensor.cpu() if self._audio_tensor.is_cuda else self._audio_tensor
|
| 271 |
+
wav_bytes = wav_to_bytes(audio_tensor_cpu, sample_rate=self._sample_rate, format="wav")
|
| 272 |
+
|
| 273 |
+
# Encode as base64
|
| 274 |
+
audio_data_b64 = base64.b64encode(wav_bytes.tobytes()).decode('utf-8')
|
| 275 |
+
|
| 276 |
+
results["preprocessed_audio"] = {
|
| 277 |
+
"data": audio_data_b64,
|
| 278 |
+
"format": "wav",
|
| 279 |
+
"sample_rate": self._sample_rate,
|
| 280 |
+
"duration": self.duration,
|
| 281 |
+
"size_bytes": len(wav_bytes)
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
logging.getLogger(__name__).info(f"Added preprocessed audio data: {len(wav_bytes)} bytes")
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logging.getLogger(__name__).warning(f"Failed to include preprocessed audio data: {e}")
|
| 288 |
+
|
| 289 |
+
return results
|
| 290 |
+
|
| 291 |
+
def cleanup(self):
|
| 292 |
+
"""Clean up all temporary files and resources."""
|
| 293 |
+
if self._cleanup_performed:
|
| 294 |
+
return
|
| 295 |
+
|
| 296 |
+
logger = logging.getLogger(__name__)
|
| 297 |
+
|
| 298 |
+
# Clean up temporary files
|
| 299 |
+
for temp_file in self._temp_files:
|
| 300 |
+
try:
|
| 301 |
+
if os.path.exists(temp_file):
|
| 302 |
+
os.unlink(temp_file)
|
| 303 |
+
logger.debug(f"Cleaned up temp file: {temp_file}")
|
| 304 |
+
except Exception as e:
|
| 305 |
+
logger.warning(f"Failed to clean up temp file {temp_file}: {e}")
|
| 306 |
+
|
| 307 |
+
# Finish transcription status - always call to ensure we don't get stuck
|
| 308 |
+
# It's better to be safe than risk leaving the server in a busy state
|
| 309 |
+
transcription_status.finish_transcription()
|
| 310 |
+
self._status_initialized = False
|
| 311 |
+
|
| 312 |
+
# Clear references to help garbage collection
|
| 313 |
+
self._audio_tensor = None
|
| 314 |
+
self._audio_numpy = None
|
| 315 |
+
self._transcription_results = None
|
| 316 |
+
self._chunks = None
|
| 317 |
+
self._temp_files.clear()
|
| 318 |
+
|
| 319 |
+
self._cleanup_performed = True
|
| 320 |
+
logger.debug(f"Cleanup completed for: {self.original_filename}")
|
| 321 |
+
|
| 322 |
+
def __enter__(self) -> 'MediaTranscriptionProcessor':
|
| 323 |
+
"""Context manager entry."""
|
| 324 |
+
return self
|
| 325 |
+
|
| 326 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 327 |
+
"""Context manager exit - ensures cleanup."""
|
| 328 |
+
self.cleanup()
|
| 329 |
+
|
| 330 |
+
def __del__(self):
|
| 331 |
+
"""Destructor - final cleanup attempt."""
|
| 332 |
+
if not self._cleanup_performed:
|
| 333 |
+
|
| 334 |
+
self.cleanup()
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask==3.0.0
|
| 2 |
+
flask-cors==4.0.0
|
| 3 |
+
gunicorn==21.2.0
|
| 4 |
+
|
| 5 |
+
# Audio processing
|
| 6 |
+
torchaudio<=2.8.0
|
| 7 |
+
torchcodec
|
| 8 |
+
librosa==0.10.1
|
| 9 |
+
soundfile==0.12.1
|
| 10 |
+
audioread>=3.0.0
|
| 11 |
+
pydub>=0.25.1
|
| 12 |
+
|
| 13 |
+
# VAD and audio chunking
|
| 14 |
+
silero-vad>=4.0.0
|
| 15 |
+
onnxruntime>=1.12.0
|
| 16 |
+
|
| 17 |
+
# Text processing
|
| 18 |
+
uroman
|
| 19 |
+
|
| 20 |
+
# Data structures and utilities
|
| 21 |
+
dataclasses
|
| 22 |
+
pandas
|
| 23 |
+
xxhash
|
| 24 |
+
requests==2.31.0
|
server/subtitle.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
def convert_time_to_srt_format(seconds):
|
| 4 |
+
"""Converts seconds to the standard SRT time format (HH:MM:SS,ms)."""
|
| 5 |
+
hours = int(seconds // 3600)
|
| 6 |
+
minutes = int((seconds % 3600) // 60)
|
| 7 |
+
secs = int(seconds % 60)
|
| 8 |
+
milliseconds = round((seconds - int(seconds)) * 1000)
|
| 9 |
+
|
| 10 |
+
if milliseconds == 1000:
|
| 11 |
+
milliseconds = 0
|
| 12 |
+
secs += 1
|
| 13 |
+
if secs == 60:
|
| 14 |
+
secs, minutes = 0, minutes + 1
|
| 15 |
+
if minutes == 60:
|
| 16 |
+
minutes, hours = 0, hours + 1
|
| 17 |
+
|
| 18 |
+
return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
|
| 19 |
+
|
| 20 |
+
def word_level_srt(words_timestamp, srt_path="word_level_subtitle.srt", shorts=False):
|
| 21 |
+
"""Generates an SRT file with one word per subtitle entry."""
|
| 22 |
+
punctuation = re.compile(r'[.,!?;:"\–—_~^+*|]')
|
| 23 |
+
with open(srt_path, 'w', encoding='utf-8') as srt_file:
|
| 24 |
+
for i, word_info in enumerate(words_timestamp, start=1):
|
| 25 |
+
start = convert_time_to_srt_format(word_info['start'])
|
| 26 |
+
end = convert_time_to_srt_format(word_info['end'])
|
| 27 |
+
word = re.sub(punctuation, '', word_info['word'])
|
| 28 |
+
if word.strip().lower() == 'i': word = "I"
|
| 29 |
+
if not shorts: word = word.replace("-", "")
|
| 30 |
+
srt_file.write(f"{i}\n{start} --> {end}\n{word}\n\n")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def split_line_by_char_limit(text, max_chars_per_line=38):
|
| 35 |
+
"""Splits a string into multiple lines based on a character limit."""
|
| 36 |
+
words = text.split()
|
| 37 |
+
lines = []
|
| 38 |
+
current_line = ""
|
| 39 |
+
for word in words:
|
| 40 |
+
if not current_line:
|
| 41 |
+
current_line = word
|
| 42 |
+
elif len(current_line + " " + word) <= max_chars_per_line:
|
| 43 |
+
current_line += " " + word
|
| 44 |
+
else:
|
| 45 |
+
lines.append(current_line)
|
| 46 |
+
current_line = word
|
| 47 |
+
if current_line:
|
| 48 |
+
lines.append(current_line)
|
| 49 |
+
return lines
|
| 50 |
+
|
| 51 |
+
def merge_punctuation_glitches(subtitles):
|
| 52 |
+
"""Cleans up punctuation artifacts at the boundaries of subtitle entries."""
|
| 53 |
+
if not subtitles:
|
| 54 |
+
return []
|
| 55 |
+
|
| 56 |
+
cleaned = [subtitles[0]]
|
| 57 |
+
for i in range(1, len(subtitles)):
|
| 58 |
+
prev = cleaned[-1]
|
| 59 |
+
curr = subtitles[i]
|
| 60 |
+
|
| 61 |
+
prev_text = prev["text"].rstrip()
|
| 62 |
+
curr_text = curr["text"].lstrip()
|
| 63 |
+
|
| 64 |
+
match = re.match(r'^([,.:;!?]+)(\s*)(.+)', curr_text)
|
| 65 |
+
if match:
|
| 66 |
+
punct, _, rest = match.groups()
|
| 67 |
+
if not prev_text.endswith(tuple(punct)):
|
| 68 |
+
prev["text"] = prev_text + punct
|
| 69 |
+
curr_text = rest.strip()
|
| 70 |
+
|
| 71 |
+
unwanted_chars = ['"', '“', '”', ';', ':']
|
| 72 |
+
for ch in unwanted_chars:
|
| 73 |
+
curr_text = curr_text.replace(ch, '')
|
| 74 |
+
curr_text = curr_text.strip()
|
| 75 |
+
|
| 76 |
+
if not curr_text or re.fullmatch(r'[.,!?]+', curr_text):
|
| 77 |
+
prev["end"] = curr["end"]
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
curr["text"] = curr_text
|
| 81 |
+
prev["text"] = prev["text"].replace('"', '').replace('“', '').replace('”', '')
|
| 82 |
+
cleaned.append(curr)
|
| 83 |
+
|
| 84 |
+
return cleaned
|
| 85 |
+
|
| 86 |
+
import json
|
| 87 |
+
def write_sentence_srt(
|
| 88 |
+
word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
|
| 89 |
+
max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
|
| 90 |
+
merge_pause_threshold=0.4
|
| 91 |
+
):
|
| 92 |
+
"""Creates professional-grade SRT files and a corresponding timestamp.json file."""
|
| 93 |
+
if not word_level_timestamps:
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
# Phase 1: Generate draft subtitles based on timing and length rules
|
| 97 |
+
draft_subtitles = []
|
| 98 |
+
i = 0
|
| 99 |
+
while i < len(word_level_timestamps):
|
| 100 |
+
start_time = word_level_timestamps[i]["start"]
|
| 101 |
+
|
| 102 |
+
# We'll now store the full word objects, not just the text
|
| 103 |
+
current_word_objects = []
|
| 104 |
+
|
| 105 |
+
j = i
|
| 106 |
+
while j < len(word_level_timestamps):
|
| 107 |
+
entry = word_level_timestamps[j]
|
| 108 |
+
|
| 109 |
+
# Create potential text from the word objects
|
| 110 |
+
potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
|
| 111 |
+
potential_text = " ".join(potential_words)
|
| 112 |
+
|
| 113 |
+
if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
|
| 114 |
+
if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
|
| 115 |
+
|
| 116 |
+
if j > i:
|
| 117 |
+
prev_entry = word_level_timestamps[j-1]
|
| 118 |
+
pause = entry["start"] - prev_entry["end"]
|
| 119 |
+
if pause >= hard_pause_threshold: break
|
| 120 |
+
if prev_entry["word"].endswith(('.','!','?')): break
|
| 121 |
+
|
| 122 |
+
# Append the full word object
|
| 123 |
+
current_word_objects.append(entry)
|
| 124 |
+
j += 1
|
| 125 |
+
|
| 126 |
+
if not current_word_objects:
|
| 127 |
+
current_word_objects.append(word_level_timestamps[i])
|
| 128 |
+
j = i + 1
|
| 129 |
+
|
| 130 |
+
text = " ".join([w["word"] for w in current_word_objects])
|
| 131 |
+
end_time = word_level_timestamps[j - 1]["end"]
|
| 132 |
+
|
| 133 |
+
# Include the list of word objects in our draft subtitle
|
| 134 |
+
draft_subtitles.append({
|
| 135 |
+
"start": start_time,
|
| 136 |
+
"end": end_time,
|
| 137 |
+
"text": text,
|
| 138 |
+
"words": current_word_objects
|
| 139 |
+
})
|
| 140 |
+
i = j
|
| 141 |
+
|
| 142 |
+
# Phase 2: Post-process to merge single-word "orphan" subtitles
|
| 143 |
+
if not draft_subtitles: return
|
| 144 |
+
final_subtitles = [draft_subtitles[0]]
|
| 145 |
+
for k in range(1, len(draft_subtitles)):
|
| 146 |
+
prev_sub = final_subtitles[-1]
|
| 147 |
+
current_sub = draft_subtitles[k]
|
| 148 |
+
is_orphan = len(current_sub["text"].split()) == 1
|
| 149 |
+
pause_from_prev = current_sub["start"] - prev_sub["end"]
|
| 150 |
+
|
| 151 |
+
if is_orphan and pause_from_prev < merge_pause_threshold:
|
| 152 |
+
merged_text = prev_sub["text"] + " " + current_sub["text"]
|
| 153 |
+
if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
|
| 154 |
+
prev_sub["text"] = merged_text
|
| 155 |
+
prev_sub["end"] = current_sub["end"]
|
| 156 |
+
|
| 157 |
+
# Merge the word-level data as well
|
| 158 |
+
prev_sub["words"].extend(current_sub["words"])
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
final_subtitles.append(current_sub)
|
| 162 |
+
|
| 163 |
+
final_subtitles = merge_punctuation_glitches(final_subtitles)
|
| 164 |
+
|
| 165 |
+
# This dictionary will hold the data for our JSON file
|
| 166 |
+
timestamps_data = {}
|
| 167 |
+
|
| 168 |
+
# Phase 3: Write the final SRT file (and prepare JSON data)
|
| 169 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 170 |
+
for idx, sub in enumerate(final_subtitles, start=1):
|
| 171 |
+
# --- SRT Writing (Unchanged) ---
|
| 172 |
+
text = sub["text"].replace(" ,", ",").replace(" .", ".")
|
| 173 |
+
formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
|
| 174 |
+
start_time_str = convert_time_to_srt_format(sub['start'])
|
| 175 |
+
end_time_str = convert_time_to_srt_format(sub['end'])
|
| 176 |
+
|
| 177 |
+
f.write(f"{idx}\n")
|
| 178 |
+
f.write(f"{start_time_str} --> {end_time_str}\n")
|
| 179 |
+
f.write("\n".join(formatted_lines) + "\n\n")
|
| 180 |
+
|
| 181 |
+
# Create the list of word dictionaries for the current subtitle
|
| 182 |
+
word_data = []
|
| 183 |
+
for word_obj in sub["words"]:
|
| 184 |
+
word_data.append({
|
| 185 |
+
"word": word_obj["word"],
|
| 186 |
+
"start": convert_time_to_srt_format(word_obj["start"]),
|
| 187 |
+
"end": convert_time_to_srt_format(word_obj["end"])
|
| 188 |
+
})
|
| 189 |
+
|
| 190 |
+
# Add the complete entry to our main dictionary
|
| 191 |
+
timestamps_data[str(idx)] = {
|
| 192 |
+
"text": "\n".join(formatted_lines),
|
| 193 |
+
"start": start_time_str,
|
| 194 |
+
"end": end_time_str,
|
| 195 |
+
"words": word_data
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
# Write the collected data to the JSON file
|
| 199 |
+
json_output_file = output_file.replace(".srt",".json")
|
| 200 |
+
with open(json_output_file, "w", encoding="utf-8") as f_json:
|
| 201 |
+
json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
|
| 202 |
+
|
| 203 |
+
# print(f"Successfully generated SRT file: {output_file}")
|
| 204 |
+
# print(f"Successfully generated JSON file: {json_output_file}")
|
| 205 |
+
return json_output_file
|
| 206 |
+
def make_subtitle(word_level_timestamps,file_path):
|
| 207 |
+
os.makedirs("./subtitles/",exist_ok=True)
|
| 208 |
+
file_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 209 |
+
|
| 210 |
+
word_level_srt_file=f"./subtitles/{file_name}_subtitle_words.srt"
|
| 211 |
+
sentence_srt_file=f"./subtitles/{file_name}_subtitle_sentences.srt"
|
| 212 |
+
shorts_srt_file=f"./subtitles/{file_name}_subtitle_shorts.srt"
|
| 213 |
+
word_level_srt(
|
| 214 |
+
word_level_timestamps,
|
| 215 |
+
srt_path=word_level_srt_file,
|
| 216 |
+
shorts=False
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
sentence_json = write_sentence_srt(
|
| 220 |
+
word_level_timestamps,
|
| 221 |
+
output_file=sentence_srt_file,
|
| 222 |
+
max_lines=2,
|
| 223 |
+
max_duration_s=7.0,
|
| 224 |
+
max_chars_per_line=38,
|
| 225 |
+
hard_pause_threshold=0.5,
|
| 226 |
+
merge_pause_threshold=0.4
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
shorts_json = write_sentence_srt(
|
| 230 |
+
word_level_timestamps,
|
| 231 |
+
output_file=shorts_srt_file,
|
| 232 |
+
max_lines=1,
|
| 233 |
+
max_duration_s=2.0,
|
| 234 |
+
max_chars_per_line=17
|
| 235 |
+
)
|
| 236 |
+
return sentence_srt_file,word_level_srt_file,shorts_srt_file
|
server/transcription_status.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import threading
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TranscriptionStatus:
|
| 10 |
+
"""Simple transcription status tracker"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.is_busy = False
|
| 14 |
+
self.current_operation = None
|
| 15 |
+
self.current_filename = None
|
| 16 |
+
self.started_at = None
|
| 17 |
+
self.progress = 0.0
|
| 18 |
+
self.lock = threading.Lock()
|
| 19 |
+
self.total_completed = 0
|
| 20 |
+
|
| 21 |
+
def start_transcription(self, operation_type: str, filename: str = None):
|
| 22 |
+
"""Mark transcription as started"""
|
| 23 |
+
with self.lock:
|
| 24 |
+
self.is_busy = True
|
| 25 |
+
self.current_operation = operation_type
|
| 26 |
+
self.current_filename = filename
|
| 27 |
+
self.started_at = datetime.now()
|
| 28 |
+
self.progress = 0.0
|
| 29 |
+
logger.info(f"Started {operation_type} transcription for {filename or 'unknown file'}")
|
| 30 |
+
|
| 31 |
+
def update_progress(self, progress: float):
|
| 32 |
+
"""Update transcription progress (0.0 to 1.0)"""
|
| 33 |
+
with self.lock:
|
| 34 |
+
self.progress = max(0.0, min(1.0, progress))
|
| 35 |
+
|
| 36 |
+
def finish_transcription(self):
|
| 37 |
+
"""Mark transcription as finished"""
|
| 38 |
+
with self.lock:
|
| 39 |
+
self.is_busy = False
|
| 40 |
+
self.current_operation = None
|
| 41 |
+
self.current_filename = None
|
| 42 |
+
self.started_at = None
|
| 43 |
+
self.progress = 0.0
|
| 44 |
+
self.total_completed += 1
|
| 45 |
+
logger.info("Transcription finished")
|
| 46 |
+
|
| 47 |
+
def get_status(self) -> Dict:
|
| 48 |
+
"""Get current status for API response"""
|
| 49 |
+
with self.lock:
|
| 50 |
+
status = {"is_busy": self.is_busy, "total_completed": self.total_completed}
|
| 51 |
+
|
| 52 |
+
if self.is_busy:
|
| 53 |
+
duration = (
|
| 54 |
+
(datetime.now() - self.started_at).total_seconds()
|
| 55 |
+
if self.started_at
|
| 56 |
+
else 0
|
| 57 |
+
)
|
| 58 |
+
status.update(
|
| 59 |
+
{
|
| 60 |
+
"current_operation": self.current_operation,
|
| 61 |
+
"current_filename": self.current_filename,
|
| 62 |
+
"progress": self.progress,
|
| 63 |
+
"duration_seconds": round(duration, 1),
|
| 64 |
+
}
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
return status
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# Global status instance
|
| 71 |
+
transcription_status = TranscriptionStatus()
|
server/transcriptions_blueprint.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from audio_transcription import perform_forced_alignment
|
| 8 |
+
from media_transcription_processor import MediaTranscriptionProcessor
|
| 9 |
+
from transcription_status import transcription_status
|
| 10 |
+
from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
|
| 11 |
+
|
| 12 |
+
from env_vars import API_LOG_LEVEL, MODEL_NAME
|
| 13 |
+
from flask import Blueprint, jsonify, request, send_file
|
| 14 |
+
from video_utils import check_ffmpeg_available, combine_video_with_subtitles
|
| 15 |
+
|
| 16 |
+
transcriptions_blueprint = Blueprint(
|
| 17 |
+
"transcriptions_blueprint",
|
| 18 |
+
__name__,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
logger.level = API_LOG_LEVEL
|
| 23 |
+
logging.getLogger("boto3").setLevel(API_LOG_LEVEL)
|
| 24 |
+
logging.getLogger("botocore").setLevel(API_LOG_LEVEL)
|
| 25 |
+
|
| 26 |
+
MAX_SHORTFORM_DURATION = 10 # seconds
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@transcriptions_blueprint.route("/health")
|
| 30 |
+
def health():
|
| 31 |
+
"""Comprehensive health check endpoint"""
|
| 32 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 33 |
+
cuda_available = torch.cuda.is_available()
|
| 34 |
+
ffmpeg_available = check_ffmpeg_available()
|
| 35 |
+
|
| 36 |
+
# Get transcription status
|
| 37 |
+
transcription_info = MediaTranscriptionProcessor.get_server_status()
|
| 38 |
+
|
| 39 |
+
# Get GPU details if CUDA is available
|
| 40 |
+
gpu_info = {}
|
| 41 |
+
if cuda_available:
|
| 42 |
+
gpu_info = {
|
| 43 |
+
"gpu_count": torch.cuda.device_count(),
|
| 44 |
+
"current_device": torch.cuda.current_device(),
|
| 45 |
+
"gpu_name": (
|
| 46 |
+
torch.cuda.get_device_name(0)
|
| 47 |
+
if torch.cuda.device_count() > 0
|
| 48 |
+
else "Unknown"
|
| 49 |
+
),
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Add GPU memory information
|
| 53 |
+
try:
|
| 54 |
+
current_device = torch.cuda.current_device()
|
| 55 |
+
memory_allocated = torch.cuda.memory_allocated(current_device)
|
| 56 |
+
memory_reserved = torch.cuda.memory_reserved(current_device)
|
| 57 |
+
memory_total = torch.cuda.get_device_properties(current_device).total_memory
|
| 58 |
+
|
| 59 |
+
gpu_info.update(
|
| 60 |
+
{
|
| 61 |
+
"gpu_memory_allocated_mb": round(memory_allocated / 1024 / 1024, 1),
|
| 62 |
+
"gpu_memory_reserved_mb": round(memory_reserved / 1024 / 1024, 1),
|
| 63 |
+
"gpu_memory_total_mb": round(memory_total / 1024 / 1024, 1),
|
| 64 |
+
"gpu_memory_free_mb": round(
|
| 65 |
+
(memory_total - memory_reserved) / 1024 / 1024, 1
|
| 66 |
+
),
|
| 67 |
+
}
|
| 68 |
+
)
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.warning(f"Could not get GPU memory info: {e}")
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"status": "healthy",
|
| 74 |
+
"message": "MMS Transcription API is running",
|
| 75 |
+
"version": "1.0.0",
|
| 76 |
+
"service": "mms-transcription",
|
| 77 |
+
"device": str(device),
|
| 78 |
+
"cuda_available": cuda_available,
|
| 79 |
+
"ffmpeg_available": ffmpeg_available,
|
| 80 |
+
"transcription_status": transcription_info,
|
| 81 |
+
**gpu_info,
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@transcriptions_blueprint.route("/supported-languages")
|
| 86 |
+
def get_supported_languages():
|
| 87 |
+
"""Get list of supported languages for transcription"""
|
| 88 |
+
try:
|
| 89 |
+
return jsonify({
|
| 90 |
+
"supported_languages": supported_langs,
|
| 91 |
+
})
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.error(f"Error getting supported languages: {str(e)}")
|
| 94 |
+
return jsonify({
|
| 95 |
+
"error": "Could not retrieve supported languages",
|
| 96 |
+
"message": str(e)
|
| 97 |
+
}), 500
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@transcriptions_blueprint.route("/status")
|
| 101 |
+
def get_transcription_status():
|
| 102 |
+
"""Get current transcription status"""
|
| 103 |
+
return jsonify(MediaTranscriptionProcessor.get_server_status())
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@transcriptions_blueprint.route("/transcribe", methods=["POST"])
|
| 107 |
+
def transcribe_audio():
|
| 108 |
+
"""Transcribe media using the MMS model with intelligent chunking for all audio/video files"""
|
| 109 |
+
try:
|
| 110 |
+
# Check if server is busy
|
| 111 |
+
if MediaTranscriptionProcessor.is_server_busy():
|
| 112 |
+
status = MediaTranscriptionProcessor.get_server_status()
|
| 113 |
+
return (
|
| 114 |
+
jsonify(
|
| 115 |
+
{
|
| 116 |
+
"error": "Server is currently processing another transcription",
|
| 117 |
+
"status": "busy",
|
| 118 |
+
"current_operation": status.get("current_operation"),
|
| 119 |
+
}
|
| 120 |
+
),
|
| 121 |
+
503,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Check if media file is provided
|
| 125 |
+
if "media" not in request.files:
|
| 126 |
+
return jsonify({"error": "No media file provided"}), 400
|
| 127 |
+
|
| 128 |
+
media_file = request.files["media"]
|
| 129 |
+
if media_file.filename == "":
|
| 130 |
+
return jsonify({"error": "No file selected"}), 400
|
| 131 |
+
|
| 132 |
+
# Get optional language parameter
|
| 133 |
+
language_with_script = request.form.get("language", None)
|
| 134 |
+
|
| 135 |
+
if language_with_script:
|
| 136 |
+
logger.info(f"Language specified: {language_with_script}")
|
| 137 |
+
else:
|
| 138 |
+
logger.info("No language specified, using auto-detection")
|
| 139 |
+
|
| 140 |
+
# Get optional include_preprocessed parameter (from form data or query string)
|
| 141 |
+
include_preprocessed = (
|
| 142 |
+
request.form.get("include_preprocessed", "false").lower() == "true" or
|
| 143 |
+
request.args.get("include_preprocessed", "false").lower() == "true"
|
| 144 |
+
)
|
| 145 |
+
if include_preprocessed:
|
| 146 |
+
logger.info("Preprocessed audio will be included in response")
|
| 147 |
+
|
| 148 |
+
# Mark as busy and start transcription
|
| 149 |
+
# This will be handled by the processor
|
| 150 |
+
|
| 151 |
+
# Read file bytes once
|
| 152 |
+
media_bytes = media_file.read()
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
# Use the MediaTranscriptionProcessor with context manager for automatic cleanup
|
| 156 |
+
with MediaTranscriptionProcessor(media_bytes, media_file.filename, language_with_script) as processor:
|
| 157 |
+
# Start transcription status tracking
|
| 158 |
+
processor.start_transcription()
|
| 159 |
+
|
| 160 |
+
# Stage 1: Convert media (this also calculates duration and updates progress)
|
| 161 |
+
processor.convert_media()
|
| 162 |
+
logger.info(f"Media conversion completed for: {media_file.filename}")
|
| 163 |
+
|
| 164 |
+
# Stage 2: Run full transcription pipeline (this also updates progress)
|
| 165 |
+
processor.transcribe_full_pipeline()
|
| 166 |
+
|
| 167 |
+
# Get final results with optional preprocessed audio
|
| 168 |
+
results = processor.get_results(include_preprocessed_audio=include_preprocessed)
|
| 169 |
+
|
| 170 |
+
logger.info(f"Transcription completed: {results.get('num_chunks', 0)} chunks")
|
| 171 |
+
|
| 172 |
+
# Format response
|
| 173 |
+
response = {
|
| 174 |
+
"transcription": results.get("transcription", ""),
|
| 175 |
+
"aligned_segments": results.get("aligned_segments", []),
|
| 176 |
+
"chunks": results.get("chunks", []),
|
| 177 |
+
"total_duration": results.get("total_duration", 0.0),
|
| 178 |
+
"num_chunks": results.get("num_chunks", 0),
|
| 179 |
+
"num_segments": results.get("num_segments", 0),
|
| 180 |
+
"model": MODEL_NAME,
|
| 181 |
+
"device": str(torch.device("cuda:0" if torch.cuda.is_available() else "cpu")),
|
| 182 |
+
"status": results.get("status", "success"),
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
# Add preprocessed audio if it was included in results
|
| 186 |
+
if "preprocessed_audio" in results:
|
| 187 |
+
response["preprocessed_audio"] = results["preprocessed_audio"]
|
| 188 |
+
|
| 189 |
+
if "error" in results:
|
| 190 |
+
response["error"] = results["error"]
|
| 191 |
+
logger.error(f"Transcription response with error: {response}")
|
| 192 |
+
return jsonify(response), 500
|
| 193 |
+
|
| 194 |
+
# Print out the complete response for debugging
|
| 195 |
+
logger.info("=== TRANSCRIBE RESPONSE ===")
|
| 196 |
+
# logger.info(f"Full response: {json.dumps(response, indent=2)}")
|
| 197 |
+
logger.info("=== END TRANSCRIBE RESPONSE ===")
|
| 198 |
+
|
| 199 |
+
return jsonify(response)
|
| 200 |
+
# Context manager automatically handles cleanup and status finalization here
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error(f"Media conversion/transcription error: {str(e)}")
|
| 204 |
+
return jsonify({"error": f"Media processing failed: {str(e)}"}), 500
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Transcription error: {str(e)}")
|
| 208 |
+
return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
@transcriptions_blueprint.route("/combine-video-subtitles", methods=["POST"])
|
| 212 |
+
def combine_video_subtitles():
|
| 213 |
+
"""Combine video with subtitles using FFmpeg"""
|
| 214 |
+
try:
|
| 215 |
+
# Check if server is busy
|
| 216 |
+
if MediaTranscriptionProcessor.is_server_busy():
|
| 217 |
+
status = MediaTranscriptionProcessor.get_server_status()
|
| 218 |
+
return (
|
| 219 |
+
jsonify(
|
| 220 |
+
{
|
| 221 |
+
"error": "Server is currently processing another request",
|
| 222 |
+
"status": "busy",
|
| 223 |
+
"current_operation": status.get("current_operation"),
|
| 224 |
+
}
|
| 225 |
+
),
|
| 226 |
+
503,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
# Check required fields
|
| 230 |
+
if "video" not in request.files:
|
| 231 |
+
return jsonify({"error": "No video file provided"}), 400
|
| 232 |
+
|
| 233 |
+
if "subtitles" not in request.form:
|
| 234 |
+
return jsonify({"error": "No subtitles provided"}), 400
|
| 235 |
+
|
| 236 |
+
video_file = request.files["video"]
|
| 237 |
+
subtitles = request.form["subtitles"]
|
| 238 |
+
|
| 239 |
+
if video_file.filename == "":
|
| 240 |
+
return jsonify({"error": "No video file selected"}), 400
|
| 241 |
+
|
| 242 |
+
# Get optional parameters
|
| 243 |
+
subtitle_format = request.form.get("format", "srt") # srt or webvtt
|
| 244 |
+
output_format = request.form.get("output_format", "mp4") # mp4 or mkv
|
| 245 |
+
language = request.form.get("language", "eng")
|
| 246 |
+
|
| 247 |
+
# Mark as busy and start processing
|
| 248 |
+
transcription_status.start_transcription("combine_video", video_file.filename)
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
transcription_status.update_progress(0.1)
|
| 252 |
+
|
| 253 |
+
# Save the uploaded video file to a temporary location
|
| 254 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(video_file.filename)[1]) as temp_video:
|
| 255 |
+
video_file.save(temp_video.name)
|
| 256 |
+
temp_video_path = temp_video.name
|
| 257 |
+
|
| 258 |
+
transcription_status.update_progress(0.3)
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
# Combine video with subtitles using video_utils function
|
| 262 |
+
output_path = combine_video_with_subtitles(
|
| 263 |
+
temp_video_path, subtitles, subtitle_format, output_format, language
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
transcription_status.update_progress(0.9)
|
| 267 |
+
|
| 268 |
+
logger.info(f"Video combination completed: {output_path}")
|
| 269 |
+
|
| 270 |
+
# Return the combined video file
|
| 271 |
+
return send_file(
|
| 272 |
+
output_path,
|
| 273 |
+
as_attachment=True,
|
| 274 |
+
download_name=f"{video_file.filename.rsplit('.', 1)[0]}_with_subtitles.{output_format}",
|
| 275 |
+
mimetype=f"video/{output_format}",
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
finally:
|
| 279 |
+
# Clean up temporary video file
|
| 280 |
+
try:
|
| 281 |
+
os.unlink(temp_video_path)
|
| 282 |
+
except OSError:
|
| 283 |
+
pass
|
| 284 |
+
|
| 285 |
+
finally:
|
| 286 |
+
# Mark transcription as finished
|
| 287 |
+
transcription_status.finish_transcription()
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
transcription_status.finish_transcription()
|
| 291 |
+
logger.error(f"Video combination error: {str(e)}")
|
| 292 |
+
return jsonify({"error": f"Video combination failed: {str(e)}"}), 500
|
server/video_utils.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import subprocess
|
| 5 |
+
import tempfile
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def combine_video_with_subtitles(
|
| 12 |
+
video_file_path: str,
|
| 13 |
+
subtitle_content: str,
|
| 14 |
+
subtitle_format: str = "srt",
|
| 15 |
+
output_format: str = "mp4",
|
| 16 |
+
language: str = "eng",
|
| 17 |
+
) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Combine video file with subtitle content using FFmpeg.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
video_file_path: Path to the input video file
|
| 23 |
+
subtitle_content: String content of the subtitles (SRT or WebVTT)
|
| 24 |
+
subtitle_format: Format of subtitles ("srt" or "webvtt")
|
| 25 |
+
output_format: Output container format ("mp4" or "mkv")
|
| 26 |
+
language: Language code for subtitle track
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Path to the output video file with embedded subtitles
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
# Create temporary files
|
| 33 |
+
with tempfile.NamedTemporaryFile(
|
| 34 |
+
mode="w", suffix=f".{subtitle_format}", delete=False
|
| 35 |
+
) as sub_file:
|
| 36 |
+
sub_file.write(subtitle_content)
|
| 37 |
+
subtitle_file_path = sub_file.name
|
| 38 |
+
|
| 39 |
+
# Generate output filename
|
| 40 |
+
input_path = Path(video_file_path)
|
| 41 |
+
output_path = (
|
| 42 |
+
input_path.parent / f"{input_path.stem}_with_subtitles.{output_format}"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
if output_format.lower() == "mkv":
|
| 47 |
+
# MKV has better subtitle support
|
| 48 |
+
if subtitle_format.lower() == "webvtt":
|
| 49 |
+
codec = "webvtt"
|
| 50 |
+
else:
|
| 51 |
+
codec = "srt"
|
| 52 |
+
|
| 53 |
+
cmd = [
|
| 54 |
+
"ffmpeg",
|
| 55 |
+
"-y", # -y to overwrite output file
|
| 56 |
+
"-i",
|
| 57 |
+
video_file_path,
|
| 58 |
+
"-i",
|
| 59 |
+
subtitle_file_path,
|
| 60 |
+
"-c:v",
|
| 61 |
+
"copy", # Copy video stream
|
| 62 |
+
"-c:a",
|
| 63 |
+
"copy", # Copy audio stream
|
| 64 |
+
"-c:s",
|
| 65 |
+
codec, # Subtitle codec
|
| 66 |
+
"-metadata:s:s:0",
|
| 67 |
+
f"language={language}",
|
| 68 |
+
str(output_path),
|
| 69 |
+
]
|
| 70 |
+
else:
|
| 71 |
+
# MP4 format
|
| 72 |
+
cmd = [
|
| 73 |
+
"ffmpeg",
|
| 74 |
+
"-y",
|
| 75 |
+
"-i",
|
| 76 |
+
video_file_path,
|
| 77 |
+
"-i",
|
| 78 |
+
subtitle_file_path,
|
| 79 |
+
"-c:v",
|
| 80 |
+
"copy", # Copy video stream
|
| 81 |
+
"-c:a",
|
| 82 |
+
"copy", # Copy audio stream
|
| 83 |
+
"-c:s:0",
|
| 84 |
+
"mov_text", # MP4 subtitle format
|
| 85 |
+
"-map",
|
| 86 |
+
"0:v", # Map video from first input
|
| 87 |
+
"-map",
|
| 88 |
+
"0:a", # Map audio from first input
|
| 89 |
+
"-map",
|
| 90 |
+
"1:s", # Map subtitles from second input
|
| 91 |
+
"-metadata:s:s:0",
|
| 92 |
+
f"language={language}",
|
| 93 |
+
"-disposition:s:0",
|
| 94 |
+
"default", # Make subtitles default
|
| 95 |
+
str(output_path),
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
# Execute FFmpeg command
|
| 99 |
+
logger.info(f"Executing FFmpeg command: {' '.join(cmd)}")
|
| 100 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 101 |
+
|
| 102 |
+
# Log FFmpeg output for debugging
|
| 103 |
+
if result.stdout:
|
| 104 |
+
logger.debug(f"FFmpeg stdout: {result.stdout}")
|
| 105 |
+
if result.stderr:
|
| 106 |
+
logger.debug(f"FFmpeg stderr: {result.stderr}")
|
| 107 |
+
|
| 108 |
+
logger.info(f"FFmpeg completed successfully, output file: {output_path}")
|
| 109 |
+
|
| 110 |
+
return str(output_path)
|
| 111 |
+
|
| 112 |
+
except subprocess.CalledProcessError as e:
|
| 113 |
+
raise RuntimeError(f"FFmpeg failed: {e.stderr}")
|
| 114 |
+
except FileNotFoundError:
|
| 115 |
+
raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
|
| 116 |
+
finally:
|
| 117 |
+
# Clean up temporary subtitle file
|
| 118 |
+
try:
|
| 119 |
+
os.unlink(subtitle_file_path)
|
| 120 |
+
except OSError:
|
| 121 |
+
pass
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def check_ffmpeg_available() -> bool:
|
| 125 |
+
"""Check if FFmpeg is available on the system."""
|
| 126 |
+
try:
|
| 127 |
+
subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
|
| 128 |
+
return True
|
| 129 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 130 |
+
return False
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def extract_audio_from_video(video_file_path: str, output_audio_path: str = None) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Extract audio from video file using FFmpeg.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
video_file_path: Path to the input video file
|
| 139 |
+
output_audio_path: Path for output audio file (optional)
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Path to the extracted audio file
|
| 143 |
+
"""
|
| 144 |
+
if not check_ffmpeg_available():
|
| 145 |
+
raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
|
| 146 |
+
|
| 147 |
+
# Generate output filename if not provided
|
| 148 |
+
if output_audio_path is None:
|
| 149 |
+
input_path = Path(video_file_path)
|
| 150 |
+
output_audio_path = str(input_path.with_suffix('.wav'))
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
# FFmpeg command to extract audio
|
| 154 |
+
# -vn: disable video stream
|
| 155 |
+
# -acodec pcm_s16le: use 16-bit PCM encoding
|
| 156 |
+
# -ar 16000: set sample rate to 16kHz (optimal for speech recognition)
|
| 157 |
+
# -ac 1: mono audio (single channel)
|
| 158 |
+
cmd = [
|
| 159 |
+
"ffmpeg",
|
| 160 |
+
"-i", video_file_path,
|
| 161 |
+
"-vn", # No video
|
| 162 |
+
"-acodec", "pcm_s16le", # 16-bit PCM
|
| 163 |
+
"-ar", "16000", # 16kHz sample rate
|
| 164 |
+
"-ac", "1", # Mono
|
| 165 |
+
"-y", # Overwrite output file if it exists
|
| 166 |
+
output_audio_path
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 170 |
+
logger.info(f"Audio extracted successfully to: {output_audio_path}")
|
| 171 |
+
return output_audio_path
|
| 172 |
+
|
| 173 |
+
except subprocess.CalledProcessError as e:
|
| 174 |
+
raise RuntimeError(f"FFmpeg audio extraction failed: {e.stderr}")
|
| 175 |
+
except FileNotFoundError:
|
| 176 |
+
raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def get_video_info(video_file_path: str) -> dict:
|
| 180 |
+
"""Get basic information about a video file."""
|
| 181 |
+
try:
|
| 182 |
+
cmd = [
|
| 183 |
+
"ffprobe",
|
| 184 |
+
"-v",
|
| 185 |
+
"quiet",
|
| 186 |
+
"-print_format",
|
| 187 |
+
"json",
|
| 188 |
+
"-show_format",
|
| 189 |
+
"-show_streams",
|
| 190 |
+
video_file_path,
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 194 |
+
return json.loads(result.stdout)
|
| 195 |
+
|
| 196 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 197 |
+
return {}
|
| 198 |
+
except json.JSONDecodeError:
|
| 199 |
+
return {}
|