NeuralFalcon commited on
Commit
92e075b
·
verified ·
1 Parent(s): 6a0cd27

Upload 21 files

Browse files
app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ import warnings
5
+ import sys
6
+ import os
7
+ fix_import=f"{os.getcwd()}/server"
8
+ sys.path.append(fix_import)
9
+ from inference.audio_chunker import AudioChunker
10
+ from inference.audio_sentence_alignment import AudioAlignment
11
+ from inference.mms_model_pipeline import MMSModel
12
+ from media_transcription_processor import MediaTranscriptionProcessor
13
+ from subtitle import make_subtitle
14
+ from lang_dict import lang_code
15
+ import download_models
16
+
17
+ # warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
18
+ warnings.filterwarnings(
19
+ "ignore",
20
+ message=".*torchaudio.functional._alignment.forced_align.*",
21
+ category=UserWarning
22
+ )
23
+
24
+
25
+ # ---- Setup Model Globals ----
26
+ _model_loaded = False
27
+ _model_loading = False
28
+
29
+ # ---- Initialize model ----
30
+ def load_model(model_name="omniASR_LLM_1B"):
31
+ """Load MMS model on startup - only once."""
32
+ global _model_loaded, _model_loading
33
+ if _model_loaded or _model_loading:
34
+ return
35
+
36
+ _model_loading = True
37
+ print(f"🔄 Loading {model_name} model...")
38
+
39
+ AudioChunker()
40
+ AudioAlignment()
41
+
42
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
43
+ MMSModel(model_card=model_name, device=device)
44
+
45
+ _model_loaded = True
46
+ _model_loading = False
47
+ print("✅ Model loaded successfully.")
48
+
49
+
50
+ # ---- Transcription function ----
51
+ def media_transcription(file_path, lang_code="eng_Latn"):
52
+ """Perform transcription + subtitle generation."""
53
+ with open(file_path, "rb") as f:
54
+ media_bytes = f.read()
55
+
56
+ processor = MediaTranscriptionProcessor(
57
+ media_bytes=media_bytes,
58
+ filename=file_path,
59
+ language_with_script=lang_code
60
+ )
61
+
62
+ processor.convert_media()
63
+ processor.transcribe_full_pipeline()
64
+ results = processor.get_results()
65
+
66
+ transcription = results['transcription']
67
+ word_level_timestamps = [
68
+ {"word": s['text'], "start": s['start'], "end": s['end']}
69
+ for s in results.get('aligned_segments', [])
70
+ ]
71
+
72
+ sentence_srt, word_level_srt, shorts_srt = make_subtitle(word_level_timestamps, file_path)
73
+ return transcription, sentence_srt, word_level_srt, shorts_srt
74
+
75
+
76
+
77
+ def transcribe_interface(audio, selected_lang):
78
+ """Main Gradio wrapper."""
79
+ if audio is None:
80
+ return "Please upload or record audio.", None, None, None
81
+
82
+ # Save uploaded/recorded audio
83
+ file_path = audio
84
+ find_lang_code = lang_code[selected_lang]
85
+
86
+ # print(f"🎙 Transcribing {file_path} in {selected_lang} ({find_lang_code})...")
87
+
88
+ try:
89
+ transcription, sentence_srt, word_level_srt, shorts_srt = media_transcription(file_path, find_lang_code)
90
+ return transcription, sentence_srt, word_level_srt, shorts_srt
91
+ except Exception as e:
92
+ return f"❌ Error: {e}", None, None, None
93
+
94
+
95
+
96
+ def ui():
97
+ lang_list = list(lang_code.keys())
98
+ custom_css = """.gradio-container { font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif; }"""
99
+ with gr.Blocks(theme=gr.themes.Soft(),css=custom_css) as demo:
100
+ gr.HTML("""
101
+ <div style="text-align: center; margin: 20px auto; max-width: 800px;">
102
+ <h1 style="font-size: 2.5em; margin-bottom: 10px;">Meta Omnilingual ASR</h1>
103
+ <a href="https://github.com/NeuralFalconYT/omnilingual-asr-colab" target="_blank" style="display: inline-block; padding: 10px 20px; background-color: #4285F4; color: white; border-radius: 6px; text-decoration: none; font-size: 1em;">😇 Run on Google Colab</a>
104
+ </div>
105
+ """)
106
+
107
+ with gr.Row():
108
+ with gr.Column():
109
+ audio_input = gr.Audio(sources=[ "microphone","upload"], type="filepath", label="🎙 Upload or Record Audio")
110
+ language_dropdown = gr.Dropdown(
111
+ choices=lang_list,
112
+ value=lang_list[0],
113
+ label="🌐 Select Language"
114
+ )
115
+ transcribe_btn = gr.Button("🚀 Transcribe")
116
+ with gr.Column():
117
+ transcription_output = gr.Textbox(label="Transcription", lines=8,show_copy_button=True)
118
+ with gr.Accordion("🎬 Subtitle (Not Accurate)", open=False):
119
+ sentence_srt_out = gr.File(label="Sentence-level Subtitle File")
120
+ word_srt_out = gr.File(label="Word-level Subtitle File")
121
+ shorts_srt_out = gr.File(label="Shorts Subtitle File")
122
+
123
+ transcribe_btn.click(
124
+ fn=transcribe_interface,
125
+ inputs=[audio_input, language_dropdown],
126
+ outputs=[transcription_output, sentence_srt_out, word_srt_out, shorts_srt_out]
127
+ )
128
+
129
+ return demo
130
+
131
+
132
+
133
+
134
+ import click
135
+
136
+ @click.command()
137
+ @click.option(
138
+ "--debug",
139
+ is_flag=True,
140
+ default=False,
141
+ help="Enable debug mode (shows detailed logs)."
142
+ )
143
+ @click.option(
144
+ "--share",
145
+ is_flag=True,
146
+ default=False,
147
+ help="Create a public Gradio share link (for Colab or remote usage)."
148
+ )
149
+ @click.option(
150
+ "--model",
151
+ default="omniASR_LLM_1B",
152
+ type=click.Choice([
153
+ "omniASR_CTC_300M",
154
+ "omniASR_CTC_1B",
155
+ "omniASR_CTC_3B",
156
+ "omniASR_CTC_7B",
157
+ "omniASR_LLM_300M",
158
+ "omniASR_LLM_1B",
159
+ "omniASR_LLM_3B",
160
+ "omniASR_LLM_7B",
161
+ "omniASR_LLM_7B_ZS",
162
+ ]),
163
+ help="Choose the OmniASR model to load."
164
+ )
165
+ def main(debug, share, model):
166
+ # def main(debug=True, share=True,model="omniASR_LLM_1B"):
167
+
168
+ """Universal CLI entry point for omniASR transcription UI."""
169
+ print(f"\n🚀 Starting omniASR UI with model: {model}")
170
+ # ✅ Load model
171
+ load_model(model)
172
+ # ✅ Launch UI
173
+ demo = ui()
174
+ demo.queue().launch(share=share, debug=debug)
175
+
176
+ if __name__ == "__main__":
177
+ main()
178
+
179
+
server/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ models/
server/audio_transcription.py ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ # Standard library imports
4
+ import logging
5
+ import os
6
+ import tempfile
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ # Third-party imports
10
+ import librosa
11
+ import numpy as np
12
+ import soundfile as sf
13
+ import torch
14
+ import uroman
15
+
16
+ # fairseq2 imports
17
+ from inference.align_utils import get_uroman_tokens
18
+ from inference.audio_chunker import AudioChunker
19
+
20
+ from inference.audio_reading_tools import wav_to_bytes
21
+
22
+ # Import AudioAlignment and its config classes
23
+ from inference.audio_sentence_alignment import AudioAlignment
24
+ from inference.mms_model_pipeline import MMSModel
25
+ from inference.text_normalization import text_normalize
26
+ from transcription_status import transcription_status
27
+ from env_vars import USE_CHUNKING
28
+
29
+ # Constants
30
+ SAMPLE_RATE = 16000
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def transcribe_single_chunk(audio_tensor: torch.Tensor, sample_rate: int = 16000, language_with_script: str = None):
36
+ """
37
+ Basic transcription pipeline for a single audio chunk using MMS model pipeline.
38
+ This is the lowest-level transcription function that handles individual audio segments.
39
+
40
+ Args:
41
+ audio_tensor (torch.Tensor): Audio tensor (1D waveform)
42
+ sample_rate (int): Sample rate of the audio tensor
43
+ language_with_script (str): language_with_script for transcription (3-letter ISO codes like "eng", "spa") with script
44
+
45
+ Returns:
46
+ str: Transcribed text
47
+ """
48
+
49
+ logger.info("Starting complete audio transcription pipeline...")
50
+
51
+ try:
52
+ logger.info("Using pipeline transcription...")
53
+ # Use the singleton model instance
54
+ model = MMSModel.get_instance()
55
+
56
+ # Transcribe using pipeline - convert tensor to list format
57
+ lang_list = [language_with_script] if language_with_script else None
58
+ results = model.transcribe_audio(audio_tensor, batch_size=1, language_with_scripts=lang_list)
59
+ result = results[0] if results else {}
60
+
61
+ # Convert pipeline result to expected format
62
+ if isinstance(result, dict) and 'text' in result:
63
+ transcription_text = result['text']
64
+ elif isinstance(result, str):
65
+ transcription_text = result
66
+ else:
67
+ transcription_text = str(result)
68
+
69
+ if not transcription_text.strip():
70
+ logger.warning("Pipeline returned empty transcription")
71
+ return ""
72
+
73
+ logger.info(f"✓ Pipeline transcription successful: '{transcription_text}'")
74
+
75
+ # Return the transcription text
76
+ return transcription_text
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error in transcription pipeline: {str(e)}", exc_info=True)
80
+ raise
81
+
82
+
83
+ def perform_forced_alignment(
84
+ audio_tensor: torch.Tensor,
85
+ transcription_tokens: List[str],
86
+ device,
87
+ sample_rate: int = 16000,
88
+ ) -> List[Dict]:
89
+ """
90
+ Perform forced alignment using the AudioAlignment class from audio_sentence_alignment.py.
91
+ Uses the provided audio tensor directly.
92
+
93
+ Args:
94
+ audio_tensor (torch.Tensor): Audio tensor (1D waveform)
95
+ transcription_tokens (List[str]): List of tokens from transcription
96
+ device: Device for computation
97
+ sample_rate (int): Audio sample rate
98
+
99
+ Returns:
100
+ List[Dict]: List of segments with timestamps and text
101
+ """
102
+
103
+ try:
104
+ logger.info(f"Starting forced alignment with audio tensor")
105
+ logger.info(f"Audio shape: {audio_tensor.shape}, sample_rate: {sample_rate}")
106
+ logger.info(f"Tokens to align: {transcription_tokens}")
107
+
108
+ # Use the provided audio tensor directly
109
+ # Convert to the format expected by AudioAlignment.get_one_row_alignments
110
+ if hasattr(audio_tensor, "cpu"):
111
+ # If it's a torch tensor, use it directly
112
+ alignment_tensor = audio_tensor.float()
113
+ else:
114
+ # If it's numpy, convert to tensor
115
+ alignment_tensor = torch.from_numpy(audio_tensor).float()
116
+
117
+ # Ensure it's 1D (flatten if needed)
118
+ if len(alignment_tensor.shape) > 1:
119
+ alignment_tensor = alignment_tensor.flatten()
120
+
121
+ # Convert audio tensor to bytes format expected by AudioAlignment
122
+ # Use wav_to_bytes to create proper audio bytes
123
+ # Move tensor to CPU first to avoid CUDA tensor to numpy conversion error
124
+ audio_tensor_cpu = alignment_tensor.cpu() if alignment_tensor.is_cuda else alignment_tensor
125
+
126
+ audio_arr = wav_to_bytes(
127
+ audio_tensor_cpu, sample_rate=sample_rate, format="wav"
128
+ )
129
+
130
+ # logger.info(
131
+ # f"Converted audio to bytes: shape={audio_arr.shape}, dtype={audio_arr.dtype}"
132
+ # )
133
+ logger.info(f"Converted audio to bytes: {len(audio_arr)} bytes")
134
+
135
+ # Preprocess tokens for MMS alignment model using the same approach as TextRomanizer
136
+ # The MMS alignment model expects romanized tokens in the same format as text_sentences_tokens
137
+ try:
138
+ # Join tokens back to text for uroman processing
139
+ transcription_text = " ".join(transcription_tokens)
140
+
141
+ # Create uroman instance and process the text the same way as TextRomanizer
142
+ uroman_instance = uroman.Uroman()
143
+
144
+ # Step 1: Normalize the text first using text_normalize function (same as TextRomanizer)
145
+ normalized_text = text_normalize(transcription_text.strip(), "en")
146
+
147
+ # Step 2: Get uroman tokens using the same function as TextRomanizer
148
+ # This creates character-level tokens with spaces between characters
149
+ uroman_tokens_str = get_uroman_tokens(
150
+ [normalized_text], uroman_instance, "en"
151
+ )[0]
152
+
153
+ # Step 3: Split by spaces to get individual character tokens (same as real MMS pipeline)
154
+ alignment_tokens = uroman_tokens_str.split()
155
+
156
+ logger.info(f"Original tokens: {transcription_tokens}")
157
+ logger.info(f"Original text: '{transcription_text}'")
158
+ logger.info(f"Normalized text: '{normalized_text}'")
159
+ logger.info(f"Uroman tokens string: '{uroman_tokens_str}'")
160
+ logger.info(
161
+ f"Alignment tokens (count={len(alignment_tokens)}): {alignment_tokens[:20]}..."
162
+ )
163
+
164
+ # Additional debugging - check for any unusual characters
165
+ for i, token in enumerate(alignment_tokens[:10]): # Check first 10 tokens
166
+ logger.info(
167
+ f"Token {i}: '{token}' (length={len(token)}, chars={[c for c in token]})"
168
+ )
169
+
170
+ except Exception as e:
171
+ logger.warning(
172
+ f"Failed to preprocess tokens with TextRomanizer approach: {e}"
173
+ )
174
+ logger.exception("Full error traceback:")
175
+ # Fallback: use simple character-level tokenization
176
+ transcription_text = " ".join(transcription_tokens).lower()
177
+ # Simple character-level tokenization as fallback
178
+ alignment_tokens = []
179
+ for char in transcription_text:
180
+ if char == " ":
181
+ alignment_tokens.append(" ")
182
+ else:
183
+ alignment_tokens.append(char)
184
+ logger.info(f"Using fallback character tokens: {alignment_tokens[:20]}...")
185
+
186
+ logger.info(
187
+ f"Using {len(alignment_tokens)} alignment tokens for forced alignment"
188
+ )
189
+
190
+ # Create AudioAlignment instance
191
+ logger.info("Creating AudioAlignment instance...")
192
+ alignment = AudioAlignment()
193
+
194
+ # Perform alignment using get_one_row_alignments
195
+ logger.info("Performing alignment...")
196
+ logger.info(f"About to call get_one_row_alignments with:")
197
+ # logger.info(f" audio_arr type: {type(audio_arr)}, shape: {audio_arr.shape}")
198
+ logger.info(f"audio_arr type: {type(audio_arr)}")
199
+ logger.info(
200
+ f" alignment_tokens type: {type(alignment_tokens)}, length: {len(alignment_tokens)}"
201
+ )
202
+ logger.info(
203
+ f" First 10 tokens: {alignment_tokens[:10] if len(alignment_tokens) >= 10 else alignment_tokens}"
204
+ )
205
+
206
+ # Check for any problematic characters in tokens
207
+ for i, token in enumerate(alignment_tokens[:5]):
208
+ token_chars = [ord(c) for c in str(token)]
209
+ logger.info(f" Token {i} '{token}' char codes: {token_chars}")
210
+
211
+ # Check if tokens contain any RTL characters that might cause the LTR assertion
212
+ rtl_chars = []
213
+ for i, token in enumerate(alignment_tokens):
214
+ for char in str(token):
215
+ # Check for Arabic, Hebrew, and other RTL characters
216
+ if (
217
+ "\u0590" <= char <= "\u08ff"
218
+ or "\ufb1d" <= char <= "\ufdff"
219
+ or "\ufe70" <= char <= "\ufeff"
220
+ ):
221
+ rtl_chars.append((i, token, char, ord(char)))
222
+
223
+ if rtl_chars:
224
+ logger.warning(f"Found RTL characters in tokens: {rtl_chars[:10]}...")
225
+
226
+ try:
227
+ audio_segments = alignment.get_one_row_alignments(
228
+ audio_arr, sample_rate, alignment_tokens
229
+ )
230
+
231
+ except Exception as alignment_error:
232
+ logger.error(f"Alignment failed with error: {alignment_error}")
233
+ logger.error(f"Error type: {type(alignment_error)}")
234
+
235
+ # Try to provide more context about the error
236
+ if "ltr" in str(alignment_error).lower():
237
+ logger.error("LTR assertion error detected. This might be due to:")
238
+ logger.error("1. RTL characters in the input tokens")
239
+ logger.error(
240
+ "2. Incorrect token format - tokens should be individual characters"
241
+ )
242
+ logger.error("3. Unicode normalization issues")
243
+
244
+ # Try a simple ASCII-only fallback
245
+ logger.info("Attempting ASCII-only fallback...")
246
+ ascii_tokens = []
247
+ for token in alignment_tokens:
248
+ # Keep only ASCII characters
249
+ ascii_token = "".join(c for c in str(token) if ord(c) < 128)
250
+ if ascii_token:
251
+ ascii_tokens.append(ascii_token)
252
+
253
+ logger.info(
254
+ f"ASCII tokens (count={len(ascii_tokens)}): {ascii_tokens[:20]}..."
255
+ )
256
+
257
+ try:
258
+ audio_segments = alignment.get_one_row_alignments(
259
+ audio_arr, ascii_tokens
260
+ )
261
+ alignment_tokens = ascii_tokens # Update for later use
262
+ logger.info("ASCII fallback successful!")
263
+ except Exception as ascii_error:
264
+ logger.error(f"ASCII fallback also failed: {ascii_error}")
265
+ raise alignment_error
266
+ else:
267
+ raise
268
+
269
+ logger.info(
270
+ f"Alignment completed, got {len(audio_segments)} character segments"
271
+ )
272
+
273
+ # Debug: Log the actual structure of audio_segments
274
+ if audio_segments:
275
+ logger.info("=== Audio Segments Debug Info ===")
276
+ logger.info(f"Total segments: {len(audio_segments)}")
277
+
278
+ # Print ALL audio segments for complete debugging
279
+ logger.info("=== ALL AUDIO SEGMENTS ===")
280
+ for i, segment in enumerate(audio_segments):
281
+ logger.info(f"Segment {i}: {segment}")
282
+ if i > 0 and i % 20 == 0: # Print progress every 20 segments
283
+ logger.info(
284
+ f"... printed {i+1}/{len(audio_segments)} segments so far..."
285
+ )
286
+ logger.info("=== End All Audio Segments ===")
287
+ logger.info("=== End Audio Segments Debug ===")
288
+
289
+ # Convert character-level segments back to word-level segments
290
+ # Use the actual alignment timings to preserve silence and natural timing
291
+ aligned_segments = []
292
+
293
+ logger.info(
294
+ f"Converting {len(audio_segments)} character segments to word segments"
295
+ )
296
+ logger.info(f"Original tokens: {transcription_tokens}")
297
+ logger.info(f"Alignment tokens: {alignment_tokens[:20]}...")
298
+
299
+ # Validate that we have segments and tokens
300
+ if not audio_segments or not transcription_tokens:
301
+ logger.warning("No audio segments or transcription tokens available")
302
+ return []
303
+
304
+ # Get actual timing from character segments
305
+ if audio_segments:
306
+ # Use the known segment keys from audio_sentence_alignment
307
+ start_key, duration_key = "segment_start_sec", "segment_duration"
308
+
309
+ first_segment = audio_segments[0]
310
+ last_segment = audio_segments[-1]
311
+
312
+ total_audio_duration = last_segment.get(start_key, 0) + last_segment.get(
313
+ duration_key, 0
314
+ )
315
+ logger.info(
316
+ f"Total audio duration from segments: {total_audio_duration:.3f}s"
317
+ )
318
+ else:
319
+ total_audio_duration = 0.0
320
+ start_key, duration_key = "segment_start_sec", "segment_duration"
321
+
322
+ # Strategy: Group character segments by words using the actual alignment timing
323
+ # This preserves the natural timing including silences from the forced alignment
324
+
325
+ # First, reconstruct the alignment character sequence
326
+ alignment_char_sequence = "".join(alignment_tokens)
327
+ transcription_text = "".join(
328
+ transcription_tokens
329
+ ) # Remove spaces for character matching
330
+
331
+ logger.info(f"Alignment sequence length: {len(alignment_char_sequence)}")
332
+ logger.info(f"Transcription length: {len(transcription_text)}")
333
+
334
+ # Create word boundaries based on romanized alignment tokens
335
+ # We need to map each original word to its position in the romanized sequence
336
+ word_boundaries = []
337
+ alignment_pos = 0
338
+
339
+ # Process each word individually to get its romanized representation
340
+ for word in transcription_tokens:
341
+ try:
342
+ # Get romanized version of this individual word
343
+ normalized_word = text_normalize(word.strip(), "en")
344
+ uroman_word_str = get_uroman_tokens([normalized_word], uroman_instance, "en")[0]
345
+ romanized_word_tokens = uroman_word_str.split()
346
+
347
+ word_start = alignment_pos
348
+ word_end = alignment_pos + len(romanized_word_tokens)
349
+ word_boundaries.append((word_start, word_end))
350
+ alignment_pos = word_end
351
+
352
+ logger.info(f"Word '{word}' -> romanized tokens {romanized_word_tokens} -> positions {word_start}-{word_end}")
353
+
354
+ except Exception as e:
355
+ logger.warning(f"Failed to romanize word '{word}': {e}")
356
+ # Fallback: estimate based on character length ratio
357
+ estimated_length = max(1, int(len(word) * len(alignment_tokens) / len(transcription_text)))
358
+ word_start = alignment_pos
359
+ word_end = min(alignment_pos + estimated_length, len(alignment_tokens))
360
+ word_boundaries.append((word_start, word_end))
361
+ alignment_pos = word_end
362
+
363
+ logger.info(f"Word '{word}' (fallback) -> estimated positions {word_start}-{word_end}")
364
+
365
+ logger.info(f"Word boundaries (romanized): {word_boundaries[:5]}...")
366
+ logger.info(f"Total alignment tokens used: {alignment_pos}/{len(alignment_tokens)}")
367
+
368
+ # Map each word to its character segments using the boundaries
369
+ for word_idx, (word, (word_start, word_end)) in enumerate(
370
+ zip(transcription_tokens, word_boundaries)
371
+ ):
372
+ # Find character segments that belong to this word
373
+ word_segments = []
374
+
375
+ # Map word character range to alignment token indices
376
+ # Since alignment_tokens might be slightly different due to normalization,
377
+ # we'll be flexible and use a range around the expected positions
378
+ start_idx = max(0, min(word_start, len(audio_segments) - 1))
379
+ end_idx = min(word_end, len(audio_segments))
380
+
381
+ # Ensure we don't go beyond available segments
382
+ for seg_idx in range(start_idx, end_idx):
383
+ if seg_idx < len(audio_segments):
384
+ word_segments.append(audio_segments[seg_idx])
385
+
386
+ if word_segments:
387
+ # Use actual timing from the character segments for this word
388
+ start_times = [seg.get(start_key, 0) for seg in word_segments]
389
+ end_times = [
390
+ seg.get(start_key, 0) + seg.get(duration_key, 0)
391
+ for seg in word_segments
392
+ ]
393
+
394
+ start_time = min(start_times) if start_times else 0
395
+ end_time = max(end_times) if end_times else start_time + 0.1
396
+ duration = end_time - start_time
397
+
398
+ # Ensure minimum duration
399
+ if duration < 0.05: # Minimum 50ms
400
+ duration = 0.05
401
+ end_time = start_time + duration
402
+
403
+ logger.debug(
404
+ f"Word '{word}' (segments {start_idx}-{end_idx}, {len(word_segments)} segs): {start_time:.3f}s - {end_time:.3f}s ({duration:.3f}s)"
405
+ )
406
+ else:
407
+ logger.warning(
408
+ f"No segments found for word '{word}' at position {word_start}-{word_end}"
409
+ )
410
+ # Fallback: use proportional timing if no segments found
411
+ if total_audio_duration > 0 and len(transcription_text) > 0:
412
+ start_proportion = word_start / len(transcription_text)
413
+ end_proportion = word_end / len(transcription_text)
414
+ start_time = start_proportion * total_audio_duration
415
+ end_time = end_proportion * total_audio_duration
416
+ duration = end_time - start_time
417
+ else:
418
+ # Ultimate fallback
419
+ word_duration = 0.5
420
+ start_time = word_idx * word_duration
421
+ end_time = start_time + word_duration
422
+ duration = word_duration
423
+
424
+ logger.debug(
425
+ f"Word '{word}' (fallback): {start_time:.3f}s - {end_time:.3f}s"
426
+ )
427
+
428
+ aligned_segments.append(
429
+ {
430
+ "text": word,
431
+ "start": start_time,
432
+ "end": end_time,
433
+ "duration": duration,
434
+ }
435
+ )
436
+
437
+ # Validate segments don't overlap but preserve natural gaps/silences
438
+ for i in range(1, len(aligned_segments)):
439
+ prev_end = aligned_segments[i - 1]["end"]
440
+ current_start = aligned_segments[i]["start"]
441
+
442
+ if current_start < prev_end:
443
+ # Only fix actual overlaps, don't force adjacency
444
+ gap = prev_end - current_start
445
+ logger.debug(
446
+ f"Overlap detected: segment {i-1} ends at {prev_end:.3f}s, segment {i} starts at {current_start:.3f}s (overlap: {gap:.3f}s)"
447
+ )
448
+
449
+ # Fix overlap by adjusting current segment start to previous end
450
+ aligned_segments[i]["start"] = prev_end
451
+ aligned_segments[i]["duration"] = (
452
+ aligned_segments[i]["end"] - aligned_segments[i]["start"]
453
+ )
454
+ logger.debug(
455
+ f"Fixed overlap for segment {i}: adjusted start to {prev_end:.3f}s"
456
+ )
457
+ else:
458
+ # Log natural gaps (this is normal and expected)
459
+ gap = current_start - prev_end
460
+ if gap > 0.1: # Log gaps > 100ms
461
+ logger.debug(
462
+ f"Natural gap preserved: {gap:.3f}s between segments {i-1} and {i}"
463
+ )
464
+
465
+ logger.info(f"Forced alignment completed: {len(aligned_segments)} segments")
466
+ return aligned_segments
467
+
468
+ except Exception as e:
469
+ logger.error(f"Error in forced alignment: {str(e)}", exc_info=True)
470
+
471
+ # Fallback: create uniform timestamps based on audio tensor length
472
+ logger.info("Using fallback uniform timestamps")
473
+ try:
474
+ # Calculate duration from the audio tensor
475
+ total_duration = (
476
+ len(audio_tensor) / sample_rate
477
+ if len(audio_tensor) > 0
478
+ else len(transcription_tokens) * 0.5
479
+ )
480
+ except:
481
+ total_duration = len(transcription_tokens) * 0.5 # Fallback
482
+
483
+ segment_duration = (
484
+ total_duration / len(transcription_tokens) if transcription_tokens else 1.0
485
+ )
486
+
487
+ fallback_segments = []
488
+ for i, token in enumerate(transcription_tokens):
489
+ start_time = i * segment_duration
490
+ end_time = (i + 1) * segment_duration
491
+
492
+ fallback_segments.append(
493
+ {
494
+ "text": token,
495
+ "start": start_time,
496
+ "end": end_time,
497
+ "duration": segment_duration,
498
+ }
499
+ )
500
+
501
+ logger.info(
502
+ f"Using fallback uniform timestamps: {len(fallback_segments)} segments"
503
+ )
504
+ return fallback_segments
505
+
506
+
507
+ def transcribe_with_word_alignment(audio_tensor: torch.Tensor, sample_rate: int = 16000, language_with_script: str = None) -> Dict:
508
+ """
509
+ Transcription pipeline that includes word-level timing through forced alignment.
510
+ Adds precise word-level timestamps to the basic transcription capability.
511
+
512
+ Args:
513
+ audio_tensor (torch.Tensor): Audio tensor (1D waveform)
514
+ sample_rate (int): Sample rate of the audio tensor
515
+ language_with_script (str): language_with_script code for transcription (3-letter ISO codes like "eng", "spa") with script
516
+
517
+ Returns:
518
+ Dict: Transcription results with alignment information including word-level timestamps
519
+ """
520
+
521
+ try:
522
+ # Get model and device first
523
+
524
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
525
+ # Get the transcription results
526
+ transcription_text = transcribe_single_chunk(audio_tensor, sample_rate=sample_rate, language_with_script=language_with_script)
527
+
528
+ if not transcription_text:
529
+ return {
530
+ "transcription": "",
531
+ "tokens": [],
532
+ "aligned_segments": [],
533
+ "total_duration": 0.0,
534
+ }
535
+
536
+ # Tokenize the transcription for alignment
537
+ tokens = transcription_text.split()
538
+
539
+ # Perform forced alignment using the original audio tensor
540
+ logger.info("Performing forced alignment with original audio tensor...")
541
+ aligned_segments = perform_forced_alignment(audio_tensor, tokens, device, sample_rate)
542
+
543
+ # Calculate total duration
544
+ total_duration = aligned_segments[-1]["end"] if aligned_segments else 0.0
545
+
546
+ result = {
547
+ "transcription": transcription_text,
548
+ "tokens": tokens,
549
+ "aligned_segments": aligned_segments,
550
+ "total_duration": total_duration,
551
+ "num_segments": len(aligned_segments),
552
+ }
553
+
554
+ logger.info(
555
+ f"Transcription with alignment completed: {len(aligned_segments)} segments, {total_duration:.2f}s total"
556
+ )
557
+ return result
558
+
559
+ except Exception as e:
560
+ logger.error(f"Error in transcription with alignment: {str(e)}", exc_info=True)
561
+ # Return basic transcription without alignment
562
+ try:
563
+ transcription_text = transcribe_single_chunk(audio_tensor, sample_rate=sample_rate, language_with_script=language_with_script)
564
+ tokens = transcription_text.split() if transcription_text else []
565
+
566
+ return {
567
+ "transcription": transcription_text,
568
+ "tokens": tokens,
569
+ "aligned_segments": [],
570
+ "total_duration": 0.0,
571
+ "alignment_error": str(e),
572
+ }
573
+ except Exception as e2:
574
+ logger.error(f"Error in fallback transcription: {str(e2)}", exc_info=True)
575
+ return {
576
+ "transcription": "",
577
+ "tokens": [],
578
+ "aligned_segments": [],
579
+ "total_duration": 0.0,
580
+ "error": str(e2),
581
+ }
582
+
583
+
584
+ def _validate_and_adjust_segments(
585
+ aligned_segments: List[Dict],
586
+ chunk_start_time: float,
587
+ chunk_audio_tensor: torch.Tensor,
588
+ chunk_sample_rate: int,
589
+ chunk_duration: float,
590
+ chunk_index: int
591
+ ) -> List[Dict]:
592
+ """
593
+ Private helper function to validate and adjust segment timestamps to global timeline.
594
+
595
+ Args:
596
+ aligned_segments: Raw segments from forced alignment (local chunk timeline)
597
+ chunk_start_time: Start time of this chunk in global timeline
598
+ chunk_audio_tensor: Audio tensor for this chunk (to get actual duration)
599
+ chunk_sample_rate: Sample rate of the chunk
600
+ chunk_duration: Reported duration of the chunk
601
+ chunk_index: Index of this chunk for debugging
602
+
603
+ Returns:
604
+ List of validated segments with global timeline timestamps
605
+ """
606
+ adjusted_segments = []
607
+
608
+ # Get the actual audio duration from the chunk tensor instead of the potentially incorrect chunk duration
609
+ actual_chunk_duration = len(chunk_audio_tensor) / chunk_sample_rate if len(chunk_audio_tensor) > 0 else chunk_duration
610
+
611
+ for segment in aligned_segments:
612
+ original_start = segment["start"]
613
+ original_end = segment["end"]
614
+
615
+ # Validate that segment timestamps are within chunk boundaries
616
+ if original_start < 0:
617
+ logger.warning(
618
+ f"Segment '{segment['text']}' has negative start time {original_start:.3f}s, clipping to 0"
619
+ )
620
+ original_start = 0
621
+
622
+ if original_end > actual_chunk_duration + 1.0: # Allow 1s buffer for alignment errors
623
+ logger.warning(
624
+ f"Segment '{segment['text']}' end time {original_end:.3f}s exceeds actual chunk duration {actual_chunk_duration:.3f}s, clipping"
625
+ )
626
+ original_end = actual_chunk_duration
627
+
628
+ if original_start >= original_end:
629
+ logger.warning(
630
+ f"Segment '{segment['text']}' has invalid timing {original_start:.3f}s-{original_end:.3f}s, using fallback"
631
+ )
632
+ # Use proportional timing based on segment position using actual chunk duration
633
+ segment_index = len(adjusted_segments)
634
+ total_segments = len(aligned_segments)
635
+ if total_segments > 0:
636
+ segment_proportion = segment_index / total_segments
637
+ next_proportion = (segment_index + 1) / total_segments
638
+ original_start = segment_proportion * actual_chunk_duration
639
+ original_end = next_proportion * actual_chunk_duration
640
+ else:
641
+ original_start = 0
642
+ original_end = 0.5
643
+
644
+ # Create segment with absolute timeline
645
+ adjusted_segment = {
646
+ "text": segment["text"],
647
+ "start": original_start + chunk_start_time, # Global timeline
648
+ "end": original_end + chunk_start_time, # Global timeline
649
+ "duration": original_end - original_start,
650
+ "chunk_index": chunk_index,
651
+ "original_start": original_start, # Local chunk time
652
+ "original_end": original_end, # Local chunk time
653
+ }
654
+
655
+ adjusted_segments.append(adjusted_segment)
656
+
657
+ logger.debug(
658
+ f"Segment '{segment['text']}': {original_start:.3f}-{original_end:.3f} -> {adjusted_segment['start']:.3f}-{adjusted_segment['end']:.3f}"
659
+ )
660
+
661
+ logger.info(
662
+ f"Adjusted {len(adjusted_segments)} segments to absolute timeline (chunk starts at {chunk_start_time:.2f}s)"
663
+ )
664
+
665
+ return adjusted_segments
666
+
667
+
668
+ def transcribe_full_audio_with_chunking(
669
+ audio_tensor: torch.Tensor, sample_rate: int = 16000, chunk_duration: float = 30.0, language_with_script: str = None, progress_callback=None
670
+ ) -> Dict:
671
+ """
672
+ Complete audio transcription pipeline that handles any length audio with intelligent chunking.
673
+ This is the full-featured transcription function that can process both short and long audio files.
674
+
675
+ Chunking mode is controlled by USE_CHUNKING environment variable:
676
+ - USE_CHUNKING=false: No chunking (single chunk mode)
677
+ - USE_CHUNKING=true (default): VAD-based intelligent chunking
678
+
679
+ Args:
680
+ audio_tensor: Audio tensor (1D waveform)
681
+ sample_rate: Sample rate of the audio tensor
682
+ chunk_duration: Target chunk duration in seconds (for static chunking)
683
+ language_with_script: {Language code}_{script} for transcription
684
+ progress_callback: Optional callback for progress updates
685
+
686
+ Returns:
687
+ Dict with full transcription and segment information including word-level timestamps
688
+ """
689
+
690
+ try:
691
+ logger.info(f"Starting long-form transcription: tensor shape {audio_tensor.shape} at {sample_rate}Hz")
692
+ logger.info(f"USE_CHUNKING = {USE_CHUNKING}")
693
+
694
+ # Initialize chunker
695
+ chunker = AudioChunker()
696
+
697
+ # Determine chunking mode based on USE_CHUNKING setting
698
+ chunking_mode = "vad" if USE_CHUNKING else "none"
699
+
700
+ # Chunk the audio using the new unified interface
701
+ # Ensure tensor is 1D before chunking (squeeze any extra dimensions)
702
+ if len(audio_tensor.shape) > 1:
703
+ logger.info(f"Squeezing audio tensor from {audio_tensor.shape} to 1D")
704
+ audio_tensor_1d = audio_tensor.squeeze()
705
+ else:
706
+ audio_tensor_1d = audio_tensor
707
+
708
+ chunks = chunker.chunk_audio(audio_tensor_1d, sample_rate=sample_rate, mode=chunking_mode, chunk_duration=chunk_duration)
709
+
710
+ if not chunks:
711
+ logger.warning("No audio chunks created")
712
+ return {
713
+ "transcription": "",
714
+ "chunks": [],
715
+ "total_duration": 0.0,
716
+ "error": "No audio content detected",
717
+ }
718
+
719
+ logger.info(f"Processing {len(chunks)} audio chunks (mode: {chunking_mode})")
720
+
721
+ # Validate chunk continuity
722
+ for i, chunk in enumerate(chunks):
723
+ logger.info(
724
+ f"Chunk {i+1}: {chunk['start_time']:.2f}s - {chunk['end_time']:.2f}s ({chunk['duration']:.2f}s)"
725
+ )
726
+ if i > 0:
727
+ prev_end = chunks[i - 1]["end_time"]
728
+ current_start = chunk["start_time"]
729
+ gap = current_start - prev_end
730
+ if abs(gap) > 0.1: # More than 100ms gap/overlap
731
+ logger.warning(
732
+ f"Gap/overlap between chunks {i} and {i+1}: {gap:.3f}s"
733
+ )
734
+
735
+ # Process each chunk - now all chunks have uniform format!
736
+ all_segments = []
737
+ full_transcription_parts = []
738
+ total_duration = 0.0
739
+ chunk_details = []
740
+
741
+ for i, chunk in enumerate(chunks):
742
+ logger.info(
743
+ f"Processing chunk {i+1}/{len(chunks)} ({chunk['duration']:.1f}s, {chunk['start_time']:.1f}s-{chunk['end_time']:.1f}s)"
744
+ )
745
+
746
+ try:
747
+ # Process this chunk using tensor-based transcription pipeline
748
+ # Use the chunk's audio_data tensor directly - no more file operations!
749
+ chunk_audio_tensor = chunk["audio_data"]
750
+ chunk_sample_rate = chunk["sample_rate"]
751
+
752
+ chunk_result = transcribe_with_word_alignment(
753
+ audio_tensor=chunk_audio_tensor,
754
+ sample_rate=chunk_sample_rate,
755
+ language_with_script=language_with_script
756
+ )
757
+
758
+ # Process alignment results - uniform handling for all chunk types
759
+ chunk_segments = []
760
+ chunk_start_time = chunk["start_time"]
761
+ chunk_duration = chunk["duration"]
762
+
763
+ if chunk_result.get("aligned_segments"):
764
+ logger.info(
765
+ f"Chunk {i+1} has {len(chunk_result['aligned_segments'])} segments"
766
+ )
767
+
768
+ chunk_segments = _validate_and_adjust_segments(
769
+ aligned_segments=chunk_result["aligned_segments"],
770
+ chunk_start_time=chunk_start_time,
771
+ chunk_audio_tensor=chunk_audio_tensor,
772
+ chunk_sample_rate=chunk_sample_rate,
773
+ chunk_duration=chunk_duration,
774
+ chunk_index=i
775
+ )
776
+
777
+ all_segments.extend(chunk_segments)
778
+ logger.info(f"Chunk {i+1} processed {len(chunk_segments)} valid segments")
779
+
780
+ # Add to full transcription
781
+ chunk_transcription = ""
782
+ if chunk_result.get("transcription"):
783
+ chunk_transcription = chunk_result["transcription"]
784
+ full_transcription_parts.append(chunk_transcription)
785
+
786
+ # Store detailed chunk information
787
+ chunk_detail = {
788
+ "chunk_index": i,
789
+ "start_time": chunk["start_time"],
790
+ "end_time": chunk["end_time"],
791
+ "duration": chunk["duration"],
792
+ "transcription": chunk_transcription,
793
+ "num_segments": len(chunk_segments),
794
+ "segments": chunk_segments,
795
+ }
796
+ chunk_details.append(chunk_detail)
797
+
798
+ total_duration = max(total_duration, chunk["end_time"])
799
+
800
+ # Update progress linearly from 0.1 to 0.9 based on chunk processing
801
+ progress = 0.1 + (0.8 * (i + 1) / len(chunks))
802
+ transcription_status.update_progress(progress)
803
+
804
+ logger.info(
805
+ f"Chunk {i+1} processed: '{chunk_transcription}' ({len(chunk_segments)} segments)"
806
+ )
807
+
808
+ except Exception as chunk_error:
809
+ logger.error(f"Error processing chunk {i+1}: {chunk_error}")
810
+ # Continue with next chunk
811
+
812
+ # Combine results
813
+ full_transcription = " ".join(full_transcription_parts)
814
+
815
+ # Validate segment continuity
816
+ logger.info("Validating segment continuity...")
817
+ for i in range(1, len(all_segments)):
818
+ prev_end = all_segments[i - 1]["end"]
819
+ current_start = all_segments[i]["start"]
820
+ gap = current_start - prev_end
821
+ if abs(gap) > 1.0: # More than 1 second gap
822
+ logger.warning(f"Large gap between segments {i-1} and {i}: {gap:.3f}s")
823
+
824
+ result = {
825
+ "transcription": full_transcription,
826
+ "aligned_segments": all_segments,
827
+ "chunks": [
828
+ {
829
+ "chunk_index": chunk_detail["chunk_index"],
830
+ "start_time": chunk_detail["start_time"],
831
+ "end_time": chunk_detail["end_time"],
832
+ "duration": chunk_detail["duration"],
833
+ "transcription": chunk_detail["transcription"],
834
+ "num_segments": chunk_detail["num_segments"],
835
+ }
836
+ for chunk_detail in chunk_details
837
+ ],
838
+ "chunk_details": chunk_details, # Full details including segments per chunk
839
+ "total_duration": total_duration,
840
+ "num_chunks": len(chunks),
841
+ "num_segments": len(all_segments),
842
+ "status": "success",
843
+ }
844
+
845
+ logger.info(
846
+ f"Long-form transcription completed: {len(chunks)} chunks, {total_duration:.1f}s total"
847
+ )
848
+ logger.info(f"Total segments: {len(all_segments)}")
849
+
850
+ # Log chunk timing summary
851
+ for chunk_detail in chunk_details:
852
+ logger.info(
853
+ f"Chunk {chunk_detail['chunk_index']}: {chunk_detail['start_time']:.2f}-{chunk_detail['end_time']:.2f}s, {chunk_detail['num_segments']} segments"
854
+ )
855
+
856
+ return result
857
+
858
+ except Exception as e:
859
+ logger.error(f"Error in long-form transcription: {str(e)}", exc_info=True)
860
+ return {
861
+ "transcription": "",
862
+ "chunks": [],
863
+ "total_duration": 0.0,
864
+ "error": str(e),
865
+ }
866
+
867
+
server/convert_media_to_wav.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Media to WAV Converter Module
3
+
4
+ Converts various media formats (m4a, mp3, mp4, etc.) to standardized WAV files
5
+ and PyTorch tensors for audio transcription pipelines.
6
+
7
+ Standardization:
8
+ - 16kHz sample rate
9
+ - Mono channel (merged if multi-channel)
10
+ - Layer normalized
11
+ - bfloat16 dtype tensor
12
+ - Fail-fast error handling
13
+ """
14
+
15
+ import os
16
+ import tempfile
17
+ from pathlib import Path
18
+ from typing import Tuple, Union, Optional
19
+
20
+ import librosa
21
+ import numpy as np
22
+ import soundfile as sf
23
+ import torch
24
+ import torch.nn.functional as F
25
+ from pydub import AudioSegment
26
+ from pydub.utils import which
27
+
28
+
29
+ # Constants
30
+ TARGET_SAMPLE_RATE = 16000
31
+ TARGET_DTYPE = torch.bfloat16
32
+
33
+
34
+ def verify_ffmpeg_installation():
35
+ """Verify FFmpeg is available for pydub operations."""
36
+ if not which("ffmpeg"):
37
+ raise RuntimeError(
38
+ "FFmpeg not found. Please install FFmpeg for media format support. "
39
+ "On Ubuntu: sudo apt install ffmpeg"
40
+ )
41
+
42
+
43
+ def layer_norm(tensor: torch.Tensor, shape: torch.Size) -> torch.Tensor:
44
+ """Apply layer normalization to audio tensor."""
45
+ # Simple layer normalization: (x - mean) / std
46
+ mean = tensor.mean()
47
+ std = tensor.std()
48
+ if std == 0:
49
+ return tensor - mean
50
+ return (tensor - mean) / std
51
+
52
+
53
+ def detect_media_format(file_path: str) -> str:
54
+ """Detect media format from file extension."""
55
+ file_path = Path(file_path)
56
+ extension = file_path.suffix.lower()
57
+
58
+ supported_formats = {
59
+ '.wav': 'wav',
60
+ '.mp3': 'mp3',
61
+ '.m4a': 'm4a',
62
+ '.aac': 'aac',
63
+ '.flac': 'flac',
64
+ '.ogg': 'ogg',
65
+ '.wma': 'wma',
66
+ '.mp4': 'mp4',
67
+ '.avi': 'avi',
68
+ '.mov': 'mov',
69
+ '.mkv': 'mkv'
70
+ }
71
+
72
+ # Return known format or just pass through the extension without the dot
73
+ # Let downstream processing handle unknown formats with detailed error messages
74
+ return supported_formats.get(extension, extension[1:] if extension.startswith('.') else extension)
75
+
76
+
77
+ def convert_to_wav_with_pydub(input_path: str, output_path: str, format_hint: str = None):
78
+ """Convert media file to WAV using pydub (FFmpeg backend)."""
79
+ verify_ffmpeg_installation()
80
+
81
+ # Load audio file - pydub auto-detects format or use hint
82
+ if format_hint:
83
+ audio = AudioSegment.from_file(input_path, format=format_hint)
84
+ else:
85
+ # Let pydub auto-detect
86
+ audio = AudioSegment.from_file(input_path)
87
+
88
+ # Convert to WAV format with standard settings
89
+ # pydub will handle the initial conversion, librosa will do the final processing
90
+ audio.export(output_path, format="wav")
91
+
92
+
93
+ def process_wav_to_standard_format(wav_path: str) -> Tuple[np.ndarray, int]:
94
+ """Process WAV file to standard format using librosa."""
95
+ # Load the WAV file with librosa (handles resampling better than pydub)
96
+ data, fs = librosa.load(wav_path, sr=None) # Load at original sample rate first
97
+
98
+ # Resample to target sample rate if needed
99
+ if fs != TARGET_SAMPLE_RATE:
100
+ data = librosa.resample(data, orig_sr=fs, target_sr=TARGET_SAMPLE_RATE)
101
+
102
+ # Handle multi-channel audio by merging to mono
103
+ if len(data.shape) > 1:
104
+ # Average across channels
105
+ data = np.mean(data, axis=0)
106
+
107
+ # Ensure it's a 1D array
108
+ data = np.asarray(data, dtype=np.float32)
109
+
110
+ return data, TARGET_SAMPLE_RATE
111
+
112
+
113
+ def create_normalized_tensor(audio_data: np.ndarray) -> torch.Tensor:
114
+ """Convert numpy audio data to normalized PyTorch tensor with device handling."""
115
+ # Convert to bf16 tensor and normalize
116
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
117
+ data = torch.Tensor(audio_data).to(torch.bfloat16)
118
+ data = layer_norm(data, data.shape)
119
+ data = data.unsqueeze(0).to(device)
120
+
121
+ return data
122
+
123
+
124
+ def convert_media_to_wav(
125
+ input_path: str,
126
+ output_dir: Optional[str] = None,
127
+ keep_temp_wav: bool = True
128
+ ) -> Tuple[str, torch.Tensor]:
129
+ """
130
+ Convert media file to standardized WAV file and normalized tensor.
131
+
132
+ Args:
133
+ input_path: Path to input media file
134
+ output_dir: Directory for output WAV file (default: temp directory)
135
+ keep_temp_wav: Whether to keep the temporary WAV file
136
+
137
+ Returns:
138
+ Tuple of (wav_file_path, normalized_tensor)
139
+
140
+ Raises:
141
+ ValueError: If file format is unsupported
142
+ RuntimeError: If FFmpeg is not available
143
+ FileNotFoundError: If input file doesn't exist
144
+ """
145
+
146
+ # Validate input file
147
+ if not os.path.exists(input_path):
148
+ raise FileNotFoundError(f"Input file not found: {input_path}")
149
+
150
+ input_path = os.path.abspath(input_path)
151
+
152
+ # Detect format
153
+ media_format = detect_media_format(input_path)
154
+
155
+ # Setup output path
156
+ if output_dir is None:
157
+ output_dir = tempfile.gettempdir()
158
+
159
+ # Create output filename
160
+ input_name = Path(input_path).stem
161
+ output_wav_path = os.path.join(output_dir, f"{input_name}_converted.wav")
162
+
163
+ # Step 1: Convert to WAV using pydub (handles format conversion)
164
+ if media_format == 'wav':
165
+ # Already WAV, but still process through pydub to normalize format
166
+ convert_to_wav_with_pydub(input_path, output_wav_path, 'wav')
167
+ else:
168
+ # Convert from other format to WAV
169
+ convert_to_wav_with_pydub(input_path, output_wav_path, media_format)
170
+
171
+ # Step 2: Process WAV to standard format using librosa
172
+ audio_data, sample_rate = process_wav_to_standard_format(output_wav_path)
173
+
174
+ # Step 3: Create normalized tensor
175
+ normalized_tensor = create_normalized_tensor(audio_data)
176
+
177
+ # Step 4: Save the processed audio back to WAV file
178
+ # Overwrite the temp WAV with the processed version
179
+ sf.write(output_wav_path, audio_data, sample_rate)
180
+
181
+ return output_wav_path, normalized_tensor
182
+
183
+
184
+ def convert_media_to_wav_from_bytes(
185
+ media_bytes: bytes,
186
+ original_filename: str,
187
+ output_dir: Optional[str] = None
188
+ ) -> Tuple[str, torch.Tensor]:
189
+ """
190
+ Convert media from bytes to WAV file and tensor.
191
+
192
+ Args:
193
+ media_bytes: Raw media file bytes
194
+ original_filename: Original filename for format detection
195
+ output_dir: Directory for output files
196
+
197
+ Returns:
198
+ Tuple of (wav_file_path, normalized_tensor)
199
+ """
200
+
201
+ # Create temporary input file
202
+ input_extension = Path(original_filename).suffix
203
+ with tempfile.NamedTemporaryFile(delete=False, suffix=input_extension) as temp_input:
204
+ temp_input.write(media_bytes)
205
+ temp_input_path = temp_input.name
206
+
207
+ # Convert using the main function
208
+ wav_path, tensor = convert_media_to_wav(temp_input_path, output_dir)
209
+
210
+ # Clean up temporary input file
211
+ os.unlink(temp_input_path)
212
+
213
+ return wav_path, tensor
214
+
215
+
216
+ # Utility function for getting audio info
217
+ def get_media_info(file_path: str) -> dict:
218
+ """Get information about media file."""
219
+ verify_ffmpeg_installation()
220
+
221
+ audio = AudioSegment.from_file(file_path)
222
+
223
+ return {
224
+ "duration_seconds": len(audio) / 1000.0,
225
+ "frame_rate": audio.frame_rate,
226
+ "channels": audio.channels,
227
+ "sample_width": audio.sample_width,
228
+ "format": detect_media_format(file_path)
229
+ }
230
+
231
+
232
+ if __name__ == "__main__":
233
+ # Example usage
234
+ import sys
235
+
236
+ if len(sys.argv) != 2:
237
+ print("Usage: python convert_media_to_wav.py <input_file>")
238
+ sys.exit(1)
239
+
240
+ input_file = sys.argv[1]
241
+
242
+ print(f"Converting {input_file}...")
243
+ wav_path, tensor = convert_media_to_wav(input_file)
244
+
245
+ print(f"✓ WAV file: {wav_path}")
246
+ print(f"✓ Tensor shape: {tensor.shape}")
247
+ print(f"✓ Tensor dtype: {tensor.dtype}")
248
+ print(f"✓ Tensor device: {tensor.device}")
249
+
250
+ # Show media info
251
+ info = get_media_info(input_file)
252
+ print(f"✓ Media info: {info}")
server/download_models.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #@title download model /content/omniasr-transcriptions/server/download_models.sh
2
+
3
+ # %%writefile /content/omniasr-transcriptions/server/download_models.py
4
+ #!/usr/bin/env python3
5
+ """
6
+ download_models.py
7
+ Ensures the MMS model files are downloaded into MODELS_DIR.
8
+ """
9
+
10
+ import os
11
+ import urllib.request
12
+ import urllib.error
13
+ from tqdm.auto import tqdm
14
+ import sys
15
+
16
+
17
+ def download_file(url: str, download_file_path: str, redownload: bool = False) -> bool:
18
+ """Download a single file with urllib + tqdm progress bar."""
19
+ base_path = os.path.dirname(download_file_path)
20
+ os.makedirs(base_path, exist_ok=True)
21
+
22
+ # Skip if file already exists
23
+ if os.path.exists(download_file_path):
24
+ if redownload:
25
+ os.remove(download_file_path)
26
+ tqdm.write(f"♻️ Redownloading: {os.path.basename(download_file_path)}")
27
+ elif os.path.getsize(download_file_path) > 0:
28
+ tqdm.write(f"✔️ Skipped (already exists): {os.path.basename(download_file_path)}")
29
+ return True
30
+
31
+ # Try fetching metadata
32
+ try:
33
+ request = urllib.request.urlopen(url)
34
+ total = int(request.headers.get("Content-Length", 0))
35
+ except urllib.error.URLError as e:
36
+ print(f"❌ Error: Unable to open URL: {url}")
37
+ print(f"Reason: {e.reason}")
38
+ return False
39
+
40
+ # Download with progress bar
41
+ with tqdm(
42
+ total=total,
43
+ desc=os.path.basename(download_file_path),
44
+ unit="B",
45
+ unit_scale=True,
46
+ unit_divisor=1024,
47
+ ) as progress:
48
+ try:
49
+ urllib.request.urlretrieve(
50
+ url,
51
+ download_file_path,
52
+ reporthook=lambda count, block_size, total_size: progress.update(block_size),
53
+ )
54
+ except urllib.error.URLError as e:
55
+ print(f"❌ Error: Failed to download {url}")
56
+ print(f"Reason: {e.reason}")
57
+ return False
58
+
59
+ tqdm.write(f"⬇️ Downloaded: {os.path.basename(download_file_path)}")
60
+ return True
61
+
62
+
63
+ def main():
64
+ # Use MODELS_DIR from environment variable or default
65
+ MODELS_DIR = os.environ.get("MODELS_DIR", "./models")
66
+ print(f"📁 Checking and downloading MMS models to: {MODELS_DIR}")
67
+
68
+ # Check write permission
69
+ if not os.access(os.path.dirname(MODELS_DIR) or ".", os.W_OK):
70
+ print(f"✗ No write permission to {MODELS_DIR}")
71
+ sys.exit(1)
72
+
73
+ # ✅ Define URLs and build full local paths here
74
+ model_urls = {
75
+ "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/dictionary.txt":
76
+ os.path.join(MODELS_DIR, "ctc_alignment_mling_uroman_model_dict.txt"),
77
+ "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt":
78
+ os.path.join(MODELS_DIR, "ctc_alignment_mling_uroman_model.pt"),
79
+ }
80
+
81
+ for url, full_path in model_urls.items():
82
+ success = download_file(url, full_path)
83
+ if not success:
84
+ print(f"✗ Failed to fetch: {os.path.basename(full_path)}")
85
+ sys.exit(1)
86
+
87
+ print("✅ All model files are ready!")
88
+
89
+ main()
90
+ # if __name__ == "__main__":
91
+ # main()
server/env_vars.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #@title change model name at /content/omniasr-transcriptions/server/env_vars.py
2
+ # %%writefile /content/omniasr-transcriptions/server/env_vars.py
3
+ import logging
4
+ import os
5
+
6
+ log_level = os.environ.get("API_LOG_LEVEL", "INFO") # see logging._nameToLevel
7
+ API_LOG_LEVEL = logging._nameToLevel.get(log_level)
8
+
9
+ # MMS Model Configuration
10
+ MODEL_NAME = os.environ.get("MODEL_NAME", "omniASR_LLM_1B") # Model name for pipeline
11
+
12
+ # Audio Processing Configuration
13
+ USE_CHUNKING = os.environ.get("USE_CHUNKING", "true").lower() == "true" # Whether to use audio chunking
server/inference/align_utils.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #@title fix file path /content/omniasr-transcriptions/server/inference/align_utils.py
2
+ # %%writefile /content/omniasr-transcriptions/server/inference/align_utils.py
3
+ import math
4
+ import os
5
+ import re
6
+ import tempfile
7
+ import logging
8
+ from dataclasses import dataclass
9
+
10
+ import torch
11
+ from torchaudio.models import wav2vec2_model
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # iso codes with specialized rules in uroman
16
+ special_isos_uroman = "ara, bel, bul, deu, ell, eng, fas, grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid".split(
17
+ ","
18
+ )
19
+ special_isos_uroman = [i.strip() for i in special_isos_uroman]
20
+
21
+
22
+ def normalize_uroman(text):
23
+ text = text.lower()
24
+ text = re.sub("([^a-z' ])", " ", text)
25
+ text = re.sub(" +", " ", text)
26
+ return text.strip()
27
+
28
+
29
+ def get_uroman_tokens(norm_transcripts, uroman, iso=None):
30
+ tf = tempfile.NamedTemporaryFile()
31
+ tf2 = tempfile.NamedTemporaryFile()
32
+ with open(tf.name, "w") as f:
33
+ for t in norm_transcripts:
34
+ f.write(t + "\n")
35
+ uroman.romanize_file(
36
+ input_filename=tf.name,
37
+ output_filename=tf2.name,
38
+ lcode=iso if iso in special_isos_uroman else None,
39
+ )
40
+ outtexts = []
41
+ with open(tf2.name) as f:
42
+ for line in f:
43
+ line = " ".join(line.strip())
44
+ line = re.sub(r"\s+", " ", line).strip()
45
+ outtexts.append(line)
46
+ assert len(outtexts) == len(norm_transcripts)
47
+ uromans = []
48
+ for ot in outtexts:
49
+ uromans.append(normalize_uroman(ot))
50
+ return uromans
51
+
52
+
53
+ @dataclass
54
+ class Segment:
55
+ label: str
56
+ start: int
57
+ end: int
58
+
59
+ def __repr__(self):
60
+ return f"{self.label}: [{self.start:5d}, {self.end:5d})"
61
+
62
+ @property
63
+ def length(self):
64
+ return self.end - self.start
65
+
66
+
67
+ def merge_repeats(path, idx_to_token_map):
68
+ i1, i2 = 0, 0
69
+ segments = []
70
+ while i1 < len(path):
71
+ while i2 < len(path) and path[i1] == path[i2]:
72
+ i2 += 1
73
+ segments.append(Segment(idx_to_token_map[path[i1]], i1, i2 - 1))
74
+ i1 = i2
75
+ return segments
76
+
77
+
78
+ def time_to_frame(time):
79
+ stride_msec = 20
80
+ frames_per_sec = 1000 / stride_msec
81
+ return int(time * frames_per_sec)
82
+
83
+
84
+ def load_model_dict():
85
+ # Use models directory from environment variable
86
+ models_dir = os.environ.get("MODELS_DIR", "./models")
87
+ model_path_name = os.path.join(models_dir, "ctc_alignment_mling_uroman_model.pt")
88
+
89
+ logger.info("Loading model from models directory...")
90
+ if not os.path.exists(model_path_name):
91
+ raise FileNotFoundError(f"Model file not found at {model_path_name}")
92
+ logger.info(f"Model found at: {model_path_name}")
93
+ state_dict = torch.load(model_path_name, map_location="cpu")
94
+
95
+ model = wav2vec2_model(
96
+ extractor_mode="layer_norm",
97
+ extractor_conv_layer_config=[
98
+ (512, 10, 5),
99
+ (512, 3, 2),
100
+ (512, 3, 2),
101
+ (512, 3, 2),
102
+ (512, 3, 2),
103
+ (512, 2, 2),
104
+ (512, 2, 2),
105
+ ],
106
+ extractor_conv_bias=True,
107
+ encoder_embed_dim=1024,
108
+ encoder_projection_dropout=0.0,
109
+ encoder_pos_conv_kernel=128,
110
+ encoder_pos_conv_groups=16,
111
+ encoder_num_layers=24,
112
+ encoder_num_heads=16,
113
+ encoder_attention_dropout=0.0,
114
+ encoder_ff_interm_features=4096,
115
+ encoder_ff_interm_dropout=0.1,
116
+ encoder_dropout=0.0,
117
+ encoder_layer_norm_first=True,
118
+ encoder_layer_drop=0.1,
119
+ aux_num_out=31,
120
+ )
121
+ model.load_state_dict(state_dict)
122
+ model.eval()
123
+
124
+ # Use models directory from environment variable
125
+ models_dir = os.environ.get("MODELS_DIR", "./models")
126
+ dict_path_name = os.path.join(
127
+ models_dir, "ctc_alignment_mling_uroman_model_dict.txt"
128
+ )
129
+ if not os.path.exists(dict_path_name):
130
+ raise FileNotFoundError(f"Dictionary file not found at {dict_path_name}")
131
+ logger.info(f"Dictionary found at: {dict_path_name}")
132
+ dictionary = {}
133
+ with open(dict_path_name) as f:
134
+ dictionary = {l.strip(): i for i, l in enumerate(f.readlines())}
135
+
136
+ return model, dictionary
137
+
138
+
139
+ def get_spans(tokens, segments):
140
+ ltr_idx = 0
141
+ tokens_idx = 0
142
+ intervals = []
143
+ start, end = (0, 0)
144
+ sil = "<blank>"
145
+ for seg_idx, seg in enumerate(segments):
146
+ if tokens_idx == len(tokens):
147
+ assert seg_idx == len(segments) - 1
148
+ assert seg.label == "<blank>"
149
+ continue
150
+ cur_token = tokens[tokens_idx].split(" ")
151
+ ltr = cur_token[ltr_idx]
152
+ if seg.label == "<blank>":
153
+ continue
154
+ assert seg.label == ltr
155
+ if (ltr_idx) == 0:
156
+ start = seg_idx
157
+ if ltr_idx == len(cur_token) - 1:
158
+ ltr_idx = 0
159
+ tokens_idx += 1
160
+ intervals.append((start, seg_idx))
161
+ while tokens_idx < len(tokens) and len(tokens[tokens_idx]) == 0:
162
+ intervals.append((seg_idx, seg_idx))
163
+ tokens_idx += 1
164
+ else:
165
+ ltr_idx += 1
166
+ spans = []
167
+ for idx, (start, end) in enumerate(intervals):
168
+ span = segments[start : end + 1]
169
+ if start > 0:
170
+ prev_seg = segments[start - 1]
171
+ if prev_seg.label == sil:
172
+ pad_start = (
173
+ prev_seg.start
174
+ if (idx == 0)
175
+ else int((prev_seg.start + prev_seg.end) / 2)
176
+ )
177
+ span = [Segment(sil, pad_start, span[0].start)] + span
178
+ if end + 1 < len(segments):
179
+ next_seg = segments[end + 1]
180
+ if next_seg.label == sil:
181
+ pad_end = (
182
+ next_seg.end
183
+ if (idx == len(intervals) - 1)
184
+ else math.floor((next_seg.start + next_seg.end) / 2)
185
+ )
186
+ span = span + [Segment(sil, span[-1].end, pad_end)]
187
+ spans.append(span)
188
+ return spans
server/inference/audio_chunker.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import numpy as np
4
+ import logging
5
+ import tempfile
6
+ import os
7
+ import threading
8
+ from typing import List, Tuple, Dict, Optional, Any
9
+ import silero_vad
10
+ import soundfile as sf
11
+ import librosa
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ TARGET_CHUNK_DURATION = 30.0
16
+ MIN_CHUNK_DURATION = 5.0
17
+ SAMPLE_RATE = 16000
18
+
19
+
20
+ class AudioChunker:
21
+ """
22
+ Handles audio chunking with different strategies:
23
+ - 'none': Single chunk (no chunking)
24
+ - 'vad': VAD-based intelligent chunking
25
+ - 'static': Fixed-duration time-based chunking
26
+ """
27
+
28
+ _instance = None
29
+ _instance_lock = threading.Lock()
30
+ vad_model: Optional[Any]
31
+
32
+ def __new__(cls):
33
+ if cls._instance is None:
34
+ with cls._instance_lock:
35
+ # Check again after acquiring lock as the value could have been set
36
+ if cls._instance is None:
37
+ cls._instance = super().__new__(cls)
38
+ # Only load VAD model here since this only runs once
39
+ cls._instance.vad_model = cls.load_vad_model()
40
+ return cls._instance
41
+
42
+ @staticmethod
43
+ def load_vad_model():
44
+ """Load silero VAD model with error handling."""
45
+ try:
46
+ logger.info("Loading Silero VAD model...")
47
+ vad_model = silero_vad.load_silero_vad()
48
+ logger.info("✓ VAD model loaded successfully")
49
+ return vad_model
50
+ except Exception as e:
51
+ logger.error(f"Failed to load VAD model: {e}")
52
+ logger.warning("VAD chunking will fall back to time-based chunking")
53
+ return None
54
+
55
+ @torch.inference_mode()
56
+ def chunk_audio(self, audio_tensor: torch.Tensor, sample_rate: int = SAMPLE_RATE, mode: str = "vad", chunk_duration: float = 30.0) -> List[Dict]:
57
+ """
58
+ Chunk audio tensor using specified strategy.
59
+
60
+ Args:
61
+ audio_tensor: Audio tensor (1D waveform)
62
+ sample_rate: Sample rate of the audio tensor
63
+ mode: Chunking mode - 'none', 'vad', or 'static'
64
+ chunk_duration: Target duration for static chunking (seconds)
65
+
66
+ Returns:
67
+ List of chunk info dicts with uniform format:
68
+ - start_time: Start time in seconds
69
+ - end_time: End time in seconds
70
+ - duration: Duration in seconds
71
+ - audio_data: Audio tensor for this chunk
72
+ - sample_rate: Sample rate
73
+ - chunk_index: Index of this chunk
74
+ """
75
+ logger.info(f"Chunking audio tensor: {audio_tensor.shape} at {sample_rate}Hz (mode: {mode})")
76
+
77
+ try:
78
+ # Assert tensor is already 1D (should be preprocessed by MediaTranscriptionProcessor)
79
+ assert len(audio_tensor.shape) == 1, f"Expected 1D audio tensor, got shape {audio_tensor.shape}"
80
+
81
+ # Assert sample rate is already 16kHz (should be preprocessed by MediaTranscriptionProcessor)
82
+ assert sample_rate == SAMPLE_RATE, f"Expected {SAMPLE_RATE}Hz sample rate, got {sample_rate}Hz"
83
+
84
+ # Route to appropriate chunking strategy
85
+ if mode == "none":
86
+ return self._create_single_chunk(audio_tensor, sample_rate)
87
+ elif mode == "vad":
88
+ if self.vad_model is not None:
89
+ return self._chunk_with_vad(audio_tensor)
90
+ else:
91
+ logger.warning("VAD model not available, falling back to static chunking")
92
+ return self._chunk_static(audio_tensor, chunk_duration)
93
+ elif mode == "static":
94
+ return self._chunk_static(audio_tensor, chunk_duration)
95
+ else:
96
+ raise ValueError(f"Unknown chunking mode: {mode}")
97
+
98
+ except Exception as e:
99
+ logger.error(f"Error chunking audio tensor: {e}")
100
+ # Ultimate fallback to single chunk
101
+ return self._create_single_chunk(audio_tensor, sample_rate)
102
+
103
+ def _create_single_chunk(self, waveform: torch.Tensor, sample_rate: int = SAMPLE_RATE) -> List[Dict]:
104
+ """Create a single chunk containing the entire audio."""
105
+ duration = len(waveform) / sample_rate
106
+
107
+ return [{
108
+ "start_time": 0.0,
109
+ "end_time": duration,
110
+ "duration": duration,
111
+ "audio_data": waveform,
112
+ "sample_rate": sample_rate,
113
+ "chunk_index": 0,
114
+ }]
115
+
116
+ def _chunk_static(self, waveform: torch.Tensor, chunk_duration: float) -> List[Dict]:
117
+ """Create fixed-duration chunks."""
118
+ chunks = []
119
+ total_samples = len(waveform)
120
+ target_samples = int(chunk_duration * SAMPLE_RATE)
121
+
122
+ start_sample = 0
123
+ chunk_idx = 0
124
+
125
+ while start_sample < total_samples:
126
+ end_sample = min(start_sample + target_samples, total_samples)
127
+ chunk_audio = waveform[start_sample:end_sample]
128
+ duration = len(chunk_audio) / SAMPLE_RATE
129
+
130
+ # Only add chunk if it meets minimum duration
131
+ if duration >= MIN_CHUNK_DURATION:
132
+ chunks.append({
133
+ "start_time": start_sample / SAMPLE_RATE,
134
+ "end_time": end_sample / SAMPLE_RATE,
135
+ "duration": duration,
136
+ "audio_data": chunk_audio,
137
+ "sample_rate": SAMPLE_RATE,
138
+ "chunk_index": chunk_idx,
139
+ })
140
+ chunk_idx += 1
141
+
142
+ start_sample = end_sample
143
+
144
+ logger.info(f"Created {len(chunks)} static chunks of ~{chunk_duration}s each")
145
+ return chunks
146
+
147
+ def _chunk_fallback(self, audio_path: str) -> List[Dict]:
148
+ """Ultimate fallback - create single chunk using librosa (for file-based legacy method)."""
149
+ try:
150
+ logger.warning("Using librosa fallback for chunking")
151
+ data, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
152
+ waveform = torch.from_numpy(data)
153
+ return self._create_single_chunk(waveform, SAMPLE_RATE)
154
+ except Exception as e:
155
+ logger.error(f"All chunking methods failed: {e}")
156
+ return []
157
+ def _chunk_with_vad(self, waveform: torch.Tensor) -> List[Dict]:
158
+ """Chunk audio using VAD for speech detection with uniform return format."""
159
+ try:
160
+ # VAD model expects tensor on CPU
161
+ vad_waveform = waveform.cpu() if waveform.is_cuda else waveform
162
+
163
+ # Get speech timestamps using VAD
164
+ speech_timestamps = silero_vad.get_speech_timestamps(
165
+ vad_waveform,
166
+ self.vad_model,
167
+ sampling_rate=SAMPLE_RATE,
168
+ min_speech_duration_ms=500, # Minimum speech segment
169
+ min_silence_duration_ms=300, # Minimum silence to split
170
+ window_size_samples=1536,
171
+ speech_pad_ms=100, # Padding around speech
172
+ )
173
+
174
+ logger.info(f"Found {len(speech_timestamps)} speech segments")
175
+
176
+ # Create chunks based on speech segments and target duration
177
+ # Pass original waveform (with device preserved) to chunk creation
178
+ chunks = self._create_chunks_from_speech_segments(
179
+ waveform, speech_timestamps
180
+ )
181
+
182
+ logger.info(f"Created {len(chunks)} audio chunks using VAD")
183
+ return chunks
184
+
185
+ except Exception as e:
186
+ logger.error(f"VAD chunking failed: {e}")
187
+ return self._chunk_static(waveform, TARGET_CHUNK_DURATION)
188
+ def _create_chunks_from_speech_segments(
189
+ self, waveform: torch.Tensor, speech_segments: List[Dict]
190
+ ) -> List[Dict]:
191
+ """Create chunks that respect speech boundaries and target duration with uniform format."""
192
+ if not speech_segments:
193
+ logger.warning(
194
+ "No speech segments found, falling back to static chunking"
195
+ )
196
+ return self._chunk_static(waveform, TARGET_CHUNK_DURATION)
197
+
198
+ chunks = []
199
+ current_chunk_start = 0
200
+ target_samples = int(TARGET_CHUNK_DURATION * SAMPLE_RATE)
201
+ total_samples = len(waveform)
202
+ chunk_idx = 0
203
+
204
+ while current_chunk_start < total_samples:
205
+ # Calculate target end for this chunk
206
+ target_chunk_end = current_chunk_start + target_samples
207
+
208
+ # If this would be the last chunk or close to it, just take the rest
209
+ if target_chunk_end >= total_samples or (
210
+ total_samples - target_chunk_end
211
+ ) < (target_samples * 0.3):
212
+ chunk_end = total_samples
213
+ else:
214
+ # Find the best place to end this chunk using VAD, but ensure continuous coverage
215
+ chunk_end = self._find_best_chunk_end_continuous(
216
+ speech_segments,
217
+ current_chunk_start,
218
+ target_chunk_end,
219
+ total_samples,
220
+ )
221
+
222
+ # Create chunk with uniform format
223
+ chunk_audio = waveform[current_chunk_start:chunk_end]
224
+ duration = len(chunk_audio) / SAMPLE_RATE
225
+
226
+ chunks.append({
227
+ "start_time": current_chunk_start / SAMPLE_RATE,
228
+ "end_time": chunk_end / SAMPLE_RATE,
229
+ "duration": duration,
230
+ "audio_data": chunk_audio,
231
+ "sample_rate": SAMPLE_RATE,
232
+ "chunk_index": chunk_idx,
233
+ })
234
+
235
+ logger.info(
236
+ f"Created chunk {chunk_idx + 1}: {current_chunk_start/SAMPLE_RATE:.2f}s - {chunk_end/SAMPLE_RATE:.2f}s ({duration:.2f}s)"
237
+ )
238
+ chunk_idx += 1
239
+
240
+ # Move to next chunk - IMPORTANT: start exactly where this chunk ended
241
+ current_chunk_start = chunk_end
242
+
243
+ # Verify total coverage
244
+ total_audio_duration = len(waveform) / SAMPLE_RATE
245
+ total_chunks_duration = sum(chunk["duration"] for chunk in chunks)
246
+ logger.info(
247
+ f"Audio chunking complete: {len(chunks)} chunks covering {total_chunks_duration:.2f}s of {total_audio_duration:.2f}s total audio"
248
+ )
249
+
250
+ if (
251
+ abs(total_chunks_duration - total_audio_duration) > 0.01
252
+ ): # Allow 10ms tolerance
253
+ logger.error(
254
+ f"Duration mismatch: chunks={total_chunks_duration:.2f}s, original={total_audio_duration:.2f}s"
255
+ )
256
+ else:
257
+ logger.info("✓ Perfect audio coverage achieved")
258
+
259
+ return chunks
260
+
261
+ def _find_best_chunk_end_continuous(
262
+ self,
263
+ speech_segments: List[Dict],
264
+ chunk_start: int,
265
+ target_end: int,
266
+ total_samples: int,
267
+ ) -> int:
268
+ """Find the best place to end a chunk while ensuring continuous coverage."""
269
+
270
+ # Don't go beyond the audio
271
+ target_end = min(target_end, total_samples)
272
+
273
+ # Look for a good break point within a reasonable window around target
274
+ search_window = int(SAMPLE_RATE * 3) # 3 second window
275
+ search_start = max(chunk_start, target_end - search_window)
276
+ search_end = min(total_samples, target_end + search_window)
277
+
278
+ best_end = target_end
279
+ best_score = 0
280
+
281
+ # Look for speech segment boundaries within the search window
282
+ for segment in speech_segments:
283
+ segment_start = segment["start"]
284
+ segment_end = segment["end"]
285
+
286
+ # Check if segment end is in our search window
287
+ if search_start <= segment_end <= search_end:
288
+ # Score based on how close to target and if it's a good break point
289
+ distance_score = 1.0 - abs(segment_end - target_end) / search_window
290
+
291
+ # Prefer segment ends (natural pauses)
292
+ boundary_score = 1.0
293
+
294
+ total_score = distance_score * boundary_score
295
+
296
+ if total_score > best_score:
297
+ best_score = total_score
298
+ best_end = segment_end
299
+
300
+ # Ensure we don't go beyond audio bounds
301
+ best_end = min(int(best_end), total_samples)
302
+
303
+ # Ensure we make progress (don't end before we started)
304
+ if best_end <= chunk_start:
305
+ best_end = min(target_end, total_samples)
306
+
307
+ return best_end
308
+
309
+ def _find_best_chunk_end(
310
+ self,
311
+ speech_segments: List[Dict],
312
+ start_idx: int,
313
+ chunk_start: int,
314
+ target_end: int,
315
+ ) -> int:
316
+ """Find the best place to end a chunk (at silence, near target duration)."""
317
+
318
+ best_end = target_end
319
+
320
+ # Look for speech segments that could provide good break points
321
+ for i in range(start_idx, len(speech_segments)):
322
+ segment = speech_segments[i]
323
+ segment_start = segment["start"]
324
+ segment_end = segment["end"]
325
+
326
+ # If segment starts after our target end, use the gap before it
327
+ if segment_start > target_end:
328
+ best_end = min(target_end, segment_start)
329
+ break
330
+
331
+ # If segment ends near our target, use the end of the segment
332
+ if abs(segment_end - target_end) < SAMPLE_RATE * 5: # Within 5 seconds
333
+ best_end = segment_end
334
+ break
335
+
336
+ # If segment extends way past target, look for a good break point
337
+ if segment_end > target_end + SAMPLE_RATE * 10: # 10+ seconds past
338
+ # Try to find a silence gap within the segment or use target
339
+ best_end = target_end
340
+ break
341
+
342
+ return int(best_end)
343
+
344
+ def save_chunk_to_file(self, chunk: Dict, output_path: str) -> str:
345
+ """Save a chunk to a temporary audio file."""
346
+ try:
347
+ # Convert tensor to numpy if needed
348
+ audio_data = chunk["audio_data"]
349
+ if isinstance(audio_data, torch.Tensor):
350
+ # Move to CPU first if on GPU, then convert to numpy
351
+ audio_data = audio_data.cpu().numpy()
352
+
353
+ # Save to file
354
+ sf.write(output_path, audio_data, chunk["sample_rate"])
355
+ return output_path
356
+
357
+ except Exception as e:
358
+ logger.error(f"Failed to save chunk to file: {e}")
359
+ raise
server/inference/audio_reading_tools.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import numpy as np
4
+ import soundfile as sf
5
+ import torch
6
+ from numpy.typing import NDArray
7
+
8
+
9
+ # def wav_to_bytes(
10
+ # wav: torch.Tensor | NDArray, sample_rate: int = 16_000, format: str = "wav"
11
+ # ) -> NDArray[np.int8]:
12
+ # """Convert audio tensor to bytes using soundfile directly."""
13
+ # # Convert to numpy if torch tensor
14
+ # if isinstance(wav, torch.Tensor):
15
+ # if wav.is_cuda:
16
+ # wav = wav.cpu()
17
+ # # Convert to float32 first (numpy doesn't support bfloat16)
18
+ # if wav.dtype != torch.float32:
19
+ # wav = wav.float()
20
+ # wav = wav.numpy()
21
+
22
+ # # Ensure float32 dtype for numpy arrays
23
+ # if wav.dtype != np.float32:
24
+ # wav = wav.astype(np.float32)
25
+
26
+ # # Handle shape: soundfile expects (samples,) for mono or (samples, channels) for multi-channel
27
+ # if wav.ndim == 1:
28
+ # # Already correct shape for mono
29
+ # pass
30
+ # elif wav.ndim == 2:
31
+ # # If shape is (channels, samples), transpose to (samples, channels)
32
+ # if wav.shape[0] < wav.shape[1]:
33
+ # wav = wav.T
34
+
35
+ # # Create buffer and write using soundfile directly
36
+ # buffer = io.BytesIO()
37
+
38
+ # # Map format string to soundfile format
39
+ # sf_format = format.upper() if format.lower() in ['wav', 'flac', 'ogg'] else 'WAV'
40
+ # subtype = 'PCM_16' if sf_format == 'WAV' else None
41
+
42
+ # # Write to buffer
43
+ # sf.write(buffer, wav, sample_rate, format=sf_format, subtype=subtype)
44
+
45
+ # buffer.seek(0)
46
+ # return np.frombuffer(buffer.getvalue(), dtype=np.int8)
47
+ # # return buffer.read()
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+ def wav_to_bytes(wav: torch.Tensor | np.ndarray, sample_rate: int = 16000, format: str = "wav"):
57
+ """Convert audio tensor to bytes using soundfile directly (safe + dtype fix)."""
58
+
59
+ # ✅ Convert to numpy if torch tensor
60
+ if isinstance(wav, torch.Tensor):
61
+ wav = wav.detach().cpu()
62
+ if wav.dtype == torch.bfloat16:
63
+ wav = wav.to(torch.float32) # FIX: convert unsupported dtype
64
+ elif wav.dtype != torch.float32:
65
+ wav = wav.float()
66
+ wav = wav.numpy()
67
+
68
+ # ✅ Handle empty or multi-dim cases
69
+ if wav.ndim > 1:
70
+ wav = wav.squeeze()
71
+ if wav.size == 0:
72
+ raise ValueError("Empty audio segment passed to wav_to_bytes")
73
+
74
+ # ✅ Ensure valid range and dtype
75
+ wav = wav.astype(np.float32)
76
+ wav = np.nan_to_num(np.clip(wav, -1.0, 1.0))
77
+
78
+ buffer = io.BytesIO()
79
+ try:
80
+ sf.write(buffer, wav, sample_rate, format="WAV", subtype="PCM_16")
81
+ except Exception as e:
82
+ print(f"[ERROR] soundfile write failed: {e}")
83
+ raise
84
+
85
+ buffer.seek(0)
86
+ return np.frombuffer(buffer.getvalue(), dtype=np.int8)
87
+
88
+
89
+
server/inference/audio_sentence_alignment.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+
8
+ import gc
9
+ import io
10
+ import logging
11
+ import threading
12
+ from dataclasses import dataclass
13
+ from typing import Dict, List
14
+
15
+ import torch
16
+ import torchaudio
17
+ import torchaudio.functional as audio_F
18
+
19
+ from .align_utils import get_spans, load_model_dict, merge_repeats, time_to_frame
20
+ from .audio_reading_tools import wav_to_bytes
21
+
22
+ # Global logger for this module
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass(kw_only=True)
27
+ class AudioAlignmentConfig:
28
+ model_path_name: str = ""
29
+ emission_interval: int = 30
30
+ audio_format: str = "flac"
31
+ use_star: bool = False
32
+ device: str = "cuda"
33
+
34
+
35
+ class AudioAlignment:
36
+ """Thread-safe singleton for audio-text alignment."""
37
+
38
+ _instance = None
39
+ _lock = threading.Lock()
40
+
41
+ scale: int = 1000
42
+
43
+ def __new__(cls):
44
+ if cls._instance is None:
45
+ with cls._lock:
46
+ # Double-check locking pattern
47
+ if cls._instance is None:
48
+ cls._instance = super(AudioAlignment, cls).__new__(cls)
49
+ cls._instance._initialize()
50
+ return cls._instance
51
+
52
+ def _initialize(self):
53
+ """Initialize the singleton instance (called only once)."""
54
+ logger.info("Initializing AudioAlignment model...")
55
+
56
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
57
+ config = AudioAlignmentConfig(
58
+ device=str(device),
59
+ use_star=False, # Set to False for standard alignment
60
+ )
61
+
62
+ self.config = config
63
+
64
+ # FIXME: pass model name correctly
65
+ logger.info("Loading forced alignment model and dictionary...")
66
+ self.model, self.dictionary = load_model_dict()
67
+ self.device = torch.device(config.device)
68
+ self.model.to(self.device)
69
+
70
+ if self.config.use_star:
71
+ self.dictionary["<star>"] = len(self.dictionary)
72
+
73
+ self.blank = self.dictionary["<blank>"]
74
+ self.inverse_dictionary = {v: k for k, v in self.dictionary.items()}
75
+
76
+ logger.info(
77
+ f"AudioAlignment model loaded successfully on device: {self.device}"
78
+ )
79
+
80
+ @torch.inference_mode()
81
+ def generate_emissions(self, waveform: torch.Tensor, reading_sr):
82
+ emission_interval = self.config.emission_interval
83
+ total_duration = waveform.size(1) / reading_sr
84
+
85
+ emissions_arr = []
86
+
87
+ i = 0
88
+ while i < total_duration:
89
+ segment_start_time, segment_end_time = (i, i + emission_interval)
90
+
91
+ context = emission_interval * 0.1
92
+ input_start_time = max(segment_start_time - context, 0)
93
+ input_end_time = min(segment_end_time + context, total_duration)
94
+ waveform_split = waveform[
95
+ :,
96
+ int(reading_sr * input_start_time) : int(reading_sr * (input_end_time)),
97
+ ]
98
+
99
+ model_outs, _ = self.model(waveform_split)
100
+ emissions_ = model_outs[0]
101
+ emission_start_frame = time_to_frame(segment_start_time)
102
+ emission_end_frame = time_to_frame(segment_end_time)
103
+ offset = time_to_frame(input_start_time)
104
+
105
+ emissions_ = emissions_[
106
+ emission_start_frame - offset : emission_end_frame - offset, :
107
+ ]
108
+ emissions_arr.append(emissions_)
109
+ i += emission_interval
110
+
111
+ emissions = torch.cat(emissions_arr, dim=0).squeeze()
112
+ emissions = torch.log_softmax(emissions, dim=-1)
113
+
114
+ stride = float(waveform.size(1) * self.scale / emissions.size(0) / reading_sr)
115
+
116
+ return emissions, stride
117
+
118
+ @torch.inference_mode()
119
+ def get_one_row_alignments(
120
+ self, audio_arr, reading_sr, tokens: List[str]
121
+ ) -> List[Dict]:
122
+ """Internal method to perform forced alignment."""
123
+ # buffer = audio_arr.tobytes()
124
+ buffer = audio_arr if isinstance(audio_arr, (bytes, bytearray)) else audio_arr.tobytes()
125
+ waveform, audio_sf = torchaudio.load(io.BytesIO(buffer))
126
+ waveform = waveform.to(self.device)
127
+ assert audio_sf == reading_sr
128
+
129
+ emissions, stride = self.generate_emissions(waveform, reading_sr)
130
+ waveform = waveform.cpu()
131
+
132
+ if self.config.use_star:
133
+ T, _ = emissions.size()
134
+ emissions = torch.cat(
135
+ [emissions, torch.zeros(T, 1, device=self.device)], dim=1
136
+ )
137
+
138
+ if self.config.use_star:
139
+ tokens = ["<star>"] + tokens
140
+
141
+ token_indices = [
142
+ self.dictionary[c]
143
+ for c in " ".join(tokens).split(" ")
144
+ if c in self.dictionary
145
+ ]
146
+
147
+ targets = torch.tensor(token_indices, dtype=torch.int32, device=self.device)
148
+
149
+ input_lengths = torch.tensor(emissions.shape[0]).unsqueeze(-1)
150
+ target_lengths = torch.tensor(targets.shape[0]).unsqueeze(-1)
151
+
152
+ path, _ = audio_F.forced_align(
153
+ emissions.unsqueeze(0),
154
+ targets.unsqueeze(0),
155
+ input_lengths,
156
+ target_lengths,
157
+ blank=self.blank,
158
+ )
159
+ path = path.squeeze().to("cpu").tolist()
160
+
161
+ segments = merge_repeats(path, self.inverse_dictionary)
162
+
163
+ spans = get_spans(tokens, segments)
164
+
165
+ # audio_segments = []
166
+ # for span in spans:
167
+ # seg_start_idx, seg_end_idx = span[0].start, span[-1].end
168
+ # segment_start_sec = seg_start_idx * stride / self.scale
169
+ # segment_end_sec = seg_end_idx * stride / self.scale
170
+ # start_frame = int(segment_start_sec * reading_sr)
171
+ # end_frame = int(segment_end_sec * reading_sr)
172
+ # trimmed_waveform = waveform[:, start_frame:end_frame]
173
+
174
+ # audio_segments.append(
175
+ # {
176
+ # "segment_start_sec": segment_start_sec,
177
+ # "segment_end_sec": segment_end_sec,
178
+ # "segment_duration": segment_end_sec - segment_start_sec,
179
+ # "segment_audio_bytes": wav_to_bytes(
180
+ # trimmed_waveform, reading_sr, self.config.audio_format
181
+ # ),
182
+ # }
183
+ # )
184
+ # return audio_segments
185
+ audio_segments = []
186
+ for i, span in enumerate(spans):
187
+ seg_start_idx, seg_end_idx = span[0].start, span[-1].end
188
+ segment_start_sec = seg_start_idx * stride / self.scale
189
+ segment_end_sec = seg_end_idx * stride / self.scale
190
+ start_frame = int(segment_start_sec * reading_sr)
191
+ end_frame = int(segment_end_sec * reading_sr)
192
+ trimmed_waveform = waveform[:, start_frame:end_frame]
193
+
194
+ # 🧩 Fix: Skip empty or invalid audio segments
195
+ if trimmed_waveform is None or trimmed_waveform.numel() == 0:
196
+ # logger.warning(
197
+ # f"⚠️ Skipping empty audio segment {i} "
198
+ # f"({segment_start_sec:.2f}-{segment_end_sec:.2f}s)"
199
+ # )
200
+ continue
201
+
202
+ try:
203
+ audio_bytes = wav_to_bytes(trimmed_waveform, reading_sr, self.config.audio_format)
204
+ except Exception as e:
205
+ # logger.error(f"❌ Failed to convert segment {i} to bytes: {e}")
206
+ continue
207
+
208
+ audio_segments.append(
209
+ {
210
+ "segment_start_sec": segment_start_sec,
211
+ "segment_end_sec": segment_end_sec,
212
+ "segment_duration": segment_end_sec - segment_start_sec,
213
+ "segment_audio_bytes": audio_bytes,
214
+ }
215
+ )
216
+
217
+ return audio_segments
218
+
219
+
server/inference/mms_model_pipeline.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #@title fix import and path /content/omniasr-transcriptions/server/inference/mms_model_pipeline.py
2
+ # %%writefile /content/omniasr-transcriptions/server/inference/mms_model_pipeline.py
3
+ """
4
+ Pipeline-based MMS Model using the official MMS library.
5
+ This implementation uses Wav2Vec2LlamaInferencePipeline to avoid Seq2SeqBatch complexity.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import torch
11
+ from typing import List, Dict, Any, Optional
12
+ # from omnilingual_asr.models.inference.pipeline import Wav2Vec2InferencePipeline
13
+ from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
14
+
15
+ from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
16
+
17
+ from inference.audio_reading_tools import wav_to_bytes
18
+ from env_vars import MODEL_NAME
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class MMSModel:
24
+ """Pipeline-based MMS model wrapper using the official inference pipeline."""
25
+ _instance = None
26
+ _initialized = False
27
+
28
+ def __new__(cls, *args, **kwargs):
29
+ if cls._instance is None:
30
+ logger.info("Creating new MMSModel singleton instance")
31
+ cls._instance = super().__new__(cls)
32
+ else:
33
+ logger.info("Using existing MMSModel singleton instance")
34
+ return cls._instance
35
+
36
+ def __init__(self, model_card: str = None, device = None):
37
+ """
38
+ Initialize the MMS model with the official pipeline.
39
+
40
+ Args:
41
+ model_card: Model card to use (omniASR_LLM_1B, omniASR_LLM_300M, etc.)
42
+ If None, uses MODEL_NAME from environment variables
43
+ device: Device to use (torch.device object, "cuda", "cpu", etc.)
44
+ """
45
+ # Only initialize once
46
+ if self._initialized:
47
+ return
48
+
49
+ # Get model name from environment variable with default fallback
50
+ self.model_card = model_card or MODEL_NAME
51
+ self.device = device
52
+
53
+ # Load the pipeline immediately during initialization
54
+ self._load_pipeline()
55
+
56
+ # Mark as initialized
57
+ self._initialized = True
58
+
59
+ def _load_pipeline(self):
60
+ """Load the MMS pipeline during initialization."""
61
+ logger.info(f"Loading MMS pipeline: {self.model_card}")
62
+ logger.info(f"Target device: {self.device}")
63
+
64
+ # Debug FAIRSEQ2_CACHE_DIR environment variable
65
+ # fairseq2_cache_dir = os.environ.get('FAIRSEQ2_CACHE_DIR')
66
+ fairseq2_cache_dir = os.environ.get('FAIRSEQ2_CACHE_DIR',"./models")
67
+ logger.info(f"DEBUG: FAIRSEQ2_CACHE_DIR = {fairseq2_cache_dir}")
68
+
69
+ try:
70
+ # Convert device to string if it's a torch.device object
71
+ device_str = str(self.device) if hasattr(self.device, 'type') else str(self.device)
72
+ # self.pipeline = Wav2Vec2InferencePipeline(
73
+ # model_card=self.model_card,
74
+ # device=device_str
75
+ # )
76
+ self.pipeline = ASRInferencePipeline(
77
+ model_card=self.model_card,
78
+ device=device_str
79
+ )
80
+ logger.info("✓ MMS pipeline loaded successfully")
81
+ except Exception as e:
82
+ logger.error(f"Failed to load MMS pipeline: {e}")
83
+ raise
84
+
85
+ def transcribe_audio(self, audio_tensor: torch.Tensor, batch_size: int = 1, language_with_scripts: List[str] = None) -> List[Dict[str, Any]]:
86
+ """
87
+ Transcribe audio tensor using the MMS pipeline.
88
+
89
+ Args:
90
+ audio_tensor: Audio tensor (1D waveform) to transcribe
91
+ batch_size: Batch size for processing
92
+ language_with_scripts: List of language_with_scripts codes for transcription (3-letter ISO codes with script)
93
+ If None, uses auto-detection
94
+
95
+ Returns:
96
+ List of transcription results
97
+ """
98
+ # Pipeline is already loaded during initialization, no need to check
99
+
100
+ # Convert tensor to bytes for the pipeline
101
+ logger.info(f"Converting tensor (shape: {audio_tensor.shape}) to bytes")
102
+ # Move to CPU first if on GPU
103
+ tensor_cpu = audio_tensor.cpu() if audio_tensor.is_cuda else audio_tensor
104
+ # Convert to bytes using wav_to_bytes with 16kHz sample rate
105
+ audio_bytes = wav_to_bytes(tensor_cpu, sample_rate=16000, format="wav")
106
+
107
+ logger.info(f"Transcribing audio tensor with batch_size={batch_size}, language_with_scripts={language_with_scripts}")
108
+
109
+ try:
110
+ # Use the official pipeline transcribe method with a list containing the single audio bytes
111
+ if language_with_scripts is not None:
112
+ transcriptions = self.pipeline.transcribe([audio_bytes], batch_size=batch_size, lang=language_with_scripts)
113
+ else:
114
+ transcriptions = self.pipeline.transcribe([audio_bytes], batch_size=batch_size)
115
+
116
+ logger.info(f"✓ Successfully transcribed audio tensor")
117
+ return transcriptions
118
+
119
+ except Exception as e:
120
+ logger.error(f"Transcription failed: {e}")
121
+ raise
122
+
123
+ @classmethod
124
+ def get_instance(cls, model_card: str = None, device = None):
125
+ """
126
+ Get the singleton instance of MMSModel.
127
+
128
+ Args:
129
+ model_card: Model card to use (omniASR_LLM_1B, omniASR_LLM_300M, etc.)
130
+ If None, uses MODEL_NAME from environment variables
131
+ device: Device to use (torch.device object, "cuda", "cpu", etc.)
132
+
133
+ Returns:
134
+ MMSModel: The singleton instance
135
+ """
136
+ if cls._instance is None:
137
+ cls._instance = cls(model_card=model_card, device=device)
138
+ return cls._instance
server/inference/norm_config_module.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # type: ignore
2
+ import os
3
+ import re
4
+
5
+ colon = ":"
6
+ comma = ","
7
+ exclamation_mark = "!"
8
+ period = re.escape(".")
9
+ question_mark = re.escape("?")
10
+ semicolon = ";"
11
+
12
+ left_curly_bracket = "{"
13
+ right_curly_bracket = "}"
14
+ quotation_mark = '"'
15
+
16
+ basic_punc = (
17
+ period
18
+ + question_mark
19
+ + comma
20
+ + colon
21
+ + exclamation_mark
22
+ + left_curly_bracket
23
+ + right_curly_bracket
24
+ )
25
+
26
+ # General punc unicode block (0x2000-0x206F)
27
+ zero_width_space = r"\u200B"
28
+ zero_width_nonjoiner = r"\u200C"
29
+ left_to_right_mark = r"\u200E"
30
+ right_to_left_mark = r"\u200F"
31
+ left_to_right_embedding = r"\u202A"
32
+ pop_directional_formatting = r"\u202C"
33
+
34
+ # Here are some commonly ill-typed versions of apostrophe
35
+ right_single_quotation_mark = r"\u2019"
36
+ left_single_quotation_mark = r"\u2018"
37
+
38
+ # Language specific definitions
39
+ # Spanish
40
+ inverted_exclamation_mark = r"\u00A1"
41
+ inverted_question_mark = r"\u00BF"
42
+
43
+
44
+ # Hindi
45
+ hindi_danda = "\u0964"
46
+
47
+ # Egyptian Arabic
48
+ # arabic_percent = r"\u066A"
49
+ arabic_comma = r"\u060C"
50
+ arabic_question_mark = r"\u061F"
51
+ arabic_semicolon = r"\u061B"
52
+ arabic_diacritics = r"\u064B-\u0652"
53
+
54
+
55
+ arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
56
+
57
+
58
+ # Chinese
59
+ full_stop = r"\u3002"
60
+ full_comma = r"\uFF0C"
61
+ full_exclamation_mark = r"\uFF01"
62
+ full_question_mark = r"\uFF1F"
63
+ full_semicolon = r"\uFF1B"
64
+ full_colon = r"\uFF1A"
65
+ full_parentheses = r"\uFF08\uFF09"
66
+ quotation_mark_horizontal = r"\u300C-\u300F"
67
+ quotation_mark_vertical = r"\uFF41-\uFF44"
68
+ title_marks = r"\u3008-\u300B"
69
+ wavy_low_line = r"\uFE4F"
70
+ ellipsis = r"\u22EF"
71
+ enumeration_comma = r"\u3001"
72
+ hyphenation_point = r"\u2027"
73
+ forward_slash = r"\uFF0F"
74
+ wavy_dash = r"\uFF5E"
75
+ box_drawings_light_horizontal = r"\u2500"
76
+ fullwidth_low_line = r"\uFF3F"
77
+ chinese_punc = (
78
+ full_stop
79
+ + full_comma
80
+ + full_exclamation_mark
81
+ + full_question_mark
82
+ + full_semicolon
83
+ + full_colon
84
+ + full_parentheses
85
+ + quotation_mark_horizontal
86
+ + quotation_mark_vertical
87
+ + title_marks
88
+ + wavy_low_line
89
+ + ellipsis
90
+ + enumeration_comma
91
+ + hyphenation_point
92
+ + forward_slash
93
+ + wavy_dash
94
+ + box_drawings_light_horizontal
95
+ + fullwidth_low_line
96
+ )
97
+
98
+ # Armenian
99
+ armenian_apostrophe = r"\u055A"
100
+ emphasis_mark = r"\u055B"
101
+ exclamation_mark = r"\u055C"
102
+ armenian_comma = r"\u055D"
103
+ armenian_question_mark = r"\u055E"
104
+ abbreviation_mark = r"\u055F"
105
+ armenian_full_stop = r"\u0589"
106
+ armenian_punc = (
107
+ armenian_apostrophe
108
+ + emphasis_mark
109
+ + exclamation_mark
110
+ + armenian_comma
111
+ + armenian_question_mark
112
+ + abbreviation_mark
113
+ + armenian_full_stop
114
+ )
115
+
116
+ lesser_than_symbol = r"&lt;"
117
+ greater_than_symbol = r"&gt;"
118
+
119
+ lesser_than_sign = r"\u003c"
120
+ greater_than_sign = r"\u003e"
121
+
122
+ nbsp_written_form = r"&nbsp"
123
+
124
+ # Quotation marks
125
+ left_double_quotes = r"\u201c"
126
+ right_double_quotes = r"\u201d"
127
+ left_double_angle = r"\u00ab"
128
+ right_double_angle = r"\u00bb"
129
+ left_single_angle = r"\u2039"
130
+ right_single_angle = r"\u203a"
131
+ low_double_quotes = r"\u201e"
132
+ low_single_quotes = r"\u201a"
133
+ high_double_quotes = r"\u201f"
134
+ high_single_quotes = r"\u201b"
135
+
136
+ all_punct_quotes = (
137
+ left_double_quotes
138
+ + right_double_quotes
139
+ + left_double_angle
140
+ + right_double_angle
141
+ + left_single_angle
142
+ + right_single_angle
143
+ + low_double_quotes
144
+ + low_single_quotes
145
+ + high_double_quotes
146
+ + high_single_quotes
147
+ + right_single_quotation_mark
148
+ + left_single_quotation_mark
149
+ )
150
+ mapping_quotes = (
151
+ "["
152
+ + high_single_quotes
153
+ + right_single_quotation_mark
154
+ + left_single_quotation_mark
155
+ + "]"
156
+ )
157
+
158
+
159
+ # Digits
160
+
161
+ english_digits = r"\u0030-\u0039"
162
+ bengali_digits = r"\u09e6-\u09ef"
163
+ khmer_digits = r"\u17e0-\u17e9"
164
+ devanagari_digits = r"\u0966-\u096f"
165
+ oriya_digits = r"\u0b66-\u0b6f"
166
+ extended_arabic_indic_digits = r"\u06f0-\u06f9"
167
+ kayah_li_digits = r"\ua900-\ua909"
168
+ fullwidth_digits = r"\uff10-\uff19"
169
+ malayam_digits = r"\u0d66-\u0d6f"
170
+ myanmar_digits = r"\u1040-\u1049"
171
+ roman_numeral = r"\u2170-\u2179"
172
+ nominal_digit_shapes = r"\u206f"
173
+
174
+ # Load punctuations from MMS-lab data from the current directory
175
+ current_dir = os.path.dirname(os.path.abspath(__file__))
176
+ with open(os.path.join(current_dir, "punctuations.lst"), "r") as punc_f:
177
+ punc_list = punc_f.readlines()
178
+
179
+ punct_pattern = r""
180
+ for punc in punc_list:
181
+ # the first character in the tab separated line is the punc to be removed
182
+ punct_pattern += re.escape(punc.split("\t")[0])
183
+
184
+ shared_digits = (
185
+ english_digits
186
+ + bengali_digits
187
+ + khmer_digits
188
+ + devanagari_digits
189
+ + oriya_digits
190
+ + extended_arabic_indic_digits
191
+ + kayah_li_digits
192
+ + fullwidth_digits
193
+ + malayam_digits
194
+ + myanmar_digits
195
+ + roman_numeral
196
+ + nominal_digit_shapes
197
+ )
198
+
199
+ shared_punc_list = (
200
+ basic_punc
201
+ + all_punct_quotes
202
+ + greater_than_sign
203
+ + lesser_than_sign
204
+ + inverted_question_mark
205
+ + full_stop
206
+ + semicolon
207
+ + armenian_punc
208
+ + inverted_exclamation_mark
209
+ + arabic_comma
210
+ + enumeration_comma
211
+ + hindi_danda
212
+ + quotation_mark
213
+ + arabic_semicolon
214
+ + arabic_question_mark
215
+ + chinese_punc
216
+ + punct_pattern
217
+ )
218
+
219
+ shared_mappping = {
220
+ lesser_than_symbol: "",
221
+ greater_than_symbol: "",
222
+ nbsp_written_form: "",
223
+ r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
224
+ }
225
+
226
+ shared_deletion_list = (
227
+ left_to_right_mark
228
+ + zero_width_nonjoiner
229
+ + arabic_subscript_alef_and_inverted_damma
230
+ + zero_width_space
231
+ + arabic_diacritics
232
+ + pop_directional_formatting
233
+ + right_to_left_mark
234
+ + left_to_right_embedding
235
+ )
236
+
237
+ norm_config = {
238
+ "*": {
239
+ "lower_case": True,
240
+ "punc_set": shared_punc_list,
241
+ "del_set": shared_deletion_list,
242
+ "mapping": shared_mappping,
243
+ "digit_set": shared_digits,
244
+ "unicode_norm": "NFKC",
245
+ "rm_diacritics": False,
246
+ }
247
+ }
248
+
249
+ # =============== Mongolian ===============#
250
+
251
+ norm_config["mon"] = norm_config["*"].copy()
252
+ # add soft hyphen to punc list to match with fleurs
253
+ norm_config["mon"]["del_set"] += r"\u00AD"
254
+
255
+ norm_config["khk"] = norm_config["mon"].copy()
256
+
257
+ # =============== Hebrew ===============#
258
+
259
+ norm_config["heb"] = norm_config["*"].copy()
260
+ # add "HEBREW POINT" symbols to match with fleurs
261
+ norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
262
+
263
+ # =============== Thai ===============#
264
+
265
+ norm_config["tha"] = norm_config["*"].copy()
266
+ # add "Zero width joiner" symbols to match with fleurs
267
+ norm_config["tha"]["punc_set"] += r"\u200D"
268
+
269
+ # =============== Arabic ===============#
270
+ norm_config["ara"] = norm_config["*"].copy()
271
+ norm_config["ara"]["mapping"]["ٱ"] = "ا"
272
+ norm_config["arb"] = norm_config["ara"].copy()
273
+
274
+ # =============== Javanese ===============#
275
+ norm_config["jav"] = norm_config["*"].copy()
276
+ norm_config["jav"]["rm_diacritics"] = True
server/inference/punctuations.lst ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+  7355 INVALID UNICODE 0x81
2
+  5265 INVALID UNICODE 0x90
3
+  75 INVALID UNICODE 0x8
4
+  31 INVALID UNICODE 0x8d
5
+ ” 3 INVALID UNICODE 0x94
6
+  2 INVALID UNICODE 0x8f
7
+  2 INVALID UNICODE 0x1a
8
+  1 INVALID UNICODE 0x9d
9
+ “ 1 INVALID UNICODE 0x93
10
+ ’ 1 INVALID UNICODE 0x92
11
+  8647 INVALID UNICODE 0xe295
12
+  6650 INVALID UNICODE 0xf21d
13
+  6234 INVALID UNICODE 0xf62d
14
+  4815 INVALID UNICODE 0xf173
15
+  4789 INVALID UNICODE 0xe514
16
+  4409 INVALID UNICODE 0xe293
17
+  3881 INVALID UNICODE 0xf523
18
+  3788 INVALID UNICODE 0xe233
19
+  2448 INVALID UNICODE 0xf50f
20
+  2177 INVALID UNICODE 0xe232
21
+  1955 INVALID UNICODE 0xea7b
22
+  1926 INVALID UNICODE 0xf172
23
+  973 INVALID UNICODE 0xe290
24
+  972 INVALID UNICODE 0xf519
25
+  661 INVALID UNICODE 0xe292
26
+  591 INVALID UNICODE 0xe328
27
+  509 INVALID UNICODE 0xe2fa
28
+  458 INVALID UNICODE 0xe234
29
+  446 INVALID UNICODE 0xe043
30
+  419 INVALID UNICODE 0xe040
31
+  399 INVALID UNICODE 0xe2fb
32
+  387 INVALID UNICODE 0xe32b
33
+  381 INVALID UNICODE 0xe236
34
+  374 INVALID UNICODE 0xf511
35
+  314 INVALID UNICODE 0xe517
36
+  296 INVALID UNICODE 0xe2fe
37
+  293 INVALID UNICODE 0xe492
38
+  291 INVALID UNICODE 0xf52d
39
+  289 INVALID UNICODE 0xe2fc
40
+  195 INVALID UNICODE 0xf521
41
+  190 INVALID UNICODE 0xe516
42
+  182 INVALID UNICODE 0xe041
43
+  178 INVALID UNICODE 0xf529
44
+  113 INVALID UNICODE 0xe2f9
45
+  87 INVALID UNICODE 0xe2d9
46
+  78 INVALID UNICODE 0xe32a
47
+  76 INVALID UNICODE 0xe291
48
+  74 INVALID UNICODE 0xe296
49
+  66 INVALID UNICODE 0xe518
50
+  52 INVALID UNICODE 0xe32c
51
+  46 INVALID UNICODE 0xe2db
52
+  41 INVALID UNICODE 0xe231
53
+  34 INVALID UNICODE 0xf522
54
+  33 INVALID UNICODE 0xf518
55
+  32 INVALID UNICODE 0xf513
56
+  27 INVALID UNICODE 0xe32d
57
+  25 INVALID UNICODE 0xe32e
58
+  23 INVALID UNICODE 0xe06b
59
+  15 INVALID UNICODE 0xea01
60
+  12 INVALID UNICODE 0xe294
61
+  11 INVALID UNICODE 0xe203
62
+  8 INVALID UNICODE 0xf218
63
+  7 INVALID UNICODE 0xe070
64
+  7 INVALID UNICODE 0xe013
65
+  5 INVALID UNICODE 0xe2de
66
+  4 INVALID UNICODE 0xe493
67
+  3 INVALID UNICODE 0xf7e8
68
+  3 INVALID UNICODE 0xf7d0
69
+  3 INVALID UNICODE 0xe313
70
+  2 INVALID UNICODE 0xe329
71
+  2 INVALID UNICODE 0xe06d
72
+  2 INVALID UNICODE 0xe003
73
+  1 INVALID UNICODE 0xf50e
74
+  1 INVALID UNICODE 0xf171
75
+  1 INVALID UNICODE 0xe01d
76
+  71 NOMINAL DIGIT SHAPES 0x206f
77
+ ⁠ 3 WORD JOINER 0x2060
78
+ ― 126545 HORIZONTAL BAR 0x2015
79
+ ־ 1028 HEBREW PUNCTUATION MAQAF 0x5be
80
+ ) 98429 RIGHT PARENTHESIS 0x29
81
+ ] 27108 RIGHT SQUARE BRACKET 0x5d
82
+ ⌋ 1567 RIGHT FLOOR 0x230b
83
+ 〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015
84
+ 】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011
85
+ ﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e
86
+ & 170517 AMPERSAND 0x26
87
+ ། 106330 TIBETAN MARK SHAD 0xf0d
88
+ ። 90203 ETHIOPIC FULL STOP 0x1362
89
+ ፥ 60484 ETHIOPIC COLON 0x1365
90
+ ༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c
91
+ ။ 51567 MYANMAR SIGN SECTION 0x104b
92
+ / 46929 SOLIDUS 0x2f
93
+ ၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a
94
+ · 37985 MIDDLE DOT 0xb7
95
+ ‸ 36310 CARET 0x2038
96
+ * 34793 ASTERISK 0x2a
97
+ ۔ 32432 ARABIC FULL STOP 0x6d4
98
+ ፤ 31906 ETHIOPIC SEMICOLON 0x1364
99
+ ၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f
100
+ ។ 20834 KHMER SIGN KHAN 0x17d4
101
+ ꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe
102
+ ᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e
103
+ ꤯ 12892 KAYAH LI SIGN SHYA 0xa92f
104
+ ⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70
105
+ ꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff
106
+ ॥ 10763 DEVANAGARI DOUBLE DANDA 0x965
107
+ ؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e
108
+ ၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d
109
+ · 8431 GREEK ANO TELEIA 0x387
110
+ † 7477 DAGGER 0x2020
111
+ ၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c
112
+ ፣ 5719 ETHIOPIC COMMA 0x1363
113
+ ៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6
114
+ ꤮ 4791 KAYAH LI SIGN CWI 0xa92e
115
+ ※ 3439 REFERENCE MARK 0x203b
116
+ ፦ 2727 ETHIOPIC PREFACE COLON 0x1366
117
+ • 1749 BULLET 0x2022
118
+ ¶ 1507 PILCROW SIGN 0xb6
119
+ ၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e
120
+ ﹖ 1224 SMALL QUESTION MARK 0xfe56
121
+ ; 975 GREEK QUESTION MARK 0x37e
122
+ … 827 HORIZONTAL ELLIPSIS 0x2026
123
+ % 617 PERCENT SIGN 0x25
124
+ ・ 468 KATAKANA MIDDLE DOT 0x30fb
125
+ ༎ 306 TIBETAN MARK NYIS SHAD 0xf0e
126
+ ‡ 140 DOUBLE DAGGER 0x2021
127
+ # 137 NUMBER SIGN 0x23
128
+ @ 125 COMMERCIAL AT 0x40
129
+ ፡ 121 ETHIOPIC WORDSPACE 0x1361
130
+ ៚ 55 KHMER SIGN KOOMUUT 0x17da
131
+ ៕ 49 KHMER SIGN BARIYOOSAN 0x17d5
132
+ ﹐ 10 SMALL COMMA 0xfe50
133
+ ༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05
134
+ ༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04
135
+ . 2 FULLWIDTH FULL STOP 0xff0e
136
+ ﹗ 2 SMALL EXCLAMATION MARK 0xfe57
137
+ ﹕ 2 SMALL COLON 0xfe55
138
+ ‰ 2 PER MILLE SIGN 0x2030
139
+ ・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65
140
+ ( 98504 LEFT PARENTHESIS 0x28
141
+ [ 27245 LEFT SQUARE BRACKET 0x5b
142
+ ⌊ 1567 LEFT FLOOR 0x230a
143
+ 〔 95 LEFT TORTOISE SHELL BRACKET 0x3014
144
+ 【 36 LEFT BLACK LENTICULAR BRACKET 0x3010
145
+ ﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f
146
+ _ 4851 LOW LINE 0x5f
147
+ $ 72 DOLLAR SIGN 0x24
148
+ € 14 EURO SIGN 0x20ac
149
+ £ 2 POUND SIGN 0xa3
150
+ ~ 27462 TILDE 0x7e
151
+ = 11450 EQUALS SIGN 0x3d
152
+ | 8430 VERTICAL LINE 0x7c
153
+ − 3971 MINUS SIGN 0x2212
154
+ ≫ 1904 MUCH GREATER-THAN 0x226b
155
+ ≪ 1903 MUCH LESS-THAN 0x226a
156
+ + 1450 PLUS SIGN 0x2b
157
+ < 345 FULLWIDTH LESS-THAN SIGN 0xff1c
158
+ > 344 FULLWIDTH GREATER-THAN SIGN 0xff1e
159
+ ¬ 5 NOT SIGN 0xac
160
+ × 4 MULTIPLICATION SIGN 0xd7
161
+ → 2 RIGHTWARDS ARROW 0x2192
162
+ ᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d
163
+ ° 499 DEGREE SIGN 0xb0
164
+ ႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f
165
+ � 192 REPLACEMENT CHARACTER 0xfffd
166
+ ⌟ 54 BOTTOM RIGHT CORNER 0x231f
167
+ ⌞ 54 BOTTOM LEFT CORNER 0x231e
168
+ © 2 COPYRIGHT SIGN 0xa9
169
+   40 NARROW NO-BREAK SPACE 0x202f
170
+   1 SIX-PER-EM SPACE 0x2006
171
+ ˜ 40261 SMALL TILDE 0x2dc
172
+ ^ 6469 CIRCUMFLEX ACCENT 0x5e
173
+ ¯ 20 MACRON 0xaf
174
+ ˇ 191442 CARON 0x2c7
175
+ ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f
176
+ ـ 9440 ARABIC TATWEEL 0x640
177
+ ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46
178
+ ៗ 3310 KHMER SIGN LEK TOO 0x17d7
179
+ 々 678 IDEOGRAPHIC ITERATION MARK 0x3005
180
+ ໆ 430 LAO KO LA 0xec6
181
+ ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc
182
+ ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071
183
+ ৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7
184
+ ⅓ 26 VULGAR FRACTION ONE THIRD 0x2153
185
+ ½ 26 VULGAR FRACTION ONE HALF 0xbd
186
+ ¼ 4 VULGAR FRACTION ONE QUARTER 0xbc
187
+ ⅟ 1 FRACTION NUMERATOR ONE 0x215f
188
+ ⁄ 57 FRACTION SLASH 0x2044
server/inference/text_normalization.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ from . import norm_config_module
5
+
6
+ norm_config = norm_config_module.norm_config # type: ignore
7
+
8
+
9
+ def text_normalize(
10
+ text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
11
+ ):
12
+ """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
13
+
14
+ Args:
15
+ text : The string to be normalized
16
+ iso_code :
17
+ remove_numbers : Boolean flag to specify if words containing only digits should be removed
18
+
19
+ Returns:
20
+ normalized_text : the string after all normalization
21
+
22
+ """
23
+
24
+ config = norm_config.get(iso_code, norm_config["*"])
25
+
26
+ for field in [
27
+ "lower_case",
28
+ "punc_set",
29
+ "del_set",
30
+ "mapping",
31
+ "digit_set",
32
+ "unicode_norm",
33
+ ]:
34
+ if field not in config:
35
+ config[field] = norm_config["*"][field]
36
+
37
+ text = unicodedata.normalize(config["unicode_norm"], text)
38
+
39
+ # Convert to lower case
40
+
41
+ if config["lower_case"] and lower_case:
42
+ text = text.lower()
43
+
44
+ # brackets
45
+
46
+ # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
47
+ text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
48
+ if remove_brackets:
49
+ text = re.sub(r"\([^\)]*\)", " ", text)
50
+
51
+ # Apply mappings
52
+
53
+ for old, new in config["mapping"].items():
54
+ text = re.sub(old, new, text)
55
+
56
+ # Replace punctutations with space
57
+
58
+ punct_pattern = r"[" + config["punc_set"]
59
+
60
+ punct_pattern += "]"
61
+
62
+ normalized_text = re.sub(punct_pattern, " ", text)
63
+
64
+ # remove characters in delete list
65
+
66
+ delete_patten = r"[" + config["del_set"] + "]"
67
+
68
+ normalized_text = re.sub(delete_patten, "", normalized_text)
69
+
70
+ # Remove words containing only digits
71
+ # We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
72
+ # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
73
+ # The lookaround enables overlapping pattern matches to be replaced
74
+
75
+ if remove_numbers:
76
+
77
+ digits_pattern = "[" + config["digit_set"]
78
+
79
+ digits_pattern += "]+"
80
+
81
+ complete_digit_pattern = (
82
+ r"^"
83
+ + digits_pattern
84
+ + r"(?=\s)|(?<=\s)"
85
+ + digits_pattern
86
+ + r"(?=\s)|(?<=\s)"
87
+ + digits_pattern
88
+ + "$"
89
+ )
90
+
91
+ normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
92
+
93
+ if config["rm_diacritics"]:
94
+ from unidecode import unidecode
95
+
96
+ normalized_text = unidecode(normalized_text)
97
+
98
+ # Remove extra spaces
99
+ normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
100
+
101
+ return normalized_text
server/lang_dict.py ADDED
@@ -0,0 +1,1675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lang_code={
2
+ "English (Latin)": "eng_Latn",
3
+ "Hindi (Devanagari)": "hin_Deva",
4
+ "Bengali (Bengali)": "ben_Beng",
5
+ "Arbëreshë Albanian (Latin)": "aae_Latn",
6
+ "Afade (Latin)": "aal_Latn",
7
+ "Abung (Latin)": "abb_Latn",
8
+ "Abidji (Latin)": "abi_Latn",
9
+ "Abkhazian (Cyrillic)": "abk_Cyrl",
10
+ "Abua (Latin)": "abn_Latn",
11
+ "Abellen Ayta (Latin)": "abp_Latn",
12
+ "Abron (Latin)": "abr_Latn",
13
+ "Ambulas (Latin)": "abs_Latn",
14
+ "Achagua (Latin)": "aca_Latn",
15
+ "Gikyode (Latin)": "acd_Latn",
16
+ "Achinese (Latin)": "ace_Latn",
17
+ "Saint Lucian Creole French (Latin)": "acf_Latn",
18
+ "Acholi (Latin)": "ach_Latn",
19
+ "Iraqi Arabic (Arabic)": "acm_Arab",
20
+ "Achang (Latin)": "acn_Latn",
21
+ "Achi (Latin)": "acr_Latn",
22
+ "Achuar-Shiwiar (Latin)": "acu_Latn",
23
+ "Hijazi Arabic (Arabic)": "acw_Arab",
24
+ "Adele (Latin)": "ade_Latn",
25
+ "Adhola (Latin)": "adh_Latn",
26
+ "Adioukrou (Latin)": "adj_Latn",
27
+ "Amdo Tibetan (Tibetan)": "adx_Tibt",
28
+ "Adyghe (Cyrillic)": "ady_Cyrl",
29
+ "Tunisian Arabic (Arabic)": "aeb_Arab",
30
+ "Saidi Arabic (Arabic)": "aec_Arab",
31
+ "Arem (Latin)": "aeu_Latn",
32
+ "Gulf Arabic (Arabic)": "afb_Arab",
33
+ "Eloyi (Latin)": "afo_Latn",
34
+ "Afrikaans (Latin)": "afr_Latn",
35
+ "Agarabi (Latin)": "agd_Latn",
36
+ "Angor (Latin)": "agg_Latn",
37
+ "Agariya (Latin)": "agn_Latn",
38
+ "Aguaruna (Latin)": "agr_Latn",
39
+ "Aguacateco (Latin)": "agu_Latn",
40
+ "Agul (Cyrillic)": "agx_Cyrl",
41
+ "Ahanta (Latin)": "aha_Latn",
42
+ "Akha (Latin)": "ahk_Latn",
43
+ "Igo (Latin)": "ahl_Latn",
44
+ "Arosi (Latin)": "ahs_Latn",
45
+ "Arosi (Latin)": "aia_Latn",
46
+ "Aja (Benin) (Latin)": "ajg_Latn",
47
+ "Akan (Latin)": "aka_Latn",
48
+ "Batak Angkola (Latin)": "akb_Latn",
49
+ "Akawaio (Latin)": "ake_Latn",
50
+ "Akpes (Latin)": "akp_Latn",
51
+ "Alago (Latin)": "ala_Latn",
52
+ "Alangan (Latin)": "alj_Latn",
53
+ "Gheg Albanian (Latin)": "aln_Latn",
54
+ "Larike-Wakasihu (Latin)": "alo_Latn",
55
+ "Alune (Latin)": "alp_Latn",
56
+ "Tosk Albanian (Latin)": "als_Latn",
57
+ "Southern Altai (Cyrillic)": "alt_Cyrl",
58
+ "Alur (Latin)": "alz_Latn",
59
+ "Amarasi (Latin)": "ame_Latn",
60
+ "Hamer-Banna (Latin)": "amf_Latn",
61
+ "Amharic (Ethiopic)": "amh_Ethi",
62
+ "Amis (Latin)": "ami_Latn",
63
+ "Amo (Latin)": "amk_Latn",
64
+ "Amanab (Latin)": "amu_Latn",
65
+ "Ngas (Latin)": "anc_Latn",
66
+ "Goemai (Latin)": "ank_Latn",
67
+ "Obolo (Latin)": "ann_Latn",
68
+ "Angika (Devanagari)": "anp_Deva",
69
+ "Anaang (Latin)": "anw_Latn",
70
+ "Anyin (Latin)": "any_Latn",
71
+ "A'ou (Latin)": "aom_Latn",
72
+ "Uab Meto (Latin)": "aoz_Latn",
73
+ "Sa'a (Latin)": "apb_Latn",
74
+ "North Levantine Arabic (Arabic)": "apc_Arab",
75
+ "Sudanese Arabic (Arabic)": "apd_Arab",
76
+ "A-Pucikwar (Latin)": "apr_Latn",
77
+ "Standard Arabic (Arabic)": "arb_Arab",
78
+ "Aragonese (Latin)": "arg_Latn",
79
+ "Arhâ (Latin)": "arl_Latn",
80
+ "Algerian Arabic (Arabic)": "arq_Arab",
81
+ "Najdi Arabic (Arabic)": "ars_Arab",
82
+ "Moroccan Arabic (Arabic)": "ary_Arab",
83
+ "Egyptian Arabic (Arabic)": "arz_Arab",
84
+ "Asu (Tanzania) (Latin)": "asa_Latn",
85
+ "Cishingini (Latin)": "asg_Latn",
86
+ "Assamese (Bengali)": "asm_Beng",
87
+ "Asturian (Latin)": "ast_Latn",
88
+ "Ata (Latin)": "ata_Latn",
89
+ "Atsi (Latin)": "atb_Latn",
90
+ "Atong (India) (Latin)": "atg_Latn",
91
+ "Ivbie North-Okpela-Arhe (Latin)": "ati_Latn",
92
+ "Atikamekw (Latin)": "atq_Latn",
93
+ "Avaric (Cyrillic)": "ava_Cyrl",
94
+ "Avikam (Latin)": "avn_Latn",
95
+ "Avokaya (Latin)": "avu_Latn",
96
+ "Awadhi (Devanagari)": "awa_Deva",
97
+ "Awa-Cuaiquer (Latin)": "awb_Latn",
98
+ "Arawum (Latin)": "awo_Latn",
99
+ "South Levantine Arabic (Arabic)": "ayl_Arab",
100
+ "Ayizo Gbe (Latin)": "ayo_Latn",
101
+ "North Mesopotamian Arabic (Arabic)": "ayp_Arab",
102
+ "Aymara (Latin)": "ayr_Latn",
103
+ "Mai Brat (Latin)": "ayz_Latn",
104
+ "Azerbaijani (Arabic)": "aze_Arab",
105
+ "Azerbaijani (Cyrillic)": "aze_Cyrl",
106
+ "Azerbaijani (Latin)": "aze_Latn",
107
+ "Ambele (Latin)": "azg_Latn",
108
+ "Highland Oaxaca Chontal (Latin)": "azz_Latn",
109
+ "Bagheli (Latin)": "bag_Latn",
110
+ "Bashkir (Cyrillic)": "bak_Cyrl",
111
+ "Bambara (Latin)": "bam_Latn",
112
+ "Balinese (Latin)": "ban_Latn",
113
+ "Waimaha (Latin)": "bao_Latn",
114
+ "Basa (Cameroon) (Latin)": "bas_Latn",
115
+ "Vengo (Latin)": "bav_Latn",
116
+ "Bambili-Bambui (Latin)": "bax_Latn",
117
+ "Barai (Latin)": "bba_Latn",
118
+ "Baeggu (Latin)": "bbb_Latn",
119
+ "Batak Toba (Latin)": "bbc_Latn",
120
+ "Ghomálá' (Latin)": "bbj_Latn",
121
+ "Babanki (Georgian)": "bbl_Geor",
122
+ "Northern Bobo Madaré (Latin)": "bbo_Latn",
123
+ "Kulung (Nigeria) (Latin)": "bbu_Latn",
124
+ "Southern Balochi (Arabic)": "bcc_Arab",
125
+ "Southern Balochi (Latin)": "bcc_Latn",
126
+ "Bainouk-Samik (Latin)": "bce_Latn",
127
+ "Baoulé (Latin)": "bci_Latn",
128
+ "Central Bikol (Latin)": "bcl_Latn",
129
+ "Bainouk-Gunyaamolo (Latin)": "bcs_Latn",
130
+ "Bana (Latin)": "bcw_Latn",
131
+ "Bannoni (Latin)": "bcy_Latn",
132
+ "Bainouk-Gunyaamolo (Latin)": "bcz_Latn",
133
+ "Bai (Latin)": "bda_Latn",
134
+ "Bade (Latin)": "bde_Latn",
135
+ "Balesin-Bisaya (Latin)": "bdg_Latn",
136
+ "Baka (South Sudan) (Latin)": "bdh_Latn",
137
+ "Burun (Latin)": "bdm_Latn",
138
+ "Bau (Latin)": "bdq_Latn",
139
+ "Oroko (Latin)": "bdu_Latn",
140
+ "Bebele (Latin)": "beb_Latn",
141
+ "Biali (Latin)": "beh_Latn",
142
+ "Belarusian (Cyrillic)": "bel_Cyrl",
143
+ "Bemba (Zambia) (Latin)": "bem_Latn",
144
+ "Bengali (Bengali)": "ben_Beng",
145
+ "Bila (Latin)": "bep_Latn",
146
+ "Betawi (Latin)": "bew_Latn",
147
+ "Yarawa (Latin)": "bex_Latn",
148
+ "Beba (Latin)": "bfa_Latn",
149
+ "Bafut (Latin)": "bfd_Latn",
150
+ "Beba (Latin)": "bfo_Latn",
151
+ "Balti (Arabic)": "bft_Arab",
152
+ "Bagheli (Devanagari)": "bfy_Deva",
153
+ "Pahari-Potwari (Devanagari)": "bfz_Deva",
154
+ "Haryanvi (Devanagari)": "bgc_Deva",
155
+ "Gwamhi-Wuri (Arabic)": "bgp_Arab",
156
+ "Bagri (Devanagari)": "bgq_Deva",
157
+ "Bauria (Latin)": "bgr_Latn",
158
+ "Gamo-Gofa-Dawro (Latin)": "bgt_Latn",
159
+ "Bhatri (Devanagari)": "bgw_Deva",
160
+ "Bharia (Devanagari)": "bha_Deva",
161
+ "Bhili (Devanagari)": "bhb_Deva",
162
+ "Bukhari (Cyrillic)": "bhh_Cyrl",
163
+ "Bhojpuri (Devanagari)": "bho_Deva",
164
+ "Bima (Latin)": "bhp_Latn",
165
+ "Bhattiyali (Devanagari)": "bht_Deva",
166
+ "Biangai (Latin)": "bhz_Latn",
167
+ "Bissa (Latin)": "bib_Latn",
168
+ "Bimoba (Latin)": "bim_Latn",
169
+ "Bislama (Latin)": "bis_Latn",
170
+ "B Eliot (Latin)": "biv_Latn",
171
+ "Badyara (Devanagari)": "bjj_Deva",
172
+ "Barok (Latin)": "bjk_Latn",
173
+ "Banjar (Latin)": "bjn_Latn",
174
+ "Binumarien (Latin)": "bjr_Latn",
175
+ "Bulu (Papua New Guinea) (Latin)": "bjt_Latn",
176
+ "Bedjond (Latin)": "bjv_Latn",
177
+ "Bakwé (Latin)": "bjw_Latn",
178
+ "Bariji (Latin)": "bjz_Latn",
179
+ "Binukid (Latin)": "bkd_Latn",
180
+ "Bakoko (Latin)": "bkh_Latn",
181
+ "Boki (Latin)": "bkm_Latn",
182
+ "Bekwarra (Latin)": "bkv_Latn",
183
+ "Bungku (Latin)": "bky_Latn",
184
+ "Bolia (Latin)": "ble_Latn",
185
+ "Baluan-Pam (Latin)": "blh_Latn",
186
+ "Tai Dam (Latin)": "blt_Latn",
187
+ "Mag-Indi Ayta (Latin)": "blx_Latn",
188
+ "Balantak (Latin)": "blz_Latn",
189
+ "Bembe (Latin)": "bmm_Latn",
190
+ "Biao Mon (Latin)": "bmq_Latn",
191
+ "Muinane (Latin)": "bmr_Latn",
192
+ "Bomwali (Latin)": "bmu_Latn",
193
+ "Bum (Latin)": "bmv_Latn",
194
+ "Bangi (Bengali)": "bng_Beng",
195
+ "Bonerif (Latin)": "bnm_Latn",
196
+ "Bontok (Latin)": "bnn_Latn",
197
+ "Bantoanon (Latin)": "bno_Latn",
198
+ "Bola (Papua New Guinea) (Latin)": "bnp_Latn",
199
+ "Bunun (Devanagari)": "bns_Deva",
200
+ "Bora (Latin)": "boa_Latn",
201
+ "Tibetan (Tibetan)": "bod_Tibt",
202
+ "Anjam (Latin)": "boj_Latn",
203
+ "Berom (Latin)": "bom_Latn",
204
+ "Borôro (Latin)": "bor_Latn",
205
+ "Bosnian (Latin)": "bos_Latn",
206
+ "Bonkiman (Latin)": "bou_Latn",
207
+ "Bongo (Latin)": "bov_Latn",
208
+ "Tuwuli (Latin)": "box_Latn",
209
+ "Barapasi (Latin)": "bpr_Latn",
210
+ "Banda-Banda (Latin)": "bps_Latn",
211
+ "Birgid (Latin)": "bqc_Latn",
212
+ "Baga Pokur (Latin)": "bqg_Latn",
213
+ "Bakhtiari (Arabic)": "bqi_Arab",
214
+ "Banda-Mbrès (Latin)": "bqj_Latn",
215
+ "Banda-Ndélé (Latin)": "bqp_Latn",
216
+ "Braj (Devanagari)": "bra_Deva",
217
+ "Breton (Latin)": "bre_Latn",
218
+ "Brahui (Arabic)": "brh_Arab",
219
+ "Bira (Congo) (Latin)": "bri_Latn",
220
+ "Burui (Latin)": "bru_Latn",
221
+ "Bodo (India) (Devanagari)": "brx_Deva",
222
+ "Basa (Nigeria) (Latin)": "bsc_Latn",
223
+ "Kati (Arabic)": "bsh_Arab",
224
+ "Bangolan (Latin)": "bsj_Latn",
225
+ "Burushaski (Latin)": "bsk_Latn",
226
+ "Bassa-Kontagora (Latin)": "bsq_Latn",
227
+ "Akoose (Latin)": "bss_Latn",
228
+ "Busami (Latin)": "bsy_Latn",
229
+ "Batak Dairi (Latin)": "btd_Latn",
230
+ "Batak Mandailing (Latin)": "btm_Latn",
231
+ "Ratte Buri (Latin)": "bts_Latn",
232
+ "Bete-Bendi (Latin)": "btt_Latn",
233
+ "Bateri (Arabic)": "btv_Arab",
234
+ "Batak Karo (Latin)": "btx_Latn",
235
+ "Budu (Latin)": "bud_Latn",
236
+ "Buginese (Latin)": "bug_Latn",
237
+ "Bulgarian (Cyrillic)": "bul_Cyrl",
238
+ "Bulu (Cameroon) (Latin)": "bum_Latn",
239
+ "Bulu (Cameroon) (Latin)": "buo_Latn",
240
+ "Bussa (Latin)": "bus_Latn",
241
+ "Bokobaru (Latin)": "bux_Latn",
242
+ "Bube (Latin)": "bvb_Latn",
243
+ "Baelelea (Latin)": "bvc_Latn",
244
+ "Buriat (Latin)": "bvz_Latn",
245
+ "Bwatoo (Latin)": "bwq_Latn",
246
+ "Bura-Pabir (Latin)": "bwr_Latn",
247
+ "Buli (Ghana) (Latin)": "bwu_Latn",
248
+ "Bilur (Latin)": "bxf_Latn",
249
+ "Buhutu (Latin)": "bxk_Latn",
250
+ "Tiéyaxo Bozo (Latin)": "byc_Latn",
251
+ "Bina (Nigeria) (Latin)": "byr_Latn",
252
+ "Bisa (Latin)": "bys_Latn",
253
+ "Batak (Latin)": "byv_Latn",
254
+ "Qaqet (Latin)": "byx_Latn",
255
+ "Blaan (Latin)": "bzh_Latn",
256
+ "Bisu (Thai)": "bzi_Thai",
257
+ "Jamaican Creole English (Latin)": "bzj_Latn",
258
+ "Boano (Sulawesi) (Latin)": "bzw_Latn",
259
+ "Chortí (Latin)": "caa_Latn",
260
+ "Garifuna (Latin)": "cab_Latn",
261
+ "Chuj (Latin)": "cac_Latn",
262
+ "Kaqchikel (Latin)": "cak_Latn",
263
+ "Carolinian (Latin)": "cap_Latn",
264
+ "Galibi Carib (Latin)": "car_Latn",
265
+ "Tsimané (Latin)": "cas_Latn",
266
+ "Catalan (Latin)": "cat_Latn",
267
+ "Cua (Latin)": "cax_Latn",
268
+ "Cabiyarí (Latin)": "cbc_Latn",
269
+ "Chachi (Latin)": "cbi_Latn",
270
+ "Carijona (Latin)": "cbr_Latn",
271
+ "Cashibo-Cacataibo (Latin)": "cbs_Latn",
272
+ "Chayahuita (Latin)": "cbt_Latn",
273
+ "Chachi (Latin)": "cbu_Latn",
274
+ "Kakua (Latin)": "cbv_Latn",
275
+ "Chopi (Latin)": "cce_Latn",
276
+ "Samba Daka (Latin)": "ccg_Latn",
277
+ "Chakma (Latin)": "cco_Latn",
278
+ "Churahi (Devanagari)": "cdj_Deva",
279
+ "Min Dong Chinese (Han)": "cdo_Hans",
280
+ "Cebuano (Latin)": "ceb_Latn",
281
+ "Cen Gbe (Latin)": "ceg_Latn",
282
+ "Cek pet (Latin)": "cek_Latn",
283
+ "Centúúm (Latin)": "cen_Latn",
284
+ "Czech (Latin)": "ces_Latn",
285
+ "Chafarruscas (Latin)": "cfa_Latn",
286
+ "Falam Chin (Latin)": "cfm_Latn",
287
+ "Chiga (Latin)": "cgg_Latn",
288
+ "Chiga (Latin)": "cgg_Latn",
289
+ "Chechen (Cyrillic)": "che_Cyrl",
290
+ "Chontal de Tabasco (Latin)": "chf_Latn",
291
+ "Chatino (Latin)": "chq_Latn",
292
+ "Chuvash (Cyrillic)": "chv_Cyrl",
293
+ "Ozumacín Chinantec (Latin)": "chz_Latn",
294
+ "Chokwe (Latin)": "cjk_Latn",
295
+ "Chamorro (Latin)": "cjo_Latn",
296
+ "Upper Chehalis (Latin)": "cjp_Latn",
297
+ "Shor (Cyrillic)": "cjs_Cyrl",
298
+ "Central Kurdish (Arabic)": "ckb_Arab",
299
+ "Cibak (Latin)": "ckl_Latn",
300
+ "Anufo (Latin)": "cko_Latn",
301
+ "Chak (Latin)": "ckr_Latn",
302
+ "Chukot (Cyrillic)": "ckt_Cyrl",
303
+ "Chukot (Latin)": "cky_Latn",
304
+ "Chala (Latin)": "cla_Latn",
305
+ "Lealao Chinantec (Latin)": "cle_Latn",
306
+ "Eastern Highland Chatino (Latin)": "cly_Latn",
307
+ "Mro-Khimi Chin (Latin)": "cme_Latn",
308
+ "Mandarin Chinese (Han)": "cmn_Hans",
309
+ "Mandarin Chinese (Han)": "cmn_Hant",
310
+ "Central Mnong (Khmer)": "cmo_Khmr",
311
+ "Central Mnong (Latin)": "cmo_Latn",
312
+ "Mro-Khimi Chin (Latin)": "cmr_Latn",
313
+ "Hakha Chin (Latin)": "cnh_Latn",
314
+ "Ashéninka Pajonal (Latin)": "cni_Latn",
315
+ "Lalana Chinantec (Latin)": "cnl_Latn",
316
+ "Northern Tlaxiaco Chatino (Latin)": "cnt_Latn",
317
+ "Cochimi (Latin)": "coe_Latn",
318
+ "Cofán (Latin)": "cof_Latn",
319
+ "Chong (Latin)": "cok_Latn",
320
+ "Cotoname (Latin)": "con_Latn",
321
+ "Cornish (Latin)": "cor_Latn",
322
+ "Caquinte (Latin)": "cot_Latn",
323
+ "Wamey (Latin)": "cou_Latn",
324
+ "Ponares (Latin)": "cpa_Latn",
325
+ "Ucayali-Yurúa Ashéninka (Latin)": "cpb_Latn",
326
+ "Pichis Ashéninka (Latin)": "cpu_Latn",
327
+ "Pu-Xian Chinese (Han)": "cpx_Hans",
328
+ "Ucayali-Yurúa Ashéninka (Latin)": "cpy_Latn",
329
+ "Crimean Tatar (Cyrillic)": "crh_Cyrl",
330
+ "Cree (Canadian Aboriginal Syllabics)": "crk_Cans",
331
+ "Cree (Latin)": "crk_Latn",
332
+ "El Nayar Cora (Latin)": "crn_Latn",
333
+ "Caramanta (Latin)": "crq_Latn",
334
+ "Seselwa Creole French (Latin)": "crs_Latn",
335
+ "Iyo'wujwa Chorote (Latin)": "crt_Latn",
336
+ "Carrier (Latin)": "csk_Latn",
337
+ "Southern Ping Chinese (Latin)": "cso_Latn",
338
+ "Northern Tlaxiaco Chatino (Latin)": "ctd_Latn",
339
+ "Tepinapa Chinantec (Latin)": "cte_Latn",
340
+ "Chittagonian (Bengali)": "ctg_Beng",
341
+ "Tataltepec Chatino (Latin)": "ctl_Latn",
342
+ "Tataltepec Chatino (Latin)": "cto_Latn",
343
+ "Wayanad Chetti (Latin)": "ctu_Latn",
344
+ "Cun (Latin)": "cuc_Latn",
345
+ "Culina (Latin)": "cui_Latn",
346
+ "Culina (Latin)": "cuk_Latn",
347
+ "Culina (Latin)": "cul_Latn",
348
+ "Teutila Cuicatec (Latin)": "cut_Latn",
349
+ "Chuka (Latin)": "cux_Latn",
350
+ "Chuwabu (Latin)": "cwa_Latn",
351
+ "Kwere (Latin)": "cwe_Latn",
352
+ "Nute (Latin)": "cwt_Latn",
353
+ "Cemuhî (Latin)": "cya_Latn",
354
+ "Welsh (Latin)": "cym_Latn",
355
+ "Dambi (Latin)": "daa_Latn",
356
+ "Dagbani (Latin)": "dag_Latn",
357
+ "Gwahatike (Latin)": "dah_Latn",
358
+ "Danish (Latin)": "dan_Latn",
359
+ "Dargwa (Cyrillic)": "dar_Cyrl",
360
+ "Taita (Latin)": "dav_Latn",
361
+ "Dabarre (Latin)": "dbd_Latn",
362
+ "Doga (Latin)": "dbj_Latn",
363
+ "Daba (Latin)": "dbq_Latn",
364
+ "Deccan (Arabic)": "dcc_Arab",
365
+ "Dendi (Nigeria) (Latin)": "ddn_Latn",
366
+ "Dedua (Latin)": "ded_Latn",
367
+ "Dezfuli (Latin)": "deg_Latn",
368
+ "Desano (Latin)": "des_Latn",
369
+ "German (Latin)": "deu_Latn",
370
+ "Dagaari Dioula (Latin)": "dga_Latn",
371
+ "Dghwede (Latin)": "dgh_Latn",
372
+ "Dugwor (Latin)": "dgi_Latn",
373
+ "Dakka (Latin)": "dgk_Latn",
374
+ "Dogri (macrolanguage) (Devanagari)": "dgo_Deva",
375
+ "Dogrib (Latin)": "dgr_Latn",
376
+ "Didinga (Devanagari)": "dhi_Deva",
377
+ "Digo (Latin)": "did_Latn",
378
+ "Digo (Latin)": "dig_Latn",
379
+ "Dilling (Latin)": "dik_Latn",
380
+ "Dilling (Latin)": "dip_Latn",
381
+ "Dhivehi (Thaana)": "div_Thaa",
382
+ "Zarma (Latin)": "dje_Latn",
383
+ "Jukun of Takum (Latin)": "djk_Latn",
384
+ "Domaaki (Arabic)": "dmk_Arab",
385
+ "Domaaki (Arabic)": "dml_Arab",
386
+ "Dan (Latin)": "dnj_Latn",
387
+ "Dan (Latin)": "dnt_Latn",
388
+ "Dan (Latin)": "dnw_Latn",
389
+ "Dom (Latin)": "dop_Latn",
390
+ "Dogosé (Latin)": "dos_Latn",
391
+ "Duruwa (Latin)": "dru_Latn",
392
+ "Lower Sorbian (Latin)": "dsb_Latn",
393
+ "Daasanach (Latin)": "dsh_Latn",
394
+ "Dusner (Latin)": "dtp_Latn",
395
+ "Toro So Dogon (Latin)": "dts_Latn",
396
+ "Dotyali (Devanagari)": "dty_Deva",
397
+ "Duala (Latin)": "dua_Latn",
398
+ "Duna (Latin)": "dug_Latn",
399
+ "Dutton World Speedwords (Latin)": "dwr_Latn",
400
+ "Dyiri (Latin)": "dyi_Latn",
401
+ "Dyola-Fonyi (Latin)": "dyo_Latn",
402
+ "Dyula (Latin)": "dyu_Latn",
403
+ "Dazaga (Latin)": "dzg_Latn",
404
+ "Dzongkha (Tibetan)": "dzo_Tibt",
405
+ "Embu (Latin)": "ebu_Latn",
406
+ "Epie (Latin)": "ego_Latn",
407
+ "Eipomek (Latin)": "eip_Latn",
408
+ "Askopan (Latin)": "eiv_Latn",
409
+ "Eka (Latin)": "eka_Latn",
410
+ "Standard Estonian (Latin)": "ekk_Latn",
411
+ "Eki (Latin)": "eko_Latn",
412
+ "Yace (Latin)": "ekr_Latn",
413
+ "Modern Greek (1453-) (Greek)": "ell_Grek",
414
+ "Modern Greek (1453-) (Greek, cypr1249)": "ell_Grek_cypr1249",
415
+ "Eleme (Latin)": "elm_Latn",
416
+ "Eman (Latin)": "emp_Latn",
417
+ "Enlhet (Latin)": "enb_Latn",
418
+ "English (Latin)": "eng_Latn",
419
+ "Enxet (Latin)": "enx_Latn",
420
+ "Esperanto (Latin)": "epo_Latn",
421
+ "Ese Ejja (Latin)": "ese_Latn",
422
+ "Esselen (Latin)": "ess_Latn",
423
+ "Central Yupik (Latin)": "esu_Latn",
424
+ "Eton (Vanuatu) (Latin)": "eto_Latn",
425
+ "Eton (Cameroon) (Latin)": "ets_Latn",
426
+ "Eton (Cameroon) (Latin)": "etu_Latn",
427
+ "Basque (Latin)": "eus_Latn",
428
+ "Even (Cyrillic)": "evn_Cyrl",
429
+ "Ewe (Latin)": "ewe_Latn",
430
+ "Ewondo (Latin)": "ewo_Latn",
431
+ "Eyak (Latin)": "eyo_Latn",
432
+ "Ezaa (Latin)": "eza_Latn",
433
+ "Fali (Latin)": "fal_Latn",
434
+ "Fang (Equatorial Guinea) (Latin)": "fan_Latn",
435
+ "Faroese (Latin)": "fao_Latn",
436
+ "Fasu (Latin)": "far_Latn",
437
+ "Persian (Arabic)": "fas_Arab",
438
+ "Fanti (Latin)": "fat_Latn",
439
+ "Faita (Latin)": "fia_Latn",
440
+ "Fijian (Latin)": "fij_Latn",
441
+ "Filipino (Latin)": "fil_Latn",
442
+ "Finnish (Latin)": "fin_Latn",
443
+ "Fipa (Latin)": "fip_Latn",
444
+ "Knaanic (Latin)": "fkk_Latn",
445
+ "Foau (Latin)": "flr_Latn",
446
+ "Fe'fe' (Latin)": "fmp_Latn",
447
+ "Far Western Muria (Devanagari)": "fmu_Deva",
448
+ "Fon (Latin)": "fon_Latn",
449
+ "French (Latin)": "fra_Latn",
450
+ "Fordata (Latin)": "frd_Latn",
451
+ "Western Frisian (Latin)": "fry_Latn",
452
+ "Fulah (Latin)": "fub_Latn",
453
+ "Pulaar (Latin)": "fuc_Latn",
454
+ "East Futuna (Latin)": "fue_Latn",
455
+ "Fulah (Latin)": "ful_Latn",
456
+ "Pulaar (Latin)": "fuq_Latn",
457
+ "Nigerian Fulfulde (Latin)": "fuv_Latn",
458
+ "Gagauz (Cyrillic)": "gag_Cyrl",
459
+ "Gagauz (Latin)": "gag_Latn",
460
+ "Gaina (Latin)": "gai_Latn",
461
+ "Gamkonora (Latin)": "gam_Latn",
462
+ "Kandawo (Telugu)": "gau_Telu",
463
+ "Gabri (Latin)": "gbi_Latn",
464
+ "Kaytetye (Devanagari)": "gbk_Deva",
465
+ "Garhwali (Devanagari)": "gbm_Deva",
466
+ "Gbari (Latin)": "gbo_Latn",
467
+ "Gbagyi (Latin)": "gbr_Latn",
468
+ "Gbagyi (Latin)": "gby_Latn",
469
+ "Alekano (Latin)": "gcc_Latn",
470
+ "Gade (Latin)": "gde_Latn",
471
+ "Guduf-Gava (Latin)": "gdf_Latn",
472
+ "Gengle (Latin)": "geb_Latn",
473
+ "Gebe (Latin)": "gej_Latn",
474
+ "Geser-Gorom (Latin)": "ges_Latn",
475
+ "Guria (Arabic)": "ggg_Arab",
476
+ "Gidar (Latin)": "gid_Latn",
477
+ "Gbazari (Arabic)": "gig_Arab",
478
+ "Gilbertese (Latin)": "gil_Latn",
479
+ "Gimi (Papua New Guinea) (Latin)": "giz_Latn",
480
+ "Kachi Koli (Arabic)": "gjk_Arab",
481
+ "Gunditjmara (Latin)": "gjn_Latn",
482
+ "Gujari (Arabic)": "gju_Arab",
483
+ "Gokana (Latin)": "gkn_Latn",
484
+ "Nanai (Cyrillic)": "gld_Cyrl",
485
+ "Irish (Latin)": "gle_Latn",
486
+ "Galician (Latin)": "glg_Latn",
487
+ "Gilaki (Arabic)": "glk_Arab",
488
+ "Manx (Latin)": "glv_Latn",
489
+ "Gula (Chad) (Latin)": "glw_Latn",
490
+ "Gamo (Latin)": "gmv_Latn",
491
+ "Gana (Latin)": "gna_Latn",
492
+ "Gondi (Latin)": "gnd_Latn",
493
+ "Ngangam (Latin)": "gng_Latn",
494
+ "Gofa (Latin)": "gof_Latn",
495
+ "Gogo (Latin)": "gog_Latn",
496
+ "Gola (Latin)": "gol_Latn",
497
+ "Goan Konkani (Devanagari)": "gom_Deva",
498
+ "Gorontalo (Latin)": "gor_Latn",
499
+ "Gor (Latin)": "gqr_Latn",
500
+ "Ancient Greek (to 1453) (Greek)": "grc_Grek",
501
+ "Gbiri-Niragu (Latin)": "gri_Latn",
502
+ "Guarani (Latin)": "grn_Latn",
503
+ "Garo (Bengali)": "grt_Beng",
504
+ "Guriaso (Latin)": "gsl_Latn",
505
+ "German Sign Language (Latin)": "gso_Latn",
506
+ "Guajajára (Latin)": "gub_Latn",
507
+ "Wayuu (Latin)": "guc_Latn",
508
+ "Yocoboué Dida (Latin)": "gud_Latn",
509
+ "Paraguayan Guaraní (Latin)": "gug_Latn",
510
+ "Guahibo (Latin)": "guh_Latn",
511
+ "Eastern Bolivian Guaraní (Latin)": "gui_Latn",
512
+ "Gujarati (Gujarati)": "guj_Gujr",
513
+ "Gumuz (Ethiopic)": "guk_Ethi",
514
+ "Gumuz (Latin)": "gum_Latn",
515
+ "Guro (Latin)": "guo_Latn",
516
+ "Guinau dan (Latin)": "guq_Latn",
517
+ "Farefare (Latin)": "gur_Latn",
518
+ "Farefare (Latin)": "guu_Latn",
519
+ "Gusilay (Latin)": "gux_Latn",
520
+ "Gusii (Latin)": "guz_Latn",
521
+ "Guanano (Latin)": "gvc_Latn",
522
+ "Gwanja (Latin)": "gvl_Latn",
523
+ "Kalami (Arabic)": "gwc_Arab",
524
+ "Gweno (Latin)": "gwe_Latn",
525
+ "Gwichʼin (Latin)": "gwi_Latn",
526
+ "Gwere (Latin)": "gwr_Latn",
527
+ "Gwere (Arabic)": "gwt_Arab",
528
+ "Guaymí (Latin)": "gym_Latn",
529
+ "Gyem (Latin)": "gyr_Latn",
530
+ "Geji (Latin)": "gyz_Latn",
531
+ "Hadiyya (Latin)": "had_Latn",
532
+ "Hanga (Latin)": "hag_Latn",
533
+ "Hahon (Latin)": "hah_Latn",
534
+ "Hakka Chinese (Latin)": "hak_Latn",
535
+ "Ha(Latin)": "hao_Latn",
536
+ "Hdi (Latin)": "hap_Latn",
537
+ "Haitian (Latin)": "hat_Latn",
538
+ "Hausa (Latin)": "hau_Latn",
539
+ "Hawaiian (Latin)": "haw_Latn",
540
+ "Haya (Latin)": "hay_Latn",
541
+ "Huba (Latin)": "hbb_Latn",
542
+ "Huichol (Latin)": "hch_Latn",
543
+ "Hebrew (Hebrew)": "heb_Hebr",
544
+ "Hehe (Latin)": "heh_Latn",
545
+ "Herero (Latin)": "her_Latn",
546
+ "Hiaitsiihi (Latin)": "hia_Latn",
547
+ "Fiji Hindi (Latin)": "hif_Latn",
548
+ "Higgi (Latin)": "hig_Latn",
549
+ "Hiligaynon (Latin)": "hil_Latn",
550
+ "Hindi (Devanagari)": "hin_Deva",
551
+ "Hkongso Chin (Latin)": "hkk_Latn",
552
+ "Halang (Latin)": "hla_Latn",
553
+ "Halia (Devanagari)": "hlb_Deva",
554
+ "Matu Chin (Latin)": "hlt_Latn",
555
+ "Chhattisgarhi (Devanagari)": "hne_Deva",
556
+ "Hän (Latin)": "hnn_Latn",
557
+ "Northern Hindko (Arabic)": "hno_Arab",
558
+ "Hunsrik (Latin)": "hns_Latn",
559
+ "Ho (Oriya)": "hoc_Orya",
560
+ "Croatian (Latin)": "hrv_Latn",
561
+ "Upper Sorbian (Latin)": "hsb_Latn",
562
+ "Hoti (Latin)": "hto_Latn",
563
+ "Huba (Latin)": "hub_Latn",
564
+ "Huave (Latin)": "hue_Latn",
565
+ "San Francisco Del Mar Huave (Latin)": "hui_Latn",
566
+ "Hula (Latin)": "hul_Latn",
567
+ "Hungarian (Latin)": "hun_Latn",
568
+ "Huastec (Latin)": "hus_Latn",
569
+ "Humla (Latin)": "huu_Latn",
570
+ "San Mateo Del Mar Huave (Latin)": "huv_Latn",
571
+ "Hulaulá (Latin)": "hux_Latn",
572
+ "Havanese (Latin)": "hvn_Latn",
573
+ "Hwana (Latin)": "hwo_Latn",
574
+ "Armenian (Armenian)": "hye_Armn",
575
+ "Western Armenian (Armenian)": "hyw_Armn",
576
+ "Iban (Latin)": "iba_Latn",
577
+ "Ibibio (Latin)": "ibb_Latn",
578
+ "Igbo (Latin)": "ibo_Latn",
579
+ "Etkywan (Latin)": "icr_Latn",
580
+ "Ido (Latin)": "ida_Latn",
581
+ "Idon (Latin)": "idd_Latn",
582
+ "Idoma (Latin)": "idu_Latn",
583
+ "Ifugao (Latin)": "ifa_Latn",
584
+ "Amganad Ifugao (Latin)": "ifb_Latn",
585
+ "Ifo (Latin)": "ife_Latn",
586
+ "Tuwali Ifugao (Latin)": "ifk_Latn",
587
+ "Mayoyao Ifugao (Latin)": "ifu_Latn",
588
+ "Keley-I Kallahan (Latin)": "ify_Latn",
589
+ "Igede (Latin)": "igl_Latn",
590
+ "Igala (Latin)": "ign_Latn",
591
+ "Ijaw (Latin)": "ijc_Latn",
592
+ "Biseni (Latin)": "ijn_Latn",
593
+ "Ika (Latin)": "ikk_Latn",
594
+ "Ikwere (Latin)": "ikw_Latn",
595
+ "Ila (Latin)": "ilb_Latn",
596
+ "Ilocano (Latin)": "ilo_Latn",
597
+ "Imbongu (Latin)": "imo_Latn",
598
+ "Interlingua (International Auxiliary Language Association) (Latin)": "ina_Latn",
599
+ "Inga (Latin)": "inb_Latn",
600
+ "Indonesian (Latin)": "ind_Latn",
601
+ "Iu Mien (Latin)": "iou_Latn",
602
+ "Ipili (Latin)": "ipi_Latn",
603
+ "Inupiaq (Latin)": "ipk_Latn",
604
+ "Iquito (Latin)": "iqw_Latn",
605
+ "Iresim (Latin)": "iri_Latn",
606
+ "Irarutu (Latin)": "irk_Latn",
607
+ "Isekiri (Latin)": "ish_Latn",
608
+ "Icelandic (Latin)": "isl_Latn",
609
+ "Isoko (Latin)": "iso_Latn",
610
+ "Italian (Latin)": "ita_Latn",
611
+ "Itelmen (Cyrillic)": "itl_Cyrl",
612
+ "Isekiri (Latin)": "its_Latn",
613
+ "Isekiri (Latin)": "itv_Latn",
614
+ "Ito (Latin)": "itw_Latn",
615
+ "Itzá (Latin)": "itz_Latn",
616
+ "Ixil (Latin)": "ixl_Latn",
617
+ "Izere (Latin)": "izr_Latn",
618
+ "Izii (Latin)": "izz_Latn",
619
+ "Jakaltek (Latin)": "jac_Latn",
620
+ "Yalahatan (Latin)": "jal_Latn",
621
+ "Jamaican Creole English (Latin)": "jam_Latn",
622
+ "Javanese (Latin)": "jav_Latn",
623
+ "Jambi Malay (Latin)": "jax_Latn",
624
+ "Jibu (Latin)": "jbu_Latn",
625
+ "Jerung (Latin)": "jen_Latn",
626
+ "Jicaque (Latin)": "jic_Latn",
627
+ "Jivaro (Latin)": "jiv_Latn",
628
+ "Machame (Latin)": "jmc_Latn",
629
+ "Zumbun (Latin)": "jmd_Latn",
630
+ "Jimi (Nigeria) (Latin)": "jmx_Latn",
631
+ "Japanese (Japanese)": "jpn_Jpan",
632
+ "Jaqaru (Latin)": "jqr_Latn",
633
+ "Jowulu (Latin)": "juk_Latn",
634
+ "Ju'hoan (Oriya)": "jun_Orya",
635
+ "Juang (Latin)": "juo_Latn",
636
+ "Wapan (Latin)": "jvn_Latn",
637
+ "Kara-Kalpak (Cyrillic)": "kaa_Cyrl",
638
+ "Kabyle (Latin)": "kab_Latn",
639
+ "Kachin (Latin)": "kac_Latn",
640
+ "Gayo (Latin)": "kai_Latn",
641
+ "Jju (Latin)": "kaj_Latn",
642
+ "Jju (Latin)": "kak_Latn",
643
+ "Kamba (Kenya) (Latin)": "kam_Latn",
644
+ "Kannada (Kannada)": "kan_Knda",
645
+ "Kanu (Latin)": "kao_Latn",
646
+ "Bezhta (Latin)": "kaq_Latn",
647
+ "Kashmiri (Arabic)": "kas_Arab",
648
+ "Georgian (Georgian)": "kat_Geor",
649
+ "Kadazan Dusun (Latin)": "kay_Latn",
650
+ "Kazakh (Cyrillic)": "kaz_Cyrl",
651
+ "Kabardian (Cyrillic)": "kbd_Cyrl",
652
+ "Kayan (Latin)": "kbl_Latn",
653
+ "Kande (Latin)": "kbo_Latn",
654
+ "Kabiye (Latin)": "kbp_Latn",
655
+ "Kabiye (Latin)": "kbq_Latn",
656
+ "Kafa (Latin)": "kbr_Latn",
657
+ "Kamo (Latin)": "kbt_Latn",
658
+ "Kikuyu (Latin)": "kby_Latn",
659
+ "Ket (Cyrillic)": "kca_Cyrl",
660
+ "Tyap (Latin)": "kcg_Latn",
661
+ "Kono (Nigeria) (Latin)": "kcn_Latn",
662
+ "Kutu (Latin)": "kcq_Latn",
663
+ "Kutu (Latin)": "kdc_Latn",
664
+ "Makonde (Latin)": "kde_Latn",
665
+ "Tem (Latin)": "kdh_Latn",
666
+ "Kumam (Latin)": "kdi_Latn",
667
+ "Kumam (Latin)": "kdj_Latn",
668
+ "Tsikimba (Latin)": "kdl_Latn",
669
+ "Kagulu (Latin)": "kdn_Latn",
670
+ "Kuy (Khmer)": "kdt_Khmr",
671
+ "Kepo' (Latin)": "kea_Latn",
672
+ "Kekchi (Latin)": "kek_Latn",
673
+ "Kenyang (Latin)": "ken_Latn",
674
+ "Kenyah (Latin)": "keo_Latn",
675
+ "Kera (Latin)": "ker_Latn",
676
+ "Kugbo (Latin)": "keu_Latn",
677
+ "Komi-Permyak (Telugu)": "key_Telu",
678
+ "Kukele (Latin)": "kez_Latn",
679
+ "Kobiana (Devanagari)": "kfb_Deva",
680
+ "Northwestern Kolami (Telugu)": "kff_Telu",
681
+ "Kuk (Devanagari)": "kfk_Deva",
682
+ "Kotaba (Devanagari)": "kfq_Deva",
683
+ "Koya (Gujarati)": "kfr_Gujr",
684
+ "Koro (India) (Latin)": "kfw_Latn",
685
+ "Kaili (Devanagari)": "kfx_Deva",
686
+ "Khasi (Latin)": "kha_Latn",
687
+ "Kham (Tibetan)": "khg_Tibt",
688
+ "Khalkha Mongolian (Cyrillic)": "khk_Cyrl",
689
+ "Khmer (Khmer)": "khm_Khmr",
690
+ "Koyra Chiini Songhay (Latin)": "khq_Latn",
691
+ "Khowar (Arabic)": "khw_Arab",
692
+ "Kim (Latin)": "kia_Latn",
693
+ "Koalib (Latin)": "kij_Latn",
694
+ "Kikuyu (Latin)": "kik_Latn",
695
+ "Kinyarwanda (Latin)": "kin_Latn",
696
+ "Kirghiz (Cyrillic)": "kir_Cyrl",
697
+ "Kitharaka (Latin)": "kix_Latn",
698
+ "Mlap (Latin)": "kjb_Latn",
699
+ "Coastal Konjo (Latin)": "kjc_Latn",
700
+ "Kisar (Latin)": "kje_Latn",
701
+ "Khmu (Latin)": "kjg_Latn",
702
+ "Khakas (Cyrillic)": "kjh_Cyrl",
703
+ "Khakas (Latin)": "kjk_Latn",
704
+ "Kagulu (Latin)": "kki_Latn",
705
+ "Kikuyu (Latin)": "kkj_Latn",
706
+ "Kalanguya (Devanagari)": "kle_Deva",
707
+ "Kalenjin (Latin)": "kln_Latn",
708
+ "Kulisusu (Latin)": "kls_Latn",
709
+ "Klao (Latin)": "klu_Latn",
710
+ "Maskelynes (Latin)": "klv_Latn",
711
+ "Tado (Latin)": "klw_Latn",
712
+ "Kama (Latin)": "kma_Latn",
713
+ "Kimbundu (Latin)": "kmd_Latn",
714
+ "Tanudan Kalinga (Latin)": "kml_Latn",
715
+ "Northern Kurdish (Arabic)": "kmr_Arab",
716
+ "Northern Kurdish (Cyrillic)": "kmr_Cyrl",
717
+ "Northern Kurdish (Latin)": "kmr_Latn",
718
+ "Kanite (Latin)": "kmu_Latn",
719
+ "Koma (Latin)": "kmy_Latn",
720
+ "Kanda (Latin)": "kna_Latn",
721
+ "Lubuagan Kalinga (Latin)": "knb_Latn",
722
+ "Central Kanuri (Latin)": "knc_Latn",
723
+ "Kankanaey (Latin)": "kne_Latn",
724
+ "Kutu (Latin)": "knf_Latn",
725
+ "Konda (Latin)": "knj_Latn",
726
+ "Kuranko (Latin)": "knk_Latn",
727
+ "Konkani (macrolanguage) (Devanagari)": "knn_Deva",
728
+ "Kono (Sierra Leone) (Latin)": "kno_Latn",
729
+ "Kongo (Latin)": "kog_Latn",
730
+ "Kol (Papua New Guinea) (Latin)": "kol_Latn",
731
+ "Konzo (Latin)": "koo_Latn",
732
+ "Korean (Hangul)": "kor_Hang",
733
+ "Kodia (Latin)": "kpo_Latn",
734
+ "Korupun-Sela (Latin)": "kpq_Latn",
735
+ "Kofei (Latin)": "kps_Latn",
736
+ "Komi-Zyrian (Cyrillic)": "kpv_Cyrl",
737
+ "Komi-Permyak (Cyrillic)": "kpy_Cyrl",
738
+ "Kofyar (Latin)": "kpz_Latn",
739
+ "Korafe-Yegha (Latin)": "kqe_Latn",
740
+ "Korafe-Yegha (Latin)": "kqo_Latn",
741
+ "Kimré (Latin)": "kqp_Latn",
742
+ "Kimaragang (Latin)": "kqr_Latn",
743
+ "Koyra Chiini Songhay (Ethiopic)": "kqy_Ethi",
744
+ "Karachay-Balkar (Cyrillic)": "krc_Cyrl",
745
+ "Krio (Latin)": "kri_Latn",
746
+ "Kinaray-A (Latin)": "krj_Latn",
747
+ "Karelian (Latin)": "krl_Latn",
748
+ "Sapo (Khmer)": "krr_Khmr",
749
+ "Gbaya (Sudan) (Latin)": "krs_Latn",
750
+ "Kurukh (Devanagari)": "kru_Deva",
751
+ "Tewa (Indonesia) (Latin)": "krx_Latn",
752
+ "Shambala (Latin)": "ksb_Latn",
753
+ "Kuanua (Latin)": "ksd_Latn",
754
+ "Bafia (Latin)": "ksf_Latn",
755
+ "Krisa (Latin)": "ksr_Latn",
756
+ "Kusasi (Latin)": "kss_Latn",
757
+ "Kham (Devanagari)": "ksz_Deva",
758
+ "Kambaata (Ethiopic)": "ktb_Ethi",
759
+ "Krumen (Latin)": "ktj_Latn",
760
+ "Kto (Latin)": "kto_Latn",
761
+ "Kuanyama (Latin)": "kua_Latn",
762
+ "Kutep (Latin)": "kub_Latn",
763
+ "Kuman (Papua New Guinea) (Latin)": "kue_Latn",
764
+ "Kushi (Latin)": "kuh_Latn",
765
+ "Kumyk (Cyrillic)": "kum_Cyrl",
766
+ "Kurdish (Arabic)": "kur_Arab",
767
+ "Kusaal (Latin)": "kus_Latn",
768
+ "Kutino (Latin)": "kvn_Latn",
769
+ "Kove (Latin)": "kvw_Latn",
770
+ "Komi (Arabic)": "kvx_Arab",
771
+ "Kutu (Latin)": "kwd_Latn",
772
+ "Kwara'ae (Latin)": "kwf_Latn",
773
+ "Awa-Cuaiquer (Latin)": "kwi_Latn",
774
+ "Kwak'wala (Latin)": "kwm_Latn",
775
+ "Kodia (Ethiopic)": "kxc_Ethi",
776
+ "Maninkakan, Kita (Latin)": "kxf_Latn",
777
+ "Kuanhua (Thai)": "kxm_Thai",
778
+ "Wadiyara Koli (Arabic)": "kxp_Arab",
779
+ "Kwaya (Latin)": "kyb_Latn",
780
+ "Kyaka (Latin)": "kyc_Latn",
781
+ "Karey (Latin)": "kyf_Latn",
782
+ "Keyagana (Latin)": "kyg_Latn",
783
+ "Kouya (Latin)": "kyo_Latn",
784
+ "Kwaya (Latin)": "kyq_Latn",
785
+ "Kayagar (Kayah Li)": "kyu_Kali",
786
+ "Kambaira (Latin)": "kyx_Latn",
787
+ "Kerewo (Latin)": "kyz_Latn",
788
+ "Kairiru (Latin)": "kzf_Latn",
789
+ "Kelabit (Latin)": "kzi_Latn",
790
+ "Lacandon (Latin)": "lac_Latn",
791
+ "Langi (Latin)": "lag_Latn",
792
+ "Lango (Uganda) (Latin)": "laj_Latn",
793
+ "Lamba (Latin)": "lam_Latn",
794
+ "Lao (Lao)": "lao_Laoo",
795
+ "Lama (Togo) (Latin)": "las_Latn",
796
+ "Latin (Latin)": "lat_Latn",
797
+ "Latvian (Latin)": "lav_Latn",
798
+ "Lavu (Latin)": "law_Latn",
799
+ "Lama (Myanmar) (Tibetan)": "lbj_Tibt",
800
+ "Lachi (Latin)": "lbw_Latn",
801
+ "Luchazi (Latin)": "lcm_Latn",
802
+ "Lola (Thai)": "lcp_Thai",
803
+ "Lidzonka (Latin)": "ldb_Latn",
804
+ "Leko (Latin)": "led_Latn",
805
+ "Lyélé (Latin)": "lee_Latn",
806
+ "Lefa (Latin)": "lef_Latn",
807
+ "Lembena (Latin)": "lem_Latn",
808
+ "Lense (Latin)": "lew_Latn",
809
+ "Lemio (Latin)": "lex_Latn",
810
+ "Lega-Shabunda (Latin)": "lgg_Latn",
811
+ "Laghu (Latin)": "lgl_Latn",
812
+ "Lahu (Latin)": "lhu_Latn",
813
+ "Lianshan Zhuang (Latin)": "lia_Latn",
814
+ "Likum (Latin)": "lid_Latn",
815
+ "Limbu (Devanagari)": "lif_Deva",
816
+ "Ligurian (Latin)": "lij_Latn",
817
+ "Lingala (Latin)": "lin_Latn",
818
+ "Liki (Latin)": "lip_Latn",
819
+ "Libinza (Latin)": "lir_Latn",
820
+ "Lisu (Lisu)": "lis_Lisu",
821
+ "Lithuanian (Latin)": "lit_Latn",
822
+ "Rampi (Latin)": "lje_Latn",
823
+ "Lampung Api (Latin)": "ljp_Latn",
824
+ "Lukabaras (Latin)": "lkb_Latn",
825
+ "Lakata (Latin)": "lke_Latn",
826
+ "Lilau (Latin)": "lla_Latn",
827
+ "Ladin (Latin, gherd)": "lld_Latn_gherd",
828
+ "Ladin (Latin, valbadia)": "lld_Latn_valbadia",
829
+ "Láá Láá Bwamu (Latin)": "llg_Latn",
830
+ "Lele (Guinea) (Latin)": "lln_Latn",
831
+ "Loma (Liberia) (Latin)": "lme_Latn",
832
+ "Lundayeh (Latin)": "lnd_Latn",
833
+ "Lango (South Sudan) (Latin)": "lns_Latn",
834
+ "Lundayeh (Latin)": "lnu_Latn",
835
+ "Loloda (Latin)": "loa_Latn",
836
+ "Lobi (Latin)": "lob_Latn",
837
+ "Loko (Latin)": "lok_Latn",
838
+ "Loma (Liberia) (Latin)": "lom_Latn",
839
+ "Loma (Liberia) (Latin)": "lon_Latn",
840
+ "Lobala (Latin)": "loq_Latn",
841
+ "Luri (Arabic)": "lrk_Arab",
842
+ "Lish (Latin)": "lsi_Latn",
843
+ "Sa'ban (Latin)": "lsm_Latn",
844
+ "Sa'ban (Arabic)": "lss_Arab",
845
+ "Latgalian (Latin)": "ltg_Latn",
846
+ "Lethu (Latin)": "lth_Latn",
847
+ "Lutachoni (Latin)": "lto_Latn",
848
+ "Luxembourgish (Latin)": "ltz_Latn",
849
+ "Luba-Lulua (Latin)": "lua_Latn",
850
+ "Aringa (Latin)": "luc_Latn",
851
+ "Ganda (Latin)": "lug_Latn",
852
+ "Luo (Kenya and Tanzania) (Latin)": "luo_Latn",
853
+ "Lushai (Latin)": "lus_Latn",
854
+ "Luwanga (Latin)": "lwg_Latn",
855
+ "Lwo (Latin)": "lwo_Latn",
856
+ "Lewo Eleng (Latin)": "lww_Latn",
857
+ "Laz (Latin)": "lzz_Latn",
858
+ "Maasai (Latin)": "maa_Latn",
859
+ "Yutanduchi Mixtec (Latin)": "mab_Latn",
860
+ "Madurese (Latin)": "mad_Latn",
861
+ "Mafa (Latin)": "maf_Latn",
862
+ "Magahi (Devanagari)": "mag_Deva",
863
+ "Marshallese (Latin)": "mah_Latn",
864
+ "Maithili (Devanagari)": "mai_Deva",
865
+ "Majhwar (Latin)": "maj_Latn",
866
+ "Makasar (Latin)": "mak_Latn",
867
+ "Malayalam (Malayalam)": "mal_Mlym",
868
+ "Mam (Latin)": "mam_Latn",
869
+ "Mamaindé (Latin)": "maq_Latn",
870
+ "Marathi (Devanagari)": "mar_Deva",
871
+ "Mazatec (Latin)": "mau_Latn",
872
+ "Sateré-Mawé (Latin)": "maw_Latn",
873
+ "North Moluccan Malay (Latin)": "max_Latn",
874
+ "Central Mazahua (Latin)": "maz_Latn",
875
+ "Western Bukidnon Manobo (Latin)": "mbb_Latn",
876
+ "Macushi (Latin)": "mbc_Latn",
877
+ "Duna (Latin)": "mbh_Latn",
878
+ "Ilianen Manobo (Latin)": "mbj_Latn",
879
+ "Matigsalug Manobo (Latin)": "mbt_Latn",
880
+ "Mbo (Cameroon) (Latin)": "mbu_Latn",
881
+ "Macuna (Latin)": "mca_Latn",
882
+ "Machiguenga (Latin)": "mcb_Latn",
883
+ "Bitur (Latin)": "mcd_Latn",
884
+ "Matsés (Latin)": "mcf_Latn",
885
+ "Mixe (Latin)": "mco_Latn",
886
+ "Ese (Latin)": "mcp_Latn",
887
+ "M seri (Latin)": "mcq_Latn",
888
+ "Mambai (Latin)": "mcu_Latn",
889
+ "Mpiemo (Latin)": "mcx_Latn",
890
+ "Mada (Nigeria) (Latin)": "mda_Latn",
891
+ "Morigi (Latin)": "mdd_Latn",
892
+ "Mbosi (Latin)": "mdv_Latn",
893
+ "Male (Ethiopia) (Ethiopic)": "mdy_Ethi",
894
+ "Medumba (Latin)": "med_Latn",
895
+ "Melpa (Latin)": "mee_Latn",
896
+ "Southwestern Tlaxiaco Mixtec (Latin)": "meh_Latn",
897
+ "Midob (Latin)": "mej_Latn",
898
+ "Mekeo (Latin)": "mek_Latn",
899
+ "Central Melanau (Latin)": "mel_Latn",
900
+ "Mende (Liberia) (Latin)": "men_Latn",
901
+ "Merey (Latin)": "meq_Latn",
902
+ "Meru (Latin)": "mer_Latn",
903
+ "Mato (Latin)": "met_Latn",
904
+ "Motu (Latin)": "meu_Latn",
905
+ "Mano (Latin)": "mev_Latn",
906
+ "Morisyen (Latin)": "mfe_Latn",
907
+ "Mefele (Latin)": "mfh_Latn",
908
+ "Mefele (Latin)": "mfi_Latn",
909
+ "Mogofin (Latin)": "mfk_Latn",
910
+ "Cross River Mbembe (Latin)": "mfm_Latn",
911
+ "Mefele (Latin)": "mfn_Latn",
912
+ "Mbe (Latin)": "mfo_Latn",
913
+ "Marghi South (Latin)": "mfq_Latn",
914
+ "Marghi (Latin)": "mfv_Latn",
915
+ "Pahi (Latin)": "mfy_Latn",
916
+ "Melo (Latin)": "mfz_Latn",
917
+ "Maguindanaon (Latin)": "mgd_Latn",
918
+ "Mpade (Latin)": "mge_Latn",
919
+ "Monguor (Latin)": "mgg_Latn",
920
+ "Makhuwa-Meetto (Latin)": "mgh_Latn",
921
+ "Laua (Latin)": "mgi_Latn",
922
+ "Meta' (Latin)": "mgo_Latn",
923
+ "Ma'di (Latin)": "mhi_Latn",
924
+ "Mouk-Aria (Latin)": "mhk_Latn",
925
+ "Mari (Russia) (Cyrillic)": "mhr_Cyrl",
926
+ "Mundat (Latin)": "mhu_Latn",
927
+ "Maru (Latin)": "mhx_Latn",
928
+ "Ma'di (Latin)": "mhy_Latn",
929
+ "Atatláhuca Mixtec (Latin)": "mib_Latn",
930
+ "Mi'kmaq (Latin)": "mie_Latn",
931
+ "Mofu-Gudur (Latin)": "mif_Latn",
932
+ "San Miguel El Grande Mixtec (Latin)": "mig_Latn",
933
+ "Chayuco Mixtec (Latin)": "mih_Latn",
934
+ "Peñoles Mixtec (Latin)": "mil_Latn",
935
+ "Alacatlatzala Mixtec (Latin)": "mim_Latn",
936
+ "Minangkabau (Latin)": "min_Latn",
937
+ "Pinotepa Nacional Mixtec (Latin)": "mio_Latn",
938
+ "Apasco-Apoala Mixtec (Latin)": "mip_Latn",
939
+ "Mískito (Latin)": "miq_Latn",
940
+ "Mískito (Latin)": "mit_Latn",
941
+ "Southern Puebla Mixtec (Latin)": "miu_Latn",
942
+ "Akoye (Latin)": "miy_Latn",
943
+ "Coatzospan Mixtec (Latin)": "miz_Latn",
944
+ "Mali (Devanagari)": "mjl_Deva",
945
+ "Malavedan (Malayalam)": "mjv_Mlym",
946
+ "Macedonian (Cyrillic)": "mkd_Cyrl",
947
+ "Mokole (Benin) (Latin)": "mkf_Latn",
948
+ "Dhatki (Arabic)": "mki_Arab",
949
+ "Mokole (Benin) (Latin)": "mkl_Latn",
950
+ "Mokole (Benin) (Latin)": "mkn_Latn",
951
+ "Malagasy (Latin)": "mlg_Latn",
952
+ "Maltese (Latin)": "mlq_Latn",
953
+ "Maltese (Latin)": "mlt_Latn",
954
+ "Mamanwa (Latin)": "mmc_Latn",
955
+ "Michoacán Mazahua (Latin)": "mmg_Latn",
956
+ "Maonan (Latin)": "mnb_Latn",
957
+ "Montenegrin (Latin)": "mne_Latn",
958
+ "Mundani (Latin)": "mnf_Latn",
959
+ "Manipuri (Bengali)": "mni_Beng",
960
+ "Maninka (Latin)": "mnk_Latn",
961
+ "Mon (Myanmar)": "mnw_Mymr",
962
+ "Manikion (Latin)": "mnx_Latn",
963
+ "Mwan (Latin)": "moa_Latn",
964
+ "Mogholi (Latin)": "mog_Latn",
965
+ "Mongolian (Cyrillic)": "mon_Cyrl",
966
+ "Mopán Maya (Latin)": "mop_Latn",
967
+ "Mor (New Guinea) (Latin)": "mor_Latn",
968
+ "Mossi (Latin)": "mos_Latn",
969
+ "Tucunaca (Latin)": "mox_Latn",
970
+ "Mukulu (Latin)": "moz_Latn",
971
+ "Mpompon (Latin)": "mpg_Latn",
972
+ "Yosondúa Mixtec (Latin)": "mpm_Latn",
973
+ "Mapidian (Latin)": "mpp_Latn",
974
+ "Mixtec (Latin)": "mpx_Latn",
975
+ "Malas (Latin)": "mqb_Latn",
976
+ "Mangole (Latin)": "mqf_Latn",
977
+ "Minokok (Latin)": "mqj_Latn",
978
+ "Mumuye (Latin)": "mqn_Latn",
979
+ "Manggarai (Latin)": "mqy_Latn",
980
+ "Maori (Latin)": "mri_Latn",
981
+ "Western Mari (Cyrillic)": "mrj_Cyrl",
982
+ "Western Magar (Devanagari)": "mrr_Deva",
983
+ "Maranao (Latin)": "mrt_Latn",
984
+ "Maru (Latin)": "mrw_Latn",
985
+ "Masaba (Latin)": "msh_Latn",
986
+ "Sabah Malay (Latin)": "msi_Latn",
987
+ "Mswahili (Latin)": "msw_Latn",
988
+ "Malay (macrolanguage) (Latin)": "msy_Latn",
989
+ "Mator-Taygi-Karagas (Latin)": "mtd_Latn",
990
+ "Binukidnon (Latin)": "mtj_Latn",
991
+ "Yosondúa Mixtec (Latin)": "mto_Latn",
992
+ "Totontepec Mixe (Devanagari)": "mtr_Deva",
993
+ "Tututepec Mixtec (Latin)": "mtu_Latn",
994
+ "Tututepec Mixtec (Latin)": "mtx_Latn",
995
+ "Mundang (Latin)": "mua_Latn",
996
+ "Mubi (Latin)": "mug_Latn",
997
+ "Mündü (Latin)": "muh_Latn",
998
+ "Musi (Latin)": "mui_Latn",
999
+ "Majhwar (Devanagari)": "mup_Deva",
1000
+ "Murle (Latin)": "mur_Latn",
1001
+ "Muthuvan (Malayalam)": "muv_Mlym",
1002
+ "Muyang (Latin)": "muy_Latn",
1003
+ "Marwari (Arabic)": "mve_Arab",
1004
+ "Marwari (Arabic)": "mvp_Latn",
1005
+ "Marwari (Arabic)": "mvy_Arab",
1006
+ "Mwanga (Tanzania) (Latin)": "mwq_Latn",
1007
+ "Mwera (Tanzania) (Latin)": "mwv_Latn",
1008
+ "Metlatónoc Mixtec (Latin)": "mxb_Latn",
1009
+ "Juxtlahuaca Mixtec (Latin)": "mxq_Latn",
1010
+ "Silacayoapan Mixtec (Latin)": "mxs_Latn",
1011
+ "Tezoatlán Mixtec (Latin)": "mxt_Latn",
1012
+ "Metlatónoc Mixtec (Latin)": "mxu_Latn",
1013
+ "Northwestern Ojibwa (Latin)": "mxv_Latn",
1014
+ "Metlatónoc Mixtec (Latin)": "mxy_Latn",
1015
+ "Burmese (Myanmar)": "mya_Mymr",
1016
+ "Mbay (Latin)": "myb_Latn",
1017
+ "Myene (Latin)": "myk_Latn",
1018
+ "Erzya (Cyrillic)": "myv_Cyrl",
1019
+ "Masa (Chad) (Latin)": "myx_Latn",
1020
+ "Macuna (Latin)": "myy_Latn",
1021
+ "Santa María Zacatepec Mixtec (Latin)": "mza_Latn",
1022
+ "Berber languages (Latin)": "mzi_Latn",
1023
+ "Mazatlán Mixe (Latin)": "mzj_Latn",
1024
+ "Mazatlán Mixe (Latin)": "mzk_Latn",
1025
+ "Mazatlán Mixe (Latin)": "mzl_Latn",
1026
+ "Mumuye (Latin)": "mzm_Latn",
1027
+ "Manado Malay (Latin)": "mzw_Latn",
1028
+ "Nimanbur (Latin)": "nab_Latn",
1029
+ "Naga languages (Latin)": "nag_Latn",
1030
+ "Nalik (Latin)": "nal_Latn",
1031
+ "Min Nan Chinese (Latin)": "nan_Latn",
1032
+ "Neapolitan (Latin)": "nap_Latn",
1033
+ "Coatepec Nahuatl (Latin)": "nas_Latn",
1034
+ "Nawuri (Latin)": "naw_Latn",
1035
+ "Nyemba (Latin)": "nbh_Latn",
1036
+ "Chang Naga (Latin)": "nca_Latn",
1037
+ "Notsi (Latin)": "ncf_Latn",
1038
+ "Central Huasteca Nahuatl (Latin)": "nch_Latn",
1039
+ "Central Puebla Nahuatl (Latin)": "ncj_Latn",
1040
+ "Michoacán Nahuatl (Latin)": "ncl_Latn",
1041
+ "N eko (Latin)": "nco_Latn",
1042
+ "Nahuatl languages (Latin)": "ncu_Latn",
1043
+ "Morelos Nahuatl (Latin)": "ncx_Latn",
1044
+ "Ndogo (Latin)": "ndi_Latn",
1045
+ "Ndjuká (Latin)": "ndj_Latn",
1046
+ "Ndonga (Latin)": "ndo_Latn",
1047
+ "Ndo (Latin)": "ndp_Latn",
1048
+ "Ndut (Latin)": "ndv_Latn",
1049
+ "Lutos (Latin)": "ndy_Latn",
1050
+ "Ndogo (Latin)": "ndz_Latn",
1051
+ "Toura (Côte d'Ivoire) (Latin)": "neb_Latn",
1052
+ "Nepali (Devanagari)": "nep_Deva",
1053
+ "Newari (Devanagari)": "new_Deva",
1054
+ "Ngbaka'ma'bo (Latin)": "nfa_Latn",
1055
+ "Nefamese (Latin)": "nfr_Latn",
1056
+ "Ngad'a (Latin)": "nga_Latn",
1057
+ "Ngemba (Latin)": "ngi_Latn",
1058
+ "Lomwe (Latin)": "ngl_Latn",
1059
+ "Ngulu (Latin)": "ngp_Latn",
1060
+ "Guerrero Nahuatl (Latin)": "ngu_Latn",
1061
+ "Eastern Huasteca Nahuatl (Latin)": "nhe_Latn",
1062
+ "Ngiyambaa (Latin)": "nhg_Latn",
1063
+ "Zacatlán-Ahuacatlán-Tepetzintla Nahuatl (Latin)": "nhi_Latn",
1064
+ "Nahari (Latin)": "nhn_Latn",
1065
+ "Tetelcingo Nahuatl (Latin)": "nhq_Latn",
1066
+ "Orizaba Nahuatl (Latin)": "nhu_Latn",
1067
+ "Western Huasteca Nahuatl (Latin)": "nhw_Latn",
1068
+ "Tabasco Nahuatl (Latin)": "nhx_Latn",
1069
+ "Ometepec Nahuatl (Latin)": "nhy_Latn",
1070
+ "Nias (Latin)": "nia_Latn",
1071
+ "Ngaju (Latin)": "nij_Latn",
1072
+ "Nimi (Latin)": "nim_Latn",
1073
+ "Ninzo (Latin)": "nin_Latn",
1074
+ "Nganasan (Latin)": "nja_Latn",
1075
+ "Nkonya (Latin)": "nko_Latn",
1076
+ "Ngombale (Latin)": "nla_Latn",
1077
+ "Ná-Meo (Latin)": "nlc_Latn",
1078
+ "Dutch (Latin)": "nld_Latn",
1079
+ "Gela (Latin)": "nlg_Latn",
1080
+ "Ninia Yali (Latin)": "nlk_Latn",
1081
+ "Orizaba Nahuatl (Latin)": "nlv_Latn",
1082
+ "Nyamwezi (Latin)": "nmg_Latn",
1083
+ "Nyamwezi (Latin)": "nmz_Latn",
1084
+ "Norwegian Nynorsk (Latin)": "nnb_Latn",
1085
+ "Ngiemboon (Latin)": "nnh_Latn",
1086
+ "Ngen (Latin)": "nnq_Latn",
1087
+ "Nuni (Latin)": "nnw_Latn",
1088
+ "Nocamán (Latin)": "noa_Latn",
1089
+ "Norwegian Bokmål (Latin)": "nob_Latn",
1090
+ "Northern Thai (Thai)": "nod_Thai",
1091
+ "Nimadi (Devanagari)": "noe_Deva",
1092
+ "Nogai (Cyrillic)": "nog_Cyrl",
1093
+ "Nomatsiguenga (Latin)": "not_Latn",
1094
+ "Nupoid languages (Latin)": "npl_Latn",
1095
+ "Napu (Latin)": "npy_Latn",
1096
+ "Northern Sotho (Latin)": "nso_Latn",
1097
+ "Nisenan (Latin)": "nst_Latn",
1098
+ "Nisu (Latin)": "nsu_Latn",
1099
+ "Naga languages (Latin)": "ntm_Latn",
1100
+ "Ntrubo (Latin)": "ntr_Latn",
1101
+ "Nobsalan (Latin)": "nuj_Latn",
1102
+ "Nung (Viet Nam) (Latin)": "nup_Latn",
1103
+ "Nuer (Latin)": "nus_Latn",
1104
+ "Nuu-chah-nulth (Latin)": "nuz_Latn",
1105
+ "Nyabwa (Latin)": "nwb_Latn",
1106
+ "Naxi (Latin)": "nxq_Latn",
1107
+ "Nyanja (Latin)": "nya_Latn",
1108
+ "Nyanga-li (Latin)": "nyf_Latn",
1109
+ "Nyankole (Latin)": "nyn_Latn",
1110
+ "Nyoro (Latin)": "nyo_Latn",
1111
+ "Nyulnyul (Latin)": "nyu_Latn",
1112
+ "Nyulnyul (Latin)": "nyy_Latn",
1113
+ "Nzima (Latin)": "nzi_Latn",
1114
+ "Obo Manobo (Latin)": "obo_Latn",
1115
+ "Occitan (post 1500) (Latin)": "oci_Latn",
1116
+ "Ormuri (Arabic)": "odk_Arab",
1117
+ "Odual (Latin)": "odu_Latn",
1118
+ "Ogoniland (Latin)": "ogo_Latn",
1119
+ "Ojibwa (Canadian Aboriginal Syllabics)": "ojb_Cans",
1120
+ "Ojibwa (Latin)": "ojb_Latn",
1121
+ "Oku (Latin)": "oku_Latn",
1122
+ "Mochi (Latin)": "old_Latn",
1123
+ "Omejes (Latin)": "omw_Latn",
1124
+ "Obo Manobo (Latin)": "onb_Latn",
1125
+ "Tohono O'odham (Latin)": "ood_Latn",
1126
+ "Oroqen (Latin)": "orc_Latn",
1127
+ "Oromo (Latin)": "orm_Latn",
1128
+ "Ormuri (Arabic)": "oru_Arab",
1129
+ "Oriya (Oriya)": "ory_Orya",
1130
+ "Ossetian (Cyrillic)": "oss_Cyrl",
1131
+ "Otomi (Latin)": "ote_Latn",
1132
+ "Otomi (Latin)": "otq_Latn",
1133
+ "Old Turkish (Latin)": "ozm_Latn",
1134
+ "Páez (Latin)": "pab_Latn",
1135
+ "Pareci (Latin)": "pad_Latn",
1136
+ "Pangasinan (Latin)": "pag_Latn",
1137
+ "Pampanga (Latin)": "pam_Latn",
1138
+ "Panjabi (Gurmukhi)": "pan_Guru",
1139
+ "Northern Paiute (Latin)": "pao_Latn",
1140
+ "Papiamento (Latin)": "pap_Latn",
1141
+ "Palauan (Latin)": "pau_Latn",
1142
+ "Pangwa (Latin)": "pbb_Latn",
1143
+ "Patamona (Latin)": "pbc_Latn",
1144
+ "Mezontla Popoloca (Latin)": "pbi_Latn",
1145
+ "Parkwa (Latin)": "pbs_Latn",
1146
+ "Southern Pashto (Arabic)": "pbt_Arab",
1147
+ "Northern Pashto (Arabic)": "pbu_Arab",
1148
+ "Ruching Palaung (Thai)": "pce_Thai",
1149
+ "Nigerian Pidgin (Latin)": "pcm_Latn",
1150
+ "Pardhan (Latin)": "pex_Latn",
1151
+ "Eastern Pomo (Latin)": "pez_Latn",
1152
+ "Pahi (Arabic)": "phl_Arab",
1153
+ "Phuan (Arabic)": "phr_Arab",
1154
+ "Pima Bajo (Latin)": "pib_Latn",
1155
+ "Yinjtjiparnti (Latin)": "pil_Latn",
1156
+ "Piapoco (Latin)": "pip_Latn",
1157
+ "Piratapuyo (Latin)": "pir_Latn",
1158
+ "Pijin (Latin)": "pis_Latn",
1159
+ "Pitta Pitta (Latin)": "piy_Latn",
1160
+ "Pijao (Latin)": "pjt_Latn",
1161
+ "Pokomo (Latin)": "pkb_Latn",
1162
+ "Pökoot (Latin)": "pko_Latn",
1163
+ "Shwe Palaung (Arabic)": "plk_Arab",
1164
+ "Central Pame (Latin)": "pls_Latn",
1165
+ "Malagasy, Plateau (Latin)": "plt_Latn",
1166
+ "Polonombauk (Latin)": "plw_Latn",
1167
+ "Piemontese (Latin)": "pmf_Latn",
1168
+ "Piemontese (Latin)": "pmq_Latn",
1169
+ "Piemontese (Latin)": "pms_Latn",
1170
+ "Pamona (Latin)": "pmy_Latn",
1171
+ "Western Panjabi (Arabic)": "pnb_Arab",
1172
+ "Penesak (Latin)": "pne_Latn",
1173
+ "Pinyin (Latin)": "pny_Latn",
1174
+ "Ponares (Latin)": "poc_Latn",
1175
+ "Poqomam (Latin)": "poe_Latn",
1176
+ "Poqomchi' (Latin)": "poh_Latn",
1177
+ "Pokangá (Latin)": "poi_Latn",
1178
+ "Polish (Latin)": "pol_Latn",
1179
+ "Portuguese (Latin)": "por_Latn",
1180
+ "Pémono (Latin)": "pov_Latn",
1181
+ "Puelche (Latin)": "pow_Latn",
1182
+ "Puelche (Latin)": "poy_Latn",
1183
+ "Paipai (Latin)": "ppk_Latn",
1184
+ "San Luís Temalacayuca Popoloca (Latin)": "pps_Latn",
1185
+ "Pa'o (Latin)": "prf_Latn",
1186
+ "Parauk (Latin)": "prk_Latn",
1187
+ "Parsi-Dari (Latin)": "prq_Latn",
1188
+ "Phai (Thai)": "prt_Thai",
1189
+ "Pai Tavytera (Latin)": "pse_Latn",
1190
+ "Kaulong (Latin)": "pss_Latn",
1191
+ "Central Pashto (Arabic)": "pst_Arab",
1192
+ "Patuá (Latin)": "ptu_Latn",
1193
+ "Punan Merap (Latin)": "pua_Latn",
1194
+ "Punan Merap (Latin)": "pui_Latn",
1195
+ "Pushto (Arabic)": "pus_Arab",
1196
+ "Pangwali (Latin)": "pwg_Latn",
1197
+ "Paiwan (Latin)": "pwn_Latn",
1198
+ "Pwo Western Karen (Thai)": "pww_Thai",
1199
+ "Quetzaltepec Mixe (Latin)": "pxm_Latn",
1200
+ "Bikol (Latin)": "qub_Latn",
1201
+ "K'iche' (Latin)": "quc_Latn",
1202
+ "Lambayeque Quechua (Latin)": "quf_Latn",
1203
+ "Chimborazo Highland Quichua (Latin)": "qug_Latn",
1204
+ "South Bolivian Quechua (Latin)": "quh_Latn",
1205
+ "North Bolivian Quechua (Latin)": "qul_Latn",
1206
+ "Sipacapense (Latin)": "qum_Latn",
1207
+ "Panao Huánuco Quechua (Latin)": "qup_Latn",
1208
+ "Yanahuanca Pasco Quechua (Latin)": "qur_Latn",
1209
+ "Southern Pastaza Quechua (Latin)": "qus_Latn",
1210
+ "Quechua (Latin)": "quv_Latn",
1211
+ "Quechua (Latin)": "quw_Latn",
1212
+ "Quechua (Latin)": "qux_Latn",
1213
+ "Ayacucho Quechua (Latin)": "quy_Latn",
1214
+ "Cusco Quechua (Latin)": "quz_Latn",
1215
+ "Ambo-Pasco Quechua (Latin)": "qva_Latn",
1216
+ "Cajamarca Quechua (Latin)": "qvc_Latn",
1217
+ "Eastern Apurímac Quechua (Latin)": "qve_Latn",
1218
+ "Huallaga Huánuco Quechua (Latin)": "qvh_Latn",
1219
+ "Imbabura Highland Quichua (Latin)": "qvi_Latn",
1220
+ "Loja Highland Quichua (Latin)": "qvj_Latn",
1221
+ "Cajatambo North Lima Quechua (Latin)": "qvl_Latn",
1222
+ "Margos-Yarowilca-Lauricocha Quechua (Latin)": "qvm_Latn",
1223
+ "North Junín Quechua (Latin)": "qvn_Latn",
1224
+ "Napo Lowland Quechua (Latin)": "qvo_Latn",
1225
+ "San Martín Quechua (Latin)": "qvs_Latn",
1226
+ "Huaylla Wanca Quechua (Latin)": "qvw_Latn",
1227
+ "Yauyos Quechua (Latin)": "qvz_Latn",
1228
+ "Corongo Ancash Quechua (Latin)": "qwa_Latn",
1229
+ "Huaylas Ancash Quechua (Latin)": "qwh_Latn",
1230
+ "Sihuas Ancash Quechua (Latin)": "qws_Latn",
1231
+ "Chiquián Ancash Quechua (Latin)": "qxa_Latn",
1232
+ "Southern Conchucos Ancash Quechua (Latin)": "qxh_Latn",
1233
+ "Northern Conchucos Ancash Quechua (Latin)": "qxl_Latn",
1234
+ "Puno Quechua (Latin)": "qxn_Latn",
1235
+ "Southern Pastaza Quechua (Latin)": "qxo_Latn",
1236
+ "Puno Quechua (Latin)": "qxp_Latn",
1237
+ "Pacaraos Quechua (Latin)": "qxr_Latn",
1238
+ "Santa Ana de Tusi Pasco Quechua (Latin)": "qxt_Latn",
1239
+ "Arequipa-La Unión Quechua (Latin)": "qxu_Latn",
1240
+ "Jauja Wanca Quechua (Latin)": "qxw_Latn",
1241
+ "Rāga (Latin)": "rag_Latn",
1242
+ "Rahambuu (Bengali)": "rah_Beng",
1243
+ "Ramoaaina (Latin)": "rai_Latn",
1244
+ "Rapa Nui (Latin)": "rap_Latn",
1245
+ "Rawang (Devanagari)": "rav_Deva",
1246
+ "Rawang (Latin)": "raw_Latn",
1247
+ "Rejang (Latin)": "rej_Latn",
1248
+ "Rendille (Latin)": "rel_Latn",
1249
+ "Raguile (Latin)": "rgu_Latn",
1250
+ "Rohingya (Latin)": "rhg_Latn",
1251
+ "Tarifit (Arabic)": "rif_Arab",
1252
+ "Tarifit (Latin)": "rif_Latn",
1253
+ "Riang (India) (Latin)": "rim_Latn",
1254
+ "Riang (India) (Devanagari)": "rjs_Deva",
1255
+ "Rangpuri (Bengali)": "rkt_Beng",
1256
+ "Carpathian Romani (Cyrillic)": "rmc_Cyrl",
1257
+ "Carpathian Romani (Latin)": "rmc_Latn",
1258
+ "Traveller Norwegian (Latin)": "rmo_Latn",
1259
+ "Romany (Cyrillic)": "rmy_Cyrl",
1260
+ "Romany (Latin)": "rmy_Latn",
1261
+ "Roon (Latin)": "rng_Latn",
1262
+ "Roon (Latin)": "rnl_Latn",
1263
+ "Tae' (Latin)": "rob_Latn",
1264
+ "Rombo (Latin)": "rof_Latn",
1265
+ "Romansh (Latin, surs1244)": "roh_Latn_surs1244",
1266
+ "Romblomanon (Latin)": "rol_Latn",
1267
+ "Romanian (Latin)": "ron_Latn",
1268
+ "Rongga (Latin)": "roo_Latn",
1269
+ "Kriol (Latin)": "rop_Latn",
1270
+ "Rotokas (Latin)": "rro_Latn",
1271
+ "Rathawi (Latin)": "rth_Latn",
1272
+ "Rusyn (Latin)": "rub_Latn",
1273
+ "Ruuli (Latin)": "ruc_Latn",
1274
+ "Rufiji (Latin)": "ruf_Latn",
1275
+ "Ruga (Latin)": "rug_Latn",
1276
+ "Rundi (Latin)": "run_Latn",
1277
+ "Russian (Cyrillic)": "rus_Cyrl",
1278
+ "Mbwela (Latin)": "rwm_Latn",
1279
+ "Marwari (India) (Devanagari)": "rwr_Deva",
1280
+ "Saba (Latin)": "sab_Latn",
1281
+ "Sango (Latin)": "sag_Latn",
1282
+ "Yakut (Cyrillic)": "sah_Cyrl",
1283
+ "Sahu (Latin)": "saj_Latn",
1284
+ "Samburu (Latin)": "saq_Latn",
1285
+ "Sasak (Latin)": "sas_Latn",
1286
+ "Sause (Latin)": "sau_Latn",
1287
+ "Sayula Popoluca (Latin)": "say_Latn",
1288
+ "Ngambay (Latin)": "sba_Latn",
1289
+ "Simbo (Latin)": "sbd_Latn",
1290
+ "Sagala (Latin)": "sbl_Latn",
1291
+ "Sindhi Bhil (Arabic)": "sbn_Arab",
1292
+ "Sangu (Tanzania) (Latin)": "sbp_Latn",
1293
+ "Sangu (Gabon) (Latin)": "sch_Latn",
1294
+ "Sadri (Devanagari)": "sck_Deva",
1295
+ "Shina (Arabic)": "scl_Arab",
1296
+ "Sicilian (Latin)": "scn_Latn",
1297
+ "Scots (Latin)": "sco_Latn",
1298
+ "Sandawe (Latin)": "sda_Latn",
1299
+ "Sardo-logudorese (Latin)": "sdo_Latn",
1300
+ "Semai (Latin)": "sea_Latn",
1301
+ "Sena (Latin)": "seh_Latn",
1302
+ "Sena (Latin)": "sei_Latn",
1303
+ "Serrano (Latin)": "ses_Latn",
1304
+ "Serrano (Latin)": "sey_Latn",
1305
+ "Sangu (Gabon) (Latin)": "sgb_Latn",
1306
+ "Surgujia (Devanagari)": "sgj_Deva",
1307
+ "Suri (Ethiopic)": "sgw_Ethi",
1308
+ "Tachelhit (Latin)": "shi_Latn",
1309
+ "Sheko (Latin)": "shk_Latn",
1310
+ "Shan (Myanmar)": "shn_Mymr",
1311
+ "Shanga (Latin)": "sho_Latn",
1312
+ "Sala (Latin)": "shp_Latn",
1313
+ "Sidamo (Latin)": "sid_Latn",
1314
+ "Serrano (Latin)": "sig_Latn",
1315
+ "Tumulung Sisaala (Latin)": "sil_Latn",
1316
+ "Sinhala (Sinhala)": "sin_Sinh",
1317
+ "Sikkimese (Tibetan)": "sip_Tibt",
1318
+ "Siwa (Latin)": "siw_Latn",
1319
+ "Soli (Latin)": "sja_Latn",
1320
+ "Simaa (Latin)": "sjm_Latn",
1321
+ "Surjapuri (Devanagari)": "sjp_Deva",
1322
+ "Siar-Lak (Latin)": "sjr_Latn",
1323
+ "Seke (Vanuatu) (Latin)": "skg_Latn",
1324
+ "Saraiki (Arabic)": "skr_Arab",
1325
+ "Sáliba (Latin)": "sld_Latn",
1326
+ "Slovak (Latin)": "slk_Latn",
1327
+ "Selaru (Latin)": "slu_Latn",
1328
+ "Slovenian (Latin)": "slv_Latn",
1329
+ "Sama (Latin)": "sml_Latn",
1330
+ "Samoan (Latin)": "smo_Latn",
1331
+ "Shona (Latin)": "sna_Latn",
1332
+ "Sanga (Nigeria) (Latin)": "snc_Latn",
1333
+ "Sindhi (Arabic)": "snd_Arab",
1334
+ "Bau Bidayuh (Latin)": "sne_Latn",
1335
+ "Soninke (Latin)": "snk_Latn",
1336
+ "Siona (Latin)": "snn_Latn",
1337
+ "Siane (Latin)": "snp_Latn",
1338
+ "Sauk (Latin)": "snv_Latn",
1339
+ "Sauk (Latin)": "snw_Latn",
1340
+ "Solos (Latin)": "sol_Latn",
1341
+ "Somali (Latin)": "som_Latn",
1342
+ "Songe (Latin)": "soy_Latn",
1343
+ "Spanish (Latin)": "spa_Latn",
1344
+ "Sian (Latin)": "spp_Latn",
1345
+ "Saponi (Latin)": "sps_Latn",
1346
+ "Sabaot (Latin)": "spy_Latn",
1347
+ "Sardinian (Latin)": "src_Latn",
1348
+ "Sardinian (Latin)": "srd_Latn",
1349
+ "Sera (Latin)": "sri_Latn",
1350
+ "Saramaccan (Latin)": "srm_Latn",
1351
+ "Sranan Tongo (Latin)": "srn_Latn",
1352
+ "Sarsuti (Latin)": "sro_Latn",
1353
+ "Serbian (Cyrillic)": "srp_Cyrl",
1354
+ "Serer (Latin)": "srr_Latn",
1355
+ "Seraiki (Devanagari)": "srx_Deva",
1356
+ "Siri (Arabic)": "ssi_Arab",
1357
+ "Seta (Latin)": "ste_Latn",
1358
+ "Sateré-Mawé (Latin)": "stn_Latn",
1359
+ "Stieng (Latin)": "stp_Latn",
1360
+ "Sua (Latin)": "sua_Latn",
1361
+ "Suku (Latin)": "suc_Latn",
1362
+ "Sukuma (Latin)": "suk_Latn",
1363
+ "Sundanese (Latin)": "sun_Latn",
1364
+ "Suri (Latin)": "sur_Latn",
1365
+ "Susu (Latin)": "sus_Latn",
1366
+ "Susu (Latin)": "suv_Latn",
1367
+ "Sunwar (Devanagari)": "suz_Deva",
1368
+ "Svan (Georgian)": "sva_Geor",
1369
+ "Swedish (Latin)": "swe_Latn",
1370
+ "Swahili (macrolanguage) (Latin)": "swh_Latn",
1371
+ "Seraiki (Devanagari)": "swv_Deva",
1372
+ "Sumbwa (Latin)": "sxb_Latn",
1373
+ "Sicanian (Latin)": "sxn_Latn",
1374
+ "Sighu (Latin)": "sya_Latn",
1375
+ "Sylheti (Latin)": "syl_Latn",
1376
+ "Saurashtra (Latin)": "sza_Latn",
1377
+ "Saurashtra (Latin)": "szy_Latn",
1378
+ "Tuma-Irumu (Latin)": "tac_Latn",
1379
+ "Tajio (Devanagari)": "taj_Deva",
1380
+ "Tamil (Tamil)": "tam_Taml",
1381
+ "Tana (Latin)": "tan_Latn",
1382
+ "Tangale (Latin)": "tao_Latn",
1383
+ "Taabwa (Latin)": "tap_Latn",
1384
+ "Tarahumara (Latin)": "taq_Latn",
1385
+ "Central Tarahumara (Latin)": "tar_Latn",
1386
+ "Tatar (Cyrillic)": "tat_Cyrl",
1387
+ "Tatuyo (Latin)": "tav_Latn",
1388
+ "Tay (Latin)": "tay_Latn",
1389
+ "Taliabu (Latin)": "tbc_Latn",
1390
+ "Kbo (Latin)": "tbf_Latn",
1391
+ "Tairora (Latin)": "tbg_Latn",
1392
+ "Tboli (Latin)": "tbk_Latn",
1393
+ "Tboli (Latin)": "tbl_Latn",
1394
+ "Tagbu (Latin)": "tby_Latn",
1395
+ "Ditammari (Latin)": "tbz_Latn",
1396
+ "Ticuna (Latin)": "tca_Latn",
1397
+ "Datooga (Latin)": "tcc_Latn",
1398
+ "Malagasy, Tsimihety (Latin)": "tcf_Latn",
1399
+ "Tulu (Malayalam)": "tcy_Mlym",
1400
+ "Are'are (Latin)": "tcz_Latn",
1401
+ "Tidong (Latin)": "tdj_Latn",
1402
+ "Tandaganon (Latin)": "tdn_Latn",
1403
+ "Tandroy-Mahafaly Malagasy (Latin)": "tdx_Latn",
1404
+ "Tepo Krumen (Latin)": "ted_Latn",
1405
+ "Teressa (Latin)": "tee_Latn",
1406
+ "Telugu (Telugu)": "tel_Telu",
1407
+ "Timne (Latin)": "tem_Latn",
1408
+ "Teso (Latin)": "teo_Latn",
1409
+ "Teso (Latin)": "ter_Latn",
1410
+ "Tewa (USA) (Latin)": "tew_Latn",
1411
+ "Tennet (Latin)": "tex_Latn",
1412
+ "Terik (Latin)": "tfr_Latn",
1413
+ "Ternate (Latin)": "tgc_Latn",
1414
+ "Togoyo (Latin)": "tgj_Latn",
1415
+ "Tajik (Cyrillic)": "tgk_Cyrl",
1416
+ "Tagalog (Latin)": "tgl_Latn",
1417
+ "Togoyo (Latin)": "tgo_Latn",
1418
+ "Togoyo (Latin)": "tgp_Latn",
1419
+ "Thai (Thai)": "tha_Thai",
1420
+ "Tharu (Devanagari)": "the_Deva",
1421
+ "Tho (Latin)": "thk_Latn",
1422
+ "Tharu (Devanagari)": "thl_Deva",
1423
+ "Tharu (Devanagari)": "thq_Deva",
1424
+ "Tharu (Devanagari)": "thr_Deva",
1425
+ "Thangmi (Tifinagh)": "thv_Tfng",
1426
+ "Tigre (Ethiopic)": "tig_Ethi",
1427
+ "Timugon Murut (Latin)": "tih_Latn",
1428
+ "Tii (Latin)": "tik_Latn",
1429
+ "Tillamook (Latin)": "tio_Latn",
1430
+ "Tigrinya (Ethiopic)": "tir_Ethi",
1431
+ "Masaka (Latin)": "tkg_Latn",
1432
+ "Tukumanféd (Latin)": "tkr_Latn",
1433
+ "Takpa (Devanagari)": "tkt_Deva",
1434
+ "Tobo-Kube (Latin)": "tlb_Latn",
1435
+ "Tlingit (Latin)": "tli_Latn",
1436
+ "Talysh (Latin)": "tlj_Latn",
1437
+ "Taloki (Latin)": "tlp_Latn",
1438
+ "Talysh (Latin)": "tly_Latn",
1439
+ "Tumak (Latin)": "tmc_Latn",
1440
+ "Toba-Maskoy (Latin)": "tmf_Latn",
1441
+ "Tasmate (Latin)": "tna_Latn",
1442
+ "Tonga (Nyasa) (Latin)": "tng_Latn",
1443
+ "Tenis (Latin)": "tnk_Latn",
1444
+ "Tonsawang (Latin)": "tnn_Latn",
1445
+ "Tontemboan (Latin)": "tnp_Latn",
1446
+ "Ménik (Latin)": "tnr_Latn",
1447
+ "Tenino (Latin)": "tnt_Latn",
1448
+ "Toba (Latin)": "tob_Latn",
1449
+ "Coyutla Totonac (Latin)": "toc_Latn",
1450
+ "Toma (Latin)": "toh_Latn",
1451
+ "Toki Pona (Latin)": "tok_Latn",
1452
+ "Tomini (Latin)": "tom_Latn",
1453
+ "Xicotepec De Juárez Totonac (Latin)": "top_Latn",
1454
+ "Tukumanféd (Latin)": "tos_Latn",
1455
+ "Tok Pisin (Latin)": "tpi_Latn",
1456
+ "Tukumanféd (Latin)": "tpl_Latn",
1457
+ "Tampulma (Latin)": "tpm_Latn",
1458
+ "Tukumanféd (Latin)": "tpp_Latn",
1459
+ "Tukumanféd (Latin)": "tpt_Latn",
1460
+ "Tukumanféd (Latin)": "tpz_Latn",
1461
+ "Tukumanféd (Latin)": "tqp_Latn",
1462
+ "Trio (Latin)": "trc_Latn",
1463
+ "Turi (Latin)": "tri_Latn",
1464
+ "Torona (Latin)": "trn_Latn",
1465
+ "Trumai (Latin)": "trp_Latn",
1466
+ "Tregami (Latin)": "trq_Latn",
1467
+ "Tirahi (Latin)": "trs_Latn",
1468
+ "Trukhmen (Latin)": "trv_Latn",
1469
+ "Torwali (Arabic)": "trw_Arab",
1470
+ "Tswana (Latin)": "tsn_Latn",
1471
+ "Tsonga (Latin)": "tso_Latn",
1472
+ "Tsuvan (Latin)": "tsz_Latn",
1473
+ "Tswa (Latin)": "ttc_Latn",
1474
+ "Tutelo (Latin)": "tte_Latn",
1475
+ "Tooro (Latin)": "ttj_Latn",
1476
+ "Tawallammat Tamajaq (Tifinagh)": "ttq_Tfng",
1477
+ "Tutoro (Latin)": "ttr_Latn",
1478
+ "Wotu (Latin)": "ttu_Latn",
1479
+ "Tübatulabal (Latin)": "tue_Latn",
1480
+ "Tübatulabal (Latin)": "tuf_Latn",
1481
+ "Tugutil (Latin)": "tui_Latn",
1482
+ "Turkmen (Arabic)": "tuk_Arab",
1483
+ "Turkmen (Latin)": "tuk_Latn",
1484
+ "Tula (Latin)": "tul_Latn",
1485
+ "Tumbuka (Latin)": "tuo_Latn",
1486
+ "Tedaga (Latin)": "tuq_Latn",
1487
+ "Turkish (Latin)": "tur_Latn",
1488
+ "Tuxináwa (Latin)": "tuv_Latn",
1489
+ "Tuxináwa (Latin)": "tuy_Latn",
1490
+ "Tungus languages (Latin)": "tvo_Latn",
1491
+ "Tungus languages (Latin)": "tvu_Latn",
1492
+ "Tungus languages (Latin)": "tvw_Latn",
1493
+ "Tawbuid (Latin)": "twb_Latn",
1494
+ "Twents (Latin)": "twe_Latn",
1495
+ "Tungus languages (Latin)": "twu_Latn",
1496
+ "Tewe (Latin)": "txa_Latn",
1497
+ "Tombonuo (Latin)": "txq_Latn",
1498
+ "Tartessian (Latin)": "txs_Latn",
1499
+ "Kayapó (Latin)": "txu_Latn",
1500
+ "Tanosy Malagasy (Latin)": "txy_Latn",
1501
+ "Tauya (Latin)": "tye_Latn",
1502
+ "Tzeltal (Latin)": "tzh_Latn",
1503
+ "Tz'utujil (Latin)": "tzj_Latn",
1504
+ "Tzotzil (Latin)": "tzo_Latn",
1505
+ "Ubi (Latin)": "ubl_Latn",
1506
+ "Ubang (Latin)": "ubu_Latn",
1507
+ "Ujir (Latin)": "udl_Latn",
1508
+ "Udmurt (Cyrillic)": "udm_Cyrl",
1509
+ "Uduk (Latin)": "udu_Latn",
1510
+ "Uighur (Arabic)": "uig_Arab",
1511
+ "Uighur (Cyrillic)": "uig_Cyrl",
1512
+ "Ukuriguma (Oriya)": "uki_Orya",
1513
+ "Ukrainian (Cyrillic)": "ukr_Cyrl",
1514
+ "Ukuriguma (Latin)": "ukv_Latn",
1515
+ "Umbundu (Latin)": "umb_Latn",
1516
+ "Uripiv-Wala-Rano-Atchin (Latin)": "upv_Latn",
1517
+ "Ura (Vanuatu) (Latin)": "ura_Latn",
1518
+ "Urubú-Kaapor (Latin)": "urb_Latn",
1519
+ "Urdu (Arabic)": "urd_Arab",
1520
+ "Urdu (Devanagari)": "urd_Deva",
1521
+ "Urdu (Latin)": "urd_Latn",
1522
+ "Urhobo (Latin)": "urh_Latn",
1523
+ "Urak Lawoi' (Thai)": "urk_Thai",
1524
+ "Urat (Latin)": "urt_Latn",
1525
+ "Uru (Latin)": "ury_Latn",
1526
+ "Ushojo (Arabic)": "ush_Arab",
1527
+ "Uspanteco (Latin)": "usp_Latn",
1528
+ "Uzbek (Cyrillic)": "uzb_Cyrl",
1529
+ "Uzbek (Latin)": "uzb_Latn",
1530
+ "Northern Uzbek (Latin)": "uzn_Latn",
1531
+ "Vagla (Latin)": "vag_Latn",
1532
+ "Varhadi-Nagpuri (Devanagari)": "vah_Deva",
1533
+ "Vehes (Latin)": "vai_Latn",
1534
+ "Varli (Latin)": "var_Latn",
1535
+ "Veluws (Latin)": "ver_Latn",
1536
+ "Vinde (Latin)": "vid_Latn",
1537
+ "Vietnamese (Latin)": "vie_Latn",
1538
+ "Vili (Latin)": "vif_Latn",
1539
+ "Viemo (Latin)": "vmc_Latn",
1540
+ "Juxtlahuaca Mixtec (Latin)": "vmj_Latn",
1541
+ "Mitlatongo Mixtec (Latin)": "vmm_Latn",
1542
+ "Soyaltepec Mazatec (Latin)": "vmp_Latn",
1543
+ "Makhuwa (Latin)": "vmw_Latn",
1544
+ "Soyaltepec Mazatec (Latin)": "vmy_Latn",
1545
+ "Soyaltepec Mazatec (Latin)": "vmz_Latn",
1546
+ "Võro (Latin)": "vro_Latn",
1547
+ "Vunjo (Latin)": "vun_Latn",
1548
+ "Vute (Latin)": "vut_Latn",
1549
+ "Wolaytta (Ethiopic)": "wal_Ethi",
1550
+ "Wolaytta (Latin)": "wal_Latn",
1551
+ "Wapishana (Latin)": "wap_Latn",
1552
+ "Waray (Philippines) (Latin)": "war_Latn",
1553
+ "Walla Walla (Latin)": "waw_Latn",
1554
+ "Wayana (Latin)": "way_Latn",
1555
+ "Warao (Latin)": "wba_Latn",
1556
+ "Wakhi (Latin)": "wbl_Latn",
1557
+ "Wagdi (Devanagari)": "wbr_Deva",
1558
+ "Waci Gbe (Latin)": "wci_Latn",
1559
+ "Wè Western (Latin)": "weo_Latn",
1560
+ "Wewaw (Latin)": "wes_Latn",
1561
+ "Wajan (Latin)": "wja_Latn",
1562
+ "Warji (Latin)": "wji_Latn",
1563
+ "Walloon (Latin)": "wlo_Latn",
1564
+ "Wolio (Latin)": "wlx_Latn",
1565
+ "Womo (Latin)": "wmw_Latn",
1566
+ "Wobé (Latin)": "wob_Latn",
1567
+ "Wolof (Latin)": "wof_Latn",
1568
+ "Wolof (Latin)": "wol_Latn",
1569
+ "Wagdi (Telugu)": "wsg_Telu",
1570
+ "Wassa (Latin)": "wwa_Latn",
1571
+ "Kalmyk (Cyrillic)": "xal_Cyrl",
1572
+ "Kayan Mahakam (Latin)": "xdy_Latn",
1573
+ "Xerénte (Latin)": "xed_Latn",
1574
+ "Xerénte (Latin)": "xer_Latn",
1575
+ "Khetrani (Arabic)": "xhe_Arab",
1576
+ "Xhosa (Latin)": "xho_Latn",
1577
+ "Kalkoti (Arabic)": "xka_Arab",
1578
+ "Kalkoti (Latin)": "xkl_Latn",
1579
+ "Mingrelian (Georgian)": "xmf_Geor",
1580
+ "Malay (macrolanguage), Malaccan (Latin)": "xmm_Latn",
1581
+ "Mean (Latin)": "xmv_Latn",
1582
+ "Kenyan Sign Language (Latin)": "xnj_Latn",
1583
+ "Kanjar (Devanagari)": "xnr_Deva",
1584
+ "Xhosa (Latin)": "xog_Latn",
1585
+ "Komo (Sudan) (Latin)": "xon_Latn",
1586
+ "Kpelle (Latin)": "xpe_Latn",
1587
+ "Karahawyana (Latin)": "xrb_Latn",
1588
+ "Samberigi (Latin)": "xsb_Latn",
1589
+ "Samberigi (Latin)": "xsm_Latn",
1590
+ "Sherpa (Devanagari)": "xsr_Deva",
1591
+ "Sukur (Latin)": "xsu_Latn",
1592
+ "Alcozauca Mixtec (Latin)": "xta_Latn",
1593
+ "Diuxi-Tilantongo Mixtec (Latin)": "xtd_Latn",
1594
+ "Ketengban (Latin)": "xte_Latn",
1595
+ "Sino-Tibetan languages (Latin)": "xti_Latn",
1596
+ "Tidaá Mixtec (Latin)": "xtm_Latn",
1597
+ "Diuxi-Tilantongo Mixtec (Latin)": "xtn_Latn",
1598
+ "Cuyamecalco Mixtec (Latin)": "xtu_Latn",
1599
+ "Alcozauca Mixtec (Tamil)": "xua_Taml",
1600
+ "Kuo (Latin)": "xuo_Latn",
1601
+ "Yaminahua (Latin)": "yaa_Latn",
1602
+ "Yagua (Latin)": "yad_Latn",
1603
+ "Yalunka (Latin)": "yal_Latn",
1604
+ "Yamba (Latin)": "yam_Latn",
1605
+ "Yao (Latin)": "yao_Latn",
1606
+ "Yagua (Latin)": "yaq_Latn",
1607
+ "Yagua (Latin)": "yas_Latn",
1608
+ "Yagua (Latin)": "yat_Latn",
1609
+ "Yavanawa (Latin)": "yav_Latn",
1610
+ "Yei (Latin)": "yay_Latn",
1611
+ "Yazgulyam (Latin)": "yaz_Latn",
1612
+ "Yala (Latin)": "yba_Latn",
1613
+ "Yemba (Latin)": "ybb_Latn",
1614
+ "Yucatec Maya Sign Language (Latin)": "ycl_Latn",
1615
+ "Yucuna (Latin)": "ycn_Latn",
1616
+ "Yiddish (Hebrew)": "ydd_Hebr",
1617
+ "Yidgha (Arabic)": "ydg_Arab",
1618
+ "Yennu (Malayalam)": "yea_Mlym",
1619
+ "Yenisei Say (Latin)": "yer_Latn",
1620
+ "Yeskwa (Latin)": "yes_Latn",
1621
+ "Yaka (Congo) (Latin)": "yka_Latn",
1622
+ "Yalo (Latin)": "yli_Latn",
1623
+ "Yoruba (Latin)": "yor_Latn",
1624
+ "Yarí (Latin)": "yre_Latn",
1625
+ "Yucateco (Latin)": "yua_Latn",
1626
+ "Yue Chinese (Han)": "yue_Hans",
1627
+ "Yue Chinese (Han)": "yue_Hant",
1628
+ "Yuracare (Latin)": "yuz_Latn",
1629
+ "Yawa (Latin)": "yva_Latn",
1630
+ "Zapotec (Latin)": "zaa_Latn",
1631
+ "Zapotec (Latin)": "zab_Latn",
1632
+ "Ocotlán Zapotec (Latin)": "zac_Latn",
1633
+ "Cajonos Zapotec (Latin)": "zad_Latn",
1634
+ "Yareni Zapotec (Latin)": "zae_Latn",
1635
+ "Isthmus Zapotec (Latin)": "zai_Latn",
1636
+ "Miahuatlán Zapotec (Latin)": "zam_Latn",
1637
+ "Ozolotepec Zapotec (Latin)": "zao_Latn",
1638
+ "Aloápam Zapotec (Latin)": "zaq_Latn",
1639
+ "Rincón Zapotec (Latin)": "zar_Latn",
1640
+ "Santo Domingo Albarradas Zapotec (Latin)": "zas_Latn",
1641
+ "Yatzachi Zapotec (Latin)": "zav_Latn",
1642
+ "Zay (Latin)": "zaw_Latn",
1643
+ "Choapan Zapotec (Latin)": "zca_Latn",
1644
+ "Zhigulevsk (Latin)": "zga_Latn",
1645
+ "Zimza (Latin)": "zim_Latn",
1646
+ "Zinza (Latin)": "ziw_Latn",
1647
+ "Zialo (Latin)": "zmz_Latn",
1648
+ "Zande (macrolanguage) (Latin)": "zne_Latn",
1649
+ "Zoque (Latin)": "zoc_Latn",
1650
+ "Zoque (Latin)": "zoh_Latn",
1651
+ "Zoque (Latin)": "zor_Latn",
1652
+ "Zoque (Latin)": "zos_Latn",
1653
+ "Coatecas Altas Zapotec (Latin)": "zpc_Latn",
1654
+ "Guevea De Humboldt Zapotec (Latin)": "zpg_Latn",
1655
+ "Santa María Quiegolani Zapotec (Latin)": "zpi_Latn",
1656
+ "Lachixío Zapotec (Latin)": "zpl_Latn",
1657
+ "Mixtepec Zapotec (Latin)": "zpm_Latn",
1658
+ "Choapan Zapotec (Latin)": "zpo_Latn",
1659
+ "El Alto Zapotec (Latin)": "zpt_Latn",
1660
+ "San Vicente Coatlán Zapotec (Latin)": "zpv_Latn",
1661
+ "Chichicapan Zapotec (Latin)": "zpy_Latn",
1662
+ "Mazaltepec Zapotec (Latin)": "zpz_Latn",
1663
+ "Standard Malay (Latin)": "zsm_Latn",
1664
+ "Tlacolulita Zapotec (Latin)": "ztg_Latn",
1665
+ "Tataltepec Zapotec (Latin)": "ztn_Latn",
1666
+ "Tilquiapan Zapotec (Latin)": "ztp_Latn",
1667
+ "Quiavicuzas Zapotec (Latin)": "ztq_Latn",
1668
+ "Samo (Latin)": "zts_Latn",
1669
+ "Samo (Latin)": "ztu_Latn",
1670
+ "Yalálag Zapotec (Latin)": "zty_Latn",
1671
+ "Zulu (Latin)": "zul_Latn",
1672
+ "Yongbei Zhuang (Latin)": "zyb_Latn",
1673
+ "Yongbei Zhuang (Latin)": "zyp_Latn",
1674
+ "Zhuang (Latin)": "zza_Latn"
1675
+ }
server/media_transcription_processor.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Media Transcription Processor
3
+
4
+ Pipeline-focused transcription processor that maintains state through processing stages
5
+ while exposing intermediate results for flexibility and ensuring proper resource cleanup.
6
+ """
7
+
8
+ import base64
9
+ import logging
10
+ import os
11
+ from typing import Dict, List, Optional
12
+
13
+ import numpy as np
14
+ import torch
15
+ from audio_transcription import transcribe_full_audio_with_chunking
16
+ from convert_media_to_wav import convert_media_to_wav_from_bytes
17
+ from inference.audio_reading_tools import wav_to_bytes
18
+ from transcription_status import transcription_status
19
+
20
+
21
+ class MediaTranscriptionProcessor:
22
+ """
23
+ Pipeline-focused transcription processor that maintains state through processing stages
24
+ while exposing intermediate results for flexibility and ensuring proper resource cleanup.
25
+ """
26
+
27
+ # Maximum duration (in seconds) before a transcription is considered stuck
28
+ # MAX_TRANSCRIPTION_DURATION = 120 # 2 minutes
29
+
30
+ # For long meetings (1 hour max)
31
+ # MAX_TRANSCRIPTION_DURATION = 3600
32
+
33
+ # Or disable timeout entirely
34
+ MAX_TRANSCRIPTION_DURATION = float("inf")
35
+
36
+
37
+ def __init__(self, media_bytes: bytes, filename: str, language_with_script: str = None):
38
+ """Initialize processor with media data and metadata."""
39
+ # Core input data
40
+ self.media_bytes = media_bytes
41
+ self.original_filename = filename
42
+ self.language_with_script = language_with_script
43
+
44
+ # Processing state - lazy loaded
45
+ self._temp_wav_path: Optional[str] = None
46
+ self._audio_tensor: Optional[torch.Tensor] = None
47
+ self._audio_numpy: Optional[np.ndarray] = None
48
+ self._sample_rate: int = 16000
49
+ self._duration: Optional[float] = None
50
+ self._chunks: Optional[List] = None
51
+ self._transcription_results: Optional[Dict] = None
52
+ self._error: Optional[str] = None
53
+
54
+ # Resource tracking for cleanup
55
+ self._temp_files: List[str] = []
56
+ self._cleanup_performed = False
57
+
58
+ # Transcription status management
59
+ self._status_initialized = False
60
+
61
+ def start_transcription(self):
62
+ """Initialize transcription status tracking."""
63
+ if not self._status_initialized:
64
+ transcription_status.start_transcription("transcribe", self.original_filename)
65
+ self._status_initialized = True
66
+
67
+ def update_progress(self, progress: float):
68
+ """Update transcription progress."""
69
+ transcription_status.update_progress(progress)
70
+
71
+ @staticmethod
72
+ def is_server_busy() -> bool:
73
+ """
74
+ Check if the server is currently busy with another transcription.
75
+
76
+ This method includes timeout handling - if a transcription has been
77
+ running too long, it will be force-finished.
78
+ """
79
+ status = MediaTranscriptionProcessor.get_server_status()
80
+ return status.get("is_busy", False)
81
+
82
+ @staticmethod
83
+ def get_server_status() -> dict:
84
+ """
85
+ Get current server transcription status with timeout handling.
86
+
87
+ If a transcription has been running longer than MAX_TRANSCRIPTION_DURATION,
88
+ it will be force-finished to prevent the server from being stuck indefinitely.
89
+ """
90
+ status = transcription_status.get_status()
91
+
92
+ # Check if transcription has been running too long
93
+ if (status.get("is_busy", False) and
94
+ status.get("duration_seconds", 0) > MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION):
95
+
96
+ logger = logging.getLogger(__name__)
97
+ logger.warning(
98
+ f"Force-finishing stuck transcription after {status.get('duration_seconds', 0):.1f}s "
99
+ f"(max: {MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION}s). "
100
+ f"Operation: {status.get('current_operation')}, "
101
+ f"File: {status.get('current_filename')}"
102
+ )
103
+
104
+ # Force finish the transcription
105
+ transcription_status.finish_transcription()
106
+
107
+ # Get updated status
108
+ status = transcription_status.get_status()
109
+ status["force_finished"] = True
110
+ status["reason"] = f"Transcription exceeded maximum duration of {MediaTranscriptionProcessor.MAX_TRANSCRIPTION_DURATION}s"
111
+
112
+ return status
113
+
114
+ def convert_media(self) -> 'MediaTranscriptionProcessor':
115
+ """
116
+ Stage 1: Convert media to standardized audio format.
117
+
118
+ Returns:
119
+ Self for method chaining
120
+ """
121
+ if self._temp_wav_path is not None:
122
+ # Already converted
123
+ return self
124
+
125
+ logger = logging.getLogger(__name__)
126
+ logger.info(f"Converting media file: {self.original_filename}")
127
+
128
+ # Update progress if status is initialized
129
+ if self._status_initialized:
130
+ self.update_progress(0.1)
131
+
132
+ try:
133
+ # Convert media bytes to WAV and tensor
134
+ temp_wav_path, audio_tensor = convert_media_to_wav_from_bytes(
135
+ self.media_bytes, self.original_filename
136
+ )
137
+
138
+ # Store results and track temp file
139
+ self._temp_wav_path = temp_wav_path
140
+ self._audio_tensor = audio_tensor
141
+ self._temp_files.append(temp_wav_path)
142
+
143
+ # Calculate duration from tensor
144
+ if audio_tensor is not None:
145
+ self._duration = len(audio_tensor) / self._sample_rate
146
+
147
+ logger.info(f"Media conversion completed: {self.original_filename} -> {self._duration:.2f}s")
148
+
149
+ # Update progress if status is initialized
150
+ if self._status_initialized:
151
+ self.update_progress(0.2)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Media conversion failed for {self.original_filename}: {str(e)}")
155
+
156
+ # Provide user-friendly error message based on the error type
157
+ if "ffmpeg returned error code" in str(e).lower():
158
+ error_msg = (
159
+ f"Audio/video conversion failed for '{self.original_filename}'. "
160
+ f"The file may have an unsupported audio codec or be corrupted. "
161
+ f"Please try converting the file to a standard format (MP3, WAV, MP4) before uploading. "
162
+ f"For best results, use files with common codecs: "
163
+ f"Audio - AAC, MP3, PCM, FLAC; Video - H.264/AAC (MP4), standard codecs. "
164
+ f"Avoid proprietary, DRM-protected, or very old codec variants."
165
+ )
166
+ else:
167
+ error_msg = f"Failed to process media file '{self.original_filename}'"
168
+
169
+ error_msg += f"\nTechnical Details: {str(e)}"
170
+
171
+ # Store the error for later retrieval
172
+ self._error = error_msg
173
+ raise RuntimeError(error_msg)
174
+
175
+ return self
176
+
177
+ def get_wav_path(self) -> str:
178
+ """Get the temporary WAV file path (converts media if needed)."""
179
+ if self._temp_wav_path is None:
180
+ self.convert_media()
181
+ return self._temp_wav_path
182
+
183
+ def get_audio_tensor(self) -> torch.Tensor:
184
+ """Get standardized audio tensor (converts media if needed)."""
185
+ if self._audio_tensor is None:
186
+ self.convert_media()
187
+ return self._audio_tensor
188
+
189
+ def get_audio_numpy(self) -> np.ndarray:
190
+ """Get audio as numpy array (converted from tensor if needed)."""
191
+ if self._audio_numpy is None:
192
+ tensor = self.get_audio_tensor()
193
+ if tensor is not None:
194
+ # Convert to numpy, handling different tensor types
195
+ if hasattr(tensor, 'cpu'):
196
+ self._audio_numpy = tensor.cpu().numpy()
197
+ else:
198
+ self._audio_numpy = tensor.numpy()
199
+ else:
200
+ self._audio_numpy = np.array([])
201
+ return self._audio_numpy
202
+
203
+ @property
204
+ def duration(self) -> float:
205
+ """Get audio duration in seconds."""
206
+ if self._duration is None:
207
+ self.convert_media()
208
+ return self._duration or 0.0
209
+
210
+ @property
211
+ def sample_rate(self) -> int:
212
+ """Get audio sample rate."""
213
+ return self._sample_rate
214
+
215
+ def transcribe_full_pipeline(self) -> 'MediaTranscriptionProcessor':
216
+ """
217
+ Stage 2: Run the complete transcription pipeline with chunking.
218
+
219
+ Returns:
220
+ Self for method chaining
221
+ """
222
+ if self._transcription_results is not None:
223
+ # Already transcribed
224
+ return self
225
+
226
+ logger = logging.getLogger(__name__)
227
+
228
+ # Ensure media is converted
229
+ wav_path = self.get_wav_path()
230
+
231
+ logger.info(f"Starting transcription pipeline for: {self.original_filename}")
232
+
233
+ # Get the preprocessed audio tensor instead of just the WAV path
234
+ audio_tensor = self.get_audio_tensor()
235
+
236
+ # Run the full transcription with chunking using the tensor
237
+ self._transcription_results = transcribe_full_audio_with_chunking(
238
+ audio_tensor=audio_tensor,
239
+ sample_rate=self._sample_rate,
240
+ language_with_script=self.language_with_script,
241
+ )
242
+
243
+ logger.info(f"Transcription completed: {self._transcription_results.get('num_chunks', 0)} chunks")
244
+
245
+ # Update progress if status is initialized
246
+ if self._status_initialized:
247
+ self.update_progress(0.9)
248
+
249
+ return self
250
+
251
+ def get_results(self, include_preprocessed_audio: bool = False) -> Dict:
252
+ """
253
+ Get final transcription results (runs transcription if needed).
254
+
255
+ Args:
256
+ include_preprocessed_audio: Whether to include base64-encoded preprocessed WAV data
257
+
258
+ Returns:
259
+ Complete transcription results dictionary, optionally with preprocessed audio
260
+ """
261
+ if self._transcription_results is None:
262
+ self.transcribe_full_pipeline()
263
+
264
+ results = self._transcription_results or {}
265
+
266
+ # Add preprocessed audio data if requested
267
+ if include_preprocessed_audio and self._audio_tensor is not None:
268
+ try:
269
+ # Convert the preprocessed tensor to WAV bytes
270
+ audio_tensor_cpu = self._audio_tensor.cpu() if self._audio_tensor.is_cuda else self._audio_tensor
271
+ wav_bytes = wav_to_bytes(audio_tensor_cpu, sample_rate=self._sample_rate, format="wav")
272
+
273
+ # Encode as base64
274
+ audio_data_b64 = base64.b64encode(wav_bytes.tobytes()).decode('utf-8')
275
+
276
+ results["preprocessed_audio"] = {
277
+ "data": audio_data_b64,
278
+ "format": "wav",
279
+ "sample_rate": self._sample_rate,
280
+ "duration": self.duration,
281
+ "size_bytes": len(wav_bytes)
282
+ }
283
+
284
+ logging.getLogger(__name__).info(f"Added preprocessed audio data: {len(wav_bytes)} bytes")
285
+
286
+ except Exception as e:
287
+ logging.getLogger(__name__).warning(f"Failed to include preprocessed audio data: {e}")
288
+
289
+ return results
290
+
291
+ def cleanup(self):
292
+ """Clean up all temporary files and resources."""
293
+ if self._cleanup_performed:
294
+ return
295
+
296
+ logger = logging.getLogger(__name__)
297
+
298
+ # Clean up temporary files
299
+ for temp_file in self._temp_files:
300
+ try:
301
+ if os.path.exists(temp_file):
302
+ os.unlink(temp_file)
303
+ logger.debug(f"Cleaned up temp file: {temp_file}")
304
+ except Exception as e:
305
+ logger.warning(f"Failed to clean up temp file {temp_file}: {e}")
306
+
307
+ # Finish transcription status - always call to ensure we don't get stuck
308
+ # It's better to be safe than risk leaving the server in a busy state
309
+ transcription_status.finish_transcription()
310
+ self._status_initialized = False
311
+
312
+ # Clear references to help garbage collection
313
+ self._audio_tensor = None
314
+ self._audio_numpy = None
315
+ self._transcription_results = None
316
+ self._chunks = None
317
+ self._temp_files.clear()
318
+
319
+ self._cleanup_performed = True
320
+ logger.debug(f"Cleanup completed for: {self.original_filename}")
321
+
322
+ def __enter__(self) -> 'MediaTranscriptionProcessor':
323
+ """Context manager entry."""
324
+ return self
325
+
326
+ def __exit__(self, exc_type, exc_val, exc_tb):
327
+ """Context manager exit - ensures cleanup."""
328
+ self.cleanup()
329
+
330
+ def __del__(self):
331
+ """Destructor - final cleanup attempt."""
332
+ if not self._cleanup_performed:
333
+
334
+ self.cleanup()
server/requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask==3.0.0
2
+ flask-cors==4.0.0
3
+ gunicorn==21.2.0
4
+
5
+ # Audio processing
6
+ torchaudio<=2.8.0
7
+ torchcodec
8
+ librosa==0.10.1
9
+ soundfile==0.12.1
10
+ audioread>=3.0.0
11
+ pydub>=0.25.1
12
+
13
+ # VAD and audio chunking
14
+ silero-vad>=4.0.0
15
+ onnxruntime>=1.12.0
16
+
17
+ # Text processing
18
+ uroman
19
+
20
+ # Data structures and utilities
21
+ dataclasses
22
+ pandas
23
+ xxhash
24
+ requests==2.31.0
server/subtitle.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ def convert_time_to_srt_format(seconds):
4
+ """Converts seconds to the standard SRT time format (HH:MM:SS,ms)."""
5
+ hours = int(seconds // 3600)
6
+ minutes = int((seconds % 3600) // 60)
7
+ secs = int(seconds % 60)
8
+ milliseconds = round((seconds - int(seconds)) * 1000)
9
+
10
+ if milliseconds == 1000:
11
+ milliseconds = 0
12
+ secs += 1
13
+ if secs == 60:
14
+ secs, minutes = 0, minutes + 1
15
+ if minutes == 60:
16
+ minutes, hours = 0, hours + 1
17
+
18
+ return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
19
+
20
+ def word_level_srt(words_timestamp, srt_path="word_level_subtitle.srt", shorts=False):
21
+ """Generates an SRT file with one word per subtitle entry."""
22
+ punctuation = re.compile(r'[.,!?;:"\–—_~^+*|]')
23
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
24
+ for i, word_info in enumerate(words_timestamp, start=1):
25
+ start = convert_time_to_srt_format(word_info['start'])
26
+ end = convert_time_to_srt_format(word_info['end'])
27
+ word = re.sub(punctuation, '', word_info['word'])
28
+ if word.strip().lower() == 'i': word = "I"
29
+ if not shorts: word = word.replace("-", "")
30
+ srt_file.write(f"{i}\n{start} --> {end}\n{word}\n\n")
31
+
32
+
33
+
34
+ def split_line_by_char_limit(text, max_chars_per_line=38):
35
+ """Splits a string into multiple lines based on a character limit."""
36
+ words = text.split()
37
+ lines = []
38
+ current_line = ""
39
+ for word in words:
40
+ if not current_line:
41
+ current_line = word
42
+ elif len(current_line + " " + word) <= max_chars_per_line:
43
+ current_line += " " + word
44
+ else:
45
+ lines.append(current_line)
46
+ current_line = word
47
+ if current_line:
48
+ lines.append(current_line)
49
+ return lines
50
+
51
+ def merge_punctuation_glitches(subtitles):
52
+ """Cleans up punctuation artifacts at the boundaries of subtitle entries."""
53
+ if not subtitles:
54
+ return []
55
+
56
+ cleaned = [subtitles[0]]
57
+ for i in range(1, len(subtitles)):
58
+ prev = cleaned[-1]
59
+ curr = subtitles[i]
60
+
61
+ prev_text = prev["text"].rstrip()
62
+ curr_text = curr["text"].lstrip()
63
+
64
+ match = re.match(r'^([,.:;!?]+)(\s*)(.+)', curr_text)
65
+ if match:
66
+ punct, _, rest = match.groups()
67
+ if not prev_text.endswith(tuple(punct)):
68
+ prev["text"] = prev_text + punct
69
+ curr_text = rest.strip()
70
+
71
+ unwanted_chars = ['"', '“', '”', ';', ':']
72
+ for ch in unwanted_chars:
73
+ curr_text = curr_text.replace(ch, '')
74
+ curr_text = curr_text.strip()
75
+
76
+ if not curr_text or re.fullmatch(r'[.,!?]+', curr_text):
77
+ prev["end"] = curr["end"]
78
+ continue
79
+
80
+ curr["text"] = curr_text
81
+ prev["text"] = prev["text"].replace('"', '').replace('“', '').replace('”', '')
82
+ cleaned.append(curr)
83
+
84
+ return cleaned
85
+
86
+ import json
87
+ def write_sentence_srt(
88
+ word_level_timestamps, output_file="subtitles_professional.srt", max_lines=2,
89
+ max_duration_s=7.0, max_chars_per_line=38, hard_pause_threshold=0.5,
90
+ merge_pause_threshold=0.4
91
+ ):
92
+ """Creates professional-grade SRT files and a corresponding timestamp.json file."""
93
+ if not word_level_timestamps:
94
+ return
95
+
96
+ # Phase 1: Generate draft subtitles based on timing and length rules
97
+ draft_subtitles = []
98
+ i = 0
99
+ while i < len(word_level_timestamps):
100
+ start_time = word_level_timestamps[i]["start"]
101
+
102
+ # We'll now store the full word objects, not just the text
103
+ current_word_objects = []
104
+
105
+ j = i
106
+ while j < len(word_level_timestamps):
107
+ entry = word_level_timestamps[j]
108
+
109
+ # Create potential text from the word objects
110
+ potential_words = [w["word"] for w in current_word_objects] + [entry["word"]]
111
+ potential_text = " ".join(potential_words)
112
+
113
+ if len(split_line_by_char_limit(potential_text, max_chars_per_line)) > max_lines: break
114
+ if (entry["end"] - start_time) > max_duration_s and current_word_objects: break
115
+
116
+ if j > i:
117
+ prev_entry = word_level_timestamps[j-1]
118
+ pause = entry["start"] - prev_entry["end"]
119
+ if pause >= hard_pause_threshold: break
120
+ if prev_entry["word"].endswith(('.','!','?')): break
121
+
122
+ # Append the full word object
123
+ current_word_objects.append(entry)
124
+ j += 1
125
+
126
+ if not current_word_objects:
127
+ current_word_objects.append(word_level_timestamps[i])
128
+ j = i + 1
129
+
130
+ text = " ".join([w["word"] for w in current_word_objects])
131
+ end_time = word_level_timestamps[j - 1]["end"]
132
+
133
+ # Include the list of word objects in our draft subtitle
134
+ draft_subtitles.append({
135
+ "start": start_time,
136
+ "end": end_time,
137
+ "text": text,
138
+ "words": current_word_objects
139
+ })
140
+ i = j
141
+
142
+ # Phase 2: Post-process to merge single-word "orphan" subtitles
143
+ if not draft_subtitles: return
144
+ final_subtitles = [draft_subtitles[0]]
145
+ for k in range(1, len(draft_subtitles)):
146
+ prev_sub = final_subtitles[-1]
147
+ current_sub = draft_subtitles[k]
148
+ is_orphan = len(current_sub["text"].split()) == 1
149
+ pause_from_prev = current_sub["start"] - prev_sub["end"]
150
+
151
+ if is_orphan and pause_from_prev < merge_pause_threshold:
152
+ merged_text = prev_sub["text"] + " " + current_sub["text"]
153
+ if len(split_line_by_char_limit(merged_text, max_chars_per_line)) <= max_lines:
154
+ prev_sub["text"] = merged_text
155
+ prev_sub["end"] = current_sub["end"]
156
+
157
+ # Merge the word-level data as well
158
+ prev_sub["words"].extend(current_sub["words"])
159
+ continue
160
+
161
+ final_subtitles.append(current_sub)
162
+
163
+ final_subtitles = merge_punctuation_glitches(final_subtitles)
164
+
165
+ # This dictionary will hold the data for our JSON file
166
+ timestamps_data = {}
167
+
168
+ # Phase 3: Write the final SRT file (and prepare JSON data)
169
+ with open(output_file, "w", encoding="utf-8") as f:
170
+ for idx, sub in enumerate(final_subtitles, start=1):
171
+ # --- SRT Writing (Unchanged) ---
172
+ text = sub["text"].replace(" ,", ",").replace(" .", ".")
173
+ formatted_lines = split_line_by_char_limit(text, max_chars_per_line)
174
+ start_time_str = convert_time_to_srt_format(sub['start'])
175
+ end_time_str = convert_time_to_srt_format(sub['end'])
176
+
177
+ f.write(f"{idx}\n")
178
+ f.write(f"{start_time_str} --> {end_time_str}\n")
179
+ f.write("\n".join(formatted_lines) + "\n\n")
180
+
181
+ # Create the list of word dictionaries for the current subtitle
182
+ word_data = []
183
+ for word_obj in sub["words"]:
184
+ word_data.append({
185
+ "word": word_obj["word"],
186
+ "start": convert_time_to_srt_format(word_obj["start"]),
187
+ "end": convert_time_to_srt_format(word_obj["end"])
188
+ })
189
+
190
+ # Add the complete entry to our main dictionary
191
+ timestamps_data[str(idx)] = {
192
+ "text": "\n".join(formatted_lines),
193
+ "start": start_time_str,
194
+ "end": end_time_str,
195
+ "words": word_data
196
+ }
197
+
198
+ # Write the collected data to the JSON file
199
+ json_output_file = output_file.replace(".srt",".json")
200
+ with open(json_output_file, "w", encoding="utf-8") as f_json:
201
+ json.dump(timestamps_data, f_json, indent=4, ensure_ascii=False)
202
+
203
+ # print(f"Successfully generated SRT file: {output_file}")
204
+ # print(f"Successfully generated JSON file: {json_output_file}")
205
+ return json_output_file
206
+ def make_subtitle(word_level_timestamps,file_path):
207
+ os.makedirs("./subtitles/",exist_ok=True)
208
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
209
+
210
+ word_level_srt_file=f"./subtitles/{file_name}_subtitle_words.srt"
211
+ sentence_srt_file=f"./subtitles/{file_name}_subtitle_sentences.srt"
212
+ shorts_srt_file=f"./subtitles/{file_name}_subtitle_shorts.srt"
213
+ word_level_srt(
214
+ word_level_timestamps,
215
+ srt_path=word_level_srt_file,
216
+ shorts=False
217
+ )
218
+
219
+ sentence_json = write_sentence_srt(
220
+ word_level_timestamps,
221
+ output_file=sentence_srt_file,
222
+ max_lines=2,
223
+ max_duration_s=7.0,
224
+ max_chars_per_line=38,
225
+ hard_pause_threshold=0.5,
226
+ merge_pause_threshold=0.4
227
+ )
228
+
229
+ shorts_json = write_sentence_srt(
230
+ word_level_timestamps,
231
+ output_file=shorts_srt_file,
232
+ max_lines=1,
233
+ max_duration_s=2.0,
234
+ max_chars_per_line=17
235
+ )
236
+ return sentence_srt_file,word_level_srt_file,shorts_srt_file
server/transcription_status.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import threading
3
+ from datetime import datetime
4
+ from typing import Dict
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class TranscriptionStatus:
10
+ """Simple transcription status tracker"""
11
+
12
+ def __init__(self):
13
+ self.is_busy = False
14
+ self.current_operation = None
15
+ self.current_filename = None
16
+ self.started_at = None
17
+ self.progress = 0.0
18
+ self.lock = threading.Lock()
19
+ self.total_completed = 0
20
+
21
+ def start_transcription(self, operation_type: str, filename: str = None):
22
+ """Mark transcription as started"""
23
+ with self.lock:
24
+ self.is_busy = True
25
+ self.current_operation = operation_type
26
+ self.current_filename = filename
27
+ self.started_at = datetime.now()
28
+ self.progress = 0.0
29
+ logger.info(f"Started {operation_type} transcription for {filename or 'unknown file'}")
30
+
31
+ def update_progress(self, progress: float):
32
+ """Update transcription progress (0.0 to 1.0)"""
33
+ with self.lock:
34
+ self.progress = max(0.0, min(1.0, progress))
35
+
36
+ def finish_transcription(self):
37
+ """Mark transcription as finished"""
38
+ with self.lock:
39
+ self.is_busy = False
40
+ self.current_operation = None
41
+ self.current_filename = None
42
+ self.started_at = None
43
+ self.progress = 0.0
44
+ self.total_completed += 1
45
+ logger.info("Transcription finished")
46
+
47
+ def get_status(self) -> Dict:
48
+ """Get current status for API response"""
49
+ with self.lock:
50
+ status = {"is_busy": self.is_busy, "total_completed": self.total_completed}
51
+
52
+ if self.is_busy:
53
+ duration = (
54
+ (datetime.now() - self.started_at).total_seconds()
55
+ if self.started_at
56
+ else 0
57
+ )
58
+ status.update(
59
+ {
60
+ "current_operation": self.current_operation,
61
+ "current_filename": self.current_filename,
62
+ "progress": self.progress,
63
+ "duration_seconds": round(duration, 1),
64
+ }
65
+ )
66
+
67
+ return status
68
+
69
+
70
+ # Global status instance
71
+ transcription_status = TranscriptionStatus()
server/transcriptions_blueprint.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import tempfile
5
+
6
+ import torch
7
+ from audio_transcription import perform_forced_alignment
8
+ from media_transcription_processor import MediaTranscriptionProcessor
9
+ from transcription_status import transcription_status
10
+ from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
11
+
12
+ from env_vars import API_LOG_LEVEL, MODEL_NAME
13
+ from flask import Blueprint, jsonify, request, send_file
14
+ from video_utils import check_ffmpeg_available, combine_video_with_subtitles
15
+
16
+ transcriptions_blueprint = Blueprint(
17
+ "transcriptions_blueprint",
18
+ __name__,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+ logger.level = API_LOG_LEVEL
23
+ logging.getLogger("boto3").setLevel(API_LOG_LEVEL)
24
+ logging.getLogger("botocore").setLevel(API_LOG_LEVEL)
25
+
26
+ MAX_SHORTFORM_DURATION = 10 # seconds
27
+
28
+
29
+ @transcriptions_blueprint.route("/health")
30
+ def health():
31
+ """Comprehensive health check endpoint"""
32
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
+ cuda_available = torch.cuda.is_available()
34
+ ffmpeg_available = check_ffmpeg_available()
35
+
36
+ # Get transcription status
37
+ transcription_info = MediaTranscriptionProcessor.get_server_status()
38
+
39
+ # Get GPU details if CUDA is available
40
+ gpu_info = {}
41
+ if cuda_available:
42
+ gpu_info = {
43
+ "gpu_count": torch.cuda.device_count(),
44
+ "current_device": torch.cuda.current_device(),
45
+ "gpu_name": (
46
+ torch.cuda.get_device_name(0)
47
+ if torch.cuda.device_count() > 0
48
+ else "Unknown"
49
+ ),
50
+ }
51
+
52
+ # Add GPU memory information
53
+ try:
54
+ current_device = torch.cuda.current_device()
55
+ memory_allocated = torch.cuda.memory_allocated(current_device)
56
+ memory_reserved = torch.cuda.memory_reserved(current_device)
57
+ memory_total = torch.cuda.get_device_properties(current_device).total_memory
58
+
59
+ gpu_info.update(
60
+ {
61
+ "gpu_memory_allocated_mb": round(memory_allocated / 1024 / 1024, 1),
62
+ "gpu_memory_reserved_mb": round(memory_reserved / 1024 / 1024, 1),
63
+ "gpu_memory_total_mb": round(memory_total / 1024 / 1024, 1),
64
+ "gpu_memory_free_mb": round(
65
+ (memory_total - memory_reserved) / 1024 / 1024, 1
66
+ ),
67
+ }
68
+ )
69
+ except Exception as e:
70
+ logger.warning(f"Could not get GPU memory info: {e}")
71
+
72
+ return {
73
+ "status": "healthy",
74
+ "message": "MMS Transcription API is running",
75
+ "version": "1.0.0",
76
+ "service": "mms-transcription",
77
+ "device": str(device),
78
+ "cuda_available": cuda_available,
79
+ "ffmpeg_available": ffmpeg_available,
80
+ "transcription_status": transcription_info,
81
+ **gpu_info,
82
+ }
83
+
84
+
85
+ @transcriptions_blueprint.route("/supported-languages")
86
+ def get_supported_languages():
87
+ """Get list of supported languages for transcription"""
88
+ try:
89
+ return jsonify({
90
+ "supported_languages": supported_langs,
91
+ })
92
+ except Exception as e:
93
+ logger.error(f"Error getting supported languages: {str(e)}")
94
+ return jsonify({
95
+ "error": "Could not retrieve supported languages",
96
+ "message": str(e)
97
+ }), 500
98
+
99
+
100
+ @transcriptions_blueprint.route("/status")
101
+ def get_transcription_status():
102
+ """Get current transcription status"""
103
+ return jsonify(MediaTranscriptionProcessor.get_server_status())
104
+
105
+
106
+ @transcriptions_blueprint.route("/transcribe", methods=["POST"])
107
+ def transcribe_audio():
108
+ """Transcribe media using the MMS model with intelligent chunking for all audio/video files"""
109
+ try:
110
+ # Check if server is busy
111
+ if MediaTranscriptionProcessor.is_server_busy():
112
+ status = MediaTranscriptionProcessor.get_server_status()
113
+ return (
114
+ jsonify(
115
+ {
116
+ "error": "Server is currently processing another transcription",
117
+ "status": "busy",
118
+ "current_operation": status.get("current_operation"),
119
+ }
120
+ ),
121
+ 503,
122
+ )
123
+
124
+ # Check if media file is provided
125
+ if "media" not in request.files:
126
+ return jsonify({"error": "No media file provided"}), 400
127
+
128
+ media_file = request.files["media"]
129
+ if media_file.filename == "":
130
+ return jsonify({"error": "No file selected"}), 400
131
+
132
+ # Get optional language parameter
133
+ language_with_script = request.form.get("language", None)
134
+
135
+ if language_with_script:
136
+ logger.info(f"Language specified: {language_with_script}")
137
+ else:
138
+ logger.info("No language specified, using auto-detection")
139
+
140
+ # Get optional include_preprocessed parameter (from form data or query string)
141
+ include_preprocessed = (
142
+ request.form.get("include_preprocessed", "false").lower() == "true" or
143
+ request.args.get("include_preprocessed", "false").lower() == "true"
144
+ )
145
+ if include_preprocessed:
146
+ logger.info("Preprocessed audio will be included in response")
147
+
148
+ # Mark as busy and start transcription
149
+ # This will be handled by the processor
150
+
151
+ # Read file bytes once
152
+ media_bytes = media_file.read()
153
+
154
+ try:
155
+ # Use the MediaTranscriptionProcessor with context manager for automatic cleanup
156
+ with MediaTranscriptionProcessor(media_bytes, media_file.filename, language_with_script) as processor:
157
+ # Start transcription status tracking
158
+ processor.start_transcription()
159
+
160
+ # Stage 1: Convert media (this also calculates duration and updates progress)
161
+ processor.convert_media()
162
+ logger.info(f"Media conversion completed for: {media_file.filename}")
163
+
164
+ # Stage 2: Run full transcription pipeline (this also updates progress)
165
+ processor.transcribe_full_pipeline()
166
+
167
+ # Get final results with optional preprocessed audio
168
+ results = processor.get_results(include_preprocessed_audio=include_preprocessed)
169
+
170
+ logger.info(f"Transcription completed: {results.get('num_chunks', 0)} chunks")
171
+
172
+ # Format response
173
+ response = {
174
+ "transcription": results.get("transcription", ""),
175
+ "aligned_segments": results.get("aligned_segments", []),
176
+ "chunks": results.get("chunks", []),
177
+ "total_duration": results.get("total_duration", 0.0),
178
+ "num_chunks": results.get("num_chunks", 0),
179
+ "num_segments": results.get("num_segments", 0),
180
+ "model": MODEL_NAME,
181
+ "device": str(torch.device("cuda:0" if torch.cuda.is_available() else "cpu")),
182
+ "status": results.get("status", "success"),
183
+ }
184
+
185
+ # Add preprocessed audio if it was included in results
186
+ if "preprocessed_audio" in results:
187
+ response["preprocessed_audio"] = results["preprocessed_audio"]
188
+
189
+ if "error" in results:
190
+ response["error"] = results["error"]
191
+ logger.error(f"Transcription response with error: {response}")
192
+ return jsonify(response), 500
193
+
194
+ # Print out the complete response for debugging
195
+ logger.info("=== TRANSCRIBE RESPONSE ===")
196
+ # logger.info(f"Full response: {json.dumps(response, indent=2)}")
197
+ logger.info("=== END TRANSCRIBE RESPONSE ===")
198
+
199
+ return jsonify(response)
200
+ # Context manager automatically handles cleanup and status finalization here
201
+
202
+ except Exception as e:
203
+ logger.error(f"Media conversion/transcription error: {str(e)}")
204
+ return jsonify({"error": f"Media processing failed: {str(e)}"}), 500
205
+
206
+ except Exception as e:
207
+ logger.error(f"Transcription error: {str(e)}")
208
+ return jsonify({"error": f"Transcription failed: {str(e)}"}), 500
209
+
210
+
211
+ @transcriptions_blueprint.route("/combine-video-subtitles", methods=["POST"])
212
+ def combine_video_subtitles():
213
+ """Combine video with subtitles using FFmpeg"""
214
+ try:
215
+ # Check if server is busy
216
+ if MediaTranscriptionProcessor.is_server_busy():
217
+ status = MediaTranscriptionProcessor.get_server_status()
218
+ return (
219
+ jsonify(
220
+ {
221
+ "error": "Server is currently processing another request",
222
+ "status": "busy",
223
+ "current_operation": status.get("current_operation"),
224
+ }
225
+ ),
226
+ 503,
227
+ )
228
+
229
+ # Check required fields
230
+ if "video" not in request.files:
231
+ return jsonify({"error": "No video file provided"}), 400
232
+
233
+ if "subtitles" not in request.form:
234
+ return jsonify({"error": "No subtitles provided"}), 400
235
+
236
+ video_file = request.files["video"]
237
+ subtitles = request.form["subtitles"]
238
+
239
+ if video_file.filename == "":
240
+ return jsonify({"error": "No video file selected"}), 400
241
+
242
+ # Get optional parameters
243
+ subtitle_format = request.form.get("format", "srt") # srt or webvtt
244
+ output_format = request.form.get("output_format", "mp4") # mp4 or mkv
245
+ language = request.form.get("language", "eng")
246
+
247
+ # Mark as busy and start processing
248
+ transcription_status.start_transcription("combine_video", video_file.filename)
249
+
250
+ try:
251
+ transcription_status.update_progress(0.1)
252
+
253
+ # Save the uploaded video file to a temporary location
254
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(video_file.filename)[1]) as temp_video:
255
+ video_file.save(temp_video.name)
256
+ temp_video_path = temp_video.name
257
+
258
+ transcription_status.update_progress(0.3)
259
+
260
+ try:
261
+ # Combine video with subtitles using video_utils function
262
+ output_path = combine_video_with_subtitles(
263
+ temp_video_path, subtitles, subtitle_format, output_format, language
264
+ )
265
+
266
+ transcription_status.update_progress(0.9)
267
+
268
+ logger.info(f"Video combination completed: {output_path}")
269
+
270
+ # Return the combined video file
271
+ return send_file(
272
+ output_path,
273
+ as_attachment=True,
274
+ download_name=f"{video_file.filename.rsplit('.', 1)[0]}_with_subtitles.{output_format}",
275
+ mimetype=f"video/{output_format}",
276
+ )
277
+
278
+ finally:
279
+ # Clean up temporary video file
280
+ try:
281
+ os.unlink(temp_video_path)
282
+ except OSError:
283
+ pass
284
+
285
+ finally:
286
+ # Mark transcription as finished
287
+ transcription_status.finish_transcription()
288
+
289
+ except Exception as e:
290
+ transcription_status.finish_transcription()
291
+ logger.error(f"Video combination error: {str(e)}")
292
+ return jsonify({"error": f"Video combination failed: {str(e)}"}), 500
server/video_utils.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def combine_video_with_subtitles(
12
+ video_file_path: str,
13
+ subtitle_content: str,
14
+ subtitle_format: str = "srt",
15
+ output_format: str = "mp4",
16
+ language: str = "eng",
17
+ ) -> str:
18
+ """
19
+ Combine video file with subtitle content using FFmpeg.
20
+
21
+ Args:
22
+ video_file_path: Path to the input video file
23
+ subtitle_content: String content of the subtitles (SRT or WebVTT)
24
+ subtitle_format: Format of subtitles ("srt" or "webvtt")
25
+ output_format: Output container format ("mp4" or "mkv")
26
+ language: Language code for subtitle track
27
+
28
+ Returns:
29
+ Path to the output video file with embedded subtitles
30
+ """
31
+
32
+ # Create temporary files
33
+ with tempfile.NamedTemporaryFile(
34
+ mode="w", suffix=f".{subtitle_format}", delete=False
35
+ ) as sub_file:
36
+ sub_file.write(subtitle_content)
37
+ subtitle_file_path = sub_file.name
38
+
39
+ # Generate output filename
40
+ input_path = Path(video_file_path)
41
+ output_path = (
42
+ input_path.parent / f"{input_path.stem}_with_subtitles.{output_format}"
43
+ )
44
+
45
+ try:
46
+ if output_format.lower() == "mkv":
47
+ # MKV has better subtitle support
48
+ if subtitle_format.lower() == "webvtt":
49
+ codec = "webvtt"
50
+ else:
51
+ codec = "srt"
52
+
53
+ cmd = [
54
+ "ffmpeg",
55
+ "-y", # -y to overwrite output file
56
+ "-i",
57
+ video_file_path,
58
+ "-i",
59
+ subtitle_file_path,
60
+ "-c:v",
61
+ "copy", # Copy video stream
62
+ "-c:a",
63
+ "copy", # Copy audio stream
64
+ "-c:s",
65
+ codec, # Subtitle codec
66
+ "-metadata:s:s:0",
67
+ f"language={language}",
68
+ str(output_path),
69
+ ]
70
+ else:
71
+ # MP4 format
72
+ cmd = [
73
+ "ffmpeg",
74
+ "-y",
75
+ "-i",
76
+ video_file_path,
77
+ "-i",
78
+ subtitle_file_path,
79
+ "-c:v",
80
+ "copy", # Copy video stream
81
+ "-c:a",
82
+ "copy", # Copy audio stream
83
+ "-c:s:0",
84
+ "mov_text", # MP4 subtitle format
85
+ "-map",
86
+ "0:v", # Map video from first input
87
+ "-map",
88
+ "0:a", # Map audio from first input
89
+ "-map",
90
+ "1:s", # Map subtitles from second input
91
+ "-metadata:s:s:0",
92
+ f"language={language}",
93
+ "-disposition:s:0",
94
+ "default", # Make subtitles default
95
+ str(output_path),
96
+ ]
97
+
98
+ # Execute FFmpeg command
99
+ logger.info(f"Executing FFmpeg command: {' '.join(cmd)}")
100
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
101
+
102
+ # Log FFmpeg output for debugging
103
+ if result.stdout:
104
+ logger.debug(f"FFmpeg stdout: {result.stdout}")
105
+ if result.stderr:
106
+ logger.debug(f"FFmpeg stderr: {result.stderr}")
107
+
108
+ logger.info(f"FFmpeg completed successfully, output file: {output_path}")
109
+
110
+ return str(output_path)
111
+
112
+ except subprocess.CalledProcessError as e:
113
+ raise RuntimeError(f"FFmpeg failed: {e.stderr}")
114
+ except FileNotFoundError:
115
+ raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
116
+ finally:
117
+ # Clean up temporary subtitle file
118
+ try:
119
+ os.unlink(subtitle_file_path)
120
+ except OSError:
121
+ pass
122
+
123
+
124
+ def check_ffmpeg_available() -> bool:
125
+ """Check if FFmpeg is available on the system."""
126
+ try:
127
+ subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
128
+ return True
129
+ except (subprocess.CalledProcessError, FileNotFoundError):
130
+ return False
131
+
132
+
133
+ def extract_audio_from_video(video_file_path: str, output_audio_path: str = None) -> str:
134
+ """
135
+ Extract audio from video file using FFmpeg.
136
+
137
+ Args:
138
+ video_file_path: Path to the input video file
139
+ output_audio_path: Path for output audio file (optional)
140
+
141
+ Returns:
142
+ Path to the extracted audio file
143
+ """
144
+ if not check_ffmpeg_available():
145
+ raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
146
+
147
+ # Generate output filename if not provided
148
+ if output_audio_path is None:
149
+ input_path = Path(video_file_path)
150
+ output_audio_path = str(input_path.with_suffix('.wav'))
151
+
152
+ try:
153
+ # FFmpeg command to extract audio
154
+ # -vn: disable video stream
155
+ # -acodec pcm_s16le: use 16-bit PCM encoding
156
+ # -ar 16000: set sample rate to 16kHz (optimal for speech recognition)
157
+ # -ac 1: mono audio (single channel)
158
+ cmd = [
159
+ "ffmpeg",
160
+ "-i", video_file_path,
161
+ "-vn", # No video
162
+ "-acodec", "pcm_s16le", # 16-bit PCM
163
+ "-ar", "16000", # 16kHz sample rate
164
+ "-ac", "1", # Mono
165
+ "-y", # Overwrite output file if it exists
166
+ output_audio_path
167
+ ]
168
+
169
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
170
+ logger.info(f"Audio extracted successfully to: {output_audio_path}")
171
+ return output_audio_path
172
+
173
+ except subprocess.CalledProcessError as e:
174
+ raise RuntimeError(f"FFmpeg audio extraction failed: {e.stderr}")
175
+ except FileNotFoundError:
176
+ raise RuntimeError("FFmpeg not found. Please install FFmpeg.")
177
+
178
+
179
+ def get_video_info(video_file_path: str) -> dict:
180
+ """Get basic information about a video file."""
181
+ try:
182
+ cmd = [
183
+ "ffprobe",
184
+ "-v",
185
+ "quiet",
186
+ "-print_format",
187
+ "json",
188
+ "-show_format",
189
+ "-show_streams",
190
+ video_file_path,
191
+ ]
192
+
193
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
194
+ return json.loads(result.stdout)
195
+
196
+ except (subprocess.CalledProcessError, FileNotFoundError):
197
+ return {}
198
+ except json.JSONDecodeError:
199
+ return {}