| import gradio as gr |
| import pandas as pd |
|
|
| |
| from data_loaders import ( |
| load_language_list, load_language_taxonomy, load_common_voice_data, |
| load_app_content, get_common_voice_stats |
| ) |
| from commercial_services import ( |
| fetch_azure_asr_languages, fetch_azure_tts_languages, |
| fetch_google_stt_languages, fetch_google_tts_languages, |
| fetch_aws_transcribe_languages, fetch_aws_polly_languages, |
| get_azure_locales_for_language, get_google_locales_for_language, |
| get_aws_locales_for_language, |
| check_elevenlabs_multilingual_v2_support, check_elevenlabs_turbo_v3_support |
| ) |
| from huggingface_search import ( |
| search_huggingface_models, search_huggingface_datasets, deduplicate_models |
| ) |
| from language_metadata import get_language_metadata_html, get_default_metadata_html |
|
|
| |
| LANGUAGE_CODES_FILE = "language-codes-full.csv" |
| APP_CONTENT_FILE = "app_content.md" |
| LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt" |
| COMMON_VOICE_DATA_FILE = "cv-corpus-24.0-2025-12-05.json" |
| COMMON_VOICE_VERSION = "24.0 (2025-12-05)" |
|
|
| |
| |
| LANGUAGES = {} |
|
|
| |
| |
| LANGUAGE_TAXONOMY = {} |
|
|
| |
| |
| COMMON_VOICE_DATA = {} |
|
|
| |
| TAXONOMY_LEVELS = { |
| 0: "The Left-Behinds", |
| 1: "The Scraping-Bys", |
| 2: "The Hopefuls", |
| 3: "The Rising Stars", |
| 4: "The Underdogs", |
| 5: "The Winners" |
| } |
|
|
| |
| APP_CONTENT = { |
| "title": "Speech Resource Finder", |
| "description": "Search for speech resources", |
| "full_content": "" |
| } |
|
|
| def search_language_resources(language_code, deduplicate=False): |
| """ |
| Search for ASR/TTS resources for a given language |
| Returns results organized by service type |
| deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads |
| """ |
| all_logs = [] |
|
|
| if not language_code: |
| return None, None, None, None, 0, 0, None, None, 0, 0, "" |
|
|
| lang_info = LANGUAGES.get(language_code) |
| if not lang_info: |
| return None, None, None, None, 0, 0, None, None, 0, 0, "" |
|
|
| language_name = lang_info['name'] |
| iso_639_1 = lang_info['iso_639_1'] |
| iso_639_2 = language_code |
|
|
| all_logs.append(f"=== Searching for {language_name} ({language_code}) ===") |
| all_logs.append(f"Language codes: ISO 639-1={iso_639_1}, ISO 639-2={iso_639_2}") |
|
|
| |
| all_logs.append("\n[Common Voice Dataset]") |
| cv_stats = get_common_voice_stats(iso_639_2, iso_639_1, COMMON_VOICE_DATA) |
| if cv_stats: |
| all_logs.append(f" ✅ Available in Common Voice (locale: {cv_stats['locale']})") |
| all_logs.append(f" Valid hours: {cv_stats['valid_hrs']:.1f}h, Total hours: {cv_stats['total_hrs']:.1f}h") |
| all_logs.append(f" Gender balance: {cv_stats['male_pct']:.1f}% male, {cv_stats['female_pct']:.1f}% female") |
| else: |
| all_logs.append(f" ❌ Not available in Common Voice") |
|
|
| |
| all_logs.append("\n[Azure Speech Services]") |
| azure_asr = fetch_azure_asr_languages() |
| azure_tts = fetch_azure_tts_languages() |
| all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure") |
|
|
| |
| azure_locales = get_azure_locales_for_language(iso_639_1) |
| all_logs.append(f" Matching Azure locales: {azure_locales}") |
|
|
| |
| azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr] |
| azure_asr_available = len(azure_asr_locales) > 0 |
| all_logs.append(f" Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)") |
|
|
| |
| azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts] |
| azure_tts_available = len(azure_tts_locales) > 0 |
| azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales) |
| all_logs.append(f" Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)") |
|
|
| |
| all_logs.append("\n[Google Cloud Speech]") |
| google_stt = fetch_google_stt_languages() |
| google_tts = fetch_google_tts_languages() |
| all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud") |
|
|
| |
| google_locales = get_google_locales_for_language(iso_639_1) |
| all_logs.append(f" Matching Google Cloud locales: {google_locales}") |
|
|
| |
| google_stt_locales = [loc for loc in google_locales if loc in google_stt] |
| google_stt_available = len(google_stt_locales) > 0 |
| all_logs.append(f" Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)") |
|
|
| |
| google_tts_locales = [loc for loc in google_locales if loc in google_tts] |
| google_tts_available = len(google_tts_locales) > 0 |
| google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales) |
| all_logs.append(f" Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)") |
|
|
| |
| all_logs.append("\n[AWS (Transcribe + Polly)]") |
| aws_transcribe = fetch_aws_transcribe_languages() |
| aws_polly = fetch_aws_polly_languages() |
| all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS") |
|
|
| |
| aws_locales = get_aws_locales_for_language(iso_639_1) |
| all_logs.append(f" Matching AWS locales: {aws_locales}") |
|
|
| |
| aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe] |
| aws_transcribe_available = len(aws_transcribe_locales) > 0 |
| all_logs.append(f" AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)") |
|
|
| |
| aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly] |
| aws_polly_available = len(aws_polly_locales) > 0 |
| aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales) |
| all_logs.append(f" AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)") |
|
|
| |
| commercial_rows = [] |
|
|
| |
| if azure_asr_available: |
| azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)" |
| else: |
| azure_asr_text = "❌ N/A" |
|
|
| if azure_tts_available: |
| azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)" |
| else: |
| azure_tts_text = "❌ N/A" |
|
|
| commercial_rows.append({ |
| "Service": "Azure Speech", |
| "ASR": azure_asr_text, |
| "TTS": azure_tts_text, |
| }) |
|
|
| |
| if google_stt_available: |
| google_stt_text = f"✅ {len(google_stt_locales)} locale(s)" |
| else: |
| google_stt_text = "❌ N/A" |
|
|
| if google_tts_available: |
| google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)" |
| else: |
| google_tts_text = "❌ N/A" |
|
|
| commercial_rows.append({ |
| "Service": "Google Cloud Speech", |
| "ASR": google_stt_text, |
| "TTS": google_tts_text, |
| }) |
|
|
| |
| if aws_transcribe_available: |
| aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)" |
| else: |
| aws_transcribe_text = "❌ N/A" |
|
|
| if aws_polly_available: |
| aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)" |
| else: |
| aws_polly_text = "❌ N/A" |
|
|
| commercial_rows.append({ |
| "Service": "AWS (Transcribe + Polly)", |
| "ASR": aws_transcribe_text, |
| "TTS": aws_polly_text, |
| }) |
|
|
| |
| all_logs.append("\n[ElevenLabs]") |
| elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(iso_639_1) |
| all_logs.append(f" Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}") |
|
|
| if elevenlabs_v2_supported: |
| elevenlabs_v2_tts_text = "✅ Supported" |
| else: |
| elevenlabs_v2_tts_text = "❌ N/A" |
|
|
| commercial_rows.append({ |
| "Service": "ElevenLabs Multilingual v2", |
| "ASR": "N/A", |
| "TTS": elevenlabs_v2_tts_text, |
| }) |
|
|
| |
| elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(iso_639_2) |
| all_logs.append(f" Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}") |
|
|
| if elevenlabs_v3_supported: |
| elevenlabs_v3_tts_text = "✅ Supported" |
| else: |
| elevenlabs_v3_tts_text = "❌ N/A" |
|
|
| commercial_rows.append({ |
| "Service": "ElevenLabs Turbo v3", |
| "ASR": "N/A", |
| "TTS": elevenlabs_v3_tts_text, |
| }) |
|
|
| commercial_df = pd.DataFrame(commercial_rows) |
|
|
| |
| all_logs.append("\n[HuggingFace Models]") |
|
|
| asr_models, asr_model_logs = search_huggingface_models(iso_639_1, iso_639_2, 'automatic-speech-recognition', max_results=100, max_pages=5) |
| all_logs.extend([f" [ASR] {log}" for log in asr_model_logs]) |
|
|
| tts_models, tts_model_logs = search_huggingface_models(iso_639_1, iso_639_2, 'text-to-speech', max_results=100, max_pages=5) |
| all_logs.extend([f" [TTS] {log}" for log in tts_model_logs]) |
|
|
| |
| if deduplicate: |
| all_logs.append(f"\n[Deduplication]") |
| asr_before = len(asr_models) |
| asr_models = deduplicate_models(asr_models) |
| all_logs.append(f" ASR models: {asr_before} → {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)") |
|
|
| tts_before = len(tts_models) |
| tts_models = deduplicate_models(tts_models) |
| all_logs.append(f" TTS models: {tts_before} → {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)") |
| else: |
| |
| for model in asr_models: |
| model['duplicates'] = 1 |
| for model in tts_models: |
| model['duplicates'] = 1 |
|
|
| |
| asr_models_data = [] |
| for model in asr_models: |
| asr_models_data.append({ |
| "Model Name": f"[{model['name']}]({model['url']})", |
| "Downloads": model['downloads'], |
| "Likes": model['likes'], |
| "Size": model.get('size', ''), |
| "Duplicates": model.get('duplicates', 1) |
| }) |
|
|
| if asr_models_data: |
| asr_models_df = pd.DataFrame(asr_models_data) |
| else: |
| |
| asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) |
|
|
| |
| tts_models_data = [] |
| for model in tts_models: |
| tts_models_data.append({ |
| "Model Name": f"[{model['name']}]({model['url']})", |
| "Downloads": model['downloads'], |
| "Likes": model['likes'], |
| "Size": model.get('size', ''), |
| "Duplicates": model.get('duplicates', 1) |
| }) |
|
|
| if tts_models_data: |
| tts_models_df = pd.DataFrame(tts_models_data) |
| else: |
| |
| tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) |
|
|
| |
| all_logs.append("\n[HuggingFace Datasets]") |
| asr_datasets, asr_dataset_logs = search_huggingface_datasets(iso_639_1, iso_639_2, 'automatic-speech-recognition', max_results=100, max_pages=5) |
| all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs]) |
|
|
| tts_datasets, tts_dataset_logs = search_huggingface_datasets(iso_639_1, iso_639_2, 'text-to-speech', max_results=100, max_pages=5) |
| all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs]) |
|
|
| |
| asr_datasets_data = [] |
| for dataset in asr_datasets: |
| asr_datasets_data.append({ |
| "Dataset Name": f"[{dataset['name']}]({dataset['url']})", |
| "Downloads": dataset['downloads'], |
| "Likes": dataset['likes'], |
| "Size": dataset.get('size', '') |
| }) |
|
|
| if asr_datasets_data: |
| asr_datasets_df = pd.DataFrame(asr_datasets_data) |
| else: |
| |
| asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) |
|
|
| |
| tts_datasets_data = [] |
| for dataset in tts_datasets: |
| tts_datasets_data.append({ |
| "Dataset Name": f"[{dataset['name']}]({dataset['url']})", |
| "Downloads": dataset['downloads'], |
| "Likes": dataset['likes'], |
| "Size": dataset.get('size', '') |
| }) |
|
|
| if tts_datasets_data: |
| tts_datasets_df = pd.DataFrame(tts_datasets_data) |
| else: |
| |
| tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) |
|
|
| |
| log_text = "\n".join(all_logs) |
|
|
| |
| return cv_stats, commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text |
|
|
| |
| print("Initializing Speech Resource Finder...") |
| APP_CONTENT = load_app_content(APP_CONTENT_FILE) |
| LANGUAGES = load_language_list(LANGUAGE_CODES_FILE) |
| LANGUAGE_TAXONOMY = load_language_taxonomy(LANGUAGE_TAXONOMY_URL) |
| COMMON_VOICE_DATA = load_common_voice_data(COMMON_VOICE_DATA_FILE) |
|
|
| |
| language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])] |
| print(f"Created dropdown with {len(language_choices)} language options") |
|
|
| with gr.Blocks(title=APP_CONTENT["title"]) as demo: |
| gr.Markdown(f"# 🌐 {APP_CONTENT['title']}") |
| gr.Markdown(APP_CONTENT["description"]) |
|
|
| with gr.Row(equal_height=True): |
| with gr.Column(scale=70): |
| language_dropdown = gr.Dropdown( |
| choices=language_choices, |
| label="Select Language", |
| info="Type to search for a language", |
| allow_custom_value=False, |
| filterable=True, |
| ) |
| with gr.Column(scale=30): |
| language_metadata = gr.HTML( |
| """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'> |
| <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p> |
| </div>""", |
| elem_id="language-metadata" |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=70): |
| gr.Markdown("## Commercial Services") |
| commercial_table = gr.Dataframe( |
| headers=["Service", "ASR", "TTS"], |
| interactive=False, |
| wrap=True, |
| ) |
|
|
| with gr.Column(scale=30): |
| gr.Markdown("## Common Voice") |
| cv_info = gr.HTML( |
| """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'> |
| <p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p> |
| </div>""", |
| elem_id="cv-info" |
| ) |
|
|
| gr.Markdown("## HuggingFace Models") |
|
|
| with gr.Row(): |
| deduplicate_checkbox = gr.Checkbox( |
| label="Deduplicate models", |
| value=True, |
| info="Keep only the model with most downloads for each base name" |
| ) |
|
|
| |
| with gr.Tabs(): |
| with gr.Tab(label="ASR Models") as asr_tab: |
| asr_count_label = gr.Markdown("*Loading...*") |
| asr_models_table = gr.Dataframe( |
| headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"], |
| interactive=False, |
| wrap=True, |
| datatype=["markdown", "number", "number", "str", "number"], |
| ) |
|
|
| with gr.Tab(label="TTS Models") as tts_tab: |
| tts_count_label = gr.Markdown("*Loading...*") |
| tts_models_table = gr.Dataframe( |
| headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"], |
| interactive=False, |
| wrap=True, |
| datatype=["markdown", "number", "number", "str", "number"], |
| ) |
|
|
| gr.Markdown("## HuggingFace Datasets") |
|
|
| |
| with gr.Tabs(): |
| with gr.Tab(label="ASR Datasets") as asr_datasets_tab: |
| asr_datasets_count_label = gr.Markdown("*Loading...*") |
| asr_datasets_table = gr.Dataframe( |
| headers=["Dataset Name", "Downloads", "Likes", "Size"], |
| interactive=False, |
| wrap=True, |
| datatype=["markdown", "number", "number", "str"], |
| ) |
|
|
| with gr.Tab(label="TTS Datasets") as tts_datasets_tab: |
| tts_datasets_count_label = gr.Markdown("*Loading...*") |
| tts_datasets_table = gr.Dataframe( |
| headers=["Dataset Name", "Downloads", "Likes", "Size"], |
| interactive=False, |
| wrap=True, |
| datatype=["markdown", "number", "number", "str"], |
| ) |
|
|
| with gr.Accordion("Logs", open=False): |
| log_textbox = gr.Textbox( |
| show_label=False, |
| lines=15, |
| max_lines=30, |
| interactive=False, |
| placeholder="Logs will appear here...", |
| autoscroll=True, |
| ) |
|
|
| |
| with gr.Accordion("About this tool", open=False): |
| gr.Markdown(APP_CONTENT["full_content"]) |
|
|
| def on_search(language_selection, deduplicate): |
| if not language_selection: |
| cv_default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'> |
| <p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p> |
| </div>""" |
| return get_default_metadata_html(), cv_default_html, None, "", None, "", None, "", None, "", None, "" |
|
|
| |
| language_code = language_selection.split(":")[0].strip() |
|
|
| |
| language_name = LANGUAGES.get(language_code, {}).get("name", "") |
| iso_639_1 = LANGUAGES.get(language_code, {}).get("iso_639_1", "") |
|
|
| |
| metadata_html = get_language_metadata_html(language_code, language_name, iso_639_1, LANGUAGE_TAXONOMY) |
|
|
| cv_stats, commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate) |
|
|
| |
| if cv_stats: |
| cv_info_html = f"""<div style='padding: 15px; border: 2px solid #4caf50; border-radius: 4px; background-color: #ffffff;'> |
| <div style='margin-bottom: 12px;'> |
| <span style='font-size: 18px;'>✅</span> |
| <span style='font-weight: bold; color: #2e7d32; font-size: 14px; margin-left: 4px;'>Available</span> |
| </div> |
| <table style='width: 100%; border-collapse: collapse; font-size: 13px;'> |
| <tr> |
| <td style='padding: 3px 8px 3px 0; color: #666; width: 45%;'>Locale</td> |
| <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['locale']}</td> |
| </tr> |
| <tr> |
| <td style='padding: 3px 8px 3px 0; color: #666;'>Valid Hours</td> |
| <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['valid_hrs']:.1f}h</td> |
| </tr> |
| <tr> |
| <td style='padding: 3px 8px 3px 0; color: #666;'>Total Hours</td> |
| <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['total_hrs']:.1f}h</td> |
| </tr> |
| <tr> |
| <td style='padding: 3px 8px 3px 0; color: #666;'>Contributors</td> |
| <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['users_formatted']}</td> |
| </tr> |
| <tr> |
| <td style='padding: 3px 8px 3px 0; color: #666;'>Gender</td> |
| <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['male_pct']:.0f}% M / {cv_stats['female_pct']:.0f}% F</td> |
| </tr> |
| <tr> |
| <td style='padding: 3px 8px 3px 0; color: #666;'>Version</td> |
| <td style='padding: 3px 0; color: #000; font-weight: 500;'>{COMMON_VOICE_VERSION}</td> |
| </tr> |
| </table> |
| </div>""" |
| else: |
| cv_info_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'> |
| <div style='margin-bottom: 8px;'> |
| <span style='font-size: 18px;'>❌</span> |
| <span style='font-weight: bold; color: #666; font-size: 14px; margin-left: 4px;'>Not Available</span> |
| </div> |
| <p style='margin: 0; color: #999; font-size: 12px;'>Not in Common Voice dataset</p> |
| </div>""" |
|
|
| |
| asr_models_label = f"**Found {asr_models_count} ASR model(s)**" |
| tts_models_label = f"**Found {tts_models_count} TTS model(s)**" |
| asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**" |
| tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**" |
|
|
| return metadata_html, cv_info_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs |
|
|
| |
| language_dropdown.change( |
| fn=on_search, |
| inputs=[language_dropdown, deduplicate_checkbox], |
| outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox], |
| ) |
|
|
| |
| deduplicate_checkbox.change( |
| fn=on_search, |
| inputs=[language_dropdown, deduplicate_checkbox], |
| outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) |