Spaces:
Build error
Build error
| from huggingface_hub import list_datasets | |
| from toolz import concat | |
| from toolz import frequencies | |
| import gradio as gr | |
| import pandas as pd | |
| from huggingface_hub import list_models | |
| from huggingface_hub import ModelFilter | |
| from tqdm.auto import tqdm | |
| """## Grab datasets with `token-classification` task""" | |
| datasets = list(list_datasets(filter="task_categories:token-classification")) | |
| datasets[0] | |
| with_language = [ | |
| dataset | |
| for dataset in datasets | |
| if "language" in [t.split(":")[0] for t in dataset.tags] | |
| ] | |
| with_language[0] | |
| def get_languages(dataset): | |
| tags = list(dataset.tags) | |
| languages = [t for t in tags if t.split(":")[0] == "language"] | |
| languages = [language for language in languages if len(language.split(":")) == 2] | |
| return [t.split(":")[1] for t in languages] | |
| freqs = frequencies(concat(get_languages(dataset) for dataset in with_language)) | |
| freqs = dict(sorted(freqs.items(), key=lambda x: x[1], reverse=True)) | |
| no_model = [] | |
| for lang in tqdm(freqs.keys()): | |
| models_for_lang_with_task_token_classification = list( | |
| list_models(filter=ModelFilter(language=lang, task="token-classification")) | |
| ) | |
| models_for_lang_any_task = list(list_models(filter=ModelFilter(language=lang))) | |
| datasets_for_lang_any_task = list(list_datasets(filter=f"language:{lang}")) | |
| if not models_for_lang_with_task_token_classification: | |
| data = { | |
| "language": lang, | |
| "datasets_for_token_classification": freqs[lang], | |
| "datasets": len(datasets_for_lang_any_task), | |
| "token_classification_models": len( | |
| models_for_lang_with_task_token_classification | |
| ), | |
| "all_models": len(models_for_lang_any_task), | |
| } | |
| no_model.append(data) | |
| len(no_model) | |
| df = pd.DataFrame(no_model) | |
| df = df.sort_values( | |
| by=[ | |
| "datasets_for_token_classification", | |
| "datasets", | |
| "token_classification_models", | |
| "all_models", | |
| ], | |
| ascending=[False, False, True, True], | |
| ) | |
| def report_summary(): | |
| summary = "" | |
| for row in df.head(30).itertuples(): | |
| language = row[1] | |
| summary += f"# Summary for language: {language}\n" | |
| summary += f"This language has {(row[2])} token classification datasets, {row[3]} datasets overall, {row[4]} token classification models, and {row[5]} models overall.\n" | |
| summary += f"- [Datasets for token classification task for {language}](https://huggingface.co/datasets?task_categories=task_categories:token-classification&language=language:{language})\n" | |
| summary += f"- [Token classification models for {language}](https://huggingface.co/models?task_categories=task_categories:token-classification&language=language:{language})\n" | |
| summary += f"- [All models for {language}](https://huggingface.co/models?language={language}&sort=trending)\n" | |
| summary += "<br>\n" | |
| return summary | |
| with gr.Blocks() as demo: | |
| gr.DataFrame(df) | |
| gr.Markdown("# Top 30 candidates") | |
| gr.Markdown( | |
| "Candiates generated by sorting by most token classification datasets, then least token classification models, then least models overall" | |
| ) | |
| gr.Markdown(report_summary()) | |
| demo.launch() | |