| | import json |
| | import os |
| | import shutil |
| |
|
| | import requests |
| |
|
| | RELEASE_STATS_URL = "https://commonvoice.mozilla.org/dist/releases/{}.json" |
| | VERSIONS = [ |
| | {"semver": "1.0.0", "name": "common_voice_1_0", "release": "cv-corpus-1"}, |
| | {"semver": "2.0.0", "name": "common_voice_2_0", "release": "cv-corpus-2"}, |
| | {"semver": "3.0.0", "name": "common_voice_3_0", "release": "cv-corpus-3"}, |
| | { |
| | "semver": "4.0.0", |
| | "name": "common_voice_4_0", |
| | "release": "cv-corpus-4-2019-12-10", |
| | }, |
| | { |
| | "semver": "5.0.0", |
| | "name": "common_voice_5_0", |
| | "release": "cv-corpus-5-2020-06-22", |
| | }, |
| | { |
| | "semver": "5.1.0", |
| | "name": "common_voice_5_1", |
| | "release": "cv-corpus-5.1-2020-06-22", |
| | }, |
| | { |
| | "semver": "6.0.0", |
| | "name": "common_voice_6_0", |
| | "release": "cv-corpus-6.0-2020-12-11", |
| | }, |
| | { |
| | "semver": "6.1.0", |
| | "name": "common_voice_6_1", |
| | "release": "cv-corpus-6.1-2020-12-11", |
| | }, |
| | { |
| | "semver": "7.0.0", |
| | "name": "common_voice_7_0", |
| | "release": "cv-corpus-7.0-2021-07-21", |
| | }, |
| | { |
| | "semver": "8.0.0", |
| | "name": "common_voice_8_0", |
| | "release": "cv-corpus-8.0-2022-01-19", |
| | }, |
| | { |
| | "semver": "9.0.0", |
| | "name": "common_voice_9_0", |
| | "release": "cv-corpus-9.0-2022-04-27", |
| | }, |
| | { |
| | "semver": "10.0.0", |
| | "name": "common_voice_10_0", |
| | "release": "cv-corpus-10.0-2022-07-04", |
| | }, |
| | ] |
| |
|
| |
|
| | def num_to_size(num: int): |
| | if num < 1000: |
| | return "n<1K" |
| | elif num < 10_000: |
| | return "1K<n<10K" |
| | elif num < 100_000: |
| | return "10K<n<100K" |
| | elif num < 1_000_000: |
| | return "100K<n<1M" |
| | elif num < 10_000_000: |
| | return "1M<n<10M" |
| | elif num < 100_000_000: |
| | return "10M<n<100M" |
| | elif num < 1_000_000_000: |
| | return "100M<n<1B" |
| |
|
| |
|
| | def get_language_names(): |
| | |
| | languages = {} |
| | with open("languages.ftl") as fin: |
| | for line in fin: |
| | lang_code, lang_name = line.strip().split(" = ") |
| | languages[lang_code] = lang_name |
| |
|
| | return languages |
| |
|
| |
|
| | def main(): |
| | language_names = get_language_names() |
| |
|
| | for version in VERSIONS: |
| | stats_url = RELEASE_STATS_URL.format(version["release"]) |
| | release_stats = requests.get(stats_url).text |
| | release_stats = json.loads(release_stats) |
| | release_stats["version"] = version["semver"] |
| |
|
| | dataset_path = version["name"] |
| | os.makedirs(dataset_path, exist_ok=True) |
| | with open(f"{dataset_path}/release_stats.py", "w") as fout: |
| | fout.write("STATS = " + str(release_stats)) |
| |
|
| | with open(f"README.template.md", "r") as fin: |
| | readme = fin.read() |
| | readme = readme.replace("{{NAME}}", release_stats["name"]) |
| |
|
| | locales = sorted(release_stats["locales"].keys()) |
| | languages = [f"- {loc}" for loc in locales] |
| | readme = readme.replace("{{LANGUAGES}}", "\n".join(languages)) |
| |
|
| | sizes = [f" {loc}:\n - {num_to_size(release_stats['locales'][loc]['clips'])}" for loc in locales] |
| | readme = readme.replace("{{SIZES}}", "\n".join(sizes)) |
| |
|
| | languages_human = sorted([language_names[loc] for loc in locales]) |
| | readme = readme.replace("{{LANGUAGES_HUMAN}}", ", ".join(languages_human)) |
| |
|
| | readme = readme.replace("{{TOTAL_HRS}}", str(release_stats["totalHrs"])) |
| | readme = readme.replace("{{VAL_HRS}}", str(release_stats["totalValidHrs"])) |
| | readme = readme.replace("{{NUM_LANGS}}", str(len(locales))) |
| |
|
| | with open(f"{dataset_path}/README.md", "w") as fout: |
| | fout.write(readme) |
| | with open(f"{dataset_path}/languages.py", "w") as fout: |
| | fout.write("LANGUAGES = " + str(language_names)) |
| |
|
| | shutil.copy("dataset_script.py", f"{dataset_path}/{dataset_path}.py") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|