|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datetime import date |
|
|
import os |
|
|
import sys |
|
|
from montreal_forced_aligner.utils import get_mfa_version |
|
|
|
|
|
|
|
|
|
|
|
project = 'mfa model' |
|
|
copyright = f"2018-{date.today().year}, Montreal Corpus Tools" |
|
|
author = 'Montreal Corpus Tools' |
|
|
|
|
|
version = ".".join(get_mfa_version().split(".", maxsplit=2)[:2]) |
|
|
|
|
|
release = get_mfa_version() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
extensions = [ |
|
|
"sphinx_needs", |
|
|
"sphinx_design", |
|
|
"sphinx.ext.viewcode", |
|
|
"sphinx.ext.extlinks", |
|
|
'myst_parser', |
|
|
'sphinx.ext.autosectionlabel', |
|
|
'ipa_charts' |
|
|
] |
|
|
myst_enable_extensions = ["colon_fence"] |
|
|
myst_heading_anchors = 3 |
|
|
panels_add_bootstrap_css = False |
|
|
autosectionlabel_prefix_document = True |
|
|
needs_include_needs = True |
|
|
|
|
|
needs_types = [dict(directive="acoustic", title="Acoustic model", prefix="AM_", color="#BFD8D2", style="node"), |
|
|
dict(directive="corpus", title="Corpus", prefix="", color="#FEDCD2", style="node"), |
|
|
dict(directive="g2p", title="G2P model", prefix="G2P_", color="#FEDCD2", style="node"), |
|
|
dict(directive="language_model", title="Language model", prefix="LM_", color="#DF744A", style="node"), |
|
|
dict(directive="ivector", title="Ivector Extractor", prefix="IE_", color="#DCB239", style="node"), |
|
|
dict(directive="tokenizer", title="Tokenizer", prefix="T_", color="#DCB239", style="node"), |
|
|
dict(directive="dictionary", title="Dictionary", prefix="D_", color="#DCB239", style="node"), |
|
|
] |
|
|
|
|
|
needs_template_folder = '_templates/needs_templates' |
|
|
|
|
|
needs_layouts = { |
|
|
'not_mfa': { |
|
|
'grid': 'content', |
|
|
}, |
|
|
'mfa': { |
|
|
'grid': 'content_side_right', |
|
|
'layout': { |
|
|
'side': [ |
|
|
'<<image("https://montreal-forced-aligner.readthedocs.io/en/latest/_static/logo_stacked_light.svg")>>' |
|
|
] |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
needs_show_link_title =True |
|
|
needs_show_link_type =True |
|
|
needs_role_need_template = "{title}" |
|
|
needs_extra_options = ['name', 'language', 'dialect', 'architecture', 'phoneset', 'license'] |
|
|
needs_table_style = "datatables" |
|
|
needs_table_columns = "ID;name;language;dialect;phoneset;tags" |
|
|
needs_tags = [ |
|
|
dict(name="MFA", description="Maintained by Montreal Forced Aligner"), |
|
|
dict(name="PROSODYLAB", description="Resources developed by Prosodylab"), |
|
|
dict(name="PINYIN", description="Pinyin phone set"), |
|
|
dict(name="CV", description="Maintained by VoxCommunis"), |
|
|
dict(name="IPA", description="Based on the International Phonetic Alphabet"), |
|
|
dict(name="Common Voice", description="Corpora in Mozilla's Common Voice collection"), |
|
|
dict(name="Google", description="Corpora collected and distributed by Google"), |
|
|
dict(name="Microsoft", description="Corpora collected and distributed by Microsoft"), |
|
|
dict(name="GlobalPhone", description="Corpora in GlobalPhone collection"), |
|
|
dict(name="MagicData", description="Corpora in MagicData collection"), |
|
|
dict(name="ICE", description="Corpora in ICE collection"), |
|
|
dict(name="Non-native", description="Corpora with non-native speakers"), |
|
|
dict(name="VoxPopuli", description="Corpora in Vox Populi collection"), |
|
|
dict(name="MediaSpeech", description="Corpora in MediaSpeech collection"), |
|
|
dict(name="Multilingual Librispeech", description="Corpora in Multilingual Librispeech collection"), |
|
|
dict(name="M-AILABS", description="Corpora in M-AILABS's collections"), |
|
|
dict(name="Multilingual TEDx", description="Corpora in the Multilingual TEDx collection"), |
|
|
] |
|
|
|
|
|
current_languages = ["Abkhaz", "Armenian", "Arabic", "Bashkir", "Basque", "Belarusian", "Bulgarian", |
|
|
"Chuvash", "Croatian", "Czech", "Dutch", "English", "French", "Georgian", "German","Greek", |
|
|
"Guarani", "Hausa", "Hindi", "Hungarian", "Indonesian", "Italian", "Japanese", "Kazakh", "Korean", "Kurmanji", "Kyrgyz", |
|
|
"Maltese", "Mandarin", "Polish", "Portuguese", "Punjabi", "Romanian", "Russian", "Sorbian", "Spanish", "Swahili", "Swedish", |
|
|
"Tamil", "Tatar", "Thai", "Turkish", "Ukrainian", "Urdu", "Uyghur", "Uzbek", "Vietnamese"] |
|
|
for lang in current_languages: |
|
|
needs_tags.append({'name': lang,'description':f'{lang} language'}) |
|
|
|
|
|
needs_tags.append({'name': 'Multilingual','description': 'Multiple languages'}) |
|
|
|
|
|
needs_id_regex = '[A-Za-z0-9 .():_]+' |
|
|
needs_id_required = True |
|
|
needs_role_need_max_title_length = 0 |
|
|
|
|
|
templates_path = ['_templates'] |
|
|
|
|
|
|
|
|
xref_links = { |
|
|
"wikipedia": ("Wikipedia", "https://en.wikipedia.org/wiki/Main_Page"), |
|
|
"phoible": ("Phoible", "https://phoible.org/"), |
|
|
"xpf": ("XPF", "https://cohenpr-xpf.github.io/XPF/"), |
|
|
"nagisa": ("Nagisa", "https://github.com/taishi-i/nagisa"), |
|
|
"konlpy": ("KoNLPy", "https://konlpy.org/en/latest/"), |
|
|
"spacy_pkuseg": ("spacy-pkuseg", "https://github.com/explosion/spacy-pkuseg/"), |
|
|
"num2chinese": ("num2chinese.py", "https://gist.github.com/gumblex/0d65cad2ba607fd14de7"), |
|
|
"hanziconv": ("hanziconv", "https://github.com/berniey/hanziconv"), |
|
|
"num2words": ("num2words", "https://github.com/savoirfairelinux/num2words"), |
|
|
"thai_word_segmentation": ("thai-word-segmentation", "https://github.com/sertiscorp/thai-word-segmentation"), |
|
|
"mecab_ko": ("Mecab-KO", "https://bitbucket.org/eunjeon/mecab-ko/src/master/"), |
|
|
"whisperx": ( |
|
|
"WhisperX", |
|
|
"https://github.com/m-bain/whisperX", |
|
|
), |
|
|
"nemo": ( |
|
|
"NeMo Forced Aligner", |
|
|
"https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/tools/nemo_forced_aligner.html", |
|
|
), |
|
|
"wav2vec2": ( |
|
|
"Wav2Vec2", |
|
|
"https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html", |
|
|
), |
|
|
} |
|
|
|
|
|
needs_extra_links = [ |
|
|
{ |
|
|
"option": "built_with", |
|
|
"incoming": "Built with", |
|
|
"outgoing": "Built with", |
|
|
"allow_dead_links ": True, |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
exclude_patterns = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
html_theme = 'pydata_sphinx_theme' |
|
|
|
|
|
html_logo = "https://montreal-forced-aligner.readthedocs.io/en/latest/_static/logo.svg" |
|
|
html_favicon = "_static/favicon.ico" |
|
|
|
|
|
|
|
|
html_theme_options = { |
|
|
"external_links": [ |
|
|
{ |
|
|
"url": "https://montreal-forced-aligner.readthedocs.io/", |
|
|
"name": "MFA docs", |
|
|
}, |
|
|
], |
|
|
"icon_links": [ |
|
|
{ |
|
|
"name": "GitHub", |
|
|
"url": "https://github.com/MontrealCorpusTools/mfa-models", |
|
|
"icon": "fab fa-github", |
|
|
}, |
|
|
], |
|
|
"logo": { |
|
|
"text": "Montreal Forced Aligner", |
|
|
"image_dark": "https://montreal-forced-aligner.readthedocs.io/en/latest/_static/logo_dark.svg", |
|
|
}, |
|
|
"analytics":{ |
|
|
|
|
|
"google_analytics_id": "G-31RXW9TT1Z", |
|
|
}, |
|
|
"show_nav_level": 1, |
|
|
"navigation_depth": 4, |
|
|
"show_toc_level": 2, |
|
|
"collapse_navigation": False, |
|
|
} |
|
|
html_context = { |
|
|
"github_user": "MontrealCorpusTools", |
|
|
"github_repo": "mfa-models", |
|
|
"github_version": "main", |
|
|
"doc_path": "docs/source", |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
html_static_path = ['_static'] |
|
|
html_css_files = [ |
|
|
"https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/fontawesome.min.css", |
|
|
"https://montreal-forced-aligner.readthedocs.io/en/latest/_static/css/mfa.css", |
|
|
"css/style.css", |
|
|
"css/datatables.css", |
|
|
] |
|
|
html_js_files = [ |
|
|
'main.js', |
|
|
] |
|
|
html_sidebars = {"**": ["search-field.html", "sidebar-nav-bs.html", "sidebar-ethical-ads.html"]} |
|
|
rst_prolog = """ |
|
|
.. role:: manner |
|
|
:class: manner |
|
|
|
|
|
.. role:: submanner |
|
|
:class: submanner |
|
|
|
|
|
.. role:: lexical_set |
|
|
:class: lexical-set |
|
|
|
|
|
.. role:: ipa_inline |
|
|
:class: ipa-inline ipa-highlight |
|
|
""" |
|
|
from sphinx_needs.api.configuration import add_dynamic_function |
|
|
|
|
|
|
|
|
license_links = { |
|
|
'CC-0': 'https://creativecommons.org/publicdomain/zero/1.0/', |
|
|
'CC BY 4.0': 'https://creativecommons.org/licenses/by/4.0/', |
|
|
'CC BY-NC-SA 4.0': 'https://creativecommons.org/licenses/by-nc-sa/4.0/', |
|
|
'CC BY-SA 4.0': 'https://creativecommons.org/licenses/by-sa/4.0/', |
|
|
'CC BY-NC-ND 4.0': 'https://creativecommons.org/licenses/by-nc-nd/4.0/', |
|
|
'CC BY-NC 2.0': 'https://creativecommons.org/licenses/by-nc/2.0/', |
|
|
'Microsoft Research Data License': 'https://msropendata-web-api.azurewebsites.net/licenses/2f933be3-284d-500b-7ea3-2aa2fd0f1bb2/view', |
|
|
'Apache 2.0': 'https://www.apache.org/licenses/LICENSE-2.0', |
|
|
'MIT': 'https://opensource.org/licenses/MIT', |
|
|
'Public domain in the USA': 'https://creativecommons.org/share-your-work/public-domain/cc0/', |
|
|
'M-AILABS License': 'https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/', |
|
|
'ELRA': 'https://www.elra.info/en/services-around-lrs/distribution/licensing/', |
|
|
|
|
|
} |
|
|
|
|
|
for lic in license_links.keys(): |
|
|
desc = lic |
|
|
if not lic.endswith(' License') and 'Public' not in lic: |
|
|
desc += ' License' |
|
|
needs_tags.append({'name': lic,'description': desc}) |
|
|
|
|
|
phone_set_links = { |
|
|
'Epitran': 'https://github.com/dmort27/epitran', |
|
|
'XPF': 'https://github.com/CohenPr-XPF/XPF', |
|
|
'ARPA': 'https://en.wikipedia.org/wiki/ARPABET', |
|
|
'MFA': 'https://mfa-models.readthedocs.io/en/refactor/mfa_phone_set.html', |
|
|
} |
|
|
|
|
|
for ps in phone_set_links.keys(): |
|
|
needs_tags.append({'name': ps,'description':f'{ps} phone set'}) |
|
|
|
|
|
|
|
|
def name_link(app, need, needs, *args, **kwargs): |
|
|
target_node = need['target_node'] |
|
|
print(target_node) |
|
|
return str(target_node) |
|
|
|
|
|
def language_link(app, need, needs, *args, **kwargs): |
|
|
target_node = need['target_node'] |
|
|
return str(target_node) |
|
|
|
|
|
def license_link(app, need, needs, license): |
|
|
return f"[{license}]({license_links[license]})" |
|
|
|
|
|
def phone_set_link(app, need, needs, phone_set): |
|
|
print(need) |
|
|
print(need['language']) |
|
|
if phone_set not in phone_set_links: |
|
|
return phone_set |
|
|
return f"[{phone_set}]({phone_set_links[phone_set]})" |
|
|
|
|
|
needs_string_links = { |
|
|
|
|
|
'external_link': { |
|
|
'regex': r'^\[(?P<title>.+)\]\((?P<link>.+)\)$', |
|
|
'link_url': '{{link}}', |
|
|
'link_name': '{{title}}', |
|
|
'options': ['phoneset', 'license'] |
|
|
}, |
|
|
|
|
|
'github_link': { |
|
|
'regex': r'^(?P<value>\w+)$', |
|
|
'link_url': 'https://github.com/useblocks/sphinxcontrib-needs/issues/{{value}}', |
|
|
'link_name': 'GitHub #{{value}}', |
|
|
'options': ['github'] |
|
|
} |
|
|
} |
|
|
def setup(app): |
|
|
add_dynamic_function(app, name_link) |
|
|
add_dynamic_function(app, language_link) |
|
|
add_dynamic_function(app, license_link) |
|
|
add_dynamic_function(app, phone_set_link) |
|
|
|