Spaces:

cdleong
/

langcode-search

Sleeping

App Files Files Community

cdleong commited on Nov 17, 2021

Commit

741cd0d

1 Parent(s): 6998de9

Add support for Matlab/Python

Browse files

Files changed (1) hide show

app.py +123 -105

app.py CHANGED Viewed

@@ -5,55 +5,20 @@ import urllib
 import requests
 # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
-# FEATURE: add programming languages easter egg
 # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
 # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
 # TODO: add in vachan search even if lang not found
-st.write("# Language code/tag search")
-st.write("Fed up with language tag confusion? Here's your one-stop shop!")
-st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
-st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
-# https://huggingface.co/blog/streamlit-spaces
-# https://github.com/psf/requests-html
-# https://docs.streamlit.io/library/api-reference/write-magic/st.write
-example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
-langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
-# TODO: st.code() for these "lookup in progress" outputs.
-st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
-if langcodes.tag_is_valid(langtext):
-  st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
-else:
-  st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
-try:
-  lang = langcodes.Language.get(langtext)
-#  st.write(f"{lang} is the BCP-47 tag.")
-  if "unknown" in lang.display_name().lower():
-    st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
-    lang = None
-except langcodes.LanguageTagError as e:
-  st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
-  lang = None
-if lang is None:
-  try:
-    found = langcodes.find(langtext)
-    lang = found
-    st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
-  except LookupError as e:
-    st.write("## Result: failure!")
-    st.write(f"Unable to look up language code. But all hope is not lost...")
-    st.write(f"* You can also try https://r12a.github.io/app-subtags/")
-    st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
-    lang = None
 def pull_obsolete_codes(iso_code):
   session = HTMLSession()
@@ -93,73 +58,126 @@ def try_searching_vachan_engine(langtext):
     results_list = vachan_r.json()
   return results_list
-#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
-if lang is not None:
-  display = lang.display_name()
-  b_variant = lang.to_alpha3(variant='B')
-  t_variant = lang.to_alpha3(variant='T')
-  broader_tags = lang.broader_tags()
-  results_from_vachan = try_searching_vachan_engine(langtext)
-  standardized_tag = langcodes.standardize_tag(lang)
-  languoid_id = try_retrieving_glottolog_id(langtext)
-  st.write(f"## Results: probably use '{standardized_tag}'")
-  # TODO: make a results dictionary so it's easy to copy-paste?
-  st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
-  st.write(f"Breakdown of tag components:")
-  st.write(lang.describe())
-  st.write(f"Display name for {lang}: {lang.display_name()}")
-  st.write(f"Autonym for {lang}: {lang.autonym()}")
-  st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
-  st.write("## Further Information:")
-  st.write(f"Broader tags for this language, if any:")
-  st.write(broader_tags)
-  st.write(f"### Language Subtag Search Tool")
-  st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
-  st.write(f"### Glottolog")
-  if languoid_id:
-    st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
-  st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
-  if t_variant != b_variant:
-    st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
-  st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
-  st.write("### Older / Related Codes")
-  st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
-  st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
-  # ethnologue prefers T for german (deu), and T for French
-  st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
-  if t_variant != b_variant:
-    st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")
-  st.write("#### Codes scraped from iso639-3.sil.org")
-  #TODO: Cleanup this bit
-  t_obsolete_codes = pull_obsolete_codes(t_variant)
-  b_obsolete_codes = pull_obsolete_codes(b_variant)
-  if t_obsolete_codes:
-    st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
-    st.write(t_obsolete_codes)
-  elif b_obsolete_codes:
-    st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
-    st.write(b_obsolete_codes)
-  if results_from_vachan:
-    st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
-    st.write(results_from_vachan)

 import requests
 # FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
 # TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
 # TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages.
 # TODO: add in vachan search even if lang not found
+# TODO: results from glottolog even if none from others
+things_to_test = [
+  "knh", # deprecated code on ISO
+  "khn", # only has 639-3 on ISO
+  "xxx", # no such code on ISO or glottolog
+  "Chinese", # Vachan struggles.
+]
+def get_bcp47_from_langcode(langtext):
+  pass
 def pull_obsolete_codes(iso_code):
   session = HTMLSession()
     results_list = vachan_r.json()
   return results_list
+def main():
+  st.write("# Language code/tag search")
+  st.write("Fed up with language tag confusion? Here's your one-stop shop!")
+  st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
+  st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
+  # https://huggingface.co/blog/streamlit-spaces
+  # https://github.com/psf/requests-html
+  # https://docs.streamlit.io/library/api-reference/write-magic/st.write
+  example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
+  langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
+  if langtext.lower() == "matlab":
+    st.error("Matlab is not a real language! ¯\\_(ツ)_/¯")
+    return
+  if langtext.lower() == "python":
+    st.success("[Python is the best language!(https://www.python.org/)")
+    return
+  # TODO: st.code() for these "lookup in progress" outputs.
+  st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
+  if langcodes.tag_is_valid(langtext):
+    st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
+  else:
+    st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
+  try:
+    lang = langcodes.Language.get(langtext)
+  #  st.write(f"{lang} is the BCP-47 tag.")
+    if "unknown" in lang.display_name().lower():
+      st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
+      lang = None
+  except langcodes.LanguageTagError as e:
+    st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
+    lang = None
+  if lang is None:
+    try:
+      found = langcodes.find(langtext)
+      lang = found
+      st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
+    except LookupError as e:
+      st.write("## Result: failure!")
+      st.write(f"Unable to look up language code. But all hope is not lost...")
+      st.write(f"* You can also try https://r12a.github.io/app-subtags/")
+      st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
+      lang = None
+  #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
+  if lang is not None:
+    display = lang.display_name()
+    b_variant = lang.to_alpha3(variant='B')
+    t_variant = lang.to_alpha3(variant='T')
+    broader_tags = lang.broader_tags()
+    results_from_vachan = try_searching_vachan_engine(langtext)
+    standardized_tag = langcodes.standardize_tag(lang)
+    languoid_id = try_retrieving_glottolog_id(langtext)
+    st.write(f"## Results: probably use '{standardized_tag}'")
+    # TODO: make a results dictionary so it's easy to copy-paste?
+    st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")
+    st.write(f"Breakdown of tag components:")
+    st.write(lang.describe())
+    st.write(f"Display name for {lang}: {lang.display_name()}")
+    st.write(f"Autonym for {lang}: {lang.autonym()}")
+    st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
+    st.write("## Further Information:")
+    st.write(f"Broader tags for this language, if any:")
+    st.write(broader_tags)
+    st.write(f"### Language Subtag Search Tool")
+    st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
+    st.write(f"### Glottolog")
+    if languoid_id:
+      st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
+    st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
+    if t_variant != b_variant:
+      st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)")
+    st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
+    st.write("### Older / Related Codes")
+    st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
+    st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
+    # ethnologue prefers T for german (deu), and T for French
+    st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
+    if t_variant != b_variant:
+      st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")
+    st.write("#### Codes scraped from iso639-3.sil.org")
+    #TODO: Cleanup this bit
+    t_obsolete_codes = pull_obsolete_codes(t_variant)
+    b_obsolete_codes = pull_obsolete_codes(b_variant)
+    if t_obsolete_codes:
+      st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
+      st.write(t_obsolete_codes)
+    elif b_obsolete_codes:
+      st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
+      st.write(b_obsolete_codes)
+    if results_from_vachan:
+      st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
+      st.write(results_from_vachan)
+if __name__ == "__main__":
+  main()