Add glotlid
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Rom
|
|
| 11 |
iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
|
| 12 |
DEFAULTS = None
|
| 13 |
|
| 14 |
-
libraries = ["langdetect", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid"]
|
| 15 |
|
| 16 |
class Detect():
|
| 17 |
def __init__(self, text: str) -> None:
|
|
@@ -46,6 +46,16 @@ class Detect():
|
|
| 46 |
# available_languages = cld2.LANGUAGES
|
| 47 |
isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
|
| 48 |
return [details[0][1], round(details[0][2], 2)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def fasttext(self) -> list[str, float]:
|
| 50 |
import fasttext
|
| 51 |
from huggingface_hub import hf_hub_download
|
|
@@ -66,6 +76,9 @@ class Detect():
|
|
| 66 |
long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
|
| 67 |
lang_code = all_langs[long_langname][0]
|
| 68 |
return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
|
| 71 |
"""
|
|
@@ -101,6 +114,8 @@ def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, st
|
|
| 101 |
detections.append(['fasttext'] + detectinstance.fasttext())
|
| 102 |
if 'openlid' in used_libraries:
|
| 103 |
detections.append(['openlid'] + detectinstance.openlid())
|
|
|
|
|
|
|
| 104 |
unique_languages = list(set([x[1] for x in detections]))
|
| 105 |
print(unique_languages, detections)
|
| 106 |
return detections
|
|
|
|
| 11 |
iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
|
| 12 |
DEFAULTS = None
|
| 13 |
|
| 14 |
+
libraries = ["langdetect", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid", "glotlid"]
|
| 15 |
|
| 16 |
class Detect():
|
| 17 |
def __init__(self, text: str) -> None:
|
|
|
|
| 46 |
# available_languages = cld2.LANGUAGES
|
| 47 |
isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
|
| 48 |
return [details[0][1], round(details[0][2], 2)]
|
| 49 |
+
def parse_fastext(self, repo_id, k=3):
|
| 50 |
+
import fasttext
|
| 51 |
+
from huggingface_hub import hf_hub_download
|
| 52 |
+
model_path = hf_hub_download(repo_id=repo_id, filename="model.bin")
|
| 53 |
+
model = fasttext.load_model(model_path)
|
| 54 |
+
language, probabilities = model.predict(self.text, k=k)
|
| 55 |
+
reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
|
| 56 |
+
long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
|
| 57 |
+
lang_code = all_langs[long_langname][0]
|
| 58 |
+
return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
|
| 59 |
def fasttext(self) -> list[str, float]:
|
| 60 |
import fasttext
|
| 61 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 76 |
long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
|
| 77 |
lang_code = all_langs[long_langname][0]
|
| 78 |
return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
|
| 79 |
+
def glotlid(self) -> list[str, float]:
|
| 80 |
+
repo_id="cis-lmu/glotlid"
|
| 81 |
+
return self.parse_fastext(self, repo_id)
|
| 82 |
|
| 83 |
def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
|
| 84 |
"""
|
|
|
|
| 114 |
detections.append(['fasttext'] + detectinstance.fasttext())
|
| 115 |
if 'openlid' in used_libraries:
|
| 116 |
detections.append(['openlid'] + detectinstance.openlid())
|
| 117 |
+
if 'glotlid' in used_libraries:
|
| 118 |
+
detections.append(['glotlid'] + detectinstance.glotlid())
|
| 119 |
unique_languages = list(set([x[1] for x in detections]))
|
| 120 |
print(unique_languages, detections)
|
| 121 |
return detections
|