LanguageDetection

Sleeping

App Files Files Community

TiberiuCristianLeon commited on Jan 13

Commit

230b561

verified ·

1 Parent(s): 3737317

Add glotlid

Browse files

Files changed (1) hide show

app.py +16 -1

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Rom
 iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
 DEFAULTS = None
-libraries = ["langdetect", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid"]
 class Detect():
     def __init__(self, text: str) -> None:
@@ -46,6 +46,16 @@ class Detect():
         # available_languages = cld2.LANGUAGES
         isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
         return [details[0][1], round(details[0][2], 2)]
     def fasttext(self) -> list[str, float]:
         import fasttext
         from huggingface_hub import hf_hub_download
@@ -66,6 +76,9 @@ class Detect():
         long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
         lang_code = all_langs[long_langname][0]
         return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
 def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
     """
@@ -101,6 +114,8 @@ def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, st
         detections.append(['fasttext'] + detectinstance.fasttext())
     if 'openlid' in used_libraries:
         detections.append(['openlid'] + detectinstance.openlid())
     unique_languages = list(set([x[1] for x in detections]))
     print(unique_languages, detections)
     return detections

 iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
 DEFAULTS = None
+libraries = ["langdetect", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid", "glotlid"]
 class Detect():
     def __init__(self, text: str) -> None:
         # available_languages = cld2.LANGUAGES
         isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
         return [details[0][1], round(details[0][2], 2)]
+    def parse_fastext(self, repo_id, k=3):
+        import fasttext
+        from huggingface_hub import hf_hub_download
+        model_path = hf_hub_download(repo_id=repo_id, filename="model.bin")
+        model = fasttext.load_model(model_path)
+        language, probabilities = model.predict(self.text, k=k)
+        reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
+        long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
+        lang_code = all_langs[long_langname][0]
+        return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
     def fasttext(self) -> list[str, float]:
         import fasttext
         from huggingface_hub import hf_hub_download
         long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
         lang_code = all_langs[long_langname][0]
         return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
+    def glotlid(self) -> list[str, float]:
+        repo_id="cis-lmu/glotlid"
+        return self.parse_fastext(self, repo_id)
 def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
     """
         detections.append(['fasttext'] + detectinstance.fasttext())
     if 'openlid' in used_libraries:
         detections.append(['openlid'] + detectinstance.openlid())
+    if 'glotlid' in used_libraries:
+        detections.append(['glotlid'] + detectinstance.glotlid())
     unique_languages = list(set([x[1] for x in detections]))
     print(unique_languages, detections)
     return detections