TiberiuCristianLeon commited on
Commit
230b561
·
verified ·
1 Parent(s): 3737317

Add glotlid

Browse files
Files changed (1) hide show
  1. app.py +16 -1
app.py CHANGED
@@ -11,7 +11,7 @@ all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Rom
11
  iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
12
  DEFAULTS = None
13
 
14
- libraries = ["langdetect", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid"]
15
 
16
  class Detect():
17
  def __init__(self, text: str) -> None:
@@ -46,6 +46,16 @@ class Detect():
46
  # available_languages = cld2.LANGUAGES
47
  isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
48
  return [details[0][1], round(details[0][2], 2)]
 
 
 
 
 
 
 
 
 
 
49
  def fasttext(self) -> list[str, float]:
50
  import fasttext
51
  from huggingface_hub import hf_hub_download
@@ -66,6 +76,9 @@ class Detect():
66
  long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
67
  lang_code = all_langs[long_langname][0]
68
  return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
 
 
 
69
 
70
  def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
71
  """
@@ -101,6 +114,8 @@ def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, st
101
  detections.append(['fasttext'] + detectinstance.fasttext())
102
  if 'openlid' in used_libraries:
103
  detections.append(['openlid'] + detectinstance.openlid())
 
 
104
  unique_languages = list(set([x[1] for x in detections]))
105
  print(unique_languages, detections)
106
  return detections
 
11
  iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
12
  DEFAULTS = None
13
 
14
+ libraries = ["langdetect", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid", "glotlid"]
15
 
16
  class Detect():
17
  def __init__(self, text: str) -> None:
 
46
  # available_languages = cld2.LANGUAGES
47
  isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
48
  return [details[0][1], round(details[0][2], 2)]
49
+ def parse_fastext(self, repo_id, k=3):
50
+ import fasttext
51
+ from huggingface_hub import hf_hub_download
52
+ model_path = hf_hub_download(repo_id=repo_id, filename="model.bin")
53
+ model = fasttext.load_model(model_path)
54
+ language, probabilities = model.predict(self.text, k=k)
55
+ reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
56
+ long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
57
+ lang_code = all_langs[long_langname][0]
58
+ return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
59
  def fasttext(self) -> list[str, float]:
60
  import fasttext
61
  from huggingface_hub import hf_hub_download
 
76
  long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
77
  lang_code = all_langs[long_langname][0]
78
  return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
79
+ def glotlid(self) -> list[str, float]:
80
+ repo_id="cis-lmu/glotlid"
81
+ return self.parse_fastext(self, repo_id)
82
 
83
  def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
84
  """
 
114
  detections.append(['fasttext'] + detectinstance.fasttext())
115
  if 'openlid' in used_libraries:
116
  detections.append(['openlid'] + detectinstance.openlid())
117
+ if 'glotlid' in used_libraries:
118
+ detections.append(['glotlid'] + detectinstance.glotlid())
119
  unique_languages = list(set([x[1] for x in detections]))
120
  print(unique_languages, detections)
121
  return detections