lang_identification / README.md
mamyrbek's picture
Update README.md
53b7799 verified
import fasttext
import issai_lang_id as lang_id

# load model
model = fasttext.load_model("11langs.bin")
model_10langs = fasttext.load_model("10langs.bin")
model_9langs = fasttext.load_model("9langs.bin")
# print(model.labels)

def get_lang(text):
    s= text.strip().lower()
    label, probability = model.predict(s,k=2)

    yp = label[0].replace('__label__','')
    yp = lang_id.get(s,yp,probability[0])
    if yp=='sim' and not lang_id.contains_chinese(s):
        label_10, probability_10 = model_10langs.predict(s,k=2)
        yp_10 = label_10[0].replace('__label__','')
        yp = lang_id.get(s,yp_10,probability_10[0])
        yp = yp_10
    if yp=='arb' and lang_id.not_contains_arabic(s):
        # print(lang_id.contains_arabic(s),yp=='arb',s)
        label_9, probability_9 = model_9langs.predict(s,k=2)
        yp_9 = label_9[0].replace('__label__','')
        yp = lang_id.get(s,yp_9,probability_9[0])
        # print(f'arb corrected to: {yp_9}, yt: {yt} sentence: {s}')
        yp = yp_9
    return yp
# text = "这是一个测试句子。"
# print(get_lang(text))
# text = "This is a test sentence."
# print(get_lang(text))
# text = "هذا جملة اختبار."
# print(get_lang(text))