Spaces:

Zhe-Zhang
/

Language_Classifier

Sleeping

App Files Files Community

Zhe-Zhang commited on Sep 30, 2025

Commit

32d1bb5

verified ·

1 Parent(s): a51206f

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -55

app.py CHANGED Viewed

@@ -1,21 +1,25 @@
 import torch
 import torch.nn as nn
-import joblib
 import hashlib
 from collections import Counter
 import gradio as gr
-# ========== utils ==========
 def ngrams(sentence, n=1, lc=True):
-    if lc:
-        sentence = sentence.lower()
-    return [sentence[i:i+n] for i in range(len(sentence) - n + 1)]
 def all_ngrams(sentence, max_ngram=3, lc=True):
-    result = []
     for i in range(1, max_ngram + 1):
-        result.append(ngrams(sentence, n=i, lc=lc))
-    return result
 MAX_CHARS = 521
 MAX_BIGRAMS = 1031
@@ -24,70 +28,75 @@ MAXES = [MAX_CHARS, MAX_BIGRAMS, MAX_TRIGRAMS]
 def reproducible_hash(string):
     h = hashlib.md5(string.encode("utf-8"), usedforsecurity=False)
-    return int.from_bytes(h.digest()[0:8], "big", signed=True)
 def hash_ngrams(ngrams, modulos):
-    out = []
     for ngram_list, modulo in zip(ngrams, modulos):
         codes = [(reproducible_hash(x) % modulo) for x in ngram_list]
-        out.append(codes)
-    return out
 def calc_rel_freq(codes):
     cnt = Counter(codes)
-    total = sum(cnt.values()) or 1
-    return {k: v / total for k, v in cnt.items()}
-MAX_SHIFT = [0]
-for i in range(1, len(MAXES)):
-    MAX_SHIFT.append(sum(MAXES[:i]))
-def shift_keys(dicts, shift_list):
-    new = {}
-    for i, d in enumerate(dicts):
-        for k, v in d.items():
-            new[k + shift_list[i]] = v
-    return new
-def build_freq_dict(sentence):
     hngrams = hash_ngrams(all_ngrams(sentence), MAXES)
-    freqs = list(map(calc_rel_freq, hngrams))
-    return shift_keys(freqs, MAX_SHIFT)
-# ========== load artifacts ==========
 vectorizer = joblib.load("nld_vectorizer.joblib")
 idx2lang = joblib.load("nld_lang_codes.joblib")
-input_dim = len(vectorizer.feature_names_)   # 确保和训练时一致
-num_classes = len(idx2lang)
 model = nn.Sequential(
     nn.Linear(input_dim, 50),
     nn.ReLU(),
-    nn.Linear(50, num_classes)
 )
-state_dict = torch.load("nld.pth", map_location="cpu")
-model.load_state_dict(state_dict)
 model.eval()
-# ========== prediction ==========
-def detect_lang(text: str):
-    feat_dict = build_freq_dict(text)
-    X = vectorizer.transform([feat_dict])
-    X_tensor = torch.from_numpy(X.toarray().astype("float32"))
-    with torch.no_grad():
-        logits = model(X_tensor)
-        pred_idx = torch.argmax(logits, dim=1).item()
-    return idx2lang[pred_idx]
-# ========== Gradio UI ==========
-with gr.Blocks(title="Language Detector") as demo:
-    gr.Markdown("## Language Detector")
     with gr.Row():
-        text_in = gr.Textbox(label="Input text", placeholder="Type something...")
-        text_out = gr.Textbox(label="Predicted language", interactive=False)
-    btn = gr.Button("Detect")
-    btn.click(fn=detect_lang, inputs=text_in, outputs=text_out)
-demo.launch()

+import numpy as np
 import torch
 import torch.nn as nn
 import hashlib
+import joblib
 from collections import Counter
 import gradio as gr
+# --- utils (from the notebook) ---
 def ngrams(sentence, n=1, lc=True):
+    ngram_l = []
+    sentence = sentence.lower()
+    for i in range(len(sentence) - n + 1):
+        ngram = sentence[i:i+n]
+        ngram_l.append(ngram)
+    return ngram_l
 def all_ngrams(sentence, max_ngram=3, lc=True):
+    all_ngram_list = []
     for i in range(1, max_ngram + 1):
+        all_ngram_list += [ngrams(sentence, n=i, lc=lc)]
+    return all_ngram_list
 MAX_CHARS = 521
 MAX_BIGRAMS = 1031
 def reproducible_hash(string):
     h = hashlib.md5(string.encode("utf-8"), usedforsecurity=False)
+    return int.from_bytes(h.digest()[0:8], 'big', signed=True)
 def hash_ngrams(ngrams, modulos):
+    hash_codes = []
     for ngram_list, modulo in zip(ngrams, modulos):
         codes = [(reproducible_hash(x) % modulo) for x in ngram_list]
+        hash_codes.append(codes)
+    return hash_codes
 def calc_rel_freq(codes):
     cnt = Counter(codes)
+    total = sum(cnt.values())
+    for k in cnt:
+        cnt[k] /= total
+    return cnt
+MAX_SHIFT = []
+for i in range(len(MAXES)):
+    MAX_SHIFT += [sum(MAXES[:i])]
+def shift_keys(dicts, MAX_SHIFT):
+    new_dict = {}
+    for i, ngrams_d in enumerate(dicts):
+        for k, v in ngrams_d.items():
+            new_dict[k + MAX_SHIFT[i]] = v
+    return new_dict
+def build_freq_dict(sentence, MAXES=MAXES, MAX_SHIFT=MAX_SHIFT):
     hngrams = hash_ngrams(all_ngrams(sentence), MAXES)
+    fhcodes = map(calc_rel_freq, hngrams)
+    return shift_keys(fhcodes, MAX_SHIFT)
+# --- load models ---
+clf = joblib.load("nld.joblib")
 vectorizer = joblib.load("nld_vectorizer.joblib")
 idx2lang = joblib.load("nld_lang_codes.joblib")
+input_dim = len(vectorizer.vocabulary_)
+nbr_classes = len(idx2lang)
 model = nn.Sequential(
     nn.Linear(input_dim, 50),
     nn.ReLU(),
+    nn.Linear(50, nbr_classes)
 )
+model.load_state_dict(torch.load("nld.pth", map_location="cpu"))
 model.eval()
+# --- prediction function ---
+def detect_lang(src_sentence):
+    src_sentence = [src_sentence]
+    X_test = vectorizer.transform(map(build_freq_dict, src_sentence))
+    if hasattr(X_test, "toarray"):
+        X_test = X_test.toarray()
+    Y_logits = model(torch.Tensor(X_test))
+    pred_languages = torch.argmax(Y_logits, dim=-1).tolist()
+    return list(map(idx2lang.get, pred_languages))[0]
+# --- Gradio UI ---
+with gr.Blocks(title="language detector") as demo:
+    gr.Markdown("# language detector")
     with gr.Row():
+        with gr.Column():
+            src_sentence = gr.Textbox(
+                label="Text", placeholder="Write your text...")
+        with gr.Column():
+            tgt_sentence = gr.Textbox(
+                label="Language", placeholder="Language will show here...")
+    btn = gr.Button("Guess the language!")
+    btn.click(fn=detect_lang, inputs=[src_sentence], outputs=[tgt_sentence])
+demo.launch()