Spaces:

Di12
/

KC_Classifier

Sleeping

App Files Files Community

Di12 commited on Jun 11, 2025

Commit

c7dc535

verified ·

1 Parent(s): 8d15863

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -12

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# app.py
 import json
 import re
 import unicodedata
@@ -6,7 +5,6 @@ from bs4 import BeautifulSoup
 import numpy as np
 import gradio as gr
-# —— 1. Preprocess (như trước) —— #
 def clean_html(raw_html: str) -> str:
     soup = BeautifulSoup(raw_html, "html.parser")
     for img in soup.find_all("img"): img.decompose()
@@ -27,13 +25,11 @@ def normalize_text(text: str) -> str:
 def preprocess(content_html: str) -> str:
     return normalize_text(clean_html(content_html))
-# —— 2. Load JSON & build transformer + NB classifier —— #
 with open("vectorizer.json", encoding="utf-8") as f:
     vect_data = json.load(f)
 vocab = vect_data["vocabulary"]
-# nếu có idf: idf = np.array(vect_data["idf"])
-# Chúng ta sẽ implement CountVectorizer-like transform:
 def transform_count(docs):
     """
     docs: list of preprocessed strings
@@ -47,10 +43,7 @@ def transform_count(docs):
             idx = vocab.get(token)
             if idx is not None:
                 X[i, idx] += 1.0
-    return X
-# Nếu bạn dùng TfidfVectorizer,
-# bạn sẽ tính tf-idf dựa trên vect_data["idf"] → bỏ qua trong ví dụ này.
 with open("nbc_model.json", encoding="utf-8") as f:
     clf_data = json.load(f)
@@ -69,7 +62,6 @@ def predict_nb_count(docs):
     idx = np.argmax(log_post, axis=1)
     return classes[idx]
-# —— 3. Gradio interface —— #
 def predict_kc(content_html: str):
     if not content_html:
         return "Chưa nhập content."
@@ -83,8 +75,8 @@ interface = gr.Interface(
     fn = predict_kc,
     inputs  = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
     outputs = gr.Label(label="KC dự đoán"),
-    title    = "NBC KC Predictor (no-pickle)",
-    description="Dự đoán nhãn KC dựa trên Naive Bayes đã export JSON."
 )
 if __name__ == "__main__":

 import json
 import re
 import unicodedata
 import numpy as np
 import gradio as gr
 def clean_html(raw_html: str) -> str:
     soup = BeautifulSoup(raw_html, "html.parser")
     for img in soup.find_all("img"): img.decompose()
 def preprocess(content_html: str) -> str:
     return normalize_text(clean_html(content_html))
 with open("vectorizer.json", encoding="utf-8") as f:
     vect_data = json.load(f)
 vocab = vect_data["vocabulary"]
+# Implement CountVectorizer-like transform:
 def transform_count(docs):
     """
     docs: list of preprocessed strings
             idx = vocab.get(token)
             if idx is not None:
                 X[i, idx] += 1.0
+    return
 with open("nbc_model.json", encoding="utf-8") as f:
     clf_data = json.load(f)
     idx = np.argmax(log_post, axis=1)
     return classes[idx]
 def predict_kc(content_html: str):
     if not content_html:
         return "Chưa nhập content."
     fn = predict_kc,
     inputs  = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
     outputs = gr.Label(label="KC dự đoán"),
+    title    = "KC Predictor",
+    description="Dự đoán nhãn KC dựa trên Naive Bayes."
 )
 if __name__ == "__main__":