Di12 commited on
Commit
2041649
·
1 Parent(s): bddd7a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -37
app.py CHANGED
@@ -1,20 +1,19 @@
 
 
1
  import re
2
  import unicodedata
3
  from bs4 import BeautifulSoup
4
- import joblib
5
  import gradio as gr
6
 
 
7
  def clean_html(raw_html: str) -> str:
8
- """Loại bỏ <img>, <math>, giữ text thuần."""
9
  soup = BeautifulSoup(raw_html, "html.parser")
10
- for img in soup.find_all("img"):
11
- img.decompose()
12
- for math_tag in soup.find_all("math"):
13
- math_tag.decompose()
14
  return soup.get_text(separator=" ", strip=True)
15
 
16
  def normalize_text(text: str) -> str:
17
- """Lowercase, giữ unicode letters & digits, thay ký tự khác thành space."""
18
  text = text.lower()
19
  chars = []
20
  for ch in text:
@@ -23,42 +22,70 @@ def normalize_text(text: str) -> str:
23
  chars.append(ch)
24
  else:
25
  chars.append(" ")
26
- text = "".join(chars)
27
- # xóa khoảng trắng thừa
28
- return re.sub(r"\s+", " ", text).strip()
29
 
30
- def preprocess(content_html: str) -> str
31
- text = clean_html(content_html)
32
- text = normalize_text(text)
33
- return text
34
 
35
- vect = joblib.load("vectorizer.joblib")
36
- clf = joblib.load("nbc_model.joblib")
 
 
 
37
 
38
- def predict_kc(content_html: str) -> str:
39
- if not content_html or not isinstance(content_html, str):
40
- return "Không có input hợp lệ."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  text = preprocess(content_html)
42
  if not text:
43
- return "Nội dung không chứa tự để dự đoán."
44
- Xv = vect.transform([text])
45
- pred = clf.predict(Xv)[0]
46
- return pred
47
 
48
- demo = gr.Interface(
49
- fn=predict_kc,
50
- inputs=gr.Textbox(
51
- lines=6,
52
- placeholder="Dán HTML Content (có thể kèm <p>, <img>, <math>) vào đây"
53
- ),
54
- outputs=gr.Label(num_top_classes=1, label="Mã KC dự đoán"),
55
- title="Naive Bayes KC Predictor",
56
- description="""
57
- Nhập nội dung câu hỏi (HTML) và nhấn Submit để nhận về
58
- mã kiến thức (KC) do mô hình Naive Bayes dự đoán.
59
- """,
60
- allow_flagging="never",
61
  )
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
+ # app.py
2
+ import json
3
  import re
4
  import unicodedata
5
  from bs4 import BeautifulSoup
6
+ import numpy as np
7
  import gradio as gr
8
 
9
+ # —— 1. Preprocess (như trước) —— #
10
  def clean_html(raw_html: str) -> str:
 
11
  soup = BeautifulSoup(raw_html, "html.parser")
12
+ for img in soup.find_all("img"): img.decompose()
13
+ for math in soup.find_all("math"): math.decompose()
 
 
14
  return soup.get_text(separator=" ", strip=True)
15
 
16
  def normalize_text(text: str) -> str:
 
17
  text = text.lower()
18
  chars = []
19
  for ch in text:
 
22
  chars.append(ch)
23
  else:
24
  chars.append(" ")
25
+ return re.sub(r"\s+", " ", "".join(chars)).strip()
 
 
26
 
27
+ def preprocess(content_html: str) -> str:
28
+ return normalize_text(clean_html(content_html))
 
 
29
 
30
+ # —— 2. Load JSON & build transformer + NB classifier —— #
31
+ with open("vectorizer.json", encoding="utf-8") as f:
32
+ vect_data = json.load(f)
33
+ vocab = vect_data["vocabulary"]
34
+ # nếu có idf: idf = np.array(vect_data["idf"])
35
 
36
+ # Chúng ta sẽ implement CountVectorizer-like transform:
37
+ def transform_count(docs):
38
+ """
39
+ docs: list of preprocessed strings
40
+ return: 2D numpy array (n_docs x n_features)
41
+ """
42
+ n_docs = len(docs)
43
+ n_feats = len(vocab)
44
+ X = np.zeros((n_docs, n_feats), dtype=np.float32)
45
+ for i, doc in enumerate(docs):
46
+ for token in doc.split():
47
+ idx = vocab.get(token)
48
+ if idx is not None:
49
+ X[i, idx] += 1.0
50
+ return X
51
+
52
+ # Nếu bạn dùng TfidfVectorizer,
53
+ # bạn sẽ tính tf-idf dựa trên vect_data["idf"] → bỏ qua trong ví dụ này.
54
+
55
+ with open("nbc_model.json", encoding="utf-8") as f:
56
+ clf_data = json.load(f)
57
+ classes = np.array(clf_data["classes"])
58
+ class_log_prior = np.array(clf_data["class_log_prior"])
59
+ feature_log_prob = np.array(clf_data["feature_log_prob"])
60
+
61
+ def predict_nb_count(docs):
62
+ """
63
+ doc-term count matrix X: sử dụng log-prob NB
64
+ return: list of labels
65
+ """
66
+ X = transform_count(docs) # shape (n_docs, n_feats)
67
+ # tính log posterior: log_prior + X @ feature_log_prob.T
68
+ log_post = class_log_prior + X.dot(feature_log_prob.T)
69
+ idx = np.argmax(log_post, axis=1)
70
+ return classes[idx]
71
+
72
+ # —— 3. Gradio interface —— #
73
+ def predict_kc(content_html: str):
74
+ if not content_html:
75
+ return "Chưa nhập content."
76
  text = preprocess(content_html)
77
  if not text:
78
+ return "Nội dung rỗng sau khi xử ."
79
+ label = predict_nb_count([text])[0]
80
+ return label
 
81
 
82
+ interface = gr.Interface(
83
+ fn = predict_kc,
84
+ inputs = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
85
+ outputs = gr.Label(label="KC dự đoán"),
86
+ title = "NBC KC Predictor (no-pickle)",
87
+ description="Dự đoán nhãn KC dựa trên Naive Bayes đã export JSON."
 
 
 
 
 
 
 
88
  )
89
 
90
  if __name__ == "__main__":
91
+ interface.launch()