Di12 commited on
Commit
c7dc535
·
verified ·
1 Parent(s): 8d15863

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -12
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app.py
2
  import json
3
  import re
4
  import unicodedata
@@ -6,7 +5,6 @@ from bs4 import BeautifulSoup
6
  import numpy as np
7
  import gradio as gr
8
 
9
- # —— 1. Preprocess (như trước) —— #
10
  def clean_html(raw_html: str) -> str:
11
  soup = BeautifulSoup(raw_html, "html.parser")
12
  for img in soup.find_all("img"): img.decompose()
@@ -27,13 +25,11 @@ def normalize_text(text: str) -> str:
27
  def preprocess(content_html: str) -> str:
28
  return normalize_text(clean_html(content_html))
29
 
30
- # —— 2. Load JSON & build transformer + NB classifier —— #
31
  with open("vectorizer.json", encoding="utf-8") as f:
32
  vect_data = json.load(f)
33
  vocab = vect_data["vocabulary"]
34
- # nếu có idf: idf = np.array(vect_data["idf"])
35
 
36
- # Chúng ta sẽ implement CountVectorizer-like transform:
37
  def transform_count(docs):
38
  """
39
  docs: list of preprocessed strings
@@ -47,10 +43,7 @@ def transform_count(docs):
47
  idx = vocab.get(token)
48
  if idx is not None:
49
  X[i, idx] += 1.0
50
- return X
51
-
52
- # Nếu bạn dùng TfidfVectorizer,
53
- # bạn sẽ tính tf-idf dựa trên vect_data["idf"] → bỏ qua trong ví dụ này.
54
 
55
  with open("nbc_model.json", encoding="utf-8") as f:
56
  clf_data = json.load(f)
@@ -69,7 +62,6 @@ def predict_nb_count(docs):
69
  idx = np.argmax(log_post, axis=1)
70
  return classes[idx]
71
 
72
- # —— 3. Gradio interface —— #
73
  def predict_kc(content_html: str):
74
  if not content_html:
75
  return "Chưa nhập content."
@@ -83,8 +75,8 @@ interface = gr.Interface(
83
  fn = predict_kc,
84
  inputs = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
85
  outputs = gr.Label(label="KC dự đoán"),
86
- title = "NBC KC Predictor (no-pickle)",
87
- description="Dự đoán nhãn KC dựa trên Naive Bayes đã export JSON."
88
  )
89
 
90
  if __name__ == "__main__":
 
 
1
  import json
2
  import re
3
  import unicodedata
 
5
  import numpy as np
6
  import gradio as gr
7
 
 
8
  def clean_html(raw_html: str) -> str:
9
  soup = BeautifulSoup(raw_html, "html.parser")
10
  for img in soup.find_all("img"): img.decompose()
 
25
  def preprocess(content_html: str) -> str:
26
  return normalize_text(clean_html(content_html))
27
 
 
28
  with open("vectorizer.json", encoding="utf-8") as f:
29
  vect_data = json.load(f)
30
  vocab = vect_data["vocabulary"]
 
31
 
32
+ # Implement CountVectorizer-like transform:
33
  def transform_count(docs):
34
  """
35
  docs: list of preprocessed strings
 
43
  idx = vocab.get(token)
44
  if idx is not None:
45
  X[i, idx] += 1.0
46
+ return
 
 
 
47
 
48
  with open("nbc_model.json", encoding="utf-8") as f:
49
  clf_data = json.load(f)
 
62
  idx = np.argmax(log_post, axis=1)
63
  return classes[idx]
64
 
 
65
  def predict_kc(content_html: str):
66
  if not content_html:
67
  return "Chưa nhập content."
 
75
  fn = predict_kc,
76
  inputs = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
77
  outputs = gr.Label(label="KC dự đoán"),
78
+ title = "KC Predictor",
79
+ description="Dự đoán nhãn KC dựa trên Naive Bayes."
80
  )
81
 
82
  if __name__ == "__main__":