Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# app.py
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
import unicodedata
|
|
@@ -6,7 +5,6 @@ from bs4 import BeautifulSoup
|
|
| 6 |
import numpy as np
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
-
# —— 1. Preprocess (như trước) —— #
|
| 10 |
def clean_html(raw_html: str) -> str:
|
| 11 |
soup = BeautifulSoup(raw_html, "html.parser")
|
| 12 |
for img in soup.find_all("img"): img.decompose()
|
|
@@ -27,13 +25,11 @@ def normalize_text(text: str) -> str:
|
|
| 27 |
def preprocess(content_html: str) -> str:
|
| 28 |
return normalize_text(clean_html(content_html))
|
| 29 |
|
| 30 |
-
# —— 2. Load JSON & build transformer + NB classifier —— #
|
| 31 |
with open("vectorizer.json", encoding="utf-8") as f:
|
| 32 |
vect_data = json.load(f)
|
| 33 |
vocab = vect_data["vocabulary"]
|
| 34 |
-
# nếu có idf: idf = np.array(vect_data["idf"])
|
| 35 |
|
| 36 |
-
#
|
| 37 |
def transform_count(docs):
|
| 38 |
"""
|
| 39 |
docs: list of preprocessed strings
|
|
@@ -47,10 +43,7 @@ def transform_count(docs):
|
|
| 47 |
idx = vocab.get(token)
|
| 48 |
if idx is not None:
|
| 49 |
X[i, idx] += 1.0
|
| 50 |
-
return
|
| 51 |
-
|
| 52 |
-
# Nếu bạn dùng TfidfVectorizer,
|
| 53 |
-
# bạn sẽ tính tf-idf dựa trên vect_data["idf"] → bỏ qua trong ví dụ này.
|
| 54 |
|
| 55 |
with open("nbc_model.json", encoding="utf-8") as f:
|
| 56 |
clf_data = json.load(f)
|
|
@@ -69,7 +62,6 @@ def predict_nb_count(docs):
|
|
| 69 |
idx = np.argmax(log_post, axis=1)
|
| 70 |
return classes[idx]
|
| 71 |
|
| 72 |
-
# —— 3. Gradio interface —— #
|
| 73 |
def predict_kc(content_html: str):
|
| 74 |
if not content_html:
|
| 75 |
return "Chưa nhập content."
|
|
@@ -83,8 +75,8 @@ interface = gr.Interface(
|
|
| 83 |
fn = predict_kc,
|
| 84 |
inputs = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
|
| 85 |
outputs = gr.Label(label="KC dự đoán"),
|
| 86 |
-
title = "
|
| 87 |
-
description="Dự đoán nhãn KC dựa trên Naive Bayes
|
| 88 |
)
|
| 89 |
|
| 90 |
if __name__ == "__main__":
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
import unicodedata
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import gradio as gr
|
| 7 |
|
|
|
|
| 8 |
def clean_html(raw_html: str) -> str:
|
| 9 |
soup = BeautifulSoup(raw_html, "html.parser")
|
| 10 |
for img in soup.find_all("img"): img.decompose()
|
|
|
|
| 25 |
def preprocess(content_html: str) -> str:
|
| 26 |
return normalize_text(clean_html(content_html))
|
| 27 |
|
|
|
|
| 28 |
with open("vectorizer.json", encoding="utf-8") as f:
|
| 29 |
vect_data = json.load(f)
|
| 30 |
vocab = vect_data["vocabulary"]
|
|
|
|
| 31 |
|
| 32 |
+
# Implement CountVectorizer-like transform:
|
| 33 |
def transform_count(docs):
|
| 34 |
"""
|
| 35 |
docs: list of preprocessed strings
|
|
|
|
| 43 |
idx = vocab.get(token)
|
| 44 |
if idx is not None:
|
| 45 |
X[i, idx] += 1.0
|
| 46 |
+
return
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
with open("nbc_model.json", encoding="utf-8") as f:
|
| 49 |
clf_data = json.load(f)
|
|
|
|
| 62 |
idx = np.argmax(log_post, axis=1)
|
| 63 |
return classes[idx]
|
| 64 |
|
|
|
|
| 65 |
def predict_kc(content_html: str):
|
| 66 |
if not content_html:
|
| 67 |
return "Chưa nhập content."
|
|
|
|
| 75 |
fn = predict_kc,
|
| 76 |
inputs = gr.Textbox(lines=5, placeholder="Dán HTML Content…"),
|
| 77 |
outputs = gr.Label(label="KC dự đoán"),
|
| 78 |
+
title = "KC Predictor",
|
| 79 |
+
description="Dự đoán nhãn KC dựa trên Naive Bayes."
|
| 80 |
)
|
| 81 |
|
| 82 |
if __name__ == "__main__":
|