File size: 2,868 Bytes
2041649
ca66ee2
 
 
2041649
ca66ee2
 
 
 
2041649
 
ca66ee2
 
 
 
 
 
 
 
 
 
 
2041649
ca66ee2
2041649
 
ca66ee2
2041649
 
 
ca66ee2
c7dc535
2041649
 
 
 
 
 
 
 
 
 
 
 
 
0fc0fc6
2041649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b3639
ca66ee2
 
2041649
 
 
ca66ee2
c1c6e2b
 
 
 
7ae4dbc
c1c6e2b
 
 
 
 
 
 
 
 
 
 
 
 
ca66ee2
c1c6e2b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import re
import unicodedata
from bs4 import BeautifulSoup
import numpy as np
import gradio as gr

def clean_html(raw_html: str) -> str:
    soup = BeautifulSoup(raw_html, "html.parser")
    for img in soup.find_all("img"): img.decompose()
    for math in soup.find_all("math"): math.decompose()
    return soup.get_text(separator=" ", strip=True)

def normalize_text(text: str) -> str:
    text = text.lower()
    chars = []
    for ch in text:
        cat = unicodedata.category(ch)
        if cat.startswith("L") or ch.isdigit() or ch.isspace():
            chars.append(ch)
        else:
            chars.append(" ")
    return re.sub(r"\s+", " ", "".join(chars)).strip()

def preprocess(content_html: str) -> str:
    return normalize_text(clean_html(content_html))

with open("vectorizer.json", encoding="utf-8") as f:
    vect_data = json.load(f)
vocab = vect_data["vocabulary"]

# Implement CountVectorizer-like transform:
def transform_count(docs):
    """
    docs: list of preprocessed strings
    return: 2D numpy array (n_docs x n_features)
    """
    n_docs = len(docs)
    n_feats = len(vocab)
    X = np.zeros((n_docs, n_feats), dtype=np.float32)
    for i, doc in enumerate(docs):
        for token in doc.split():
            idx = vocab.get(token)
            if idx is not None:
                X[i, idx] += 1.0
    return X 

with open("nbc_model.json", encoding="utf-8") as f:
    clf_data = json.load(f)
classes = np.array(clf_data["classes"])
class_log_prior   = np.array(clf_data["class_log_prior"])
feature_log_prob  = np.array(clf_data["feature_log_prob"])

def predict_nb_count(docs):
    """
    doc-term count matrix X: sử dụng log-prob NB
    return: list of labels
    """
    X = transform_count(docs)   # shape (n_docs, n_feats)
    # tính log posterior:  log_prior + X @ feature_log_prob.T
    log_post = class_log_prior + X.dot(feature_log_prob.T)
    idx = np.argmax(log_post, axis=1)
    return classes[idx]

def predict_kc(content_html: str):
    if not content_html:
        return "Chưa nhập nội dung câu hỏi."
    text = preprocess(content_html)
    if not text:
        return "Nội dung rỗng sau khi xử lý."
    label = predict_nb_count([text])[0]
    return label

css = """
textarea { font-size: 18px !important; }
.gradio-container .output-text { font-size: 18px !important; }
"""

interface = gr.Interface(
    fn = predict_kc,
    inputs  = gr.Textbox(lines=6,
        placeholder="Dán nội dung câu hỏi dạng HTML",
        label="Nội dung câu hỏi ",
        elem_id="input-box"),
    outputs = gr.Textbox(label="KC dự đoán",
        elem_id="output-box"),
    title    = "Demo dự đoán KC",
    description="Dự đoán nhãn KC từ câu hỏi dựa trên Naive Bayes.",
    css=css, 
    allow_flagging="never"
)

if __name__ == "__main__":
    interface.launch()