File size: 2,484 Bytes
c78c2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
PhilVerify — Bag of Words + Logistic Regression Classifier (Layer 1)

CountVectorizer (BoW) with LogisticRegression. Identical to TFIDFClassifier except
for the vectorizer — this isolates the BoW vs TF-IDF comparison in eval.py.
Supports optional WordNet lemmatization.
"""
import logging

from ml.naive_bayes_classifier import _lemmatize_tokens
from ml.tfidf_classifier import Layer1Result

logger = logging.getLogger(__name__)


class BoWClassifier:
    """
    BoW (CountVectorizer) + LogisticRegression classifier.

    Args:
        train_samples: list[Sample] from ml.dataset. If None, uses the full 100-sample dataset.
        lemmatize:     apply WordNet lemmatization before vectorization.
    """

    _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}

    def __init__(self, train_samples=None, lemmatize: bool = False):
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.linear_model import LogisticRegression

        self._lemmatize = lemmatize

        if train_samples is None:
            from ml.dataset import get_dataset
            train_samples = get_dataset()

        texts = [self._preprocess(s.text) for s in train_samples]
        labels = [s.label for s in train_samples]

        self._vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
        X = self._vectorizer.fit_transform(texts)

        self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
        self._clf.fit(X, labels)
        logger.info(
            "BoWClassifier trained on %d samples (lemmatize=%s)",
            len(texts), lemmatize,
        )

    def _preprocess(self, text: str) -> str:
        text = text.lower()
        if self._lemmatize:
            return " ".join(_lemmatize_tokens(text.split()))
        return text

    def predict(self, text: str) -> Layer1Result:
        processed = self._preprocess(text)
        X = self._vectorizer.transform([processed])
        pred_label = int(self._clf.predict(X)[0])
        proba = self._clf.predict_proba(X)[0]
        confidence = round(float(max(proba)) * 100, 1)
        verdict = self._LABELS[pred_label]

        feature_names = self._vectorizer.get_feature_names_out()
        bow_scores = X.toarray()[0]
        top_idx = bow_scores.argsort()[-5:][::-1]
        triggered = [feature_names[i] for i in top_idx if bow_scores[i] > 0]

        return Layer1Result(verdict=verdict, confidence=confidence, triggered_features=triggered)