Spaces:

trantuan1701
/

ml_exercise

Sleeping

App Files Files Community

trantuan1701 commited on Sep 24, 2025

Commit

58e2d3b

1 Parent(s): 9d9f0fa

damn

Browse files

Files changed (6) hide show

__pycache__/feature_extract.cpython-313.pyc +0 -0
__pycache__/inference_demo.cpython-313.pyc +0 -0
app.py +35 -26
demo_models.pkl +2 -2
inference_demo.py +48 -30
training_model.py +54 -80

__pycache__/feature_extract.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/feature_extract.cpython-313.pyc and b/__pycache__/feature_extract.cpython-313.pyc differ

__pycache__/inference_demo.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/inference_demo.cpython-313.pyc and b/__pycache__/inference_demo.cpython-313.pyc differ

app.py CHANGED Viewed

@@ -3,8 +3,10 @@ from llm_classification import get_answer
 from inference_demo import (
     predict_randomforest_2f, predict_xgboost_2f, predict_lightgbm_2f,
     predict_svm_2f, predict_decisiontree_2f, predict_naivebayes_2f,
     predict_randomforest_6f, predict_xgboost_6f, predict_lightgbm_6f,
     predict_svm_6f, predict_decisiontree_6f, predict_naivebayes_6f,
 )
 PREDICT_FUNCS = {
@@ -14,12 +16,15 @@ PREDICT_FUNCS = {
     ("SVM", "2-feature"): predict_svm_2f,
     ("Decision Tree", "2-feature"): predict_decisiontree_2f,
     ("Naive Bayes", "2-feature"): predict_naivebayes_2f,
     ("Random Forest", "6-feature"): predict_randomforest_6f,
     ("XGBoost", "6-feature"): predict_xgboost_6f,
     ("LightGBM", "6-feature"): predict_lightgbm_6f,
     ("SVM", "6-feature"): predict_svm_6f,
     ("Decision Tree", "6-feature"): predict_decisiontree_6f,
     ("Naive Bayes", "6-feature"): predict_naivebayes_6f,
 }
 CLASSIFIERS = [
@@ -30,6 +35,7 @@ CLASSIFIERS = [
     "📈 SVM",
     "🌲 Decision Tree",
     "📊 Naive Bayes",
     "🤝 Ensemble"
 ]
 FEATURE_VERSIONS = ["2-feature", "6-feature"]
@@ -63,56 +69,59 @@ def explain_features(version: str) -> str:
 def infer(clf: str, version: str, text: str):
     if not text.strip():
         return {"⚠️ Please enter a sentence": 1.0}, ""
     if clf == "🔮 Gemini":
         y = get_answer(text)
-        if y == 1:
-            label = {"Positive 😀": 1.0}
-        else:
-            label = {"Negative 😞": 1.0}
-        return label, ""
     if clf == "🤝 Ensemble":
-        model_names = ["Random Forest", "XGBoost", "LightGBM", "SVM", "Decision Tree", "Naive Bayes"]
-        votes_detail = []
-        votes = []
         for m in model_names:
             func = PREDICT_FUNCS.get((m, version))
             if func:
                 y = func(text)
                 votes.append(y)
                 votes_detail.append(f"- **{m}**: {'Positive 😀' if y == 1 else 'Negative 😞'}")
-        if len(votes) == 0:
             return {"No models available": 1.0}, ""
-        positive_votes = sum(votes)
-        negative_votes = len(votes) - positive_votes
-        total = len(votes)
-        positive_pct = 100 * positive_votes / total
-        negative_pct = 100 * negative_votes / total
-        if positive_votes > negative_votes:
             label = {"Positive 😀": 1.0}
             final = "### Final Ensemble Result: **Positive 😀**"
-        elif negative_votes > positive_votes:
             label = {"Negative 😞": 1.0}
             final = "### Final Ensemble Result: **Negative 😞**"
         else:
             label = {"Tie 🤔": 1.0}
             final = "### Final Ensemble Result: **Tie 🤔**"
-        detail_text = "\n".join(votes_detail)
         detail_md = (
             f"{final}\n\n"
-            f"**Votes:** {positive_votes} positive ({positive_pct:.1f}%) | "
-            f"{negative_votes} negative ({negative_pct:.1f}%) out of {total} models.\n\n"
-            f"**Individual model decisions:**\n{detail_text}"
         )
         return label, detail_md
-    func = PREDICT_FUNCS.get((clf.replace("🌳 ","").replace("⚡ ","").replace("💡 ","").replace("📈 ","").replace("🌲 ","").replace("📊 ",""), version))
     if func is None:
         return {"Model not found": 1.0}, ""
     y = func(text)
-    if y == 1:
-        label = {"Positive 😀": 1.0}
-    else:
-        label = {"Negative 😞": 1.0}
-    return label, ""
 with gr.Blocks(
     title="Sentiment Classifier Demo",

 from inference_demo import (
     predict_randomforest_2f, predict_xgboost_2f, predict_lightgbm_2f,
     predict_svm_2f, predict_decisiontree_2f, predict_naivebayes_2f,
+    predict_logisticregression_2f,
     predict_randomforest_6f, predict_xgboost_6f, predict_lightgbm_6f,
     predict_svm_6f, predict_decisiontree_6f, predict_naivebayes_6f,
+    predict_logisticregression_6f,
 )
 PREDICT_FUNCS = {
     ("SVM", "2-feature"): predict_svm_2f,
     ("Decision Tree", "2-feature"): predict_decisiontree_2f,
     ("Naive Bayes", "2-feature"): predict_naivebayes_2f,
+    ("Logistic Regression", "2-feature"): predict_logisticregression_2f,
     ("Random Forest", "6-feature"): predict_randomforest_6f,
     ("XGBoost", "6-feature"): predict_xgboost_6f,
     ("LightGBM", "6-feature"): predict_lightgbm_6f,
     ("SVM", "6-feature"): predict_svm_6f,
     ("Decision Tree", "6-feature"): predict_decisiontree_6f,
     ("Naive Bayes", "6-feature"): predict_naivebayes_6f,
+    ("Logistic Regression", "6-feature"): predict_logisticregression_6f,
 }
 CLASSIFIERS = [
     "📈 SVM",
     "🌲 Decision Tree",
     "📊 Naive Bayes",
+    "🧮 Logistic Regression",
     "🤝 Ensemble"
 ]
 FEATURE_VERSIONS = ["2-feature", "6-feature"]
 def infer(clf: str, version: str, text: str):
     if not text.strip():
         return {"⚠️ Please enter a sentence": 1.0}, ""
     if clf == "🔮 Gemini":
         y = get_answer(text)
+        return ({"Positive 😀": 1.0} if y == 1 else {"Negative 😞": 1.0}), ""
     if clf == "🤝 Ensemble":
+        model_names = ["Random Forest", "XGBoost", "LightGBM", "SVM", "Decision Tree", "Naive Bayes", "Logistic Regression"]
+        votes_detail, votes = [], []
         for m in model_names:
             func = PREDICT_FUNCS.get((m, version))
             if func:
                 y = func(text)
                 votes.append(y)
                 votes_detail.append(f"- **{m}**: {'Positive 😀' if y == 1 else 'Negative 😞'}")
+        if not votes:
             return {"No models available": 1.0}, ""
+        pos, total = sum(votes), len(votes)
+        neg = total - pos
+        pos_pct = 100 * pos / total
+        neg_pct = 100 * neg / total
+        if pos > neg:
             label = {"Positive 😀": 1.0}
             final = "### Final Ensemble Result: **Positive 😀**"
+        elif neg > pos:
             label = {"Negative 😞": 1.0}
             final = "### Final Ensemble Result: **Negative 😞**"
         else:
             label = {"Tie 🤔": 1.0}
             final = "### Final Ensemble Result: **Tie 🤔**"
         detail_md = (
             f"{final}\n\n"
+            f"**Votes:** {pos} positive ({pos_pct:.1f}%) | {neg} negative ({neg_pct:.1f}%) out of {total} models.\n\n"
+            f"**Individual model decisions:**\n" + "\n".join(votes_detail)
         )
         return label, detail_md
+    base_name = (
+        clf.replace("🌳 ","")
+           .replace("⚡ ","")
+           .replace("💡 ","")
+           .replace("📈 ","")
+           .replace("🌲 ","")
+           .replace("📊 ","")
+           .replace("🧮 ","")
+    )
+    func = PREDICT_FUNCS.get((base_name, version))
     if func is None:
         return {"Model not found": 1.0}, ""
     y = func(text)
+    return ({"Positive 😀": 1.0} if y == 1 else {"Negative 😞": 1.0}), ""
 with gr.Blocks(
     title="Sentiment Classifier Demo",

demo_models.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf5cd9e9f927d6467888e9d249a99a086812f0c0a228a0b57407c2fe9eeb323d
-size 4826559

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf799bce43df9e171189f86bee098ae0c9b4bb56be43a485f1146a319e78bc5a
+size 4827607

inference_demo.py CHANGED Viewed

@@ -1,46 +1,64 @@
 import pickle
 import numpy as np
-from feature_extract import extract_features_2, extract_features_6
-# ---- Load models + freqs ----
 with open("demo_models.pkl", "rb") as f:
     data = pickle.load(f)
 freqs = data["freqs"]
-models_2f = data["2f"]
-models_6f = data["6f"]
-# ---- Helper functions ----
 def _predict_2f(sentence: str, model_name: str) -> int:
-    """Trích 2-feature và predict 0/1."""
-    x = extract_features_2(sentence, freqs)
-    return int(models_2f[model_name].predict(x)[0])
 def _predict_6f(sentence: str, model_name: str) -> int:
-    """Trích 6-feature và predict 0/1."""
-    x = extract_features_6(sentence, freqs)
-    return int(models_6f[model_name].predict(x)[0])
 # 2-feature
-def predict_randomforest_2f(sentence): return _predict_2f(sentence, "Random Forest")
-def predict_xgboost_2f(sentence):      return _predict_2f(sentence, "XGBoost")
-def predict_lightgbm_2f(sentence):     return _predict_2f(sentence, "LightGBM")
-def predict_svm_2f(sentence):          return _predict_2f(sentence, "SVM")
-def predict_decisiontree_2f(sentence): return _predict_2f(sentence, "Decision Tree")
-def predict_naivebayes_2f(sentence):   return _predict_2f(sentence, "Naive Bayes")
 # 6-feature
-def predict_randomforest_6f(sentence): return _predict_6f(sentence, "Random Forest")
-def predict_xgboost_6f(sentence):      return _predict_6f(sentence, "XGBoost")
-def predict_lightgbm_6f(sentence):     return _predict_6f(sentence, "LightGBM")
-def predict_svm_6f(sentence):          return _predict_6f(sentence, "SVM")
-def predict_decisiontree_6f(sentence): return _predict_6f(sentence, "Decision Tree")
-def predict_naivebayes_6f(sentence):   return _predict_6f(sentence, "Naive Bayes")
-# ---- Test nhanh ----
 if __name__ == "__main__":
-    test_sentence = "I love this new phone!"
-    print("RandomForest 2f:", predict_randomforest_2f(test_sentence))
-    print("RandomForest 6f:", predict_randomforest_6f(test_sentence))
-    print("SVM 2f:", predict_svm_2f(test_sentence))
-    print("SVM 6f:", predict_svm_6f(test_sentence))

 import pickle
 import numpy as np
+from feature_extract import extract_features_2, extract_features_6
 with open("demo_models.pkl", "rb") as f:
     data = pickle.load(f)
 freqs = data["freqs"]
+models_2f = data.get("2f", {})
+models_6f = data.get("6f", {})
+def _predict_with_model(sentence: str, model) -> int:
+    n = getattr(model, "n_features_in_", None)
+    if n == 6:
+        x = extract_features_6(sentence, freqs)
+    else:
+        x = extract_features_2(sentence, freqs)
+    return int(model.predict(x)[0])
+def _smart_pick_model(model_name: str, prefer: str = "2f"):
+    if prefer == "6f":
+        model = models_6f.get(model_name) or models_2f.get(model_name)
+    else:
+        model = models_2f.get(model_name) or models_6f.get(model_name)
+    if model is None:
+        raise KeyError(f"Model '{model_name}' not found in saved models.")
+    return model
+# --- 2-feature API (sẽ tự xử lý nếu model thực tế là 6f) ---
 def _predict_2f(sentence: str, model_name: str) -> int:
+    model = _smart_pick_model(model_name, prefer="2f")
+    return _predict_with_model(sentence, model)
+# --- 6-feature API (sẽ tự xử lý nếu model thực tế là 2f) ---
 def _predict_6f(sentence: str, model_name: str) -> int:
+    model = _smart_pick_model(model_name, prefer="6f")
+    return _predict_with_model(sentence, model)
 # 2-feature
+def predict_randomforest_2f(sentence):       return _predict_2f(sentence, "Random Forest")
+def predict_xgboost_2f(sentence):            return _predict_2f(sentence, "XGBoost")
+def predict_lightgbm_2f(sentence):           return _predict_2f(sentence, "LightGBM")
+def predict_svm_2f(sentence):                return _predict_2f(sentence, "SVM")
+def predict_decisiontree_2f(sentence):       return _predict_2f(sentence, "Decision Tree")
+def predict_naivebayes_2f(sentence):         return _predict_2f(sentence, "Naive Bayes")
+def predict_logisticregression_2f(sentence): return _predict_2f(sentence, "Logistic Regression")
 # 6-feature
+def predict_randomforest_6f(sentence):       return _predict_6f(sentence, "Random Forest")
+def predict_xgboost_6f(sentence):            return _predict_6f(sentence, "XGBoost")
+def predict_lightgbm_6f(sentence):           return _predict_6f(sentence, "LightGBM")
+def predict_svm_6f(sentence):                return _predict_6f(sentence, "SVM")
+def predict_decisiontree_6f(sentence):       return _predict_6f(sentence, "Decision Tree")
+def predict_naivebayes_6f(sentence):         return _predict_6f(sentence, "Naive Bayes")
+def predict_logisticregression_6f(sentence): return _predict_6f(sentence, "Logistic Regression")
 if __name__ == "__main__":
+    s = "I love this new phone!"
+    print("RF 2f:", predict_randomforest_2f(s))
+    print("RF 6f:", predict_randomforest_6f(s))
+    print("SVM 2f:", predict_svm_2f(s))
+    print("SVM 6f:", predict_svm_6f(s))
+    print("LogReg 2f:", predict_logisticregression_2f(s))
+    print("LogReg 6f:", predict_logisticregression_6f(s))

training_model.py CHANGED Viewed

@@ -1,25 +1,21 @@
 # file: train_demo_models.py
 from __future__ import annotations
 import pickle
 import numpy as np
 from typing import Dict, Tuple, List
 import nltk
 from nltk.corpus import twitter_samples, stopwords
 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.metrics import accuracy_score, log_loss
-from feature_extract import build_freqs, extract_features_2, extract_features_6
-# -------------------- NLTK setup --------------------
 def _ensure_nltk():
     try:
         twitter_samples.fileids()
@@ -30,7 +26,6 @@ def _ensure_nltk():
     except LookupError:
         nltk.download("stopwords", quiet=True)
-# -------------------- Data prep --------------------
 def load_twitter_data() -> Tuple[List[str], np.ndarray]:
     pos = twitter_samples.strings("positive_tweets.json")
     neg = twitter_samples.strings("negative_tweets.json")
@@ -38,97 +33,76 @@ def load_twitter_data() -> Tuple[List[str], np.ndarray]:
     y = np.array([1] * len(pos) + [0] * len(neg))
     return tweets, y
-def vectorize(tweets: List[str],
-              freqs: Dict[Tuple[str, float], float],
-              mode: str = "2f") -> np.ndarray:
-    """mode: '2f' -> extract_features_2, '6f' -> extract_features_6"""
     feat_fn = extract_features_2 if mode == "2f" else extract_features_6
     rows = [feat_fn(t, freqs) for t in tweets]
     return np.vstack(rows) if rows else np.zeros((0, 2 if mode == "2f" else 6))
-# -------------------- Models --------------------
-def make_models() -> Dict[str, object]:
-    return {
-        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
-        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
-        "LightGBM": LGBMClassifier(random_state=42),
-        "SVM": SVC(kernel="linear", probability=True, random_state=42),
-        "Decision Tree": DecisionTreeClassifier(random_state=42),
-        "Naive Bayes": GaussianNB(),
-    }
-# -------------------- Train --------------------
-def train_models(X: np.ndarray, y: np.ndarray) -> Dict[str, object]:
-    models = make_models()
-    trained = {}
-    print("Đang train các mô hình:")
-    for name, clf in models.items():
-        clf.fit(X, y.ravel())
-        trained[name] = clf
-        # --- ghi log sau train ---
-        y_pred = clf.predict(X)
-        acc = accuracy_score(y, y_pred)
-        # log_loss cần probability
-        try:
-            y_proba = clf.predict_proba(X)
-            loss = log_loss(y, y_proba)
-        except Exception:
-            loss = None
-        if loss is not None:
-            print(f"[{name}] Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
-        else:
-            print(f"[{name}] Accuracy: {acc:.4f} | (không có predict_proba để tính log_loss)")
-    print("=" * 60)
     return trained
-def train_all_versions(save_path: str = "demo_models.pkl"):
-    """
-    Train và lưu mô hình + freqs ra file pickle.
-    Trả về:
-      {
-        'freqs': freqs,
-        '2f': {model_name: trained_model, ...},
-        '6f': {model_name: trained_model, ...}
-      }
-    """
     _ensure_nltk()
     tweets, y = load_twitter_data()
-    freqs = build_freqs(tweets, y.reshape(-1, 1))
-    # trích features
     X2 = vectorize(tweets, freqs, mode="2f")
     X6 = vectorize(tweets, freqs, mode="6f")
-    print("\n===== Train với 2-feature =====")
-    models_2f = train_models(X2, y)
-    print("\n===== Train với 6-feature =====")
-    models_6f = train_models(X6, y)
-    data_to_save = {
-        "freqs": freqs,
-        "2f": models_2f,
-        "6f": models_6f,
-    }
-    # lưu file pickle
     with open(save_path, "wb") as f:
         pickle.dump(data_to_save, f)
-    print(f"\nĐã train và lưu mô hình + freqs vào file: {save_path}")
     return data_to_save
-# -------------------- Load --------------------
 def load_demo_models(save_path: str = "demo_models.pkl"):
-    """Load lại mô hình + freqs từ file pickle."""
     with open(save_path, "rb") as f:
         data = pickle.load(f)
     return data
-# -------------------- CLI --------------------
 if __name__ == "__main__":
-    models = train_all_versions()  # train & save
     print("Các mô hình 2f:", list(models["2f"].keys()))
     print("Các mô hình 6f:", list(models["6f"].keys()))

 # file: train_demo_models.py
 from __future__ import annotations
+import os
 import pickle
 import numpy as np
 from typing import Dict, Tuple, List
 import nltk
 from nltk.corpus import twitter_samples, stopwords
 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.naive_bayes import GaussianNB
+from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, log_loss
+from feature_extract import build_freqs, extract_features_2, extract_features_6
 def _ensure_nltk():
     try:
         twitter_samples.fileids()
     except LookupError:
         nltk.download("stopwords", quiet=True)
 def load_twitter_data() -> Tuple[List[str], np.ndarray]:
     pos = twitter_samples.strings("positive_tweets.json")
     neg = twitter_samples.strings("negative_tweets.json")
     y = np.array([1] * len(pos) + [0] * len(neg))
     return tweets, y
+def vectorize(tweets: List[str], freqs: Dict[Tuple[str, float], float], mode: str = "2f") -> np.ndarray:
     feat_fn = extract_features_2 if mode == "2f" else extract_features_6
     rows = [feat_fn(t, freqs) for t in tweets]
     return np.vstack(rows) if rows else np.zeros((0, 2 if mode == "2f" else 6))
+ALL_MODEL_SPECS: Dict[str, object] = {
+    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
+    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
+    "LightGBM": LGBMClassifier(random_state=42),
+    "SVM": SVC(kernel="linear", probability=True, random_state=42),
+    "Decision Tree": DecisionTreeClassifier(random_state=42),
+    "Naive Bayes": GaussianNB(),
+    "Logistic Regression": LogisticRegression(solver="liblinear", random_state=42),
+}
+def make_models(include: List[str] | None = None) -> Dict[str, object]:
+    if include is None:
+        return {k: v for k, v in ALL_MODEL_SPECS.items()}
+    return {k: ALL_MODEL_SPECS[k] for k in include}
+def _fit_and_log(name: str, clf, X: np.ndarray, y: np.ndarray):
+    clf.fit(X, y.ravel())
+    y_pred = clf.predict(X)
+    acc = accuracy_score(y, y_pred)
+    try:
+        y_proba = clf.predict_proba(X)
+        loss = log_loss(y, y_proba)
+        print(f"[{name}] Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
+    except Exception:
+        print(f"[{name}] Accuracy: {acc:.4f} | (no predict_proba)")
+    return clf
+def train_models(X: np.ndarray, y: np.ndarray, include: List[str] | None = None) -> Dict[str, object]:
+    specs = make_models(include)
+    trained: Dict[str, object] = {}
+    for name, clf in specs.items():
+        trained[name] = _fit_and_log(name, clf, X, y)
     return trained
+def ensure_logreg_only(save_path: str = "demo_models.pkl"):
     _ensure_nltk()
     tweets, y = load_twitter_data()
+    if os.path.exists(save_path):
+        with open(save_path, "rb") as f:
+            data = pickle.load(f)
+        freqs = data.get("freqs")
+        models_2f: Dict[str, object] = data.get("2f", {})
+        models_6f: Dict[str, object] = data.get("6f", {})
+    else:
+        freqs = build_freqs(tweets, y.reshape(-1, 1))
+        models_2f, models_6f = {}, {}
     X2 = vectorize(tweets, freqs, mode="2f")
     X6 = vectorize(tweets, freqs, mode="6f")
+    if "Logistic Regression" not in models_2f:
+        new_models_2f = train_models(X2, y, include=["Logistic Regression"])
+        models_2f.update(new_models_2f)
+    if "Logistic Regression" not in models_6f:
+        new_models_6f = train_models(X6, y, include=["Logistic Regression"])
+        models_6f.update(new_models_6f)
+    data_to_save = {"freqs": freqs, "2f": models_2f, "6f": models_6f}
     with open(save_path, "wb") as f:
         pickle.dump(data_to_save, f)
     return data_to_save
 def load_demo_models(save_path: str = "demo_models.pkl"):
     with open(save_path, "rb") as f:
         data = pickle.load(f)
     return data
 if __name__ == "__main__":
+    models = ensure_logreg_only()
     print("Các mô hình 2f:", list(models["2f"].keys()))
     print("Các mô hình 6f:", list(models["6f"].keys()))