Spaces:

trantuan1701
/

ml_exercise

Sleeping

App Files Files Community

trantuan1701 commited on Sep 24, 2025

Commit

9d454eb

1 Parent(s): ba48a36

bla

Browse files

Files changed (14) hide show

__pycache__/feature_extract.cpython-313.pyc +0 -0
__pycache__/inference_demo.cpython-313.pyc +0 -0
__pycache__/llm.cpython-310.pyc +0 -0
__pycache__/llm.cpython-312.pyc +0 -0
__pycache__/llm.cpython-313.pyc +0 -0
__pycache__/llm_classification.cpython-313.pyc +0 -0
app.py +130 -13
demo_models.pkl +3 -0
exercise8.ipynb +319 -0
feature_extract.py +132 -0
inference_demo.py +46 -0
preds.csv +2001 -0
requirements.txt +5 -0
training_model.py +134 -0

__pycache__/feature_extract.cpython-313.pyc ADDED Viewed

Binary file (5.83 kB). View file

__pycache__/inference_demo.cpython-313.pyc ADDED Viewed

Binary file (3.38 kB). View file

__pycache__/llm.cpython-310.pyc ADDED Viewed

Binary file (535 Bytes). View file

__pycache__/llm.cpython-312.pyc ADDED Viewed

Binary file (661 Bytes). View file

__pycache__/llm.cpython-313.pyc ADDED Viewed

Binary file (651 Bytes). View file

__pycache__/llm_classification.cpython-313.pyc ADDED Viewed

Binary file (1.33 kB). View file

app.py CHANGED Viewed

@@ -1,21 +1,138 @@
 import gradio as gr
-from llm_classification import get_answer
-CLASSIFIERS = ["gemini"]
-def infer(clf: str, text: str) -> str:
     if not text.strip():
-        return "negative"
-    y = get_answer(text)
-    return "positive" if y == 1 else "negative"
-with gr.Blocks(title="Sentiment Classifier") as demo:
-    gr.Markdown("## Sentiment Classifier")
-    clf = gr.Dropdown(choices=CLASSIFIERS, value="gemini", label="Classifier")
-    txt = gr.Textbox(label="Input sentence", placeholder="Type a sentence…")
     btn = gr.Button("Classify")
-    out = gr.Label(label="Result")
-    btn.click(infer, inputs=[clf, txt], outputs=out)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from llm_classification import get_answer
+from inference_demo import (
+    predict_randomforest_2f, predict_xgboost_2f, predict_lightgbm_2f,
+    predict_svm_2f, predict_decisiontree_2f, predict_naivebayes_2f,
+    predict_randomforest_6f, predict_xgboost_6f, predict_lightgbm_6f,
+    predict_svm_6f, predict_decisiontree_6f, predict_naivebayes_6f,
+)
+PREDICT_FUNCS = {
+    ("Random Forest", "2-feature"): predict_randomforest_2f,
+    ("XGBoost", "2-feature"): predict_xgboost_2f,
+    ("LightGBM", "2-feature"): predict_lightgbm_2f,
+    ("SVM", "2-feature"): predict_svm_2f,
+    ("Decision Tree", "2-feature"): predict_decisiontree_2f,
+    ("Naive Bayes", "2-feature"): predict_naivebayes_2f,
+    ("Random Forest", "6-feature"): predict_randomforest_6f,
+    ("XGBoost", "6-feature"): predict_xgboost_6f,
+    ("LightGBM", "6-feature"): predict_lightgbm_6f,
+    ("SVM", "6-feature"): predict_svm_6f,
+    ("Decision Tree", "6-feature"): predict_decisiontree_6f,
+    ("Naive Bayes", "6-feature"): predict_naivebayes_6f,
+}
+CLASSIFIERS = [
+    "🔮 Gemini",
+    "🌳 Random Forest",
+    "⚡ XGBoost",
+    "💡 LightGBM",
+    "📈 SVM",
+    "🌲 Decision Tree",
+    "📊 Naive Bayes",
+    "🤝 Ensemble"
+]
+FEATURE_VERSIONS = ["2-feature", "6-feature"]
+FEATURE_EXPLANATIONS = {
+    "2-feature": (
+        "### Supported Language\n"
+        "Only **English** sentences are supported.\n\n"
+        "### 2-feature version\n"
+        "This version uses only 2 frequency-based features:\n"
+        "  * x1 = Total frequency of words in the Positive class\n"
+        "  * x2 = Total frequency of words in the Negative class"
+    ),
+    "6-feature": (
+        "### Supported Language\n"
+        "Only **English** sentences are supported.\n\n"
+        "### 6-feature version\n"
+        "This version uses 6 features:\n"
+        "  * x1 = Total frequency of words in the Positive class\n"
+        "  * x2 = Total frequency of words in the Negative class\n"
+        "  * x3 = 1 if the word 'no' appears, else 0\n"
+        "  * x4 = Count of 1st and 2nd person pronouns\n"
+        "  * x5 = 1 if the tweet contains '!' else 0\n"
+        "  * x6 = log(word count)"
+    ),
+}
+def explain_features(version: str) -> str:
+    return FEATURE_EXPLANATIONS[version]
+def infer(clf: str, version: str, text: str):
     if not text.strip():
+        return {"⚠️ Please enter a sentence": 1.0}, ""
+    if clf == "🔮 Gemini":
+        y = get_answer(text)
+        if y == 1:
+            label = {"Positive 😀": 1.0}
+        else:
+            label = {"Negative 😞": 1.0}
+        return label, ""
+    if clf == "🤝 Ensemble":
+        model_names = ["Random Forest", "XGBoost", "LightGBM", "SVM", "Decision Tree", "Naive Bayes"]
+        votes_detail = []
+        votes = []
+        for m in model_names:
+            func = PREDICT_FUNCS.get((m, version))
+            if func:
+                y = func(text)
+                votes.append(y)
+                votes_detail.append(f"- **{m}**: {'Positive 😀' if y == 1 else 'Negative 😞'}")
+        if len(votes) == 0:
+            return {"No models available": 1.0}, ""
+        positive_votes = sum(votes)
+        negative_votes = len(votes) - positive_votes
+        total = len(votes)
+        positive_pct = 100 * positive_votes / total
+        negative_pct = 100 * negative_votes / total
+        if positive_votes > negative_votes:
+            label = {"Positive 😀": 1.0}
+            final = "### Final Ensemble Result: **Positive 😀**"
+        elif negative_votes > positive_votes:
+            label = {"Negative 😞": 1.0}
+            final = "### Final Ensemble Result: **Negative 😞**"
+        else:
+            label = {"Tie 🤔": 1.0}
+            final = "### Final Ensemble Result: **Tie 🤔**"
+        detail_text = "\n".join(votes_detail)
+        detail_md = (
+            f"{final}\n\n"
+            f"**Votes:** {positive_votes} positive ({positive_pct:.1f}%) | "
+            f"{negative_votes} negative ({negative_pct:.1f}%) out of {total} models.\n\n"
+            f"**Individual model decisions:**\n{detail_text}"
+        )
+        return label, detail_md
+    func = PREDICT_FUNCS.get((clf.replace("🌳 ","").replace("⚡ ","").replace("💡 ","").replace("📈 ","").replace("🌲 ","").replace("📊 ",""), version))
+    if func is None:
+        return {"Model not found": 1.0}, ""
+    y = func(text)
+    if y == 1:
+        label = {"Positive 😀": 1.0}
+    else:
+        label = {"Negative 😞": 1.0}
+    return label, ""
+with gr.Blocks(
+    title="Sentiment Classifier Demo",
+    css=".big-markdown {font-size: 1.2rem; min-height: 300px; overflow:auto;}"
+) as demo:
+    gr.Markdown("## Sentiment Classifier Demo")
+    with gr.Row():
+        clf = gr.Dropdown(choices=CLASSIFIERS, value="🔮 Gemini", label="Classifier (or Ensemble)")
+        version = gr.Dropdown(choices=FEATURE_VERSIONS, value="2-feature", label="Feature Version (not used for gemini)")
+    txt = gr.Textbox(label="Input sentence (English only)", placeholder="Type a sentence…")
     btn = gr.Button("Classify")
+    out_label = gr.Label(label="Main Result")
+    out_detail = gr.Markdown(elem_classes="big-markdown")
+    explanation_box = gr.Markdown(FEATURE_EXPLANATIONS["2-feature"])
+    version.change(fn=explain_features, inputs=version, outputs=explanation_box)
+    btn.click(fn=infer, inputs=[clf, version, txt], outputs=[out_label, out_detail])
+    gr.Markdown(
+        "**Note:** This demo supports **English** sentences only. "
+        "Choose '🤝 Ensemble' to see the combined decision from all classifiers, "
+        "or choose '🔮 Gemini' to use the Gemini LLM-based classifier."
+    )
 if __name__ == "__main__":
     demo.launch()

demo_models.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5cd9e9f927d6467888e9d249a99a086812f0c0a228a0b57407c2fe9eeb323d
+size 4826559

exercise8.ipynb ADDED Viewed

	@@ -0,0 +1,319 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0f914398",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install nltk\n",
+    "!pip install numpy\n",
+    "!pip install pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d473cee2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk #Natural Language Toolkit\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from nltk.corpus import twitter_samples\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain_core.messages import SystemMessage, HumanMessage\n",
+    "\n",
+    "from llm import llm\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2f9d43cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_positive_tweets = twitter_samples.strings('positive_tweets.json')\n",
+    "all_negative_tweets = twitter_samples.strings('negative_tweets.json')\n",
+    "\n",
+    "test_pos = all_positive_tweets[4000:]\n",
+    "test_neg = all_negative_tweets[4000:]\n",
+    "\n",
+    "test_x = test_pos + test_neg\n",
+    "\n",
+    "# Create the numpy array of positive labels and negative labels.\n",
+    "test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ed135bd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import numpy as np  # đảm bảo đã import\n",
+    "\n",
+    "# --- PROMPTS ---\n",
+    "system_prompt = (\n",
+    "    \"You are a strict sentiment classifier.\\n\"\n",
+    "    \"Given a batch of up to 20 sentences, output EXACTLY one line per input, \"\n",
+    "    \"in the same order. Each line must be a single character: 1 for positive, 0 for negative. \"\n",
+    "    \"NO extra text, NO numbering, NO spaces, NO blank lines.\"\n",
+    ")\n",
+    "\n",
+    "user_prompt = PromptTemplate(\n",
+    "    input_variables=[\"items\"],\n",
+    "    template=(\n",
+    "        \"Classify the sentiment of EACH sentence listed between <INPUT> and </INPUT>.\\n\"\n",
+    "        \"Rules:\\n\"\n",
+    "        \"- Output exactly ONE line per sentence, in the SAME ORDER.\\n\"\n",
+    "        \"- Each line must be EXACTLY '1' (positive) or '0' (negative).\\n\"\n",
+    "        \"- Do NOT print anything else. Do NOT repeat the inputs.\\n\\n\"\n",
+    "        \"<INPUT>\\n{items}\\n</INPUT>\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "def _format_items(sentences):\n",
+    "    return \"\\n\".join(f\"<s>{s}</s>\" for s in sentences)\n",
+    "\n",
+    "# --- PARSER (robust) ---\n",
+    "def _parse_binary_lines(text: str, expected_n: int) -> np.ndarray:\n",
+    "    \"\"\"\n",
+    "    Chấp nhận:\n",
+    "      - expected_n dòng, mỗi dòng là '0' hoặc '1'\n",
+    "      - 1 dòng duy nhất dài đúng expected_n ký tự '0'/'1'\n",
+    "      - Cứu hộ: gom toàn bộ ký tự '0'/'1' trong text nếu đúng expected_n\n",
+    "    \"\"\"\n",
+    "    s = (text or \"\").strip()\n",
+    "    if not s:\n",
+    "        raise ValueError(\"Empty model output\")\n",
+    "\n",
+    "    lines = [ln.strip() for ln in s.splitlines() if ln.strip() != \"\"]\n",
+    "\n",
+    "    # Case A: Đúng expected_n dòng, mỗi dòng là 0/1\n",
+    "    if len(lines) == expected_n and all(re.fullmatch(r\"[01]\", ln) for ln in lines):\n",
+    "        return np.array([int(ln) for ln in lines], dtype=np.int8)\n",
+    "\n",
+    "    # Case B: 1 dòng duy nhất gồm đúng expected_n ký tự 0/1\n",
+    "    if len(lines) == 1 and re.fullmatch(r\"[01]+\", lines[0]) and len(lines[0]) == expected_n:\n",
+    "        return np.array([int(ch) for ch in lines[0]], dtype=np.int8)\n",
+    "\n",
+    "    # Case C: Cứu hộ - lấy mọi ký tự 0/1 trong toàn bộ text\n",
+    "    bits = re.findall(r\"[01]\", s)\n",
+    "    if len(bits) == expected_n:\n",
+    "        return np.array([int(b) for b in bits], dtype=np.int8)\n",
+    "\n",
+    "    # Thất bại: báo lỗi kèm preview ngắn gọn\n",
+    "    preview = s[:200].replace(\"\\n\", \"\\\\n\")\n",
+    "    raise ValueError(f\"Expected {expected_n} labels, got {len(lines)} lines / {len(bits)} bits. Raw='{preview}...'\")\n",
+    "\n",
+    "# --- INFERENCE ---\n",
+    "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
+    "    n = len(sentences)\n",
+    "    if n == 0 or n > 20:\n",
+    "        raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
+    "\n",
+    "    messages = [\n",
+    "        SystemMessage(content=system_prompt),\n",
+    "        HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
+    "    ]\n",
+    "\n",
+    "    resp = llm.invoke(messages)\n",
+    "\n",
+    "    # KHÔNG dùng str(resp): dễ lẫn metadata vào.\n",
+    "    raw_text = getattr(resp, \"content\", None)\n",
+    "    if raw_text is None or not str(raw_text).strip():\n",
+    "        # Gợi ý: bạn có thể log resp để debug khi model bị chặn (block_reason, safety, v.v.)\n",
+    "        raise RuntimeError(f\"LLM returned empty content. Full response repr: {repr(resp)}\")\n",
+    "\n",
+    "    raw_text = raw_text.strip()\n",
+    "    preds = _parse_binary_lines(raw_text, expected_n=n)\n",
+    "    return preds if existing is None else np.concatenate([existing, preds])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "c06e66ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
+    "    n = len(sentences)\n",
+    "    if n == 0 or n > 20:\n",
+    "        raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
+    "\n",
+    "    messages = [\n",
+    "        SystemMessage(content=system_prompt),\n",
+    "        HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
+    "    ]\n",
+    "\n",
+    "    resp = llm.invoke(messages)\n",
+    "    raw_text = getattr(resp, \"content\", None)\n",
+    "\n",
+    "    if raw_text is None or not str(raw_text).strip():\n",
+    "        # Nếu LLM không trả ra gì → điền 0 hết\n",
+    "        print(f\"[warn] LLM output empty for batch size {n}, filling 0s\")\n",
+    "        preds = np.zeros(n, dtype=np.int8)\n",
+    "    else:\n",
+    "        raw_text = raw_text.strip()\n",
+    "        try:\n",
+    "            preds = _parse_binary_lines(raw_text, expected_n=n)\n",
+    "        except Exception as e:\n",
+    "            # Nếu parse fail → điền 0 hết\n",
+    "            print(f\"[warn] Parse fail for batch size {n}, filling 0s: {e}\")\n",
+    "            preds = np.zeros(n, dtype=np.int8)\n",
+    "\n",
+    "    return preds if existing is None else np.concatenate([existing, preds])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "495cb1f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[init] total=2000 done=1500 remain=500\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Gemini produced an empty response. Continuing with empty message\n",
+      "Feedback: block_reason: PROHIBITED_CONTENT\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[warn] LLM output empty for batch size 20, filling 0s\n",
+      "[ok] 1500:1520 +20\n",
+      "[ok] 1520:1540 +20\n",
+      "[ok] 1540:1560 +20\n",
+      "[ok] 1560:1580 +20\n",
+      "[ok] 1580:1600 +20\n",
+      "[ok] 1600:1620 +20\n",
+      "[ok] 1620:1640 +20\n",
+      "[ok] 1640:1660 +20\n",
+      "[ok] 1660:1680 +20\n",
+      "[ok] 1680:1700 +20\n",
+      "[ok] 1700:1720 +20\n",
+      "[ok] 1720:1740 +20\n",
+      "[ok] 1740:1760 +20\n",
+      "[ok] 1760:1780 +20\n",
+      "[ok] 1780:1800 +20\n",
+      "[ok] 1800:1820 +20\n",
+      "[ok] 1820:1840 +20\n",
+      "[ok] 1840:1860 +20\n",
+      "[ok] 1860:1880 +20\n",
+      "[ok] 1880:1900 +20\n",
+      "[ok] 1900:1920 +20\n",
+      "[ok] 1920:1940 +20\n",
+      "[ok] 1940:1960 +20\n",
+      "[ok] 1960:1980 +20\n",
+      "[ok] 1980:2000 +20\n",
+      "[final] collected=2000/2000\n",
+      "Accuracy: 0.9470\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os, csv, time\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "BATCH_SIZE = 20\n",
+    "SLEEP_SECS = 20\n",
+    "PRED_CSV = \"preds.csv\"\n",
+    "\n",
+    "y_true = test_y.ravel().astype(int)\n",
+    "TOTAL = len(test_x)\n",
+    "\n",
+    "# resume\n",
+    "start_idx = 0\n",
+    "if os.path.exists(PRED_CSV):\n",
+    "    with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
+    "        r = csv.reader(f); rows = list(r)\n",
+    "        if rows and rows[0] and rows[0][0] == \"idx\": rows = rows[1:]\n",
+    "        start_idx = len(rows)\n",
+    "else:\n",
+    "    with open(PRED_CSV, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
+    "        csv.writer(f).writerow([\"idx\", \"pred\"])\n",
+    "\n",
+    "print(f\"[init] total={TOTAL} done={start_idx} remain={TOTAL-start_idx}\")\n",
+    "\n",
+    "for i in range(start_idx, TOTAL, BATCH_SIZE):\n",
+    "    batch = test_x[i : i + BATCH_SIZE]\n",
+    "    try:\n",
+    "        preds = classify_20(llm, batch)\n",
+    "    except Exception as e:\n",
+    "        print(f\"[err] {i}:{i+len(batch)} {type(e).__name__}: {e}\")\n",
+    "        break\n",
+    "    with open(PRED_CSV, \"a\", newline=\"\", encoding=\"utf-8\") as f:\n",
+    "        w = csv.writer(f)\n",
+    "        for off, p in enumerate(preds):\n",
+    "            w.writerow([i + off, int(p)])\n",
+    "    print(f\"[ok] {i}:{i+len(batch)} +{len(preds)}\")\n",
+    "    if i + BATCH_SIZE < TOTAL:\n",
+    "        time.sleep(SLEEP_SECS)\n",
+    "\n",
+    "# eval if complete\n",
+    "idxs, vals = [], []\n",
+    "with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
+    "    r = csv.reader(f); next(r, None)\n",
+    "    for row in r:\n",
+    "        idxs.append(int(row[0])); vals.append(int(row[1]))\n",
+    "order = np.argsort(np.array(idxs))\n",
+    "y_pred = np.array(vals, dtype=int)[order]\n",
+    "\n",
+    "print(f\"[final] collected={len(y_pred)}/{TOTAL}\")\n",
+    "if len(y_pred) == TOTAL:\n",
+    "    print(f\"Accuracy: {accuracy_score(y_true, y_pred):.4f}\")\n",
+    "else:\n",
+    "    print(f\"[note] missing={TOTAL-len(y_pred)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "435f575c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

feature_extract.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# file: sa_features.py
+import re
+import string
+import numpy as np
+from nltk.stem import PorterStemmer
+from nltk.tokenize import TweetTokenizer
+from nltk.corpus import stopwords
+# --- constants & tools ---
+pronouns = {
+    "i","me","my","mine","myself",
+    "we","us","our","ours","ourselves",
+    "you","your","yours","yourself","yourselves",
+    "he","him","his","himself",
+    "she","her","hers","herself",
+    "it","its","itself",
+    "they","them","their","theirs","themselves",
+}
+_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
+_stemmer = PorterStemmer()
+_stopwords_en = set(stopwords.words("english"))
+def process_tweet(tweet):
+    """Làm sạch + tokenize + remove stopwords/punctuation + stem. Trả về list token."""
+    tweet = re.sub(r"\$\w*", "", tweet)                # bỏ tickers $GE
+    tweet = re.sub(r"^RT[\s]+", "", tweet)             # bỏ 'RT'
+    tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet)  # bỏ URL
+    tweet = re.sub(r"#", "", tweet)                    # bỏ dấu '#', giữ từ
+    tokens = _tokenizer.tokenize(tweet)
+    clean = []
+    for w in tokens:
+        if (w not in _stopwords_en) and (w not in string.punctuation):
+            clean.append(_stemmer.stem(w))
+    return clean
+def extract_features_2(tweet, freqs):
+    """
+    x[0,0]: tổng tần suất từ (đã process) ở lớp 1.0
+    x[0,1]: tổng tần suất từ (đã process) ở lớp 0.0
+    """
+    words = process_tweet(tweet)
+    x = np.zeros((1, 2))
+    for w in words:
+        x[0, 0] += freqs.get((w, 1.0), 0)
+        x[0, 1] += freqs.get((w, 0.0), 0)
+    return x
+def extract_features_6(tweet, freqs):
+    """
+    x1: tổng freq từ theo lớp 1.0 (tokenizer raw-lower)
+    x2: tổng freq từ theo lớp 0.0
+    x3: 1 nếu có "no" trong tokens else 0
+    x4: đếm đại từ ngôi 1 & 2 (pronouns)
+    x5: 1 nếu có '!' trong raw tweet else 0
+    x6: log(số lượng token) (0 nếu rỗng)
+    """
+    words = _tokenizer.tokenize(tweet)
+    x = np.zeros((1, 6))
+    for w in words:
+        x[0, 0] += freqs.get((w, 1.0), 0)
+        x[0, 1] += freqs.get((w, 0.0), 0)
+    x[0, 2] = 1 if "no" in words else 0
+    x[0, 3] = sum(1 for w in words if w in pronouns)
+    x[0, 4] = 1 if "!" in tweet else 0
+    x[0, 5] = np.log(len(words)) if len(words) > 0 else 0
+    return x
+def build_freqs(tweets, ys):
+    """
+    Xây dựng tần suất (word, sentiment)
+    Input:
+        tweets: list các tweet
+        ys: m×1 array (numpy) với nhãn sentiment mỗi tweet (0 hoặc 1)
+    Output:
+        freqs: dict {(word, y): count}
+    """
+    yslist = np.squeeze(ys).tolist()
+    freqs = {}
+    for y, tweet in zip(yslist, tweets):
+        for word in process_tweet(tweet):
+            pair = (word, y)
+            freqs[pair] = freqs.get(pair, 0) + 1
+    return freqs
+if __name__ == "__main__":
+    """
+    Đoạn kiểm tra nhanh module:
+    - tải dữ liệu twitter_samples
+    - build freqs
+    - trích 2 loại feature cho 1 tweet mẫu
+    """
+    import nltk
+    from nltk.corpus import twitter_samples
+    # tải nếu thiếu
+    try:
+        twitter_samples.fileids()
+    except LookupError:
+        nltk.download("twitter_samples")
+    try:
+        stopwords.words("english")
+    except LookupError:
+        nltk.download("stopwords")
+    # lấy dữ liệu pos/neg
+    pos = twitter_samples.strings("positive_tweets.json")
+    neg = twitter_samples.strings("negative_tweets.json")
+    tweets = pos + neg
+    y = np.array([1] * len(pos) + [0] * len(neg)).reshape(-1, 1)
+    print(f"Tổng số tweet: {len(tweets)}")
+    # build freqs
+    freqs = build_freqs(tweets, y)
+    print(f"Số cặp (word, sentiment): {len(freqs)}")
+    # kiểm tra 1 tweet mẫu
+    sample_tweet = tweets[0]
+    print("\nTweet mẫu:", sample_tweet)
+    print("Tokens (process_tweet):", process_tweet(sample_tweet))
+    x2 = extract_features_2(sample_tweet, freqs)
+    x6 = extract_features_6(sample_tweet, freqs)
+    print("\nFeatures 2 chiều:", x2)
+    print("Features 6 chiều:", x6)

inference_demo.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import pickle
+import numpy as np
+from feature_extract import extract_features_2, extract_features_6
+# ---- Load models + freqs ----
+with open("demo_models.pkl", "rb") as f:
+    data = pickle.load(f)
+freqs = data["freqs"]
+models_2f = data["2f"]
+models_6f = data["6f"]
+# ---- Helper functions ----
+def _predict_2f(sentence: str, model_name: str) -> int:
+    """Trích 2-feature và predict 0/1."""
+    x = extract_features_2(sentence, freqs)
+    return int(models_2f[model_name].predict(x)[0])
+def _predict_6f(sentence: str, model_name: str) -> int:
+    """Trích 6-feature và predict 0/1."""
+    x = extract_features_6(sentence, freqs)
+    return int(models_6f[model_name].predict(x)[0])
+# 2-feature
+def predict_randomforest_2f(sentence): return _predict_2f(sentence, "Random Forest")
+def predict_xgboost_2f(sentence):      return _predict_2f(sentence, "XGBoost")
+def predict_lightgbm_2f(sentence):     return _predict_2f(sentence, "LightGBM")
+def predict_svm_2f(sentence):          return _predict_2f(sentence, "SVM")
+def predict_decisiontree_2f(sentence): return _predict_2f(sentence, "Decision Tree")
+def predict_naivebayes_2f(sentence):   return _predict_2f(sentence, "Naive Bayes")
+# 6-feature
+def predict_randomforest_6f(sentence): return _predict_6f(sentence, "Random Forest")
+def predict_xgboost_6f(sentence):      return _predict_6f(sentence, "XGBoost")
+def predict_lightgbm_6f(sentence):     return _predict_6f(sentence, "LightGBM")
+def predict_svm_6f(sentence):          return _predict_6f(sentence, "SVM")
+def predict_decisiontree_6f(sentence): return _predict_6f(sentence, "Decision Tree")
+def predict_naivebayes_6f(sentence):   return _predict_6f(sentence, "Naive Bayes")
+# ---- Test nhanh ----
+if __name__ == "__main__":
+    test_sentence = "I love this new phone!"
+    print("RandomForest 2f:", predict_randomforest_2f(test_sentence))
+    print("RandomForest 6f:", predict_randomforest_6f(test_sentence))
+    print("SVM 2f:", predict_svm_2f(test_sentence))
+    print("SVM 6f:", predict_svm_6f(test_sentence))

preds.csv ADDED Viewed

	@@ -0,0 +1,2001 @@

+idx,pred
+0,0
+1,1
+2,1
+3,1
+4,1
+5,1
+6,1
+7,1
+8,1
+9,1
+10,1
+11,0
+12,1
+13,1
+14,0
+15,1
+16,1
+17,1
+18,1
+19,1
+20,1
+21,0
+22,1
+23,1
+24,1
+25,1
+26,1
+27,1
+28,1
+29,1
+30,1
+31,0
+32,1
+33,1
+34,1
+35,1
+36,1
+37,0
+38,1
+39,1
+40,1
+41,1
+42,1
+43,1
+44,1
+45,1
+46,1
+47,1
+48,1
+49,1
+50,1
+51,1
+52,1
+53,1
+54,0
+55,0
+56,1
+57,1
+58,1
+59,1
+60,1
+61,1
+62,1
+63,1
+64,1
+65,1
+66,1
+67,1
+68,1
+69,1
+70,0
+71,1
+72,1
+73,1
+74,1
+75,0
+76,1
+77,1
+78,1
+79,1
+80,1
+81,1
+82,1
+83,1
+84,0
+85,0
+86,1
+87,1
+88,1
+89,1
+90,1
+91,1
+92,1
+93,1
+94,1
+95,1
+96,1
+97,1
+98,1
+99,1
+100,1
+101,1
+102,1
+103,1
+104,1
+105,1
+106,1
+107,1
+108,1
+109,1
+110,1
+111,1
+112,1
+113,1
+114,1
+115,1
+116,1
+117,1
+118,1
+119,1
+120,1
+121,1
+122,1
+123,1
+124,1
+125,1
+126,1
+127,1
+128,1
+129,1
+130,1
+131,1
+132,1
+133,1
+134,1
+135,1
+136,1
+137,1
+138,1
+139,1
+140,1
+141,1
+142,1
+143,1
+144,1
+145,1
+146,1
+147,1
+148,1
+149,1
+150,0
+151,1
+152,1
+153,0
+154,1
+155,1
+156,1
+157,1
+158,1
+159,0
+160,1
+161,1
+162,1
+163,1
+164,1
+165,1
+166,1
+167,0
+168,1
+169,1
+170,1
+171,1
+172,1
+173,1
+174,1
+175,1
+176,0
+177,1
+178,1
+179,1
+180,1
+181,1
+182,1
+183,1
+184,1
+185,1
+186,1
+187,1
+188,1
+189,1
+190,1
+191,1
+192,1
+193,1
+194,1
+195,1
+196,1
+197,1
+198,1
+199,1
+200,1
+201,1
+202,1
+203,0
+204,0
+205,1
+206,1
+207,1
+208,1
+209,1
+210,1
+211,1
+212,1
+213,0
+214,1
+215,1
+216,1
+217,1
+218,1
+219,1
+220,1
+221,1
+222,0
+223,1
+224,1
+225,1
+226,1
+227,1
+228,1
+229,0
+230,0
+231,1
+232,1
+233,1
+234,1
+235,1
+236,1
+237,1
+238,1
+239,1
+240,1
+241,1
+242,1
+243,1
+244,1
+245,1
+246,1
+247,1
+248,1
+249,0
+250,1
+251,1
+252,1
+253,1
+254,1
+255,1
+256,1
+257,1
+258,1
+259,1
+260,1
+261,1
+262,1
+263,1
+264,1
+265,1
+266,1
+267,1
+268,0
+269,1
+270,1
+271,1
+272,1
+273,1
+274,1
+275,1
+276,1
+277,1
+278,1
+279,1
+280,1
+281,1
+282,1
+283,1
+284,1
+285,1
+286,1
+287,1
+288,1
+289,1
+290,1
+291,1
+292,1
+293,1
+294,1
+295,1
+296,0
+297,1
+298,1
+299,1
+300,1
+301,1
+302,1
+303,1
+304,1
+305,1
+306,1
+307,1
+308,1
+309,1
+310,1
+311,1
+312,1
+313,1
+314,1
+315,1
+316,1
+317,1
+318,1
+319,1
+320,1
+321,1
+322,1
+323,1
+324,1
+325,1
+326,1
+327,1
+328,1
+329,0
+330,1
+331,1
+332,1
+333,1
+334,1
+335,1
+336,1
+337,1
+338,1
+339,1
+340,1
+341,1
+342,1
+343,1
+344,0
+345,1
+346,1
+347,1
+348,1
+349,1
+350,1
+351,1
+352,1
+353,1
+354,1
+355,1
+356,1
+357,1
+358,0
+359,0
+360,1
+361,1
+362,1
+363,1
+364,0
+365,1
+366,1
+367,0
+368,1
+369,1
+370,1
+371,1
+372,1
+373,1
+374,1
+375,1
+376,1
+377,1
+378,1
+379,1
+380,1
+381,1
+382,1
+383,1
+384,1
+385,1
+386,1
+387,1
+388,1
+389,0
+390,1
+391,1
+392,1
+393,1
+394,0
+395,1
+396,1
+397,0
+398,1
+399,1
+400,1
+401,1
+402,1
+403,1
+404,1
+405,1
+406,1
+407,0
+408,1
+409,1
+410,1
+411,1
+412,1
+413,1
+414,1
+415,1
+416,1
+417,1
+418,0
+419,0
+420,1
+421,0
+422,1
+423,1
+424,1
+425,0
+426,1
+427,1
+428,1
+429,0
+430,1
+431,1
+432,1
+433,1
+434,1
+435,1
+436,1
+437,1
+438,0
+439,1
+440,1
+441,1
+442,1
+443,1
+444,0
+445,0
+446,1
+447,1
+448,1
+449,1
+450,1
+451,1
+452,1
+453,1
+454,1
+455,1
+456,0
+457,1
+458,1
+459,1
+460,1
+461,0
+462,1
+463,1
+464,1
+465,0
+466,1
+467,1
+468,1
+469,1
+470,1
+471,1
+472,1
+473,1
+474,0
+475,1
+476,0
+477,1
+478,1
+479,1
+480,1
+481,1
+482,1
+483,1
+484,1
+485,1
+486,1
+487,0
+488,1
+489,1
+490,1
+491,1
+492,1
+493,1
+494,1
+495,1
+496,1
+497,1
+498,1
+499,1
+500,1
+501,1
+502,1
+503,0
+504,1
+505,1
+506,1
+507,1
+508,1
+509,1
+510,1
+511,0
+512,1
+513,1
+514,1
+515,1
+516,1
+517,1
+518,1
+519,0
+520,1
+521,1
+522,1
+523,1
+524,1
+525,1
+526,1
+527,1
+528,1
+529,1
+530,1
+531,1
+532,0
+533,1
+534,1
+535,1
+536,1
+537,1
+538,1
+539,1
+540,1
+541,1
+542,1
+543,1
+544,1
+545,1
+546,1
+547,0
+548,1
+549,1
+550,1
+551,1
+552,1
+553,0
+554,1
+555,0
+556,1
+557,1
+558,0
+559,0
+560,1
+561,1
+562,1
+563,1
+564,1
+565,1
+566,1
+567,1
+568,1
+569,1
+570,0
+571,1
+572,1
+573,1
+574,1
+575,1
+576,1
+577,0
+578,1
+579,1
+580,1
+581,1
+582,1
+583,1
+584,1
+585,1
+586,1
+587,0
+588,1
+589,1
+590,1
+591,1
+592,1
+593,1
+594,1
+595,1
+596,1
+597,1
+598,1
+599,1
+600,1
+601,1
+602,1
+603,1
+604,1
+605,1
+606,1
+607,1
+608,1
+609,1
+610,1
+611,1
+612,1
+613,1
+614,1
+615,1
+616,1
+617,1
+618,1
+619,1
+620,1
+621,1
+622,1
+623,1
+624,1
+625,1
+626,1
+627,1
+628,1
+629,0
+630,1
+631,1
+632,1
+633,1
+634,1
+635,1
+636,1
+637,1
+638,1
+639,1
+640,1
+641,1
+642,1
+643,1
+644,1
+645,1
+646,1
+647,1
+648,1
+649,1
+650,1
+651,1
+652,1
+653,1
+654,1
+655,1
+656,1
+657,1
+658,1
+659,1
+660,1
+661,1
+662,1
+663,1
+664,1
+665,1
+666,1
+667,1
+668,1
+669,1
+670,1
+671,1
+672,1
+673,1
+674,1
+675,0
+676,1
+677,1
+678,1
+679,1
+680,1
+681,1
+682,1
+683,1
+684,1
+685,1
+686,1
+687,1
+688,1
+689,1
+690,1
+691,1
+692,1
+693,1
+694,1
+695,1
+696,1
+697,0
+698,1
+699,1
+700,1
+701,1
+702,1
+703,1
+704,1
+705,1
+706,1
+707,1
+708,1
+709,1
+710,1
+711,1
+712,1
+713,0
+714,1
+715,1
+716,1
+717,1
+718,1
+719,1
+720,1
+721,1
+722,1
+723,1
+724,1
+725,1
+726,1
+727,1
+728,1
+729,1
+730,1
+731,1
+732,1
+733,1
+734,0
+735,1
+736,0
+737,1
+738,1
+739,1
+740,1
+741,1
+742,1
+743,1
+744,0
+745,1
+746,1
+747,1
+748,1
+749,1
+750,1
+751,1
+752,1
+753,1
+754,1
+755,1
+756,1
+757,1
+758,0
+759,1
+760,1
+761,0
+762,1
+763,1
+764,1
+765,1
+766,1
+767,1
+768,1
+769,1
+770,1
+771,1
+772,1
+773,1
+774,1
+775,1
+776,1
+777,1
+778,1
+779,1
+780,1
+781,1
+782,1
+783,1
+784,1
+785,1
+786,1
+787,0
+788,1
+789,1
+790,1
+791,1
+792,1
+793,1
+794,0
+795,1
+796,1
+797,1
+798,1
+799,0
+800,1
+801,1
+802,1
+803,1
+804,1
+805,1
+806,1
+807,1
+808,1
+809,0
+810,1
+811,1
+812,1
+813,1
+814,1
+815,1
+816,1
+817,1
+818,1
+819,1
+820,1
+821,1
+822,1
+823,1
+824,1
+825,1
+826,1
+827,1
+828,1
+829,1
+830,1
+831,1
+832,1
+833,1
+834,1
+835,1
+836,1
+837,1
+838,1
+839,1
+840,1
+841,1
+842,1
+843,1
+844,0
+845,0
+846,1
+847,1
+848,1
+849,1
+850,1
+851,1
+852,1
+853,1
+854,1
+855,1
+856,1
+857,1
+858,1
+859,1
+860,1
+861,1
+862,1
+863,0
+864,1
+865,1
+866,1
+867,1
+868,1
+869,0
+870,1
+871,1
+872,1
+873,1
+874,1
+875,1
+876,1
+877,1
+878,1
+879,1
+880,1
+881,1
+882,1
+883,1
+884,1
+885,1
+886,1
+887,1
+888,1
+889,1
+890,1
+891,1
+892,1
+893,1
+894,1
+895,1
+896,1
+897,1
+898,1
+899,1
+900,1
+901,1
+902,1
+903,1
+904,1
+905,1
+906,1
+907,1
+908,1
+909,1
+910,1
+911,1
+912,1
+913,1
+914,1
+915,1
+916,1
+917,1
+918,1
+919,1
+920,1
+921,0
+922,1
+923,0
+924,1
+925,1
+926,1
+927,1
+928,1
+929,1
+930,1
+931,1
+932,1
+933,1
+934,1
+935,1
+936,1
+937,0
+938,1
+939,1
+940,1
+941,1
+942,1
+943,0
+944,0
+945,1
+946,1
+947,1
+948,1
+949,1
+950,1
+951,0
+952,1
+953,1
+954,1
+955,1
+956,1
+957,1
+958,1
+959,1
+960,1
+961,1
+962,1
+963,1
+964,1
+965,1
+966,1
+967,1
+968,1
+969,0
+970,0
+971,1
+972,1
+973,1
+974,1
+975,1
+976,1
+977,1
+978,1
+979,1
+980,1
+981,1
+982,1
+983,1
+984,1
+985,1
+986,0
+987,1
+988,1
+989,1
+990,1
+991,1
+992,1
+993,1
+994,0
+995,1
+996,1
+997,1
+998,1
+999,0
+1000,0
+1001,0
+1002,0
+1003,0
+1004,0
+1005,0
+1006,0
+1007,0
+1008,0
+1009,0
+1010,0
+1011,0
+1012,0
+1013,0
+1014,0
+1015,0
+1016,0
+1017,0
+1018,0
+1019,0
+1020,0
+1021,0
+1022,0
+1023,1
+1024,0
+1025,0
+1026,0
+1027,0
+1028,0
+1029,0
+1030,0
+1031,0
+1032,0
+1033,0
+1034,0
+1035,1
+1036,0
+1037,0
+1038,0
+1039,0
+1040,0
+1041,0
+1042,0
+1043,0
+1044,0
+1045,0
+1046,0
+1047,0
+1048,0
+1049,0
+1050,0
+1051,0
+1052,0
+1053,0
+1054,0
+1055,0
+1056,0
+1057,0
+1058,0
+1059,0
+1060,0
+1061,0
+1062,0
+1063,0
+1064,0
+1065,0
+1066,0
+1067,0
+1068,0
+1069,0
+1070,0
+1071,0
+1072,0
+1073,0
+1074,0
+1075,0
+1076,0
+1077,0
+1078,0
+1079,0
+1080,0
+1081,0
+1082,0
+1083,0
+1084,0
+1085,0
+1086,0
+1087,0
+1088,0
+1089,0
+1090,0
+1091,0
+1092,0
+1093,0
+1094,0
+1095,0
+1096,0
+1097,0
+1098,0
+1099,0
+1100,0
+1101,0
+1102,0
+1103,0
+1104,0
+1105,0
+1106,0
+1107,0
+1108,0
+1109,0
+1110,0
+1111,0
+1112,0
+1113,0
+1114,0
+1115,0
+1116,0
+1117,0
+1118,0
+1119,0
+1120,0
+1121,0
+1122,0
+1123,0
+1124,0
+1125,0
+1126,0
+1127,0
+1128,0
+1129,0
+1130,0
+1131,0
+1132,0
+1133,0
+1134,0
+1135,0
+1136,0
+1137,0
+1138,0
+1139,0
+1140,0
+1141,0
+1142,0
+1143,0
+1144,0
+1145,0
+1146,0
+1147,0
+1148,0
+1149,0
+1150,0
+1151,0
+1152,0
+1153,0
+1154,0
+1155,0
+1156,0
+1157,0
+1158,0
+1159,0
+1160,0
+1161,1
+1162,0
+1163,0
+1164,0
+1165,0
+1166,0
+1167,0
+1168,0
+1169,0
+1170,0
+1171,0
+1172,0
+1173,0
+1174,0
+1175,0
+1176,1
+1177,0
+1178,0
+1179,0
+1180,0
+1181,0
+1182,0
+1183,0
+1184,0
+1185,0
+1186,0
+1187,0
+1188,0
+1189,0
+1190,0
+1191,0
+1192,0
+1193,0
+1194,0
+1195,0
+1196,0
+1197,0
+1198,0
+1199,0
+1200,0
+1201,0
+1202,0
+1203,0
+1204,0
+1205,0
+1206,0
+1207,0
+1208,0
+1209,0
+1210,0
+1211,0
+1212,0
+1213,0
+1214,0
+1215,0
+1216,0
+1217,0
+1218,0
+1219,0
+1220,0
+1221,0
+1222,0
+1223,0
+1224,0
+1225,0
+1226,0
+1227,0
+1228,0
+1229,0
+1230,0
+1231,0
+1232,0
+1233,0
+1234,0
+1235,0
+1236,0
+1237,0
+1238,0
+1239,0
+1240,0
+1241,0
+1242,0
+1243,0
+1244,0
+1245,0
+1246,0
+1247,0
+1248,0
+1249,0
+1250,0
+1251,0
+1252,0
+1253,1
+1254,0
+1255,0
+1256,0
+1257,0
+1258,0
+1259,0
+1260,0
+1261,0
+1262,0
+1263,0
+1264,0
+1265,0
+1266,0
+1267,0
+1268,0
+1269,0
+1270,0
+1271,0
+1272,0
+1273,0
+1274,0
+1275,0
+1276,0
+1277,0
+1278,0
+1279,0
+1280,0
+1281,0
+1282,0
+1283,0
+1284,0
+1285,0
+1286,0
+1287,0
+1288,0
+1289,0
+1290,0
+1291,0
+1292,0
+1293,0
+1294,0
+1295,0
+1296,0
+1297,0
+1298,0
+1299,0
+1300,0
+1301,0
+1302,0
+1303,0
+1304,0
+1305,0
+1306,0
+1307,0
+1308,0
+1309,0
+1310,0
+1311,0
+1312,0
+1313,0
+1314,0
+1315,0
+1316,0
+1317,0
+1318,0
+1319,0
+1320,0
+1321,0
+1322,0
+1323,0
+1324,0
+1325,0
+1326,0
+1327,0
+1328,0
+1329,0
+1330,0
+1331,0
+1332,0
+1333,0
+1334,0
+1335,0
+1336,0
+1337,0
+1338,0
+1339,0
+1340,0
+1341,0
+1342,0
+1343,0
+1344,0
+1345,0
+1346,0
+1347,0
+1348,0
+1349,0
+1350,0
+1351,0
+1352,0
+1353,0
+1354,0
+1355,0
+1356,0
+1357,0
+1358,0
+1359,0
+1360,0
+1361,0
+1362,0
+1363,0
+1364,0
+1365,0
+1366,0
+1367,0
+1368,0
+1369,0
+1370,0
+1371,0
+1372,0
+1373,0
+1374,0
+1375,0
+1376,0
+1377,0
+1378,0
+1379,0
+1380,0
+1381,0
+1382,0
+1383,0
+1384,0
+1385,0
+1386,0
+1387,0
+1388,0
+1389,0
+1390,0
+1391,0
+1392,0
+1393,0
+1394,0
+1395,0
+1396,0
+1397,0
+1398,0
+1399,0
+1400,0
+1401,0
+1402,0
+1403,0
+1404,0
+1405,0
+1406,0
+1407,0
+1408,0
+1409,0
+1410,0
+1411,0
+1412,0
+1413,0
+1414,0
+1415,0
+1416,0
+1417,0
+1418,0
+1419,0
+1420,0
+1421,0
+1422,0
+1423,0
+1424,0
+1425,0
+1426,0
+1427,0
+1428,0
+1429,0
+1430,0
+1431,0
+1432,0
+1433,0
+1434,0
+1435,0
+1436,0
+1437,0
+1438,0
+1439,0
+1440,0
+1441,0
+1442,0
+1443,0
+1444,0
+1445,0
+1446,0
+1447,0
+1448,0
+1449,0
+1450,0
+1451,0
+1452,0
+1453,0
+1454,0
+1455,0
+1456,0
+1457,0
+1458,0
+1459,0
+1460,0
+1461,0
+1462,0
+1463,0
+1464,0
+1465,0
+1466,0
+1467,0
+1468,0
+1469,0
+1470,0
+1471,0
+1472,0
+1473,0
+1474,0
+1475,0
+1476,0
+1477,0
+1478,0
+1479,0
+1480,0
+1481,0
+1482,0
+1483,0
+1484,0
+1485,0
+1486,0
+1487,0
+1488,0
+1489,0
+1490,0
+1491,0
+1492,0
+1493,0
+1494,0
+1495,0
+1496,0
+1497,0
+1498,0
+1499,0
+1500,0
+1501,0
+1502,0
+1503,0
+1504,0
+1505,0
+1506,0
+1507,0
+1508,0
+1509,0
+1510,0
+1511,0
+1512,0
+1513,0
+1514,0
+1515,0
+1516,0
+1517,0
+1518,0
+1519,0
+1520,0
+1521,0
+1522,0
+1523,0
+1524,0
+1525,0
+1526,0
+1527,1
+1528,0
+1529,0
+1530,0
+1531,0
+1532,0
+1533,0
+1534,0
+1535,0
+1536,0
+1537,1
+1538,0
+1539,0
+1540,0
+1541,0
+1542,0
+1543,0
+1544,0
+1545,0
+1546,0
+1547,0
+1548,0
+1549,0
+1550,0
+1551,0
+1552,0
+1553,0
+1554,0
+1555,0
+1556,0
+1557,0
+1558,0
+1559,0
+1560,0
+1561,0
+1562,0
+1563,0
+1564,0
+1565,0
+1566,0
+1567,0
+1568,0
+1569,0
+1570,0
+1571,0
+1572,0
+1573,0
+1574,0
+1575,0
+1576,0
+1577,0
+1578,0
+1579,0
+1580,1
+1581,0
+1582,0
+1583,0
+1584,0
+1585,0
+1586,0
+1587,0
+1588,0
+1589,0
+1590,0
+1591,0
+1592,0
+1593,0
+1594,0
+1595,0
+1596,0
+1597,0
+1598,0
+1599,0
+1600,0
+1601,0
+1602,0
+1603,0
+1604,0
+1605,0
+1606,0
+1607,0
+1608,0
+1609,0
+1610,0
+1611,0
+1612,0
+1613,0
+1614,0
+1615,0
+1616,0
+1617,0
+1618,0
+1619,0
+1620,0
+1621,0
+1622,0
+1623,0
+1624,0
+1625,0
+1626,0
+1627,0
+1628,0
+1629,0
+1630,0
+1631,0
+1632,0
+1633,0
+1634,0
+1635,0
+1636,1
+1637,0
+1638,0
+1639,0
+1640,0
+1641,0
+1642,0
+1643,0
+1644,0
+1645,0
+1646,0
+1647,0
+1648,0
+1649,0
+1650,0
+1651,0
+1652,0
+1653,0
+1654,0
+1655,0
+1656,0
+1657,0
+1658,0
+1659,0
+1660,0
+1661,0
+1662,0
+1663,0
+1664,0
+1665,0
+1666,0
+1667,0
+1668,0
+1669,0
+1670,0
+1671,0
+1672,0
+1673,0
+1674,0
+1675,0
+1676,0
+1677,0
+1678,0
+1679,0
+1680,0
+1681,0
+1682,0
+1683,0
+1684,0
+1685,0
+1686,0
+1687,0
+1688,0
+1689,0
+1690,0
+1691,0
+1692,0
+1693,0
+1694,0
+1695,0
+1696,0
+1697,0
+1698,0
+1699,0
+1700,0
+1701,0
+1702,0
+1703,0
+1704,0
+1705,0
+1706,0
+1707,0
+1708,0
+1709,0
+1710,0
+1711,0
+1712,0
+1713,0
+1714,0
+1715,0
+1716,0
+1717,0
+1718,0
+1719,0
+1720,0
+1721,0
+1722,0
+1723,0
+1724,0
+1725,0
+1726,0
+1727,0
+1728,0
+1729,0
+1730,0
+1731,0
+1732,0
+1733,0
+1734,0
+1735,0
+1736,0
+1737,0
+1738,0
+1739,0
+1740,0
+1741,0
+1742,0
+1743,0
+1744,0
+1745,0
+1746,1
+1747,0
+1748,0
+1749,0
+1750,0
+1751,0
+1752,0
+1753,0
+1754,0
+1755,0
+1756,0
+1757,0
+1758,1
+1759,0
+1760,0
+1761,0
+1762,0
+1763,0
+1764,0
+1765,0
+1766,0
+1767,0
+1768,0
+1769,0
+1770,0
+1771,0
+1772,0
+1773,0
+1774,0
+1775,0
+1776,0
+1777,0
+1778,0
+1779,0
+1780,0
+1781,0
+1782,0
+1783,0
+1784,0
+1785,0
+1786,0
+1787,0
+1788,0
+1789,0
+1790,0
+1791,0
+1792,0
+1793,0
+1794,0
+1795,0
+1796,0
+1797,0
+1798,0
+1799,0
+1800,0
+1801,0
+1802,0
+1803,0
+1804,0
+1805,0
+1806,0
+1807,0
+1808,0
+1809,0
+1810,0
+1811,0
+1812,0
+1813,0
+1814,0
+1815,0
+1816,0
+1817,1
+1818,0
+1819,0
+1820,0
+1821,0
+1822,0
+1823,0
+1824,0
+1825,0
+1826,0
+1827,0
+1828,0
+1829,0
+1830,0
+1831,0
+1832,0
+1833,0
+1834,0
+1835,0
+1836,0
+1837,0
+1838,0
+1839,0
+1840,0
+1841,0
+1842,0
+1843,0
+1844,0
+1845,0
+1846,0
+1847,0
+1848,0
+1849,0
+1850,0
+1851,0
+1852,0
+1853,1
+1854,0
+1855,0
+1856,0
+1857,0
+1858,0
+1859,0
+1860,0
+1861,0
+1862,0
+1863,0
+1864,0
+1865,0
+1866,0
+1867,0
+1868,0
+1869,0
+1870,0
+1871,0
+1872,0
+1873,0
+1874,0
+1875,0
+1876,0
+1877,0
+1878,0
+1879,0
+1880,0
+1881,0
+1882,0
+1883,0
+1884,0
+1885,0
+1886,0
+1887,0
+1888,0
+1889,0
+1890,0
+1891,0
+1892,0
+1893,0
+1894,0
+1895,0
+1896,0
+1897,0
+1898,0
+1899,0
+1900,0
+1901,0
+1902,0
+1903,0
+1904,0
+1905,0
+1906,0
+1907,0
+1908,0
+1909,0
+1910,0
+1911,0
+1912,0
+1913,0
+1914,0
+1915,0
+1916,0
+1917,0
+1918,0
+1919,0
+1920,0
+1921,0
+1922,0
+1923,0
+1924,0
+1925,0
+1926,0
+1927,0
+1928,0
+1929,0
+1930,0
+1931,0
+1932,0
+1933,0
+1934,1
+1935,0
+1936,0
+1937,0
+1938,0
+1939,0
+1940,0
+1941,0
+1942,0
+1943,1
+1944,1
+1945,0
+1946,0
+1947,0
+1948,0
+1949,0
+1950,0
+1951,0
+1952,0
+1953,0
+1954,0
+1955,0
+1956,0
+1957,0
+1958,0
+1959,0
+1960,0
+1961,0
+1962,0
+1963,0
+1964,0
+1965,0
+1966,0
+1967,0
+1968,0
+1969,0
+1970,0
+1971,0
+1972,0
+1973,0
+1974,0
+1975,0
+1976,0
+1977,0
+1978,0
+1979,0
+1980,0
+1981,0
+1982,0
+1983,0
+1984,0
+1985,0
+1986,0
+1987,0
+1988,0
+1989,0
+1990,0
+1991,0
+1992,0
+1993,0
+1994,0
+1995,0
+1996,0
+1997,0
+1998,0
+1999,0

requirements.txt CHANGED Viewed

@@ -3,3 +3,8 @@ python-dotenv>=1.0.0
 google-generativeai>=0.8.0
 langchain>=0.2.5
 langchain-google-genai>=0.0.12

 google-generativeai>=0.8.0
 langchain>=0.2.5
 langchain-google-genai>=0.0.12
+numpy
+nltk
+scikit-learn
+xgboost
+lightgbm

training_model.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# file: train_demo_models.py
+from __future__ import annotations
+import pickle
+import numpy as np
+from typing import Dict, Tuple, List
+import nltk
+from nltk.corpus import twitter_samples, stopwords
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from lightgbm import LGBMClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.metrics import accuracy_score, log_loss
+from feature_extract import build_freqs, extract_features_2, extract_features_6
+# -------------------- NLTK setup --------------------
+def _ensure_nltk():
+    try:
+        twitter_samples.fileids()
+    except LookupError:
+        nltk.download("twitter_samples", quiet=True)
+    try:
+        stopwords.words("english")
+    except LookupError:
+        nltk.download("stopwords", quiet=True)
+# -------------------- Data prep --------------------
+def load_twitter_data() -> Tuple[List[str], np.ndarray]:
+    pos = twitter_samples.strings("positive_tweets.json")
+    neg = twitter_samples.strings("negative_tweets.json")
+    tweets = pos + neg
+    y = np.array([1] * len(pos) + [0] * len(neg))
+    return tweets, y
+def vectorize(tweets: List[str],
+              freqs: Dict[Tuple[str, float], float],
+              mode: str = "2f") -> np.ndarray:
+    """mode: '2f' -> extract_features_2, '6f' -> extract_features_6"""
+    feat_fn = extract_features_2 if mode == "2f" else extract_features_6
+    rows = [feat_fn(t, freqs) for t in tweets]
+    return np.vstack(rows) if rows else np.zeros((0, 2 if mode == "2f" else 6))
+# -------------------- Models --------------------
+def make_models() -> Dict[str, object]:
+    return {
+        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
+        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
+        "LightGBM": LGBMClassifier(random_state=42),
+        "SVM": SVC(kernel="linear", probability=True, random_state=42),
+        "Decision Tree": DecisionTreeClassifier(random_state=42),
+        "Naive Bayes": GaussianNB(),
+    }
+# -------------------- Train --------------------
+def train_models(X: np.ndarray, y: np.ndarray) -> Dict[str, object]:
+    models = make_models()
+    trained = {}
+    print("Đang train các mô hình:")
+    for name, clf in models.items():
+        clf.fit(X, y.ravel())
+        trained[name] = clf
+        # --- ghi log sau train ---
+        y_pred = clf.predict(X)
+        acc = accuracy_score(y, y_pred)
+        # log_loss cần probability
+        try:
+            y_proba = clf.predict_proba(X)
+            loss = log_loss(y, y_proba)
+        except Exception:
+            loss = None
+        if loss is not None:
+            print(f"[{name}] Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
+        else:
+            print(f"[{name}] Accuracy: {acc:.4f} | (không có predict_proba để tính log_loss)")
+    print("=" * 60)
+    return trained
+def train_all_versions(save_path: str = "demo_models.pkl"):
+    """
+    Train và lưu mô hình + freqs ra file pickle.
+    Trả về:
+      {
+        'freqs': freqs,
+        '2f': {model_name: trained_model, ...},
+        '6f': {model_name: trained_model, ...}
+      }
+    """
+    _ensure_nltk()
+    tweets, y = load_twitter_data()
+    freqs = build_freqs(tweets, y.reshape(-1, 1))
+    # trích features
+    X2 = vectorize(tweets, freqs, mode="2f")
+    X6 = vectorize(tweets, freqs, mode="6f")
+    print("\n===== Train với 2-feature =====")
+    models_2f = train_models(X2, y)
+    print("\n===== Train với 6-feature =====")
+    models_6f = train_models(X6, y)
+    data_to_save = {
+        "freqs": freqs,
+        "2f": models_2f,
+        "6f": models_6f,
+    }
+    # lưu file pickle
+    with open(save_path, "wb") as f:
+        pickle.dump(data_to_save, f)
+    print(f"\nĐã train và lưu mô hình + freqs vào file: {save_path}")
+    return data_to_save
+# -------------------- Load --------------------
+def load_demo_models(save_path: str = "demo_models.pkl"):
+    """Load lại mô hình + freqs từ file pickle."""
+    with open(save_path, "rb") as f:
+        data = pickle.load(f)
+    return data
+# -------------------- CLI --------------------
+if __name__ == "__main__":
+    models = train_all_versions()  # train & save
+    print("Các mô hình 2f:", list(models["2f"].keys()))
+    print("Các mô hình 6f:", list(models["6f"].keys()))