trantuan1701 commited on
Commit
9d454eb
·
1 Parent(s): ba48a36
__pycache__/feature_extract.cpython-313.pyc ADDED
Binary file (5.83 kB). View file
 
__pycache__/inference_demo.cpython-313.pyc ADDED
Binary file (3.38 kB). View file
 
__pycache__/llm.cpython-310.pyc ADDED
Binary file (535 Bytes). View file
 
__pycache__/llm.cpython-312.pyc ADDED
Binary file (661 Bytes). View file
 
__pycache__/llm.cpython-313.pyc ADDED
Binary file (651 Bytes). View file
 
__pycache__/llm_classification.cpython-313.pyc ADDED
Binary file (1.33 kB). View file
 
app.py CHANGED
@@ -1,21 +1,138 @@
1
  import gradio as gr
2
- from llm_classification import get_answer
 
 
 
 
 
 
3
 
4
- CLASSIFIERS = ["gemini"]
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- def infer(clf: str, text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  if not text.strip():
8
- return "negative"
9
- y = get_answer(text)
10
- return "positive" if y == 1 else "negative"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- with gr.Blocks(title="Sentiment Classifier") as demo:
13
- gr.Markdown("## Sentiment Classifier")
14
- clf = gr.Dropdown(choices=CLASSIFIERS, value="gemini", label="Classifier")
15
- txt = gr.Textbox(label="Input sentence", placeholder="Type a sentence…")
 
 
 
 
 
16
  btn = gr.Button("Classify")
17
- out = gr.Label(label="Result")
18
- btn.click(infer, inputs=[clf, txt], outputs=out)
19
-
 
 
 
 
 
 
 
20
  if __name__ == "__main__":
21
  demo.launch()
 
1
  import gradio as gr
2
+ from llm_classification import get_answer
3
+ from inference_demo import (
4
+ predict_randomforest_2f, predict_xgboost_2f, predict_lightgbm_2f,
5
+ predict_svm_2f, predict_decisiontree_2f, predict_naivebayes_2f,
6
+ predict_randomforest_6f, predict_xgboost_6f, predict_lightgbm_6f,
7
+ predict_svm_6f, predict_decisiontree_6f, predict_naivebayes_6f,
8
+ )
9
 
10
+ PREDICT_FUNCS = {
11
+ ("Random Forest", "2-feature"): predict_randomforest_2f,
12
+ ("XGBoost", "2-feature"): predict_xgboost_2f,
13
+ ("LightGBM", "2-feature"): predict_lightgbm_2f,
14
+ ("SVM", "2-feature"): predict_svm_2f,
15
+ ("Decision Tree", "2-feature"): predict_decisiontree_2f,
16
+ ("Naive Bayes", "2-feature"): predict_naivebayes_2f,
17
+ ("Random Forest", "6-feature"): predict_randomforest_6f,
18
+ ("XGBoost", "6-feature"): predict_xgboost_6f,
19
+ ("LightGBM", "6-feature"): predict_lightgbm_6f,
20
+ ("SVM", "6-feature"): predict_svm_6f,
21
+ ("Decision Tree", "6-feature"): predict_decisiontree_6f,
22
+ ("Naive Bayes", "6-feature"): predict_naivebayes_6f,
23
+ }
24
 
25
+ CLASSIFIERS = [
26
+ "🔮 Gemini",
27
+ "🌳 Random Forest",
28
+ "⚡ XGBoost",
29
+ "💡 LightGBM",
30
+ "📈 SVM",
31
+ "🌲 Decision Tree",
32
+ "📊 Naive Bayes",
33
+ "🤝 Ensemble"
34
+ ]
35
+ FEATURE_VERSIONS = ["2-feature", "6-feature"]
36
+
37
+ FEATURE_EXPLANATIONS = {
38
+ "2-feature": (
39
+ "### Supported Language\n"
40
+ "Only **English** sentences are supported.\n\n"
41
+ "### 2-feature version\n"
42
+ "This version uses only 2 frequency-based features:\n"
43
+ " * x1 = Total frequency of words in the Positive class\n"
44
+ " * x2 = Total frequency of words in the Negative class"
45
+ ),
46
+ "6-feature": (
47
+ "### Supported Language\n"
48
+ "Only **English** sentences are supported.\n\n"
49
+ "### 6-feature version\n"
50
+ "This version uses 6 features:\n"
51
+ " * x1 = Total frequency of words in the Positive class\n"
52
+ " * x2 = Total frequency of words in the Negative class\n"
53
+ " * x3 = 1 if the word 'no' appears, else 0\n"
54
+ " * x4 = Count of 1st and 2nd person pronouns\n"
55
+ " * x5 = 1 if the tweet contains '!' else 0\n"
56
+ " * x6 = log(word count)"
57
+ ),
58
+ }
59
+
60
+ def explain_features(version: str) -> str:
61
+ return FEATURE_EXPLANATIONS[version]
62
+
63
+ def infer(clf: str, version: str, text: str):
64
  if not text.strip():
65
+ return {"⚠️ Please enter a sentence": 1.0}, ""
66
+ if clf == "🔮 Gemini":
67
+ y = get_answer(text)
68
+ if y == 1:
69
+ label = {"Positive 😀": 1.0}
70
+ else:
71
+ label = {"Negative 😞": 1.0}
72
+ return label, ""
73
+ if clf == "🤝 Ensemble":
74
+ model_names = ["Random Forest", "XGBoost", "LightGBM", "SVM", "Decision Tree", "Naive Bayes"]
75
+ votes_detail = []
76
+ votes = []
77
+ for m in model_names:
78
+ func = PREDICT_FUNCS.get((m, version))
79
+ if func:
80
+ y = func(text)
81
+ votes.append(y)
82
+ votes_detail.append(f"- **{m}**: {'Positive 😀' if y == 1 else 'Negative 😞'}")
83
+ if len(votes) == 0:
84
+ return {"No models available": 1.0}, ""
85
+ positive_votes = sum(votes)
86
+ negative_votes = len(votes) - positive_votes
87
+ total = len(votes)
88
+ positive_pct = 100 * positive_votes / total
89
+ negative_pct = 100 * negative_votes / total
90
+ if positive_votes > negative_votes:
91
+ label = {"Positive 😀": 1.0}
92
+ final = "### Final Ensemble Result: **Positive 😀**"
93
+ elif negative_votes > positive_votes:
94
+ label = {"Negative 😞": 1.0}
95
+ final = "### Final Ensemble Result: **Negative 😞**"
96
+ else:
97
+ label = {"Tie 🤔": 1.0}
98
+ final = "### Final Ensemble Result: **Tie 🤔**"
99
+ detail_text = "\n".join(votes_detail)
100
+ detail_md = (
101
+ f"{final}\n\n"
102
+ f"**Votes:** {positive_votes} positive ({positive_pct:.1f}%) | "
103
+ f"{negative_votes} negative ({negative_pct:.1f}%) out of {total} models.\n\n"
104
+ f"**Individual model decisions:**\n{detail_text}"
105
+ )
106
+ return label, detail_md
107
+ func = PREDICT_FUNCS.get((clf.replace("🌳 ","").replace("⚡ ","").replace("💡 ","").replace("📈 ","").replace("🌲 ","").replace("📊 ",""), version))
108
+ if func is None:
109
+ return {"Model not found": 1.0}, ""
110
+ y = func(text)
111
+ if y == 1:
112
+ label = {"Positive 😀": 1.0}
113
+ else:
114
+ label = {"Negative 😞": 1.0}
115
+ return label, ""
116
 
117
+ with gr.Blocks(
118
+ title="Sentiment Classifier Demo",
119
+ css=".big-markdown {font-size: 1.2rem; min-height: 300px; overflow:auto;}"
120
+ ) as demo:
121
+ gr.Markdown("## Sentiment Classifier Demo")
122
+ with gr.Row():
123
+ clf = gr.Dropdown(choices=CLASSIFIERS, value="🔮 Gemini", label="Classifier (or Ensemble)")
124
+ version = gr.Dropdown(choices=FEATURE_VERSIONS, value="2-feature", label="Feature Version (not used for gemini)")
125
+ txt = gr.Textbox(label="Input sentence (English only)", placeholder="Type a sentence…")
126
  btn = gr.Button("Classify")
127
+ out_label = gr.Label(label="Main Result")
128
+ out_detail = gr.Markdown(elem_classes="big-markdown")
129
+ explanation_box = gr.Markdown(FEATURE_EXPLANATIONS["2-feature"])
130
+ version.change(fn=explain_features, inputs=version, outputs=explanation_box)
131
+ btn.click(fn=infer, inputs=[clf, version, txt], outputs=[out_label, out_detail])
132
+ gr.Markdown(
133
+ "**Note:** This demo supports **English** sentences only. "
134
+ "Choose '🤝 Ensemble' to see the combined decision from all classifiers, "
135
+ "or choose '🔮 Gemini' to use the Gemini LLM-based classifier."
136
+ )
137
  if __name__ == "__main__":
138
  demo.launch()
demo_models.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf5cd9e9f927d6467888e9d249a99a086812f0c0a228a0b57407c2fe9eeb323d
3
+ size 4826559
exercise8.ipynb ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "0f914398",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%%capture\n",
11
+ "!pip install nltk\n",
12
+ "!pip install numpy\n",
13
+ "!pip install pandas"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 2,
19
+ "id": "d473cee2",
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "import nltk #Natural Language Toolkit\n",
24
+ "import numpy as np\n",
25
+ "import pandas as pd\n",
26
+ "from nltk.corpus import twitter_samples\n",
27
+ "from langchain.prompts import PromptTemplate\n",
28
+ "from langchain_core.messages import SystemMessage, HumanMessage\n",
29
+ "\n",
30
+ "from llm import llm\n",
31
+ "\n"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 3,
37
+ "id": "2f9d43cc",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "all_positive_tweets = twitter_samples.strings('positive_tweets.json')\n",
42
+ "all_negative_tweets = twitter_samples.strings('negative_tweets.json')\n",
43
+ "\n",
44
+ "test_pos = all_positive_tweets[4000:]\n",
45
+ "test_neg = all_negative_tweets[4000:]\n",
46
+ "\n",
47
+ "test_x = test_pos + test_neg\n",
48
+ "\n",
49
+ "# Create the numpy array of positive labels and negative labels.\n",
50
+ "test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 12,
56
+ "id": "ed135bd0",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "import re\n",
61
+ "import numpy as np # đảm bảo đã import\n",
62
+ "\n",
63
+ "# --- PROMPTS ---\n",
64
+ "system_prompt = (\n",
65
+ " \"You are a strict sentiment classifier.\\n\"\n",
66
+ " \"Given a batch of up to 20 sentences, output EXACTLY one line per input, \"\n",
67
+ " \"in the same order. Each line must be a single character: 1 for positive, 0 for negative. \"\n",
68
+ " \"NO extra text, NO numbering, NO spaces, NO blank lines.\"\n",
69
+ ")\n",
70
+ "\n",
71
+ "user_prompt = PromptTemplate(\n",
72
+ " input_variables=[\"items\"],\n",
73
+ " template=(\n",
74
+ " \"Classify the sentiment of EACH sentence listed between <INPUT> and </INPUT>.\\n\"\n",
75
+ " \"Rules:\\n\"\n",
76
+ " \"- Output exactly ONE line per sentence, in the SAME ORDER.\\n\"\n",
77
+ " \"- Each line must be EXACTLY '1' (positive) or '0' (negative).\\n\"\n",
78
+ " \"- Do NOT print anything else. Do NOT repeat the inputs.\\n\\n\"\n",
79
+ " \"<INPUT>\\n{items}\\n</INPUT>\"\n",
80
+ " ),\n",
81
+ ")\n",
82
+ "\n",
83
+ "def _format_items(sentences):\n",
84
+ " return \"\\n\".join(f\"<s>{s}</s>\" for s in sentences)\n",
85
+ "\n",
86
+ "# --- PARSER (robust) ---\n",
87
+ "def _parse_binary_lines(text: str, expected_n: int) -> np.ndarray:\n",
88
+ " \"\"\"\n",
89
+ " Chấp nhận:\n",
90
+ " - expected_n dòng, mỗi dòng là '0' hoặc '1'\n",
91
+ " - 1 dòng duy nhất dài đúng expected_n ký tự '0'/'1'\n",
92
+ " - Cứu hộ: gom toàn bộ ký tự '0'/'1' trong text nếu đúng expected_n\n",
93
+ " \"\"\"\n",
94
+ " s = (text or \"\").strip()\n",
95
+ " if not s:\n",
96
+ " raise ValueError(\"Empty model output\")\n",
97
+ "\n",
98
+ " lines = [ln.strip() for ln in s.splitlines() if ln.strip() != \"\"]\n",
99
+ "\n",
100
+ " # Case A: Đúng expected_n dòng, mỗi dòng là 0/1\n",
101
+ " if len(lines) == expected_n and all(re.fullmatch(r\"[01]\", ln) for ln in lines):\n",
102
+ " return np.array([int(ln) for ln in lines], dtype=np.int8)\n",
103
+ "\n",
104
+ " # Case B: 1 dòng duy nhất gồm đúng expected_n ký tự 0/1\n",
105
+ " if len(lines) == 1 and re.fullmatch(r\"[01]+\", lines[0]) and len(lines[0]) == expected_n:\n",
106
+ " return np.array([int(ch) for ch in lines[0]], dtype=np.int8)\n",
107
+ "\n",
108
+ " # Case C: Cứu hộ - lấy mọi ký tự 0/1 trong toàn bộ text\n",
109
+ " bits = re.findall(r\"[01]\", s)\n",
110
+ " if len(bits) == expected_n:\n",
111
+ " return np.array([int(b) for b in bits], dtype=np.int8)\n",
112
+ "\n",
113
+ " # Thất bại: báo lỗi kèm preview ngắn gọn\n",
114
+ " preview = s[:200].replace(\"\\n\", \"\\\\n\")\n",
115
+ " raise ValueError(f\"Expected {expected_n} labels, got {len(lines)} lines / {len(bits)} bits. Raw='{preview}...'\")\n",
116
+ "\n",
117
+ "# --- INFERENCE ---\n",
118
+ "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
119
+ " n = len(sentences)\n",
120
+ " if n == 0 or n > 20:\n",
121
+ " raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
122
+ "\n",
123
+ " messages = [\n",
124
+ " SystemMessage(content=system_prompt),\n",
125
+ " HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
126
+ " ]\n",
127
+ "\n",
128
+ " resp = llm.invoke(messages)\n",
129
+ "\n",
130
+ " # KHÔNG dùng str(resp): dễ lẫn metadata vào.\n",
131
+ " raw_text = getattr(resp, \"content\", None)\n",
132
+ " if raw_text is None or not str(raw_text).strip():\n",
133
+ " # Gợi ý: bạn có thể log resp để debug khi model bị chặn (block_reason, safety, v.v.)\n",
134
+ " raise RuntimeError(f\"LLM returned empty content. Full response repr: {repr(resp)}\")\n",
135
+ "\n",
136
+ " raw_text = raw_text.strip()\n",
137
+ " preds = _parse_binary_lines(raw_text, expected_n=n)\n",
138
+ " return preds if existing is None else np.concatenate([existing, preds])\n"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 18,
144
+ "id": "c06e66ff",
145
+ "metadata": {},
146
+ "outputs": [],
147
+ "source": [
148
+ "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
149
+ " n = len(sentences)\n",
150
+ " if n == 0 or n > 20:\n",
151
+ " raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
152
+ "\n",
153
+ " messages = [\n",
154
+ " SystemMessage(content=system_prompt),\n",
155
+ " HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
156
+ " ]\n",
157
+ "\n",
158
+ " resp = llm.invoke(messages)\n",
159
+ " raw_text = getattr(resp, \"content\", None)\n",
160
+ "\n",
161
+ " if raw_text is None or not str(raw_text).strip():\n",
162
+ " # Nếu LLM không trả ra gì → điền 0 hết\n",
163
+ " print(f\"[warn] LLM output empty for batch size {n}, filling 0s\")\n",
164
+ " preds = np.zeros(n, dtype=np.int8)\n",
165
+ " else:\n",
166
+ " raw_text = raw_text.strip()\n",
167
+ " try:\n",
168
+ " preds = _parse_binary_lines(raw_text, expected_n=n)\n",
169
+ " except Exception as e:\n",
170
+ " # Nếu parse fail → điền 0 hết\n",
171
+ " print(f\"[warn] Parse fail for batch size {n}, filling 0s: {e}\")\n",
172
+ " preds = np.zeros(n, dtype=np.int8)\n",
173
+ "\n",
174
+ " return preds if existing is None else np.concatenate([existing, preds])\n"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": 19,
180
+ "id": "495cb1f2",
181
+ "metadata": {},
182
+ "outputs": [
183
+ {
184
+ "name": "stdout",
185
+ "output_type": "stream",
186
+ "text": [
187
+ "[init] total=2000 done=1500 remain=500\n"
188
+ ]
189
+ },
190
+ {
191
+ "name": "stderr",
192
+ "output_type": "stream",
193
+ "text": [
194
+ "Gemini produced an empty response. Continuing with empty message\n",
195
+ "Feedback: block_reason: PROHIBITED_CONTENT\n",
196
+ "\n"
197
+ ]
198
+ },
199
+ {
200
+ "name": "stdout",
201
+ "output_type": "stream",
202
+ "text": [
203
+ "[warn] LLM output empty for batch size 20, filling 0s\n",
204
+ "[ok] 1500:1520 +20\n",
205
+ "[ok] 1520:1540 +20\n",
206
+ "[ok] 1540:1560 +20\n",
207
+ "[ok] 1560:1580 +20\n",
208
+ "[ok] 1580:1600 +20\n",
209
+ "[ok] 1600:1620 +20\n",
210
+ "[ok] 1620:1640 +20\n",
211
+ "[ok] 1640:1660 +20\n",
212
+ "[ok] 1660:1680 +20\n",
213
+ "[ok] 1680:1700 +20\n",
214
+ "[ok] 1700:1720 +20\n",
215
+ "[ok] 1720:1740 +20\n",
216
+ "[ok] 1740:1760 +20\n",
217
+ "[ok] 1760:1780 +20\n",
218
+ "[ok] 1780:1800 +20\n",
219
+ "[ok] 1800:1820 +20\n",
220
+ "[ok] 1820:1840 +20\n",
221
+ "[ok] 1840:1860 +20\n",
222
+ "[ok] 1860:1880 +20\n",
223
+ "[ok] 1880:1900 +20\n",
224
+ "[ok] 1900:1920 +20\n",
225
+ "[ok] 1920:1940 +20\n",
226
+ "[ok] 1940:1960 +20\n",
227
+ "[ok] 1960:1980 +20\n",
228
+ "[ok] 1980:2000 +20\n",
229
+ "[final] collected=2000/2000\n",
230
+ "Accuracy: 0.9470\n"
231
+ ]
232
+ }
233
+ ],
234
+ "source": [
235
+ "import os, csv, time\n",
236
+ "from sklearn.metrics import accuracy_score\n",
237
+ "\n",
238
+ "BATCH_SIZE = 20\n",
239
+ "SLEEP_SECS = 20\n",
240
+ "PRED_CSV = \"preds.csv\"\n",
241
+ "\n",
242
+ "y_true = test_y.ravel().astype(int)\n",
243
+ "TOTAL = len(test_x)\n",
244
+ "\n",
245
+ "# resume\n",
246
+ "start_idx = 0\n",
247
+ "if os.path.exists(PRED_CSV):\n",
248
+ " with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
249
+ " r = csv.reader(f); rows = list(r)\n",
250
+ " if rows and rows[0] and rows[0][0] == \"idx\": rows = rows[1:]\n",
251
+ " start_idx = len(rows)\n",
252
+ "else:\n",
253
+ " with open(PRED_CSV, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
254
+ " csv.writer(f).writerow([\"idx\", \"pred\"])\n",
255
+ "\n",
256
+ "print(f\"[init] total={TOTAL} done={start_idx} remain={TOTAL-start_idx}\")\n",
257
+ "\n",
258
+ "for i in range(start_idx, TOTAL, BATCH_SIZE):\n",
259
+ " batch = test_x[i : i + BATCH_SIZE]\n",
260
+ " try:\n",
261
+ " preds = classify_20(llm, batch)\n",
262
+ " except Exception as e:\n",
263
+ " print(f\"[err] {i}:{i+len(batch)} {type(e).__name__}: {e}\")\n",
264
+ " break\n",
265
+ " with open(PRED_CSV, \"a\", newline=\"\", encoding=\"utf-8\") as f:\n",
266
+ " w = csv.writer(f)\n",
267
+ " for off, p in enumerate(preds):\n",
268
+ " w.writerow([i + off, int(p)])\n",
269
+ " print(f\"[ok] {i}:{i+len(batch)} +{len(preds)}\")\n",
270
+ " if i + BATCH_SIZE < TOTAL:\n",
271
+ " time.sleep(SLEEP_SECS)\n",
272
+ "\n",
273
+ "# eval if complete\n",
274
+ "idxs, vals = [], []\n",
275
+ "with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
276
+ " r = csv.reader(f); next(r, None)\n",
277
+ " for row in r:\n",
278
+ " idxs.append(int(row[0])); vals.append(int(row[1]))\n",
279
+ "order = np.argsort(np.array(idxs))\n",
280
+ "y_pred = np.array(vals, dtype=int)[order]\n",
281
+ "\n",
282
+ "print(f\"[final] collected={len(y_pred)}/{TOTAL}\")\n",
283
+ "if len(y_pred) == TOTAL:\n",
284
+ " print(f\"Accuracy: {accuracy_score(y_true, y_pred):.4f}\")\n",
285
+ "else:\n",
286
+ " print(f\"[note] missing={TOTAL-len(y_pred)}\")\n"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": null,
292
+ "id": "435f575c",
293
+ "metadata": {},
294
+ "outputs": [],
295
+ "source": []
296
+ }
297
+ ],
298
+ "metadata": {
299
+ "kernelspec": {
300
+ "display_name": "base",
301
+ "language": "python",
302
+ "name": "python3"
303
+ },
304
+ "language_info": {
305
+ "codemirror_mode": {
306
+ "name": "ipython",
307
+ "version": 3
308
+ },
309
+ "file_extension": ".py",
310
+ "mimetype": "text/x-python",
311
+ "name": "python",
312
+ "nbconvert_exporter": "python",
313
+ "pygments_lexer": "ipython3",
314
+ "version": "3.13.5"
315
+ }
316
+ },
317
+ "nbformat": 4,
318
+ "nbformat_minor": 5
319
+ }
feature_extract.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file: sa_features.py
2
+ import re
3
+ import string
4
+ import numpy as np
5
+ from nltk.stem import PorterStemmer
6
+ from nltk.tokenize import TweetTokenizer
7
+ from nltk.corpus import stopwords
8
+
9
+ # --- constants & tools ---
10
+ pronouns = {
11
+ "i","me","my","mine","myself",
12
+ "we","us","our","ours","ourselves",
13
+ "you","your","yours","yourself","yourselves",
14
+ "he","him","his","himself",
15
+ "she","her","hers","herself",
16
+ "it","its","itself",
17
+ "they","them","their","theirs","themselves",
18
+ }
19
+
20
+ _tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
21
+ _stemmer = PorterStemmer()
22
+ _stopwords_en = set(stopwords.words("english"))
23
+
24
+ def process_tweet(tweet):
25
+ """Làm sạch + tokenize + remove stopwords/punctuation + stem. Trả về list token."""
26
+ tweet = re.sub(r"\$\w*", "", tweet) # bỏ tickers $GE
27
+ tweet = re.sub(r"^RT[\s]+", "", tweet) # bỏ 'RT'
28
+ tweet = re.sub(r"https?://[^\s\n\r]+", "", tweet) # bỏ URL
29
+ tweet = re.sub(r"#", "", tweet) # bỏ dấu '#', giữ từ
30
+
31
+ tokens = _tokenizer.tokenize(tweet)
32
+ clean = []
33
+ for w in tokens:
34
+ if (w not in _stopwords_en) and (w not in string.punctuation):
35
+ clean.append(_stemmer.stem(w))
36
+ return clean
37
+
38
+ def extract_features_2(tweet, freqs):
39
+ """
40
+ x[0,0]: tổng tần suất từ (đã process) ở lớp 1.0
41
+ x[0,1]: tổng tần suất từ (đã process) ở lớp 0.0
42
+ """
43
+ words = process_tweet(tweet)
44
+ x = np.zeros((1, 2))
45
+ for w in words:
46
+ x[0, 0] += freqs.get((w, 1.0), 0)
47
+ x[0, 1] += freqs.get((w, 0.0), 0)
48
+ return x
49
+
50
+ def extract_features_6(tweet, freqs):
51
+ """
52
+ x1: tổng freq từ theo lớp 1.0 (tokenizer raw-lower)
53
+ x2: tổng freq từ theo lớp 0.0
54
+ x3: 1 nếu có "no" trong tokens else 0
55
+ x4: đếm đại từ ngôi 1 & 2 (pronouns)
56
+ x5: 1 nếu có '!' trong raw tweet else 0
57
+ x6: log(số lượng token) (0 nếu rỗng)
58
+ """
59
+ words = _tokenizer.tokenize(tweet)
60
+ x = np.zeros((1, 6))
61
+
62
+ for w in words:
63
+ x[0, 0] += freqs.get((w, 1.0), 0)
64
+ x[0, 1] += freqs.get((w, 0.0), 0)
65
+
66
+ x[0, 2] = 1 if "no" in words else 0
67
+ x[0, 3] = sum(1 for w in words if w in pronouns)
68
+ x[0, 4] = 1 if "!" in tweet else 0
69
+ x[0, 5] = np.log(len(words)) if len(words) > 0 else 0
70
+
71
+ return x
72
+
73
+ def build_freqs(tweets, ys):
74
+ """
75
+ Xây dựng tần suất (word, sentiment)
76
+ Input:
77
+ tweets: list các tweet
78
+ ys: m×1 array (numpy) với nhãn sentiment mỗi tweet (0 hoặc 1)
79
+ Output:
80
+ freqs: dict {(word, y): count}
81
+ """
82
+ yslist = np.squeeze(ys).tolist()
83
+ freqs = {}
84
+ for y, tweet in zip(yslist, tweets):
85
+ for word in process_tweet(tweet):
86
+ pair = (word, y)
87
+ freqs[pair] = freqs.get(pair, 0) + 1
88
+ return freqs
89
+
90
+
91
+ if __name__ == "__main__":
92
+ """
93
+ Đoạn kiểm tra nhanh module:
94
+ - tải dữ liệu twitter_samples
95
+ - build freqs
96
+ - trích 2 loại feature cho 1 tweet mẫu
97
+ """
98
+ import nltk
99
+ from nltk.corpus import twitter_samples
100
+
101
+ # tải nếu thiếu
102
+ try:
103
+ twitter_samples.fileids()
104
+ except LookupError:
105
+ nltk.download("twitter_samples")
106
+ try:
107
+ stopwords.words("english")
108
+ except LookupError:
109
+ nltk.download("stopwords")
110
+
111
+ # lấy dữ liệu pos/neg
112
+ pos = twitter_samples.strings("positive_tweets.json")
113
+ neg = twitter_samples.strings("negative_tweets.json")
114
+ tweets = pos + neg
115
+ y = np.array([1] * len(pos) + [0] * len(neg)).reshape(-1, 1)
116
+
117
+ print(f"Tổng số tweet: {len(tweets)}")
118
+
119
+ # build freqs
120
+ freqs = build_freqs(tweets, y)
121
+ print(f"Số cặp (word, sentiment): {len(freqs)}")
122
+
123
+ # kiểm tra 1 tweet mẫu
124
+ sample_tweet = tweets[0]
125
+ print("\nTweet mẫu:", sample_tweet)
126
+ print("Tokens (process_tweet):", process_tweet(sample_tweet))
127
+
128
+ x2 = extract_features_2(sample_tweet, freqs)
129
+ x6 = extract_features_6(sample_tweet, freqs)
130
+
131
+ print("\nFeatures 2 chiều:", x2)
132
+ print("Features 6 chiều:", x6)
inference_demo.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ from feature_extract import extract_features_2, extract_features_6
4
+
5
+ # ---- Load models + freqs ----
6
+ with open("demo_models.pkl", "rb") as f:
7
+ data = pickle.load(f)
8
+
9
+ freqs = data["freqs"]
10
+ models_2f = data["2f"]
11
+ models_6f = data["6f"]
12
+
13
+ # ---- Helper functions ----
14
+ def _predict_2f(sentence: str, model_name: str) -> int:
15
+ """Trích 2-feature và predict 0/1."""
16
+ x = extract_features_2(sentence, freqs)
17
+ return int(models_2f[model_name].predict(x)[0])
18
+
19
+ def _predict_6f(sentence: str, model_name: str) -> int:
20
+ """Trích 6-feature và predict 0/1."""
21
+ x = extract_features_6(sentence, freqs)
22
+ return int(models_6f[model_name].predict(x)[0])
23
+
24
+ # 2-feature
25
+ def predict_randomforest_2f(sentence): return _predict_2f(sentence, "Random Forest")
26
+ def predict_xgboost_2f(sentence): return _predict_2f(sentence, "XGBoost")
27
+ def predict_lightgbm_2f(sentence): return _predict_2f(sentence, "LightGBM")
28
+ def predict_svm_2f(sentence): return _predict_2f(sentence, "SVM")
29
+ def predict_decisiontree_2f(sentence): return _predict_2f(sentence, "Decision Tree")
30
+ def predict_naivebayes_2f(sentence): return _predict_2f(sentence, "Naive Bayes")
31
+
32
+ # 6-feature
33
+ def predict_randomforest_6f(sentence): return _predict_6f(sentence, "Random Forest")
34
+ def predict_xgboost_6f(sentence): return _predict_6f(sentence, "XGBoost")
35
+ def predict_lightgbm_6f(sentence): return _predict_6f(sentence, "LightGBM")
36
+ def predict_svm_6f(sentence): return _predict_6f(sentence, "SVM")
37
+ def predict_decisiontree_6f(sentence): return _predict_6f(sentence, "Decision Tree")
38
+ def predict_naivebayes_6f(sentence): return _predict_6f(sentence, "Naive Bayes")
39
+
40
+ # ---- Test nhanh ----
41
+ if __name__ == "__main__":
42
+ test_sentence = "I love this new phone!"
43
+ print("RandomForest 2f:", predict_randomforest_2f(test_sentence))
44
+ print("RandomForest 6f:", predict_randomforest_6f(test_sentence))
45
+ print("SVM 2f:", predict_svm_2f(test_sentence))
46
+ print("SVM 6f:", predict_svm_6f(test_sentence))
preds.csv ADDED
@@ -0,0 +1,2001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ idx,pred
2
+ 0,0
3
+ 1,1
4
+ 2,1
5
+ 3,1
6
+ 4,1
7
+ 5,1
8
+ 6,1
9
+ 7,1
10
+ 8,1
11
+ 9,1
12
+ 10,1
13
+ 11,0
14
+ 12,1
15
+ 13,1
16
+ 14,0
17
+ 15,1
18
+ 16,1
19
+ 17,1
20
+ 18,1
21
+ 19,1
22
+ 20,1
23
+ 21,0
24
+ 22,1
25
+ 23,1
26
+ 24,1
27
+ 25,1
28
+ 26,1
29
+ 27,1
30
+ 28,1
31
+ 29,1
32
+ 30,1
33
+ 31,0
34
+ 32,1
35
+ 33,1
36
+ 34,1
37
+ 35,1
38
+ 36,1
39
+ 37,0
40
+ 38,1
41
+ 39,1
42
+ 40,1
43
+ 41,1
44
+ 42,1
45
+ 43,1
46
+ 44,1
47
+ 45,1
48
+ 46,1
49
+ 47,1
50
+ 48,1
51
+ 49,1
52
+ 50,1
53
+ 51,1
54
+ 52,1
55
+ 53,1
56
+ 54,0
57
+ 55,0
58
+ 56,1
59
+ 57,1
60
+ 58,1
61
+ 59,1
62
+ 60,1
63
+ 61,1
64
+ 62,1
65
+ 63,1
66
+ 64,1
67
+ 65,1
68
+ 66,1
69
+ 67,1
70
+ 68,1
71
+ 69,1
72
+ 70,0
73
+ 71,1
74
+ 72,1
75
+ 73,1
76
+ 74,1
77
+ 75,0
78
+ 76,1
79
+ 77,1
80
+ 78,1
81
+ 79,1
82
+ 80,1
83
+ 81,1
84
+ 82,1
85
+ 83,1
86
+ 84,0
87
+ 85,0
88
+ 86,1
89
+ 87,1
90
+ 88,1
91
+ 89,1
92
+ 90,1
93
+ 91,1
94
+ 92,1
95
+ 93,1
96
+ 94,1
97
+ 95,1
98
+ 96,1
99
+ 97,1
100
+ 98,1
101
+ 99,1
102
+ 100,1
103
+ 101,1
104
+ 102,1
105
+ 103,1
106
+ 104,1
107
+ 105,1
108
+ 106,1
109
+ 107,1
110
+ 108,1
111
+ 109,1
112
+ 110,1
113
+ 111,1
114
+ 112,1
115
+ 113,1
116
+ 114,1
117
+ 115,1
118
+ 116,1
119
+ 117,1
120
+ 118,1
121
+ 119,1
122
+ 120,1
123
+ 121,1
124
+ 122,1
125
+ 123,1
126
+ 124,1
127
+ 125,1
128
+ 126,1
129
+ 127,1
130
+ 128,1
131
+ 129,1
132
+ 130,1
133
+ 131,1
134
+ 132,1
135
+ 133,1
136
+ 134,1
137
+ 135,1
138
+ 136,1
139
+ 137,1
140
+ 138,1
141
+ 139,1
142
+ 140,1
143
+ 141,1
144
+ 142,1
145
+ 143,1
146
+ 144,1
147
+ 145,1
148
+ 146,1
149
+ 147,1
150
+ 148,1
151
+ 149,1
152
+ 150,0
153
+ 151,1
154
+ 152,1
155
+ 153,0
156
+ 154,1
157
+ 155,1
158
+ 156,1
159
+ 157,1
160
+ 158,1
161
+ 159,0
162
+ 160,1
163
+ 161,1
164
+ 162,1
165
+ 163,1
166
+ 164,1
167
+ 165,1
168
+ 166,1
169
+ 167,0
170
+ 168,1
171
+ 169,1
172
+ 170,1
173
+ 171,1
174
+ 172,1
175
+ 173,1
176
+ 174,1
177
+ 175,1
178
+ 176,0
179
+ 177,1
180
+ 178,1
181
+ 179,1
182
+ 180,1
183
+ 181,1
184
+ 182,1
185
+ 183,1
186
+ 184,1
187
+ 185,1
188
+ 186,1
189
+ 187,1
190
+ 188,1
191
+ 189,1
192
+ 190,1
193
+ 191,1
194
+ 192,1
195
+ 193,1
196
+ 194,1
197
+ 195,1
198
+ 196,1
199
+ 197,1
200
+ 198,1
201
+ 199,1
202
+ 200,1
203
+ 201,1
204
+ 202,1
205
+ 203,0
206
+ 204,0
207
+ 205,1
208
+ 206,1
209
+ 207,1
210
+ 208,1
211
+ 209,1
212
+ 210,1
213
+ 211,1
214
+ 212,1
215
+ 213,0
216
+ 214,1
217
+ 215,1
218
+ 216,1
219
+ 217,1
220
+ 218,1
221
+ 219,1
222
+ 220,1
223
+ 221,1
224
+ 222,0
225
+ 223,1
226
+ 224,1
227
+ 225,1
228
+ 226,1
229
+ 227,1
230
+ 228,1
231
+ 229,0
232
+ 230,0
233
+ 231,1
234
+ 232,1
235
+ 233,1
236
+ 234,1
237
+ 235,1
238
+ 236,1
239
+ 237,1
240
+ 238,1
241
+ 239,1
242
+ 240,1
243
+ 241,1
244
+ 242,1
245
+ 243,1
246
+ 244,1
247
+ 245,1
248
+ 246,1
249
+ 247,1
250
+ 248,1
251
+ 249,0
252
+ 250,1
253
+ 251,1
254
+ 252,1
255
+ 253,1
256
+ 254,1
257
+ 255,1
258
+ 256,1
259
+ 257,1
260
+ 258,1
261
+ 259,1
262
+ 260,1
263
+ 261,1
264
+ 262,1
265
+ 263,1
266
+ 264,1
267
+ 265,1
268
+ 266,1
269
+ 267,1
270
+ 268,0
271
+ 269,1
272
+ 270,1
273
+ 271,1
274
+ 272,1
275
+ 273,1
276
+ 274,1
277
+ 275,1
278
+ 276,1
279
+ 277,1
280
+ 278,1
281
+ 279,1
282
+ 280,1
283
+ 281,1
284
+ 282,1
285
+ 283,1
286
+ 284,1
287
+ 285,1
288
+ 286,1
289
+ 287,1
290
+ 288,1
291
+ 289,1
292
+ 290,1
293
+ 291,1
294
+ 292,1
295
+ 293,1
296
+ 294,1
297
+ 295,1
298
+ 296,0
299
+ 297,1
300
+ 298,1
301
+ 299,1
302
+ 300,1
303
+ 301,1
304
+ 302,1
305
+ 303,1
306
+ 304,1
307
+ 305,1
308
+ 306,1
309
+ 307,1
310
+ 308,1
311
+ 309,1
312
+ 310,1
313
+ 311,1
314
+ 312,1
315
+ 313,1
316
+ 314,1
317
+ 315,1
318
+ 316,1
319
+ 317,1
320
+ 318,1
321
+ 319,1
322
+ 320,1
323
+ 321,1
324
+ 322,1
325
+ 323,1
326
+ 324,1
327
+ 325,1
328
+ 326,1
329
+ 327,1
330
+ 328,1
331
+ 329,0
332
+ 330,1
333
+ 331,1
334
+ 332,1
335
+ 333,1
336
+ 334,1
337
+ 335,1
338
+ 336,1
339
+ 337,1
340
+ 338,1
341
+ 339,1
342
+ 340,1
343
+ 341,1
344
+ 342,1
345
+ 343,1
346
+ 344,0
347
+ 345,1
348
+ 346,1
349
+ 347,1
350
+ 348,1
351
+ 349,1
352
+ 350,1
353
+ 351,1
354
+ 352,1
355
+ 353,1
356
+ 354,1
357
+ 355,1
358
+ 356,1
359
+ 357,1
360
+ 358,0
361
+ 359,0
362
+ 360,1
363
+ 361,1
364
+ 362,1
365
+ 363,1
366
+ 364,0
367
+ 365,1
368
+ 366,1
369
+ 367,0
370
+ 368,1
371
+ 369,1
372
+ 370,1
373
+ 371,1
374
+ 372,1
375
+ 373,1
376
+ 374,1
377
+ 375,1
378
+ 376,1
379
+ 377,1
380
+ 378,1
381
+ 379,1
382
+ 380,1
383
+ 381,1
384
+ 382,1
385
+ 383,1
386
+ 384,1
387
+ 385,1
388
+ 386,1
389
+ 387,1
390
+ 388,1
391
+ 389,0
392
+ 390,1
393
+ 391,1
394
+ 392,1
395
+ 393,1
396
+ 394,0
397
+ 395,1
398
+ 396,1
399
+ 397,0
400
+ 398,1
401
+ 399,1
402
+ 400,1
403
+ 401,1
404
+ 402,1
405
+ 403,1
406
+ 404,1
407
+ 405,1
408
+ 406,1
409
+ 407,0
410
+ 408,1
411
+ 409,1
412
+ 410,1
413
+ 411,1
414
+ 412,1
415
+ 413,1
416
+ 414,1
417
+ 415,1
418
+ 416,1
419
+ 417,1
420
+ 418,0
421
+ 419,0
422
+ 420,1
423
+ 421,0
424
+ 422,1
425
+ 423,1
426
+ 424,1
427
+ 425,0
428
+ 426,1
429
+ 427,1
430
+ 428,1
431
+ 429,0
432
+ 430,1
433
+ 431,1
434
+ 432,1
435
+ 433,1
436
+ 434,1
437
+ 435,1
438
+ 436,1
439
+ 437,1
440
+ 438,0
441
+ 439,1
442
+ 440,1
443
+ 441,1
444
+ 442,1
445
+ 443,1
446
+ 444,0
447
+ 445,0
448
+ 446,1
449
+ 447,1
450
+ 448,1
451
+ 449,1
452
+ 450,1
453
+ 451,1
454
+ 452,1
455
+ 453,1
456
+ 454,1
457
+ 455,1
458
+ 456,0
459
+ 457,1
460
+ 458,1
461
+ 459,1
462
+ 460,1
463
+ 461,0
464
+ 462,1
465
+ 463,1
466
+ 464,1
467
+ 465,0
468
+ 466,1
469
+ 467,1
470
+ 468,1
471
+ 469,1
472
+ 470,1
473
+ 471,1
474
+ 472,1
475
+ 473,1
476
+ 474,0
477
+ 475,1
478
+ 476,0
479
+ 477,1
480
+ 478,1
481
+ 479,1
482
+ 480,1
483
+ 481,1
484
+ 482,1
485
+ 483,1
486
+ 484,1
487
+ 485,1
488
+ 486,1
489
+ 487,0
490
+ 488,1
491
+ 489,1
492
+ 490,1
493
+ 491,1
494
+ 492,1
495
+ 493,1
496
+ 494,1
497
+ 495,1
498
+ 496,1
499
+ 497,1
500
+ 498,1
501
+ 499,1
502
+ 500,1
503
+ 501,1
504
+ 502,1
505
+ 503,0
506
+ 504,1
507
+ 505,1
508
+ 506,1
509
+ 507,1
510
+ 508,1
511
+ 509,1
512
+ 510,1
513
+ 511,0
514
+ 512,1
515
+ 513,1
516
+ 514,1
517
+ 515,1
518
+ 516,1
519
+ 517,1
520
+ 518,1
521
+ 519,0
522
+ 520,1
523
+ 521,1
524
+ 522,1
525
+ 523,1
526
+ 524,1
527
+ 525,1
528
+ 526,1
529
+ 527,1
530
+ 528,1
531
+ 529,1
532
+ 530,1
533
+ 531,1
534
+ 532,0
535
+ 533,1
536
+ 534,1
537
+ 535,1
538
+ 536,1
539
+ 537,1
540
+ 538,1
541
+ 539,1
542
+ 540,1
543
+ 541,1
544
+ 542,1
545
+ 543,1
546
+ 544,1
547
+ 545,1
548
+ 546,1
549
+ 547,0
550
+ 548,1
551
+ 549,1
552
+ 550,1
553
+ 551,1
554
+ 552,1
555
+ 553,0
556
+ 554,1
557
+ 555,0
558
+ 556,1
559
+ 557,1
560
+ 558,0
561
+ 559,0
562
+ 560,1
563
+ 561,1
564
+ 562,1
565
+ 563,1
566
+ 564,1
567
+ 565,1
568
+ 566,1
569
+ 567,1
570
+ 568,1
571
+ 569,1
572
+ 570,0
573
+ 571,1
574
+ 572,1
575
+ 573,1
576
+ 574,1
577
+ 575,1
578
+ 576,1
579
+ 577,0
580
+ 578,1
581
+ 579,1
582
+ 580,1
583
+ 581,1
584
+ 582,1
585
+ 583,1
586
+ 584,1
587
+ 585,1
588
+ 586,1
589
+ 587,0
590
+ 588,1
591
+ 589,1
592
+ 590,1
593
+ 591,1
594
+ 592,1
595
+ 593,1
596
+ 594,1
597
+ 595,1
598
+ 596,1
599
+ 597,1
600
+ 598,1
601
+ 599,1
602
+ 600,1
603
+ 601,1
604
+ 602,1
605
+ 603,1
606
+ 604,1
607
+ 605,1
608
+ 606,1
609
+ 607,1
610
+ 608,1
611
+ 609,1
612
+ 610,1
613
+ 611,1
614
+ 612,1
615
+ 613,1
616
+ 614,1
617
+ 615,1
618
+ 616,1
619
+ 617,1
620
+ 618,1
621
+ 619,1
622
+ 620,1
623
+ 621,1
624
+ 622,1
625
+ 623,1
626
+ 624,1
627
+ 625,1
628
+ 626,1
629
+ 627,1
630
+ 628,1
631
+ 629,0
632
+ 630,1
633
+ 631,1
634
+ 632,1
635
+ 633,1
636
+ 634,1
637
+ 635,1
638
+ 636,1
639
+ 637,1
640
+ 638,1
641
+ 639,1
642
+ 640,1
643
+ 641,1
644
+ 642,1
645
+ 643,1
646
+ 644,1
647
+ 645,1
648
+ 646,1
649
+ 647,1
650
+ 648,1
651
+ 649,1
652
+ 650,1
653
+ 651,1
654
+ 652,1
655
+ 653,1
656
+ 654,1
657
+ 655,1
658
+ 656,1
659
+ 657,1
660
+ 658,1
661
+ 659,1
662
+ 660,1
663
+ 661,1
664
+ 662,1
665
+ 663,1
666
+ 664,1
667
+ 665,1
668
+ 666,1
669
+ 667,1
670
+ 668,1
671
+ 669,1
672
+ 670,1
673
+ 671,1
674
+ 672,1
675
+ 673,1
676
+ 674,1
677
+ 675,0
678
+ 676,1
679
+ 677,1
680
+ 678,1
681
+ 679,1
682
+ 680,1
683
+ 681,1
684
+ 682,1
685
+ 683,1
686
+ 684,1
687
+ 685,1
688
+ 686,1
689
+ 687,1
690
+ 688,1
691
+ 689,1
692
+ 690,1
693
+ 691,1
694
+ 692,1
695
+ 693,1
696
+ 694,1
697
+ 695,1
698
+ 696,1
699
+ 697,0
700
+ 698,1
701
+ 699,1
702
+ 700,1
703
+ 701,1
704
+ 702,1
705
+ 703,1
706
+ 704,1
707
+ 705,1
708
+ 706,1
709
+ 707,1
710
+ 708,1
711
+ 709,1
712
+ 710,1
713
+ 711,1
714
+ 712,1
715
+ 713,0
716
+ 714,1
717
+ 715,1
718
+ 716,1
719
+ 717,1
720
+ 718,1
721
+ 719,1
722
+ 720,1
723
+ 721,1
724
+ 722,1
725
+ 723,1
726
+ 724,1
727
+ 725,1
728
+ 726,1
729
+ 727,1
730
+ 728,1
731
+ 729,1
732
+ 730,1
733
+ 731,1
734
+ 732,1
735
+ 733,1
736
+ 734,0
737
+ 735,1
738
+ 736,0
739
+ 737,1
740
+ 738,1
741
+ 739,1
742
+ 740,1
743
+ 741,1
744
+ 742,1
745
+ 743,1
746
+ 744,0
747
+ 745,1
748
+ 746,1
749
+ 747,1
750
+ 748,1
751
+ 749,1
752
+ 750,1
753
+ 751,1
754
+ 752,1
755
+ 753,1
756
+ 754,1
757
+ 755,1
758
+ 756,1
759
+ 757,1
760
+ 758,0
761
+ 759,1
762
+ 760,1
763
+ 761,0
764
+ 762,1
765
+ 763,1
766
+ 764,1
767
+ 765,1
768
+ 766,1
769
+ 767,1
770
+ 768,1
771
+ 769,1
772
+ 770,1
773
+ 771,1
774
+ 772,1
775
+ 773,1
776
+ 774,1
777
+ 775,1
778
+ 776,1
779
+ 777,1
780
+ 778,1
781
+ 779,1
782
+ 780,1
783
+ 781,1
784
+ 782,1
785
+ 783,1
786
+ 784,1
787
+ 785,1
788
+ 786,1
789
+ 787,0
790
+ 788,1
791
+ 789,1
792
+ 790,1
793
+ 791,1
794
+ 792,1
795
+ 793,1
796
+ 794,0
797
+ 795,1
798
+ 796,1
799
+ 797,1
800
+ 798,1
801
+ 799,0
802
+ 800,1
803
+ 801,1
804
+ 802,1
805
+ 803,1
806
+ 804,1
807
+ 805,1
808
+ 806,1
809
+ 807,1
810
+ 808,1
811
+ 809,0
812
+ 810,1
813
+ 811,1
814
+ 812,1
815
+ 813,1
816
+ 814,1
817
+ 815,1
818
+ 816,1
819
+ 817,1
820
+ 818,1
821
+ 819,1
822
+ 820,1
823
+ 821,1
824
+ 822,1
825
+ 823,1
826
+ 824,1
827
+ 825,1
828
+ 826,1
829
+ 827,1
830
+ 828,1
831
+ 829,1
832
+ 830,1
833
+ 831,1
834
+ 832,1
835
+ 833,1
836
+ 834,1
837
+ 835,1
838
+ 836,1
839
+ 837,1
840
+ 838,1
841
+ 839,1
842
+ 840,1
843
+ 841,1
844
+ 842,1
845
+ 843,1
846
+ 844,0
847
+ 845,0
848
+ 846,1
849
+ 847,1
850
+ 848,1
851
+ 849,1
852
+ 850,1
853
+ 851,1
854
+ 852,1
855
+ 853,1
856
+ 854,1
857
+ 855,1
858
+ 856,1
859
+ 857,1
860
+ 858,1
861
+ 859,1
862
+ 860,1
863
+ 861,1
864
+ 862,1
865
+ 863,0
866
+ 864,1
867
+ 865,1
868
+ 866,1
869
+ 867,1
870
+ 868,1
871
+ 869,0
872
+ 870,1
873
+ 871,1
874
+ 872,1
875
+ 873,1
876
+ 874,1
877
+ 875,1
878
+ 876,1
879
+ 877,1
880
+ 878,1
881
+ 879,1
882
+ 880,1
883
+ 881,1
884
+ 882,1
885
+ 883,1
886
+ 884,1
887
+ 885,1
888
+ 886,1
889
+ 887,1
890
+ 888,1
891
+ 889,1
892
+ 890,1
893
+ 891,1
894
+ 892,1
895
+ 893,1
896
+ 894,1
897
+ 895,1
898
+ 896,1
899
+ 897,1
900
+ 898,1
901
+ 899,1
902
+ 900,1
903
+ 901,1
904
+ 902,1
905
+ 903,1
906
+ 904,1
907
+ 905,1
908
+ 906,1
909
+ 907,1
910
+ 908,1
911
+ 909,1
912
+ 910,1
913
+ 911,1
914
+ 912,1
915
+ 913,1
916
+ 914,1
917
+ 915,1
918
+ 916,1
919
+ 917,1
920
+ 918,1
921
+ 919,1
922
+ 920,1
923
+ 921,0
924
+ 922,1
925
+ 923,0
926
+ 924,1
927
+ 925,1
928
+ 926,1
929
+ 927,1
930
+ 928,1
931
+ 929,1
932
+ 930,1
933
+ 931,1
934
+ 932,1
935
+ 933,1
936
+ 934,1
937
+ 935,1
938
+ 936,1
939
+ 937,0
940
+ 938,1
941
+ 939,1
942
+ 940,1
943
+ 941,1
944
+ 942,1
945
+ 943,0
946
+ 944,0
947
+ 945,1
948
+ 946,1
949
+ 947,1
950
+ 948,1
951
+ 949,1
952
+ 950,1
953
+ 951,0
954
+ 952,1
955
+ 953,1
956
+ 954,1
957
+ 955,1
958
+ 956,1
959
+ 957,1
960
+ 958,1
961
+ 959,1
962
+ 960,1
963
+ 961,1
964
+ 962,1
965
+ 963,1
966
+ 964,1
967
+ 965,1
968
+ 966,1
969
+ 967,1
970
+ 968,1
971
+ 969,0
972
+ 970,0
973
+ 971,1
974
+ 972,1
975
+ 973,1
976
+ 974,1
977
+ 975,1
978
+ 976,1
979
+ 977,1
980
+ 978,1
981
+ 979,1
982
+ 980,1
983
+ 981,1
984
+ 982,1
985
+ 983,1
986
+ 984,1
987
+ 985,1
988
+ 986,0
989
+ 987,1
990
+ 988,1
991
+ 989,1
992
+ 990,1
993
+ 991,1
994
+ 992,1
995
+ 993,1
996
+ 994,0
997
+ 995,1
998
+ 996,1
999
+ 997,1
1000
+ 998,1
1001
+ 999,0
1002
+ 1000,0
1003
+ 1001,0
1004
+ 1002,0
1005
+ 1003,0
1006
+ 1004,0
1007
+ 1005,0
1008
+ 1006,0
1009
+ 1007,0
1010
+ 1008,0
1011
+ 1009,0
1012
+ 1010,0
1013
+ 1011,0
1014
+ 1012,0
1015
+ 1013,0
1016
+ 1014,0
1017
+ 1015,0
1018
+ 1016,0
1019
+ 1017,0
1020
+ 1018,0
1021
+ 1019,0
1022
+ 1020,0
1023
+ 1021,0
1024
+ 1022,0
1025
+ 1023,1
1026
+ 1024,0
1027
+ 1025,0
1028
+ 1026,0
1029
+ 1027,0
1030
+ 1028,0
1031
+ 1029,0
1032
+ 1030,0
1033
+ 1031,0
1034
+ 1032,0
1035
+ 1033,0
1036
+ 1034,0
1037
+ 1035,1
1038
+ 1036,0
1039
+ 1037,0
1040
+ 1038,0
1041
+ 1039,0
1042
+ 1040,0
1043
+ 1041,0
1044
+ 1042,0
1045
+ 1043,0
1046
+ 1044,0
1047
+ 1045,0
1048
+ 1046,0
1049
+ 1047,0
1050
+ 1048,0
1051
+ 1049,0
1052
+ 1050,0
1053
+ 1051,0
1054
+ 1052,0
1055
+ 1053,0
1056
+ 1054,0
1057
+ 1055,0
1058
+ 1056,0
1059
+ 1057,0
1060
+ 1058,0
1061
+ 1059,0
1062
+ 1060,0
1063
+ 1061,0
1064
+ 1062,0
1065
+ 1063,0
1066
+ 1064,0
1067
+ 1065,0
1068
+ 1066,0
1069
+ 1067,0
1070
+ 1068,0
1071
+ 1069,0
1072
+ 1070,0
1073
+ 1071,0
1074
+ 1072,0
1075
+ 1073,0
1076
+ 1074,0
1077
+ 1075,0
1078
+ 1076,0
1079
+ 1077,0
1080
+ 1078,0
1081
+ 1079,0
1082
+ 1080,0
1083
+ 1081,0
1084
+ 1082,0
1085
+ 1083,0
1086
+ 1084,0
1087
+ 1085,0
1088
+ 1086,0
1089
+ 1087,0
1090
+ 1088,0
1091
+ 1089,0
1092
+ 1090,0
1093
+ 1091,0
1094
+ 1092,0
1095
+ 1093,0
1096
+ 1094,0
1097
+ 1095,0
1098
+ 1096,0
1099
+ 1097,0
1100
+ 1098,0
1101
+ 1099,0
1102
+ 1100,0
1103
+ 1101,0
1104
+ 1102,0
1105
+ 1103,0
1106
+ 1104,0
1107
+ 1105,0
1108
+ 1106,0
1109
+ 1107,0
1110
+ 1108,0
1111
+ 1109,0
1112
+ 1110,0
1113
+ 1111,0
1114
+ 1112,0
1115
+ 1113,0
1116
+ 1114,0
1117
+ 1115,0
1118
+ 1116,0
1119
+ 1117,0
1120
+ 1118,0
1121
+ 1119,0
1122
+ 1120,0
1123
+ 1121,0
1124
+ 1122,0
1125
+ 1123,0
1126
+ 1124,0
1127
+ 1125,0
1128
+ 1126,0
1129
+ 1127,0
1130
+ 1128,0
1131
+ 1129,0
1132
+ 1130,0
1133
+ 1131,0
1134
+ 1132,0
1135
+ 1133,0
1136
+ 1134,0
1137
+ 1135,0
1138
+ 1136,0
1139
+ 1137,0
1140
+ 1138,0
1141
+ 1139,0
1142
+ 1140,0
1143
+ 1141,0
1144
+ 1142,0
1145
+ 1143,0
1146
+ 1144,0
1147
+ 1145,0
1148
+ 1146,0
1149
+ 1147,0
1150
+ 1148,0
1151
+ 1149,0
1152
+ 1150,0
1153
+ 1151,0
1154
+ 1152,0
1155
+ 1153,0
1156
+ 1154,0
1157
+ 1155,0
1158
+ 1156,0
1159
+ 1157,0
1160
+ 1158,0
1161
+ 1159,0
1162
+ 1160,0
1163
+ 1161,1
1164
+ 1162,0
1165
+ 1163,0
1166
+ 1164,0
1167
+ 1165,0
1168
+ 1166,0
1169
+ 1167,0
1170
+ 1168,0
1171
+ 1169,0
1172
+ 1170,0
1173
+ 1171,0
1174
+ 1172,0
1175
+ 1173,0
1176
+ 1174,0
1177
+ 1175,0
1178
+ 1176,1
1179
+ 1177,0
1180
+ 1178,0
1181
+ 1179,0
1182
+ 1180,0
1183
+ 1181,0
1184
+ 1182,0
1185
+ 1183,0
1186
+ 1184,0
1187
+ 1185,0
1188
+ 1186,0
1189
+ 1187,0
1190
+ 1188,0
1191
+ 1189,0
1192
+ 1190,0
1193
+ 1191,0
1194
+ 1192,0
1195
+ 1193,0
1196
+ 1194,0
1197
+ 1195,0
1198
+ 1196,0
1199
+ 1197,0
1200
+ 1198,0
1201
+ 1199,0
1202
+ 1200,0
1203
+ 1201,0
1204
+ 1202,0
1205
+ 1203,0
1206
+ 1204,0
1207
+ 1205,0
1208
+ 1206,0
1209
+ 1207,0
1210
+ 1208,0
1211
+ 1209,0
1212
+ 1210,0
1213
+ 1211,0
1214
+ 1212,0
1215
+ 1213,0
1216
+ 1214,0
1217
+ 1215,0
1218
+ 1216,0
1219
+ 1217,0
1220
+ 1218,0
1221
+ 1219,0
1222
+ 1220,0
1223
+ 1221,0
1224
+ 1222,0
1225
+ 1223,0
1226
+ 1224,0
1227
+ 1225,0
1228
+ 1226,0
1229
+ 1227,0
1230
+ 1228,0
1231
+ 1229,0
1232
+ 1230,0
1233
+ 1231,0
1234
+ 1232,0
1235
+ 1233,0
1236
+ 1234,0
1237
+ 1235,0
1238
+ 1236,0
1239
+ 1237,0
1240
+ 1238,0
1241
+ 1239,0
1242
+ 1240,0
1243
+ 1241,0
1244
+ 1242,0
1245
+ 1243,0
1246
+ 1244,0
1247
+ 1245,0
1248
+ 1246,0
1249
+ 1247,0
1250
+ 1248,0
1251
+ 1249,0
1252
+ 1250,0
1253
+ 1251,0
1254
+ 1252,0
1255
+ 1253,1
1256
+ 1254,0
1257
+ 1255,0
1258
+ 1256,0
1259
+ 1257,0
1260
+ 1258,0
1261
+ 1259,0
1262
+ 1260,0
1263
+ 1261,0
1264
+ 1262,0
1265
+ 1263,0
1266
+ 1264,0
1267
+ 1265,0
1268
+ 1266,0
1269
+ 1267,0
1270
+ 1268,0
1271
+ 1269,0
1272
+ 1270,0
1273
+ 1271,0
1274
+ 1272,0
1275
+ 1273,0
1276
+ 1274,0
1277
+ 1275,0
1278
+ 1276,0
1279
+ 1277,0
1280
+ 1278,0
1281
+ 1279,0
1282
+ 1280,0
1283
+ 1281,0
1284
+ 1282,0
1285
+ 1283,0
1286
+ 1284,0
1287
+ 1285,0
1288
+ 1286,0
1289
+ 1287,0
1290
+ 1288,0
1291
+ 1289,0
1292
+ 1290,0
1293
+ 1291,0
1294
+ 1292,0
1295
+ 1293,0
1296
+ 1294,0
1297
+ 1295,0
1298
+ 1296,0
1299
+ 1297,0
1300
+ 1298,0
1301
+ 1299,0
1302
+ 1300,0
1303
+ 1301,0
1304
+ 1302,0
1305
+ 1303,0
1306
+ 1304,0
1307
+ 1305,0
1308
+ 1306,0
1309
+ 1307,0
1310
+ 1308,0
1311
+ 1309,0
1312
+ 1310,0
1313
+ 1311,0
1314
+ 1312,0
1315
+ 1313,0
1316
+ 1314,0
1317
+ 1315,0
1318
+ 1316,0
1319
+ 1317,0
1320
+ 1318,0
1321
+ 1319,0
1322
+ 1320,0
1323
+ 1321,0
1324
+ 1322,0
1325
+ 1323,0
1326
+ 1324,0
1327
+ 1325,0
1328
+ 1326,0
1329
+ 1327,0
1330
+ 1328,0
1331
+ 1329,0
1332
+ 1330,0
1333
+ 1331,0
1334
+ 1332,0
1335
+ 1333,0
1336
+ 1334,0
1337
+ 1335,0
1338
+ 1336,0
1339
+ 1337,0
1340
+ 1338,0
1341
+ 1339,0
1342
+ 1340,0
1343
+ 1341,0
1344
+ 1342,0
1345
+ 1343,0
1346
+ 1344,0
1347
+ 1345,0
1348
+ 1346,0
1349
+ 1347,0
1350
+ 1348,0
1351
+ 1349,0
1352
+ 1350,0
1353
+ 1351,0
1354
+ 1352,0
1355
+ 1353,0
1356
+ 1354,0
1357
+ 1355,0
1358
+ 1356,0
1359
+ 1357,0
1360
+ 1358,0
1361
+ 1359,0
1362
+ 1360,0
1363
+ 1361,0
1364
+ 1362,0
1365
+ 1363,0
1366
+ 1364,0
1367
+ 1365,0
1368
+ 1366,0
1369
+ 1367,0
1370
+ 1368,0
1371
+ 1369,0
1372
+ 1370,0
1373
+ 1371,0
1374
+ 1372,0
1375
+ 1373,0
1376
+ 1374,0
1377
+ 1375,0
1378
+ 1376,0
1379
+ 1377,0
1380
+ 1378,0
1381
+ 1379,0
1382
+ 1380,0
1383
+ 1381,0
1384
+ 1382,0
1385
+ 1383,0
1386
+ 1384,0
1387
+ 1385,0
1388
+ 1386,0
1389
+ 1387,0
1390
+ 1388,0
1391
+ 1389,0
1392
+ 1390,0
1393
+ 1391,0
1394
+ 1392,0
1395
+ 1393,0
1396
+ 1394,0
1397
+ 1395,0
1398
+ 1396,0
1399
+ 1397,0
1400
+ 1398,0
1401
+ 1399,0
1402
+ 1400,0
1403
+ 1401,0
1404
+ 1402,0
1405
+ 1403,0
1406
+ 1404,0
1407
+ 1405,0
1408
+ 1406,0
1409
+ 1407,0
1410
+ 1408,0
1411
+ 1409,0
1412
+ 1410,0
1413
+ 1411,0
1414
+ 1412,0
1415
+ 1413,0
1416
+ 1414,0
1417
+ 1415,0
1418
+ 1416,0
1419
+ 1417,0
1420
+ 1418,0
1421
+ 1419,0
1422
+ 1420,0
1423
+ 1421,0
1424
+ 1422,0
1425
+ 1423,0
1426
+ 1424,0
1427
+ 1425,0
1428
+ 1426,0
1429
+ 1427,0
1430
+ 1428,0
1431
+ 1429,0
1432
+ 1430,0
1433
+ 1431,0
1434
+ 1432,0
1435
+ 1433,0
1436
+ 1434,0
1437
+ 1435,0
1438
+ 1436,0
1439
+ 1437,0
1440
+ 1438,0
1441
+ 1439,0
1442
+ 1440,0
1443
+ 1441,0
1444
+ 1442,0
1445
+ 1443,0
1446
+ 1444,0
1447
+ 1445,0
1448
+ 1446,0
1449
+ 1447,0
1450
+ 1448,0
1451
+ 1449,0
1452
+ 1450,0
1453
+ 1451,0
1454
+ 1452,0
1455
+ 1453,0
1456
+ 1454,0
1457
+ 1455,0
1458
+ 1456,0
1459
+ 1457,0
1460
+ 1458,0
1461
+ 1459,0
1462
+ 1460,0
1463
+ 1461,0
1464
+ 1462,0
1465
+ 1463,0
1466
+ 1464,0
1467
+ 1465,0
1468
+ 1466,0
1469
+ 1467,0
1470
+ 1468,0
1471
+ 1469,0
1472
+ 1470,0
1473
+ 1471,0
1474
+ 1472,0
1475
+ 1473,0
1476
+ 1474,0
1477
+ 1475,0
1478
+ 1476,0
1479
+ 1477,0
1480
+ 1478,0
1481
+ 1479,0
1482
+ 1480,0
1483
+ 1481,0
1484
+ 1482,0
1485
+ 1483,0
1486
+ 1484,0
1487
+ 1485,0
1488
+ 1486,0
1489
+ 1487,0
1490
+ 1488,0
1491
+ 1489,0
1492
+ 1490,0
1493
+ 1491,0
1494
+ 1492,0
1495
+ 1493,0
1496
+ 1494,0
1497
+ 1495,0
1498
+ 1496,0
1499
+ 1497,0
1500
+ 1498,0
1501
+ 1499,0
1502
+ 1500,0
1503
+ 1501,0
1504
+ 1502,0
1505
+ 1503,0
1506
+ 1504,0
1507
+ 1505,0
1508
+ 1506,0
1509
+ 1507,0
1510
+ 1508,0
1511
+ 1509,0
1512
+ 1510,0
1513
+ 1511,0
1514
+ 1512,0
1515
+ 1513,0
1516
+ 1514,0
1517
+ 1515,0
1518
+ 1516,0
1519
+ 1517,0
1520
+ 1518,0
1521
+ 1519,0
1522
+ 1520,0
1523
+ 1521,0
1524
+ 1522,0
1525
+ 1523,0
1526
+ 1524,0
1527
+ 1525,0
1528
+ 1526,0
1529
+ 1527,1
1530
+ 1528,0
1531
+ 1529,0
1532
+ 1530,0
1533
+ 1531,0
1534
+ 1532,0
1535
+ 1533,0
1536
+ 1534,0
1537
+ 1535,0
1538
+ 1536,0
1539
+ 1537,1
1540
+ 1538,0
1541
+ 1539,0
1542
+ 1540,0
1543
+ 1541,0
1544
+ 1542,0
1545
+ 1543,0
1546
+ 1544,0
1547
+ 1545,0
1548
+ 1546,0
1549
+ 1547,0
1550
+ 1548,0
1551
+ 1549,0
1552
+ 1550,0
1553
+ 1551,0
1554
+ 1552,0
1555
+ 1553,0
1556
+ 1554,0
1557
+ 1555,0
1558
+ 1556,0
1559
+ 1557,0
1560
+ 1558,0
1561
+ 1559,0
1562
+ 1560,0
1563
+ 1561,0
1564
+ 1562,0
1565
+ 1563,0
1566
+ 1564,0
1567
+ 1565,0
1568
+ 1566,0
1569
+ 1567,0
1570
+ 1568,0
1571
+ 1569,0
1572
+ 1570,0
1573
+ 1571,0
1574
+ 1572,0
1575
+ 1573,0
1576
+ 1574,0
1577
+ 1575,0
1578
+ 1576,0
1579
+ 1577,0
1580
+ 1578,0
1581
+ 1579,0
1582
+ 1580,1
1583
+ 1581,0
1584
+ 1582,0
1585
+ 1583,0
1586
+ 1584,0
1587
+ 1585,0
1588
+ 1586,0
1589
+ 1587,0
1590
+ 1588,0
1591
+ 1589,0
1592
+ 1590,0
1593
+ 1591,0
1594
+ 1592,0
1595
+ 1593,0
1596
+ 1594,0
1597
+ 1595,0
1598
+ 1596,0
1599
+ 1597,0
1600
+ 1598,0
1601
+ 1599,0
1602
+ 1600,0
1603
+ 1601,0
1604
+ 1602,0
1605
+ 1603,0
1606
+ 1604,0
1607
+ 1605,0
1608
+ 1606,0
1609
+ 1607,0
1610
+ 1608,0
1611
+ 1609,0
1612
+ 1610,0
1613
+ 1611,0
1614
+ 1612,0
1615
+ 1613,0
1616
+ 1614,0
1617
+ 1615,0
1618
+ 1616,0
1619
+ 1617,0
1620
+ 1618,0
1621
+ 1619,0
1622
+ 1620,0
1623
+ 1621,0
1624
+ 1622,0
1625
+ 1623,0
1626
+ 1624,0
1627
+ 1625,0
1628
+ 1626,0
1629
+ 1627,0
1630
+ 1628,0
1631
+ 1629,0
1632
+ 1630,0
1633
+ 1631,0
1634
+ 1632,0
1635
+ 1633,0
1636
+ 1634,0
1637
+ 1635,0
1638
+ 1636,1
1639
+ 1637,0
1640
+ 1638,0
1641
+ 1639,0
1642
+ 1640,0
1643
+ 1641,0
1644
+ 1642,0
1645
+ 1643,0
1646
+ 1644,0
1647
+ 1645,0
1648
+ 1646,0
1649
+ 1647,0
1650
+ 1648,0
1651
+ 1649,0
1652
+ 1650,0
1653
+ 1651,0
1654
+ 1652,0
1655
+ 1653,0
1656
+ 1654,0
1657
+ 1655,0
1658
+ 1656,0
1659
+ 1657,0
1660
+ 1658,0
1661
+ 1659,0
1662
+ 1660,0
1663
+ 1661,0
1664
+ 1662,0
1665
+ 1663,0
1666
+ 1664,0
1667
+ 1665,0
1668
+ 1666,0
1669
+ 1667,0
1670
+ 1668,0
1671
+ 1669,0
1672
+ 1670,0
1673
+ 1671,0
1674
+ 1672,0
1675
+ 1673,0
1676
+ 1674,0
1677
+ 1675,0
1678
+ 1676,0
1679
+ 1677,0
1680
+ 1678,0
1681
+ 1679,0
1682
+ 1680,0
1683
+ 1681,0
1684
+ 1682,0
1685
+ 1683,0
1686
+ 1684,0
1687
+ 1685,0
1688
+ 1686,0
1689
+ 1687,0
1690
+ 1688,0
1691
+ 1689,0
1692
+ 1690,0
1693
+ 1691,0
1694
+ 1692,0
1695
+ 1693,0
1696
+ 1694,0
1697
+ 1695,0
1698
+ 1696,0
1699
+ 1697,0
1700
+ 1698,0
1701
+ 1699,0
1702
+ 1700,0
1703
+ 1701,0
1704
+ 1702,0
1705
+ 1703,0
1706
+ 1704,0
1707
+ 1705,0
1708
+ 1706,0
1709
+ 1707,0
1710
+ 1708,0
1711
+ 1709,0
1712
+ 1710,0
1713
+ 1711,0
1714
+ 1712,0
1715
+ 1713,0
1716
+ 1714,0
1717
+ 1715,0
1718
+ 1716,0
1719
+ 1717,0
1720
+ 1718,0
1721
+ 1719,0
1722
+ 1720,0
1723
+ 1721,0
1724
+ 1722,0
1725
+ 1723,0
1726
+ 1724,0
1727
+ 1725,0
1728
+ 1726,0
1729
+ 1727,0
1730
+ 1728,0
1731
+ 1729,0
1732
+ 1730,0
1733
+ 1731,0
1734
+ 1732,0
1735
+ 1733,0
1736
+ 1734,0
1737
+ 1735,0
1738
+ 1736,0
1739
+ 1737,0
1740
+ 1738,0
1741
+ 1739,0
1742
+ 1740,0
1743
+ 1741,0
1744
+ 1742,0
1745
+ 1743,0
1746
+ 1744,0
1747
+ 1745,0
1748
+ 1746,1
1749
+ 1747,0
1750
+ 1748,0
1751
+ 1749,0
1752
+ 1750,0
1753
+ 1751,0
1754
+ 1752,0
1755
+ 1753,0
1756
+ 1754,0
1757
+ 1755,0
1758
+ 1756,0
1759
+ 1757,0
1760
+ 1758,1
1761
+ 1759,0
1762
+ 1760,0
1763
+ 1761,0
1764
+ 1762,0
1765
+ 1763,0
1766
+ 1764,0
1767
+ 1765,0
1768
+ 1766,0
1769
+ 1767,0
1770
+ 1768,0
1771
+ 1769,0
1772
+ 1770,0
1773
+ 1771,0
1774
+ 1772,0
1775
+ 1773,0
1776
+ 1774,0
1777
+ 1775,0
1778
+ 1776,0
1779
+ 1777,0
1780
+ 1778,0
1781
+ 1779,0
1782
+ 1780,0
1783
+ 1781,0
1784
+ 1782,0
1785
+ 1783,0
1786
+ 1784,0
1787
+ 1785,0
1788
+ 1786,0
1789
+ 1787,0
1790
+ 1788,0
1791
+ 1789,0
1792
+ 1790,0
1793
+ 1791,0
1794
+ 1792,0
1795
+ 1793,0
1796
+ 1794,0
1797
+ 1795,0
1798
+ 1796,0
1799
+ 1797,0
1800
+ 1798,0
1801
+ 1799,0
1802
+ 1800,0
1803
+ 1801,0
1804
+ 1802,0
1805
+ 1803,0
1806
+ 1804,0
1807
+ 1805,0
1808
+ 1806,0
1809
+ 1807,0
1810
+ 1808,0
1811
+ 1809,0
1812
+ 1810,0
1813
+ 1811,0
1814
+ 1812,0
1815
+ 1813,0
1816
+ 1814,0
1817
+ 1815,0
1818
+ 1816,0
1819
+ 1817,1
1820
+ 1818,0
1821
+ 1819,0
1822
+ 1820,0
1823
+ 1821,0
1824
+ 1822,0
1825
+ 1823,0
1826
+ 1824,0
1827
+ 1825,0
1828
+ 1826,0
1829
+ 1827,0
1830
+ 1828,0
1831
+ 1829,0
1832
+ 1830,0
1833
+ 1831,0
1834
+ 1832,0
1835
+ 1833,0
1836
+ 1834,0
1837
+ 1835,0
1838
+ 1836,0
1839
+ 1837,0
1840
+ 1838,0
1841
+ 1839,0
1842
+ 1840,0
1843
+ 1841,0
1844
+ 1842,0
1845
+ 1843,0
1846
+ 1844,0
1847
+ 1845,0
1848
+ 1846,0
1849
+ 1847,0
1850
+ 1848,0
1851
+ 1849,0
1852
+ 1850,0
1853
+ 1851,0
1854
+ 1852,0
1855
+ 1853,1
1856
+ 1854,0
1857
+ 1855,0
1858
+ 1856,0
1859
+ 1857,0
1860
+ 1858,0
1861
+ 1859,0
1862
+ 1860,0
1863
+ 1861,0
1864
+ 1862,0
1865
+ 1863,0
1866
+ 1864,0
1867
+ 1865,0
1868
+ 1866,0
1869
+ 1867,0
1870
+ 1868,0
1871
+ 1869,0
1872
+ 1870,0
1873
+ 1871,0
1874
+ 1872,0
1875
+ 1873,0
1876
+ 1874,0
1877
+ 1875,0
1878
+ 1876,0
1879
+ 1877,0
1880
+ 1878,0
1881
+ 1879,0
1882
+ 1880,0
1883
+ 1881,0
1884
+ 1882,0
1885
+ 1883,0
1886
+ 1884,0
1887
+ 1885,0
1888
+ 1886,0
1889
+ 1887,0
1890
+ 1888,0
1891
+ 1889,0
1892
+ 1890,0
1893
+ 1891,0
1894
+ 1892,0
1895
+ 1893,0
1896
+ 1894,0
1897
+ 1895,0
1898
+ 1896,0
1899
+ 1897,0
1900
+ 1898,0
1901
+ 1899,0
1902
+ 1900,0
1903
+ 1901,0
1904
+ 1902,0
1905
+ 1903,0
1906
+ 1904,0
1907
+ 1905,0
1908
+ 1906,0
1909
+ 1907,0
1910
+ 1908,0
1911
+ 1909,0
1912
+ 1910,0
1913
+ 1911,0
1914
+ 1912,0
1915
+ 1913,0
1916
+ 1914,0
1917
+ 1915,0
1918
+ 1916,0
1919
+ 1917,0
1920
+ 1918,0
1921
+ 1919,0
1922
+ 1920,0
1923
+ 1921,0
1924
+ 1922,0
1925
+ 1923,0
1926
+ 1924,0
1927
+ 1925,0
1928
+ 1926,0
1929
+ 1927,0
1930
+ 1928,0
1931
+ 1929,0
1932
+ 1930,0
1933
+ 1931,0
1934
+ 1932,0
1935
+ 1933,0
1936
+ 1934,1
1937
+ 1935,0
1938
+ 1936,0
1939
+ 1937,0
1940
+ 1938,0
1941
+ 1939,0
1942
+ 1940,0
1943
+ 1941,0
1944
+ 1942,0
1945
+ 1943,1
1946
+ 1944,1
1947
+ 1945,0
1948
+ 1946,0
1949
+ 1947,0
1950
+ 1948,0
1951
+ 1949,0
1952
+ 1950,0
1953
+ 1951,0
1954
+ 1952,0
1955
+ 1953,0
1956
+ 1954,0
1957
+ 1955,0
1958
+ 1956,0
1959
+ 1957,0
1960
+ 1958,0
1961
+ 1959,0
1962
+ 1960,0
1963
+ 1961,0
1964
+ 1962,0
1965
+ 1963,0
1966
+ 1964,0
1967
+ 1965,0
1968
+ 1966,0
1969
+ 1967,0
1970
+ 1968,0
1971
+ 1969,0
1972
+ 1970,0
1973
+ 1971,0
1974
+ 1972,0
1975
+ 1973,0
1976
+ 1974,0
1977
+ 1975,0
1978
+ 1976,0
1979
+ 1977,0
1980
+ 1978,0
1981
+ 1979,0
1982
+ 1980,0
1983
+ 1981,0
1984
+ 1982,0
1985
+ 1983,0
1986
+ 1984,0
1987
+ 1985,0
1988
+ 1986,0
1989
+ 1987,0
1990
+ 1988,0
1991
+ 1989,0
1992
+ 1990,0
1993
+ 1991,0
1994
+ 1992,0
1995
+ 1993,0
1996
+ 1994,0
1997
+ 1995,0
1998
+ 1996,0
1999
+ 1997,0
2000
+ 1998,0
2001
+ 1999,0
requirements.txt CHANGED
@@ -3,3 +3,8 @@ python-dotenv>=1.0.0
3
  google-generativeai>=0.8.0
4
  langchain>=0.2.5
5
  langchain-google-genai>=0.0.12
 
 
 
 
 
 
3
  google-generativeai>=0.8.0
4
  langchain>=0.2.5
5
  langchain-google-genai>=0.0.12
6
+ numpy
7
+ nltk
8
+ scikit-learn
9
+ xgboost
10
+ lightgbm
training_model.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file: train_demo_models.py
2
+ from __future__ import annotations
3
+
4
+ import pickle
5
+ import numpy as np
6
+ from typing import Dict, Tuple, List
7
+
8
+ import nltk
9
+ from nltk.corpus import twitter_samples, stopwords
10
+
11
+ from sklearn.ensemble import RandomForestClassifier
12
+ from xgboost import XGBClassifier
13
+ from lightgbm import LGBMClassifier
14
+ from sklearn.svm import SVC
15
+ from sklearn.tree import DecisionTreeClassifier
16
+ from sklearn.naive_bayes import GaussianNB
17
+
18
+ from sklearn.metrics import accuracy_score, log_loss
19
+
20
+ from feature_extract import build_freqs, extract_features_2, extract_features_6
21
+
22
+ # -------------------- NLTK setup --------------------
23
+ def _ensure_nltk():
24
+ try:
25
+ twitter_samples.fileids()
26
+ except LookupError:
27
+ nltk.download("twitter_samples", quiet=True)
28
+ try:
29
+ stopwords.words("english")
30
+ except LookupError:
31
+ nltk.download("stopwords", quiet=True)
32
+
33
+ # -------------------- Data prep --------------------
34
+ def load_twitter_data() -> Tuple[List[str], np.ndarray]:
35
+ pos = twitter_samples.strings("positive_tweets.json")
36
+ neg = twitter_samples.strings("negative_tweets.json")
37
+ tweets = pos + neg
38
+ y = np.array([1] * len(pos) + [0] * len(neg))
39
+ return tweets, y
40
+
41
+ def vectorize(tweets: List[str],
42
+ freqs: Dict[Tuple[str, float], float],
43
+ mode: str = "2f") -> np.ndarray:
44
+ """mode: '2f' -> extract_features_2, '6f' -> extract_features_6"""
45
+ feat_fn = extract_features_2 if mode == "2f" else extract_features_6
46
+ rows = [feat_fn(t, freqs) for t in tweets]
47
+ return np.vstack(rows) if rows else np.zeros((0, 2 if mode == "2f" else 6))
48
+
49
+ # -------------------- Models --------------------
50
+ def make_models() -> Dict[str, object]:
51
+ return {
52
+ "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
53
+ "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
54
+ "LightGBM": LGBMClassifier(random_state=42),
55
+ "SVM": SVC(kernel="linear", probability=True, random_state=42),
56
+ "Decision Tree": DecisionTreeClassifier(random_state=42),
57
+ "Naive Bayes": GaussianNB(),
58
+ }
59
+
60
+ # -------------------- Train --------------------
61
+ def train_models(X: np.ndarray, y: np.ndarray) -> Dict[str, object]:
62
+ models = make_models()
63
+ trained = {}
64
+ print("Đang train các mô hình:")
65
+ for name, clf in models.items():
66
+ clf.fit(X, y.ravel())
67
+ trained[name] = clf
68
+
69
+ # --- ghi log sau train ---
70
+ y_pred = clf.predict(X)
71
+ acc = accuracy_score(y, y_pred)
72
+ # log_loss cần probability
73
+ try:
74
+ y_proba = clf.predict_proba(X)
75
+ loss = log_loss(y, y_proba)
76
+ except Exception:
77
+ loss = None
78
+
79
+ if loss is not None:
80
+ print(f"[{name}] Accuracy: {acc:.4f} | LogLoss: {loss:.4f}")
81
+ else:
82
+ print(f"[{name}] Accuracy: {acc:.4f} | (không có predict_proba để tính log_loss)")
83
+ print("=" * 60)
84
+ return trained
85
+
86
+ def train_all_versions(save_path: str = "demo_models.pkl"):
87
+ """
88
+ Train và lưu mô hình + freqs ra file pickle.
89
+ Trả về:
90
+ {
91
+ 'freqs': freqs,
92
+ '2f': {model_name: trained_model, ...},
93
+ '6f': {model_name: trained_model, ...}
94
+ }
95
+ """
96
+ _ensure_nltk()
97
+ tweets, y = load_twitter_data()
98
+ freqs = build_freqs(tweets, y.reshape(-1, 1))
99
+
100
+ # trích features
101
+ X2 = vectorize(tweets, freqs, mode="2f")
102
+ X6 = vectorize(tweets, freqs, mode="6f")
103
+
104
+ print("\n===== Train với 2-feature =====")
105
+ models_2f = train_models(X2, y)
106
+
107
+ print("\n===== Train với 6-feature =====")
108
+ models_6f = train_models(X6, y)
109
+
110
+ data_to_save = {
111
+ "freqs": freqs,
112
+ "2f": models_2f,
113
+ "6f": models_6f,
114
+ }
115
+
116
+ # lưu file pickle
117
+ with open(save_path, "wb") as f:
118
+ pickle.dump(data_to_save, f)
119
+
120
+ print(f"\nĐã train và lưu mô hình + freqs vào file: {save_path}")
121
+ return data_to_save
122
+
123
+ # -------------------- Load --------------------
124
+ def load_demo_models(save_path: str = "demo_models.pkl"):
125
+ """Load lại mô hình + freqs từ file pickle."""
126
+ with open(save_path, "rb") as f:
127
+ data = pickle.load(f)
128
+ return data
129
+
130
+ # -------------------- CLI --------------------
131
+ if __name__ == "__main__":
132
+ models = train_all_versions() # train & save
133
+ print("Các mô hình 2f:", list(models["2f"].keys()))
134
+ print("Các mô hình 6f:", list(models["6f"].keys()))