{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0f914398",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"!pip install nltk\n",
"!pip install numpy\n",
"!pip install pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d473cee2",
"metadata": {},
"outputs": [],
"source": [
"import nltk #Natural Language Toolkit\n",
"import numpy as np\n",
"import pandas as pd\n",
"from nltk.corpus import twitter_samples\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain_core.messages import SystemMessage, HumanMessage\n",
"\n",
"from llm import llm\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2f9d43cc",
"metadata": {},
"outputs": [],
"source": [
"all_positive_tweets = twitter_samples.strings('positive_tweets.json')\n",
"all_negative_tweets = twitter_samples.strings('negative_tweets.json')\n",
"\n",
"test_pos = all_positive_tweets[4000:]\n",
"test_neg = all_negative_tweets[4000:]\n",
"\n",
"test_x = test_pos + test_neg\n",
"\n",
"# Create the numpy array of positive labels and negative labels.\n",
"test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ed135bd0",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import numpy as np # đảm bảo đã import\n",
"\n",
"# --- PROMPTS ---\n",
"system_prompt = (\n",
" \"You are a strict sentiment classifier.\\n\"\n",
" \"Given a batch of up to 20 sentences, output EXACTLY one line per input, \"\n",
" \"in the same order. Each line must be a single character: 1 for positive, 0 for negative. \"\n",
" \"NO extra text, NO numbering, NO spaces, NO blank lines.\"\n",
")\n",
"\n",
"user_prompt = PromptTemplate(\n",
" input_variables=[\"items\"],\n",
" template=(\n",
" \"Classify the sentiment of EACH sentence listed between and .\\n\"\n",
" \"Rules:\\n\"\n",
" \"- Output exactly ONE line per sentence, in the SAME ORDER.\\n\"\n",
" \"- Each line must be EXACTLY '1' (positive) or '0' (negative).\\n\"\n",
" \"- Do NOT print anything else. Do NOT repeat the inputs.\\n\\n\"\n",
" \"\\n{items}\\n\"\n",
" ),\n",
")\n",
"\n",
"def _format_items(sentences):\n",
" return \"\\n\".join(f\"{s}\" for s in sentences)\n",
"\n",
"# --- PARSER (robust) ---\n",
"def _parse_binary_lines(text: str, expected_n: int) -> np.ndarray:\n",
" \"\"\"\n",
" Chấp nhận:\n",
" - expected_n dòng, mỗi dòng là '0' hoặc '1'\n",
" - 1 dòng duy nhất dài đúng expected_n ký tự '0'/'1'\n",
" - Cứu hộ: gom toàn bộ ký tự '0'/'1' trong text nếu đúng expected_n\n",
" \"\"\"\n",
" s = (text or \"\").strip()\n",
" if not s:\n",
" raise ValueError(\"Empty model output\")\n",
"\n",
" lines = [ln.strip() for ln in s.splitlines() if ln.strip() != \"\"]\n",
"\n",
" # Case A: Đúng expected_n dòng, mỗi dòng là 0/1\n",
" if len(lines) == expected_n and all(re.fullmatch(r\"[01]\", ln) for ln in lines):\n",
" return np.array([int(ln) for ln in lines], dtype=np.int8)\n",
"\n",
" # Case B: 1 dòng duy nhất gồm đúng expected_n ký tự 0/1\n",
" if len(lines) == 1 and re.fullmatch(r\"[01]+\", lines[0]) and len(lines[0]) == expected_n:\n",
" return np.array([int(ch) for ch in lines[0]], dtype=np.int8)\n",
"\n",
" # Case C: Cứu hộ - lấy mọi ký tự 0/1 trong toàn bộ text\n",
" bits = re.findall(r\"[01]\", s)\n",
" if len(bits) == expected_n:\n",
" return np.array([int(b) for b in bits], dtype=np.int8)\n",
"\n",
" # Thất bại: báo lỗi kèm preview ngắn gọn\n",
" preview = s[:200].replace(\"\\n\", \"\\\\n\")\n",
" raise ValueError(f\"Expected {expected_n} labels, got {len(lines)} lines / {len(bits)} bits. Raw='{preview}...'\")\n",
"\n",
"# --- INFERENCE ---\n",
"def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
" n = len(sentences)\n",
" if n == 0 or n > 20:\n",
" raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
"\n",
" messages = [\n",
" SystemMessage(content=system_prompt),\n",
" HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
" ]\n",
"\n",
" resp = llm.invoke(messages)\n",
"\n",
" # KHÔNG dùng str(resp): dễ lẫn metadata vào.\n",
" raw_text = getattr(resp, \"content\", None)\n",
" if raw_text is None or not str(raw_text).strip():\n",
" # Gợi ý: bạn có thể log resp để debug khi model bị chặn (block_reason, safety, v.v.)\n",
" raise RuntimeError(f\"LLM returned empty content. Full response repr: {repr(resp)}\")\n",
"\n",
" raw_text = raw_text.strip()\n",
" preds = _parse_binary_lines(raw_text, expected_n=n)\n",
" return preds if existing is None else np.concatenate([existing, preds])\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c06e66ff",
"metadata": {},
"outputs": [],
"source": [
"def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
" n = len(sentences)\n",
" if n == 0 or n > 20:\n",
" raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
"\n",
" messages = [\n",
" SystemMessage(content=system_prompt),\n",
" HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
" ]\n",
"\n",
" resp = llm.invoke(messages)\n",
" raw_text = getattr(resp, \"content\", None)\n",
"\n",
" if raw_text is None or not str(raw_text).strip():\n",
" # Nếu LLM không trả ra gì → điền 0 hết\n",
" print(f\"[warn] LLM output empty for batch size {n}, filling 0s\")\n",
" preds = np.zeros(n, dtype=np.int8)\n",
" else:\n",
" raw_text = raw_text.strip()\n",
" try:\n",
" preds = _parse_binary_lines(raw_text, expected_n=n)\n",
" except Exception as e:\n",
" # Nếu parse fail → điền 0 hết\n",
" print(f\"[warn] Parse fail for batch size {n}, filling 0s: {e}\")\n",
" preds = np.zeros(n, dtype=np.int8)\n",
"\n",
" return preds if existing is None else np.concatenate([existing, preds])\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "495cb1f2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[init] total=2000 done=1500 remain=500\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Gemini produced an empty response. Continuing with empty message\n",
"Feedback: block_reason: PROHIBITED_CONTENT\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[warn] LLM output empty for batch size 20, filling 0s\n",
"[ok] 1500:1520 +20\n",
"[ok] 1520:1540 +20\n",
"[ok] 1540:1560 +20\n",
"[ok] 1560:1580 +20\n",
"[ok] 1580:1600 +20\n",
"[ok] 1600:1620 +20\n",
"[ok] 1620:1640 +20\n",
"[ok] 1640:1660 +20\n",
"[ok] 1660:1680 +20\n",
"[ok] 1680:1700 +20\n",
"[ok] 1700:1720 +20\n",
"[ok] 1720:1740 +20\n",
"[ok] 1740:1760 +20\n",
"[ok] 1760:1780 +20\n",
"[ok] 1780:1800 +20\n",
"[ok] 1800:1820 +20\n",
"[ok] 1820:1840 +20\n",
"[ok] 1840:1860 +20\n",
"[ok] 1860:1880 +20\n",
"[ok] 1880:1900 +20\n",
"[ok] 1900:1920 +20\n",
"[ok] 1920:1940 +20\n",
"[ok] 1940:1960 +20\n",
"[ok] 1960:1980 +20\n",
"[ok] 1980:2000 +20\n",
"[final] collected=2000/2000\n",
"Accuracy: 0.9470\n"
]
}
],
"source": [
"import os, csv, time\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"BATCH_SIZE = 20\n",
"SLEEP_SECS = 20\n",
"PRED_CSV = \"preds.csv\"\n",
"\n",
"y_true = test_y.ravel().astype(int)\n",
"TOTAL = len(test_x)\n",
"\n",
"# resume\n",
"start_idx = 0\n",
"if os.path.exists(PRED_CSV):\n",
" with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
" r = csv.reader(f); rows = list(r)\n",
" if rows and rows[0] and rows[0][0] == \"idx\": rows = rows[1:]\n",
" start_idx = len(rows)\n",
"else:\n",
" with open(PRED_CSV, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
" csv.writer(f).writerow([\"idx\", \"pred\"])\n",
"\n",
"print(f\"[init] total={TOTAL} done={start_idx} remain={TOTAL-start_idx}\")\n",
"\n",
"for i in range(start_idx, TOTAL, BATCH_SIZE):\n",
" batch = test_x[i : i + BATCH_SIZE]\n",
" try:\n",
" preds = classify_20(llm, batch)\n",
" except Exception as e:\n",
" print(f\"[err] {i}:{i+len(batch)} {type(e).__name__}: {e}\")\n",
" break\n",
" with open(PRED_CSV, \"a\", newline=\"\", encoding=\"utf-8\") as f:\n",
" w = csv.writer(f)\n",
" for off, p in enumerate(preds):\n",
" w.writerow([i + off, int(p)])\n",
" print(f\"[ok] {i}:{i+len(batch)} +{len(preds)}\")\n",
" if i + BATCH_SIZE < TOTAL:\n",
" time.sleep(SLEEP_SECS)\n",
"\n",
"# eval if complete\n",
"idxs, vals = [], []\n",
"with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
" r = csv.reader(f); next(r, None)\n",
" for row in r:\n",
" idxs.append(int(row[0])); vals.append(int(row[1]))\n",
"order = np.argsort(np.array(idxs))\n",
"y_pred = np.array(vals, dtype=int)[order]\n",
"\n",
"print(f\"[final] collected={len(y_pred)}/{TOTAL}\")\n",
"if len(y_pred) == TOTAL:\n",
" print(f\"Accuracy: {accuracy_score(y_true, y_pred):.4f}\")\n",
"else:\n",
" print(f\"[note] missing={TOTAL-len(y_pred)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "435f575c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}