{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "0f914398", "metadata": {}, "outputs": [], "source": [ "%%capture\n", "!pip install nltk\n", "!pip install numpy\n", "!pip install pandas" ] }, { "cell_type": "code", "execution_count": 2, "id": "d473cee2", "metadata": {}, "outputs": [], "source": [ "import nltk #Natural Language Toolkit\n", "import numpy as np\n", "import pandas as pd\n", "from nltk.corpus import twitter_samples\n", "from langchain.prompts import PromptTemplate\n", "from langchain_core.messages import SystemMessage, HumanMessage\n", "\n", "from llm import llm\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "2f9d43cc", "metadata": {}, "outputs": [], "source": [ "all_positive_tweets = twitter_samples.strings('positive_tweets.json')\n", "all_negative_tweets = twitter_samples.strings('negative_tweets.json')\n", "\n", "test_pos = all_positive_tweets[4000:]\n", "test_neg = all_negative_tweets[4000:]\n", "\n", "test_x = test_pos + test_neg\n", "\n", "# Create the numpy array of positive labels and negative labels.\n", "test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)" ] }, { "cell_type": "code", "execution_count": 12, "id": "ed135bd0", "metadata": {}, "outputs": [], "source": [ "import re\n", "import numpy as np # đảm bảo đã import\n", "\n", "# --- PROMPTS ---\n", "system_prompt = (\n", " \"You are a strict sentiment classifier.\\n\"\n", " \"Given a batch of up to 20 sentences, output EXACTLY one line per input, \"\n", " \"in the same order. Each line must be a single character: 1 for positive, 0 for negative. \"\n", " \"NO extra text, NO numbering, NO spaces, NO blank lines.\"\n", ")\n", "\n", "user_prompt = PromptTemplate(\n", " input_variables=[\"items\"],\n", " template=(\n", " \"Classify the sentiment of EACH sentence listed between and .\\n\"\n", " \"Rules:\\n\"\n", " \"- Output exactly ONE line per sentence, in the SAME ORDER.\\n\"\n", " \"- Each line must be EXACTLY '1' (positive) or '0' (negative).\\n\"\n", " \"- Do NOT print anything else. Do NOT repeat the inputs.\\n\\n\"\n", " \"\\n{items}\\n\"\n", " ),\n", ")\n", "\n", "def _format_items(sentences):\n", " return \"\\n\".join(f\"{s}\" for s in sentences)\n", "\n", "# --- PARSER (robust) ---\n", "def _parse_binary_lines(text: str, expected_n: int) -> np.ndarray:\n", " \"\"\"\n", " Chấp nhận:\n", " - expected_n dòng, mỗi dòng là '0' hoặc '1'\n", " - 1 dòng duy nhất dài đúng expected_n ký tự '0'/'1'\n", " - Cứu hộ: gom toàn bộ ký tự '0'/'1' trong text nếu đúng expected_n\n", " \"\"\"\n", " s = (text or \"\").strip()\n", " if not s:\n", " raise ValueError(\"Empty model output\")\n", "\n", " lines = [ln.strip() for ln in s.splitlines() if ln.strip() != \"\"]\n", "\n", " # Case A: Đúng expected_n dòng, mỗi dòng là 0/1\n", " if len(lines) == expected_n and all(re.fullmatch(r\"[01]\", ln) for ln in lines):\n", " return np.array([int(ln) for ln in lines], dtype=np.int8)\n", "\n", " # Case B: 1 dòng duy nhất gồm đúng expected_n ký tự 0/1\n", " if len(lines) == 1 and re.fullmatch(r\"[01]+\", lines[0]) and len(lines[0]) == expected_n:\n", " return np.array([int(ch) for ch in lines[0]], dtype=np.int8)\n", "\n", " # Case C: Cứu hộ - lấy mọi ký tự 0/1 trong toàn bộ text\n", " bits = re.findall(r\"[01]\", s)\n", " if len(bits) == expected_n:\n", " return np.array([int(b) for b in bits], dtype=np.int8)\n", "\n", " # Thất bại: báo lỗi kèm preview ngắn gọn\n", " preview = s[:200].replace(\"\\n\", \"\\\\n\")\n", " raise ValueError(f\"Expected {expected_n} labels, got {len(lines)} lines / {len(bits)} bits. Raw='{preview}...'\")\n", "\n", "# --- INFERENCE ---\n", "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n", " n = len(sentences)\n", " if n == 0 or n > 20:\n", " raise ValueError(f\"Batch size must be 1..20, got {n}\")\n", "\n", " messages = [\n", " SystemMessage(content=system_prompt),\n", " HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n", " ]\n", "\n", " resp = llm.invoke(messages)\n", "\n", " # KHÔNG dùng str(resp): dễ lẫn metadata vào.\n", " raw_text = getattr(resp, \"content\", None)\n", " if raw_text is None or not str(raw_text).strip():\n", " # Gợi ý: bạn có thể log resp để debug khi model bị chặn (block_reason, safety, v.v.)\n", " raise RuntimeError(f\"LLM returned empty content. Full response repr: {repr(resp)}\")\n", "\n", " raw_text = raw_text.strip()\n", " preds = _parse_binary_lines(raw_text, expected_n=n)\n", " return preds if existing is None else np.concatenate([existing, preds])\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "c06e66ff", "metadata": {}, "outputs": [], "source": [ "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n", " n = len(sentences)\n", " if n == 0 or n > 20:\n", " raise ValueError(f\"Batch size must be 1..20, got {n}\")\n", "\n", " messages = [\n", " SystemMessage(content=system_prompt),\n", " HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n", " ]\n", "\n", " resp = llm.invoke(messages)\n", " raw_text = getattr(resp, \"content\", None)\n", "\n", " if raw_text is None or not str(raw_text).strip():\n", " # Nếu LLM không trả ra gì → điền 0 hết\n", " print(f\"[warn] LLM output empty for batch size {n}, filling 0s\")\n", " preds = np.zeros(n, dtype=np.int8)\n", " else:\n", " raw_text = raw_text.strip()\n", " try:\n", " preds = _parse_binary_lines(raw_text, expected_n=n)\n", " except Exception as e:\n", " # Nếu parse fail → điền 0 hết\n", " print(f\"[warn] Parse fail for batch size {n}, filling 0s: {e}\")\n", " preds = np.zeros(n, dtype=np.int8)\n", "\n", " return preds if existing is None else np.concatenate([existing, preds])\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "495cb1f2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[init] total=2000 done=1500 remain=500\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Gemini produced an empty response. Continuing with empty message\n", "Feedback: block_reason: PROHIBITED_CONTENT\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[warn] LLM output empty for batch size 20, filling 0s\n", "[ok] 1500:1520 +20\n", "[ok] 1520:1540 +20\n", "[ok] 1540:1560 +20\n", "[ok] 1560:1580 +20\n", "[ok] 1580:1600 +20\n", "[ok] 1600:1620 +20\n", "[ok] 1620:1640 +20\n", "[ok] 1640:1660 +20\n", "[ok] 1660:1680 +20\n", "[ok] 1680:1700 +20\n", "[ok] 1700:1720 +20\n", "[ok] 1720:1740 +20\n", "[ok] 1740:1760 +20\n", "[ok] 1760:1780 +20\n", "[ok] 1780:1800 +20\n", "[ok] 1800:1820 +20\n", "[ok] 1820:1840 +20\n", "[ok] 1840:1860 +20\n", "[ok] 1860:1880 +20\n", "[ok] 1880:1900 +20\n", "[ok] 1900:1920 +20\n", "[ok] 1920:1940 +20\n", "[ok] 1940:1960 +20\n", "[ok] 1960:1980 +20\n", "[ok] 1980:2000 +20\n", "[final] collected=2000/2000\n", "Accuracy: 0.9470\n" ] } ], "source": [ "import os, csv, time\n", "from sklearn.metrics import accuracy_score\n", "\n", "BATCH_SIZE = 20\n", "SLEEP_SECS = 20\n", "PRED_CSV = \"preds.csv\"\n", "\n", "y_true = test_y.ravel().astype(int)\n", "TOTAL = len(test_x)\n", "\n", "# resume\n", "start_idx = 0\n", "if os.path.exists(PRED_CSV):\n", " with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n", " r = csv.reader(f); rows = list(r)\n", " if rows and rows[0] and rows[0][0] == \"idx\": rows = rows[1:]\n", " start_idx = len(rows)\n", "else:\n", " with open(PRED_CSV, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n", " csv.writer(f).writerow([\"idx\", \"pred\"])\n", "\n", "print(f\"[init] total={TOTAL} done={start_idx} remain={TOTAL-start_idx}\")\n", "\n", "for i in range(start_idx, TOTAL, BATCH_SIZE):\n", " batch = test_x[i : i + BATCH_SIZE]\n", " try:\n", " preds = classify_20(llm, batch)\n", " except Exception as e:\n", " print(f\"[err] {i}:{i+len(batch)} {type(e).__name__}: {e}\")\n", " break\n", " with open(PRED_CSV, \"a\", newline=\"\", encoding=\"utf-8\") as f:\n", " w = csv.writer(f)\n", " for off, p in enumerate(preds):\n", " w.writerow([i + off, int(p)])\n", " print(f\"[ok] {i}:{i+len(batch)} +{len(preds)}\")\n", " if i + BATCH_SIZE < TOTAL:\n", " time.sleep(SLEEP_SECS)\n", "\n", "# eval if complete\n", "idxs, vals = [], []\n", "with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n", " r = csv.reader(f); next(r, None)\n", " for row in r:\n", " idxs.append(int(row[0])); vals.append(int(row[1]))\n", "order = np.argsort(np.array(idxs))\n", "y_pred = np.array(vals, dtype=int)[order]\n", "\n", "print(f\"[final] collected={len(y_pred)}/{TOTAL}\")\n", "if len(y_pred) == TOTAL:\n", " print(f\"Accuracy: {accuracy_score(y_true, y_pred):.4f}\")\n", "else:\n", " print(f\"[note] missing={TOTAL-len(y_pred)}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "435f575c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }