{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0f914398",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install nltk\n",
    "!pip install numpy\n",
    "!pip install pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d473cee2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk #Natural Language Toolkit\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from nltk.corpus import twitter_samples\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain_core.messages import SystemMessage, HumanMessage\n",
    "\n",
    "from llm import llm\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2f9d43cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_positive_tweets = twitter_samples.strings('positive_tweets.json')\n",
    "all_negative_tweets = twitter_samples.strings('negative_tweets.json')\n",
    "\n",
    "test_pos = all_positive_tweets[4000:]\n",
    "test_neg = all_negative_tweets[4000:]\n",
    "\n",
    "test_x = test_pos + test_neg\n",
    "\n",
    "# Create the numpy array of positive labels and negative labels.\n",
    "test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ed135bd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import numpy as np  # đảm bảo đã import\n",
    "\n",
    "# --- PROMPTS ---\n",
    "system_prompt = (\n",
    "    \"You are a strict sentiment classifier.\\n\"\n",
    "    \"Given a batch of up to 20 sentences, output EXACTLY one line per input, \"\n",
    "    \"in the same order. Each line must be a single character: 1 for positive, 0 for negative. \"\n",
    "    \"NO extra text, NO numbering, NO spaces, NO blank lines.\"\n",
    ")\n",
    "\n",
    "user_prompt = PromptTemplate(\n",
    "    input_variables=[\"items\"],\n",
    "    template=(\n",
    "        \"Classify the sentiment of EACH sentence listed between <INPUT> and </INPUT>.\\n\"\n",
    "        \"Rules:\\n\"\n",
    "        \"- Output exactly ONE line per sentence, in the SAME ORDER.\\n\"\n",
    "        \"- Each line must be EXACTLY '1' (positive) or '0' (negative).\\n\"\n",
    "        \"- Do NOT print anything else. Do NOT repeat the inputs.\\n\\n\"\n",
    "        \"<INPUT>\\n{items}\\n</INPUT>\"\n",
    "    ),\n",
    ")\n",
    "\n",
    "def _format_items(sentences):\n",
    "    return \"\\n\".join(f\"<s>{s}</s>\" for s in sentences)\n",
    "\n",
    "# --- PARSER (robust) ---\n",
    "def _parse_binary_lines(text: str, expected_n: int) -> np.ndarray:\n",
    "    \"\"\"\n",
    "    Chấp nhận:\n",
    "      - expected_n dòng, mỗi dòng là '0' hoặc '1'\n",
    "      - 1 dòng duy nhất dài đúng expected_n ký tự '0'/'1'\n",
    "      - Cứu hộ: gom toàn bộ ký tự '0'/'1' trong text nếu đúng expected_n\n",
    "    \"\"\"\n",
    "    s = (text or \"\").strip()\n",
    "    if not s:\n",
    "        raise ValueError(\"Empty model output\")\n",
    "\n",
    "    lines = [ln.strip() for ln in s.splitlines() if ln.strip() != \"\"]\n",
    "\n",
    "    # Case A: Đúng expected_n dòng, mỗi dòng là 0/1\n",
    "    if len(lines) == expected_n and all(re.fullmatch(r\"[01]\", ln) for ln in lines):\n",
    "        return np.array([int(ln) for ln in lines], dtype=np.int8)\n",
    "\n",
    "    # Case B: 1 dòng duy nhất gồm đúng expected_n ký tự 0/1\n",
    "    if len(lines) == 1 and re.fullmatch(r\"[01]+\", lines[0]) and len(lines[0]) == expected_n:\n",
    "        return np.array([int(ch) for ch in lines[0]], dtype=np.int8)\n",
    "\n",
    "    # Case C: Cứu hộ - lấy mọi ký tự 0/1 trong toàn bộ text\n",
    "    bits = re.findall(r\"[01]\", s)\n",
    "    if len(bits) == expected_n:\n",
    "        return np.array([int(b) for b in bits], dtype=np.int8)\n",
    "\n",
    "    # Thất bại: báo lỗi kèm preview ngắn gọn\n",
    "    preview = s[:200].replace(\"\\n\", \"\\\\n\")\n",
    "    raise ValueError(f\"Expected {expected_n} labels, got {len(lines)} lines / {len(bits)} bits. Raw='{preview}...'\")\n",
    "\n",
    "# --- INFERENCE ---\n",
    "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
    "    n = len(sentences)\n",
    "    if n == 0 or n > 20:\n",
    "        raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
    "\n",
    "    messages = [\n",
    "        SystemMessage(content=system_prompt),\n",
    "        HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
    "    ]\n",
    "\n",
    "    resp = llm.invoke(messages)\n",
    "\n",
    "    # KHÔNG dùng str(resp): dễ lẫn metadata vào.\n",
    "    raw_text = getattr(resp, \"content\", None)\n",
    "    if raw_text is None or not str(raw_text).strip():\n",
    "        # Gợi ý: bạn có thể log resp để debug khi model bị chặn (block_reason, safety, v.v.)\n",
    "        raise RuntimeError(f\"LLM returned empty content. Full response repr: {repr(resp)}\")\n",
    "\n",
    "    raw_text = raw_text.strip()\n",
    "    preds = _parse_binary_lines(raw_text, expected_n=n)\n",
    "    return preds if existing is None else np.concatenate([existing, preds])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c06e66ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def classify_20(llm, sentences, existing: np.ndarray | None = None) -> np.ndarray:\n",
    "    n = len(sentences)\n",
    "    if n == 0 or n > 20:\n",
    "        raise ValueError(f\"Batch size must be 1..20, got {n}\")\n",
    "\n",
    "    messages = [\n",
    "        SystemMessage(content=system_prompt),\n",
    "        HumanMessage(content=user_prompt.format(items=_format_items(sentences))),\n",
    "    ]\n",
    "\n",
    "    resp = llm.invoke(messages)\n",
    "    raw_text = getattr(resp, \"content\", None)\n",
    "\n",
    "    if raw_text is None or not str(raw_text).strip():\n",
    "        # Nếu LLM không trả ra gì → điền 0 hết\n",
    "        print(f\"[warn] LLM output empty for batch size {n}, filling 0s\")\n",
    "        preds = np.zeros(n, dtype=np.int8)\n",
    "    else:\n",
    "        raw_text = raw_text.strip()\n",
    "        try:\n",
    "            preds = _parse_binary_lines(raw_text, expected_n=n)\n",
    "        except Exception as e:\n",
    "            # Nếu parse fail → điền 0 hết\n",
    "            print(f\"[warn] Parse fail for batch size {n}, filling 0s: {e}\")\n",
    "            preds = np.zeros(n, dtype=np.int8)\n",
    "\n",
    "    return preds if existing is None else np.concatenate([existing, preds])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "495cb1f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[init] total=2000 done=1500 remain=500\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Gemini produced an empty response. Continuing with empty message\n",
      "Feedback: block_reason: PROHIBITED_CONTENT\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[warn] LLM output empty for batch size 20, filling 0s\n",
      "[ok] 1500:1520 +20\n",
      "[ok] 1520:1540 +20\n",
      "[ok] 1540:1560 +20\n",
      "[ok] 1560:1580 +20\n",
      "[ok] 1580:1600 +20\n",
      "[ok] 1600:1620 +20\n",
      "[ok] 1620:1640 +20\n",
      "[ok] 1640:1660 +20\n",
      "[ok] 1660:1680 +20\n",
      "[ok] 1680:1700 +20\n",
      "[ok] 1700:1720 +20\n",
      "[ok] 1720:1740 +20\n",
      "[ok] 1740:1760 +20\n",
      "[ok] 1760:1780 +20\n",
      "[ok] 1780:1800 +20\n",
      "[ok] 1800:1820 +20\n",
      "[ok] 1820:1840 +20\n",
      "[ok] 1840:1860 +20\n",
      "[ok] 1860:1880 +20\n",
      "[ok] 1880:1900 +20\n",
      "[ok] 1900:1920 +20\n",
      "[ok] 1920:1940 +20\n",
      "[ok] 1940:1960 +20\n",
      "[ok] 1960:1980 +20\n",
      "[ok] 1980:2000 +20\n",
      "[final] collected=2000/2000\n",
      "Accuracy: 0.9470\n"
     ]
    }
   ],
   "source": [
    "import os, csv, time\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "BATCH_SIZE = 20\n",
    "SLEEP_SECS = 20\n",
    "PRED_CSV = \"preds.csv\"\n",
    "\n",
    "y_true = test_y.ravel().astype(int)\n",
    "TOTAL = len(test_x)\n",
    "\n",
    "# resume\n",
    "start_idx = 0\n",
    "if os.path.exists(PRED_CSV):\n",
    "    with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
    "        r = csv.reader(f); rows = list(r)\n",
    "        if rows and rows[0] and rows[0][0] == \"idx\": rows = rows[1:]\n",
    "        start_idx = len(rows)\n",
    "else:\n",
    "    with open(PRED_CSV, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
    "        csv.writer(f).writerow([\"idx\", \"pred\"])\n",
    "\n",
    "print(f\"[init] total={TOTAL} done={start_idx} remain={TOTAL-start_idx}\")\n",
    "\n",
    "for i in range(start_idx, TOTAL, BATCH_SIZE):\n",
    "    batch = test_x[i : i + BATCH_SIZE]\n",
    "    try:\n",
    "        preds = classify_20(llm, batch)\n",
    "    except Exception as e:\n",
    "        print(f\"[err] {i}:{i+len(batch)} {type(e).__name__}: {e}\")\n",
    "        break\n",
    "    with open(PRED_CSV, \"a\", newline=\"\", encoding=\"utf-8\") as f:\n",
    "        w = csv.writer(f)\n",
    "        for off, p in enumerate(preds):\n",
    "            w.writerow([i + off, int(p)])\n",
    "    print(f\"[ok] {i}:{i+len(batch)} +{len(preds)}\")\n",
    "    if i + BATCH_SIZE < TOTAL:\n",
    "        time.sleep(SLEEP_SECS)\n",
    "\n",
    "# eval if complete\n",
    "idxs, vals = [], []\n",
    "with open(PRED_CSV, \"r\", newline=\"\", encoding=\"utf-8\") as f:\n",
    "    r = csv.reader(f); next(r, None)\n",
    "    for row in r:\n",
    "        idxs.append(int(row[0])); vals.append(int(row[1]))\n",
    "order = np.argsort(np.array(idxs))\n",
    "y_pred = np.array(vals, dtype=int)[order]\n",
    "\n",
    "print(f\"[final] collected={len(y_pred)}/{TOTAL}\")\n",
    "if len(y_pred) == TOTAL:\n",
    "    print(f\"Accuracy: {accuracy_score(y_true, y_pred):.4f}\")\n",
    "else:\n",
    "    print(f\"[note] missing={TOTAL-len(y_pred)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "435f575c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}