{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "UrYUSkACCo62" }, "source": [ "#CÀI ĐẶT THƯ VIỆN" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "collapsed": true, "id": "gZVgq40W9F-N", "outputId": "8a61c824-b37f-4d50-d06b-11da018a1191" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: rouge-score in /usr/local/lib/python3.12/dist-packages (0.1.2)\n", "Requirement already satisfied: sacremoses in /usr/local/lib/python3.12/dist-packages (0.1.1)\n", "Requirement already satisfied: transformers[sentencepiece] in /usr/local/lib/python3.12/dist-packages (5.7.0)\n", "Requirement already satisfied: huggingface-hub<2.0,>=1.5.0 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (1.11.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (2.0.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (26.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (6.0.3)\n", "Requirement already satisfied: regex>=2025.10.22 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (2025.11.3)\n", "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (0.22.2)\n", "Requirement already satisfied: typer in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (0.24.2)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (0.7.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (4.67.3)\n", "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (0.2.1)\n", "Requirement already satisfied: protobuf in /usr/local/lib/python3.12/dist-packages (from transformers[sentencepiece]) (5.29.6)\n", "Requirement already satisfied: absl-py in /usr/local/lib/python3.12/dist-packages (from rouge-score) (1.4.0)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.12/dist-packages (from rouge-score) (3.9.1)\n", "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.12/dist-packages (from rouge-score) (1.17.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from sacremoses) (8.3.3)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from sacremoses) (1.5.3)\n", "Requirement already satisfied: filelock>=3.10.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (3.29.0)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (2025.3.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (1.4.3)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (0.28.1)\n", "Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (4.15.0)\n", "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->transformers[sentencepiece]) (1.5.4)\n", "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->transformers[sentencepiece]) (13.9.4)\n", "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer->transformers[sentencepiece]) (0.0.4)\n", "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (4.13.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (2026.4.22)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (1.0.9)\n", "Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (3.13)\n", "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface-hub<2.0,>=1.5.0->transformers[sentencepiece]) (0.16.0)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer->transformers[sentencepiece]) (4.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer->transformers[sentencepiece]) (2.20.0)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer->transformers[sentencepiece]) (0.1.2)\n" ] } ], "source": [ "!pip install transformers[sentencepiece] rouge-score sacremoses" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "fN9HeTk_9aR3" }, "outputs": [], "source": [ "import json\n", "import torch\n", "import os\n", "import re\n", "import unicodedata\n", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", "from rouge_score import rouge_scorer\n", "from tqdm import tqdm\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GD72LMn0FC74", "outputId": "a586718a-c523-4859-8687-9afb1bf9314f" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "using device: cuda\n" ] } ], "source": [ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "print('using device:', device)" ] }, { "cell_type": "markdown", "metadata": { "id": "vFoKipPwCltg" }, "source": [ "#CÁC HÀM TIỆN ÍCH (UTILS)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "A-2w4fF9_6_P" }, "outputs": [], "source": [ "def clean_text(text: str) -> str:\n", " \"\"\"\n", " Làm sạch văn bản tiếng Việt:\n", " - Bỏ HTML tags\n", " - Chuẩn hóa Unicode (NFC)\n", " - Thay '_' -> ' '\n", " - Xóa newline\n", " - Chuẩn hóa khoảng trắng\n", " - Chuẩn hóa dấu câu (có khoảng trắng sau dấu)\n", " \"\"\"\n", "\n", " if not isinstance(text, str):\n", " return \"\"\n", "\n", " # Bỏ HTML tags\n", " text = re.sub(r'<[^>]+>', '', text)\n", "\n", " # Chuẩn hóa Unicode (NFC)\n", " text = unicodedata.normalize('NFC', text)\n", "\n", " # thay underscore thành space\n", " text = text.replace('_', ' ')\n", "\n", " # Xóa newline\n", " text = text.replace('\\n', ' ')\n", "\n", " #chuẩn hóa dấu câu\n", " text = re.sub(r'([.,!?;:])', r' \\1 ', text)\n", "\n", " #chuẩn hóa khoảng trắng\n", " text = re.sub(r'\\s+', ' ', text)\n", "\n", " return text.strip()\n", "\n", "def truncate_text(text: str, max_words: int = 512) -> str:\n", " \"\"\"\n", " Cắt văn bản thành một đoạn có tối đa `max_words` từ.\n", " \"\"\"\n", " if not isinstance(text, str):\n", " return \"\"\n", "\n", " words = text.split()\n", " if len(words) <= max_words:\n", " return text\n", " else:\n", " return ' '.join(words[:max_words]) + \"...\"\n", "\n", "\n", "def compute_length(text: str) -> int:\n", " \"\"\"\n", " Tính số lượng từ trong văn bản.\n", " \"\"\"\n", " if not isinstance(text, str):\n", " return 0\n", "\n", " words = text.split()\n", " return len(words)\n", "\n", "\n", "def compute_compression_ratio(article: str, summary: str) -> float:\n", " \"\"\"\n", " Tính tỷ lệ nén giữa văn bản gốc và văn bản đã được làm sạch.\n", " \"\"\"\n", " article_len = compute_length(article)\n", " summary_len = compute_length(summary)\n", "\n", " if article_len == 0:\n", " return 0.0\n", "\n", " return summary_len / article_len\n", "\n", "\n", "def is_abnormal_sample(article: str, summary: str) -> bool:\n", " \"\"\"\n", " Detect sample bất thường:\n", " - article quá ngắn\n", " - ratio quá cao\n", " \"\"\"\n", " article_len = compute_length(article)\n", " ratio = compute_compression_ratio(article, summary)\n", "\n", " if article_len < 50 or ratio > 0.8:\n", " return True\n", " return False\n" ] }, { "cell_type": "markdown", "metadata": { "id": "7UUsD5GjCrIl" }, "source": [ "# TẠO DỮ LIỆU KIỂM THỬ (TEST SET)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "WyvzumGMCNb4" }, "outputs": [], "source": [ "def is_valid_sample(article: str, summary: str) -> bool:\n", " \"\"\"\n", " Filter sample:\n", " - article đủ dài\n", " - ratio không quá cao (loại sample lỗi)\n", " \"\"\"\n", " if not article or not summary:\n", " return False\n", "\n", " article_len = len(article.split())\n", " ratio = compute_compression_ratio(article, summary)\n", "\n", " # rule\n", " if article_len < 20:\n", " return False\n", "\n", " if ratio > 0.3:\n", " return False\n", "\n", " return True\n", "\n", "def run_create_test_set(num_samples=300, seed=42):\n", " print(\"--- Loading dataset Vietnews... ---\")\n", " dataset = load_dataset(\"harouzie/vietnews\", split='test')\n", " raw_test = dataset.shuffle(seed=seed)\n", "\n", " test_data = []\n", " print(\"--- Creating test set... ---\")\n", " for sample in raw_test:\n", " article = clean_text(sample[\"article\"])\n", " summary = clean_text(sample[\"abstract\"])\n", "\n", " if not is_valid_sample(article, summary):\n", " continue\n", "\n", " test_data.append({\"article\": article, \"abstract\": summary})\n", " if len(test_data) >= num_samples:\n", " break\n", "\n", " print(f\"Collected {len(test_data)} samples\")\n", " return test_data\n" ] }, { "cell_type": "markdown", "metadata": { "id": "1urfHff7CzvP" }, "source": [ "#HÀM ĐÁNH GIÁ MODEL (EVALUATE)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "esX-8ESRC0oM" }, "outputs": [], "source": [ "def resolve_model_path_colab(model_path: str) -> str:\n", " # Nếu là Repo ID trên Hugging Face (có dấu /) thì trả về\n", " if \"/\" in model_path and not os.path.exists(model_path):\n", " return model_path\n", "\n", " if not os.path.exists(model_path):\n", " raise ValueError(f\"Path không tồn tại: {model_path}\")\n", "\n", " files = os.listdir(model_path)\n", " if \"config.json\" in files:\n", " return model_path\n", " checkpoints = [f for f in files if f.startswith(\"checkpoint\")]\n", " if checkpoints:\n", " checkpoints.sort(key=lambda x: int(x.split('-')[-1]) if '-' in x else x)\n", " return os.path.join(model_path, checkpoints[-1])\n", " return model_path\n", "\n", "def evaluate_model_colab(model_path: str, test_data: list, output_path: str = \"rouge_results.json\"):\n", " # Giải quyết đường dẫn\n", " model_path = resolve_model_path_colab(model_path)\n", " print(f\"Loading model from: {model_path}\")\n", "\n", " # Load model & tokenizer\n", " # Colab có thể tải từ Hugging Face\n", " tokenizer = AutoTokenizer.from_pretrained(model_path)\n", " model = AutoModelForSeq2SeqLM.from_pretrained(model_path)\n", "\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " model.to(device)\n", " model.eval()\n", " print(f\"Device: {device}\")\n", "\n", " # ROUGE setup\n", " scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)\n", " results = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'details': []}\n", "\n", " print(\"Evaluating...\")\n", " for item in tqdm(test_data):\n", " article = clean_text(item['article'])\n", " reference = clean_text(item['abstract'])\n", "\n", " inputs = tokenizer(article, max_length=512, truncation=True, return_tensors=\"pt\").to(device)\n", "\n", " with torch.no_grad():\n", " summary_ids = model.generate(\n", " inputs[\"input_ids\"],\n", " max_length=150,\n", " min_length=30,\n", " num_beams=4,\n", " no_repeat_ngram_size=3,\n", " early_stopping=True,\n", " )\n", "\n", " generated = tokenizer.decode(summary_ids[0], skip_special_tokens=True)\n", " generated = clean_text(generated)\n", "\n", " # Tính ROUGE\n", " scores = scorer.score(reference, generated)\n", " r1, r2, rl = scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure\n", "\n", " results['rouge1'].append(r1)\n", " results['rouge2'].append(r2)\n", " results['rougeL'].append(rl)\n", " results['details'].append({\n", " \"reference\": reference,\n", " \"generated\": generated,\n", " \"rouge1\": round(r1, 4), \"rouge2\": round(r2, 4), \"rougeL\": round(rl, 4)\n", " })\n", "\n", " # Tính trung bình\n", " avg_r1 = sum(results['rouge1']) / len(results['rouge1'])\n", " avg_r2 = sum(results['rouge2']) / len(results['rouge2'])\n", " avg_rl = sum(results['rougeL']) / len(results['rougeL'])\n", "\n", " print(\"\\n\" + \"=\"*30)\n", " print(\"ROUGE RESULTS\")\n", " print(f\"ROUGE-1: {avg_r1:.4f}\")\n", " print(f\"ROUGE-2: {avg_r2:.4f}\")\n", " print(f\"ROUGE-L: {avg_rl:.4f}\")\n", " print(\"=\"*30)\n", "\n", " # Lưu kết quả\n", " with open(output_path, 'w', encoding='utf-8') as f:\n", " json.dump({\n", " \"model\": model_path,\n", " \"average\": {\"rouge1\": round(avg_r1, 4), \"rouge2\": round(avg_r2, 4), \"rougeL\": round(avg_rl, 4)},\n", " \"details\": results['details']\n", " }, f, ensure_ascii=False, indent=2)\n", " print(f\"Saved results to {output_path}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "m5hlgELELX8M" }, "source": [ "#tóm tắt lỗi" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "OnjDfjCZLZqe" }, "outputs": [], "source": [ "import json\n", "import os\n", "\n", "def create_error_summary(input_path=\"rouge_results.json\", output_path=\"error_analysis.txt\", top_n=10):\n", " \"\"\"\n", " Phân tích kết quả ROUGE để tìm ra các mẫu model làm tệ nhất và tốt nhất.\n", " \"\"\"\n", " if not os.path.exists(input_path):\n", " print(f\"Không tìm thấy file {input_path}. Hãy chạy evaluate.py trước!\")\n", " return\n", "\n", " with open(input_path, \"r\", encoding=\"utf-8\") as f:\n", " data = json.load(f)\n", "\n", " details = data.get(\"details\", [])\n", " if not details:\n", " print(\"File kết quả không có dữ liệu chi tiết (details).\")\n", " return\n", "\n", " # 1. Sắp xếp danh sách theo điểm ROUGE-L tăng dần (tệ nhất lên đầu)\n", " worst_samples = sorted(details, key=lambda x: x[\"rougeL\"])\n", "\n", " # 2. Lấy danh sách tốt nhất (để đối chiếu)\n", " best_samples = sorted(details, key=lambda x: x[\"rougeL\"], reverse=True)\n", "\n", " with open(output_path, \"w\", encoding=\"utf-8\") as f:\n", " f.write(\"==================================================\\n\")\n", " f.write(\" BÁO CÁO PHÂN TÍCH LỖI TÓM TẮT \\n\")\n", " f.write(\"==================================================\\n\\n\")\n", "\n", " f.write(f\"Model: {data.get('model', 'Unknown')}\\n\")\n", " f.write(f\"Điểm ROUGE trung bình:\\n\")\n", " avg = data.get(\"average\", {})\n", " f.write(f\" - ROUGE-1: {avg.get('rouge1'):.4f}\\n\")\n", " f.write(f\" - ROUGE-2: {avg.get('rouge2'):.4f}\\n\")\n", " f.write(f\" - ROUGE-L: {avg.get('rougeL'):.4f}\\n\\n\")\n", "\n", " # PHẦN 1: TOP CÁC CÂU TỆ NHẤT\n", " f.write(f\"--- TOP {top_n} MẪU CÓ ĐIỂM THẤP NHẤT (CẦN PHÂN TÍCH LỖI) ---\\n\")\n", " for i, sample in enumerate(worst_samples[:top_n]):\n", " f.write(f\"\\n[Mẫu lỗi số {i+1}]\\n\")\n", " f.write(f\" > ROUGE-L: {sample['rougeL']:.4f} | ROUGE-1: {sample['rouge1']:.4f}\\n\")\n", " f.write(f\" > Tham chiếu (Gold): {sample['reference']}\\n\")\n", " f.write(f\" > Model viết: {sample['generated']}\\n\")\n", " f.write(\"-\" * 30 + \"\\n\")\n", "\n", " f.write(\"\\n\" + \"=\"*50 + \"\\n\\n\")\n", "\n", " # PHẦN 2: TOP CÁC CÂU TỐT NHẤT\n", " f.write(f\"--- TOP {top_n} MẪU CÓ ĐIỂM CAO NHẤT (MODEL LÀM TỐT) ---\\n\")\n", " for i, sample in enumerate(best_samples[:top_n]):\n", " f.write(f\"\\n[Mẫu tốt số {i+1}]\\n\")\n", " f.write(f\" > ROUGE-L: {sample['rougeL']:.4f} | ROUGE-1: {sample['rouge1']:.4f}\\n\")\n", " f.write(f\" > Tham chiếu (Gold): {sample['reference']}\\n\")\n", " f.write(f\" > Model viết: {sample['generated']}\\n\")\n", " f.write(\"-\" * 30 + \"\\n\")\n", "\n", " print(f\"Đã tạo báo cáo phân tích lỗi tại: {output_path}\")\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 584, "referenced_widgets": [ "73fd47b411a24145a609bab0cf37eaa9", "2c46fcfc647546f2aa0a499518c7f894", "51f3b92a16c14098899b88605e5ae506", "d065f191724f4f07b24698eaa70328f6", "68e125c3d4b74a6db0dc706041cea907", "0705273e51cf4be982faab30bbec3820", "fe96abf3ed3240f39d7a7ad69ebeb38e", "ff2134fadfeb473d97309dfe7d9f2fff", "25227fefc594436ca8d042dedfaeb4ba", "cf9fee299b7c402dab73371707cbc96b", "25e25a2b5db14e65b518c1d9a7934311", "4932c1b71001436d989bff89034d1489", "700e4bfdd19a4da49746b64349825465", "ddeb494331f244f5b7ba3b216ccee9d5", "de7b4e36949140a383cde04f803f66d1", "704c1c9ebc68429e86718a12f6f31e47", "1288103230e143b78db2b655bbaa32a3", "e4d8f471e4f14750987fd1f01326980c", "18a62b050d9541428faead83567beb30", "10861235615d4d76b7baaa40cffbbdef", "cda61cbf54884c3aa98127049dad33bf", "1c8d0998a1cc4d70af29c394c2949506", "15f431ee684f448f897b20b1f4f4a3b1", "a50de1c0ccc443f5a2a9fe9a791b02f6", "afb39d22095b428da144710b43785445", "3ee0b4fe1e5f4226bf25f0d8f301198f", "b1dd37ad15bb4de6982db3c967021ae6", "1fad9d81479d4dc4ae7a0573bd965b93", "bab7bc50bb5e45caad41bcb7f4da2b98", "d967b31fab6b402db5fa1ee4cd382044", "aaccf77482b24a1ba9883d9601ebda81", "b9e7c10eb925420fae0440946a62dcb3", "16b56cb978bf4951abd8c093c65f9c60" ] }, "id": "b6v8VIZVEBsm", "outputId": "9c3944ed-5bd8-4e80-fd26-509e56e3e29b" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--- Loading dataset Vietnews... ---\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n", "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n", "WARNING:huggingface_hub.utils._http:Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "--- Creating test set... ---\n", "Collected 300 samples\n", "Loading model from: OrdinaryAI/visum-qlora-5epochs\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "WARNING:torchao:Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "Loading weights: 0%| | 0/517 [00:00