Spaces:

rotemso23
/

dialogue-summarizer

Sleeping

rotemso23 Claude Sonnet 4.6 commited on Apr 16

Commit

cd3c9f3

1 Parent(s): 1954903

Add Phase 4: ROUGE evaluation script and Colab notebook

Compares fine-tuned LoRA adapter vs zero-shot baseline on DialogSum test split.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

notebooks/evaluate_colab.ipynb +201 -0
src/evaluate.py +279 -0

notebooks/evaluate_colab.ipynb ADDED Viewed

	@@ -0,0 +1,201 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a1000001",
+   "metadata": {},
+   "source": [
+    "# Dialogue Summarizer — Evaluation on Colab T4\n",
+    "\n",
+    "Runs ROUGE evaluation on the DialogSum test split (819 examples).\n",
+    "Compares the fine-tuned LoRA adapter (`rotemso23/dialogsum-phi3-lora`) against the zero-shot baseline.\n",
+    "\n",
+    "**Before running:**\n",
+    "1. Set Runtime → Change runtime type → **T4 GPU**\n",
+    "2. Add your HuggingFace token in the Colab Secrets tab (key icon, name: `HF_TOKEN`)\n",
+    "\n",
+    "**Expected runtime:** ~30–60 minutes (two inference passes over 819 examples)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1000002",
+   "metadata": {},
+   "source": [
+    "## 1. Verify GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1000003",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "assert torch.cuda.is_available(), \"No GPU found! Set Runtime → Change runtime type → T4 GPU\"\n",
+    "print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1000004",
+   "metadata": {},
+   "source": [
+    "## 2. Install dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1000005",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Colab already has torch, so we skip it to avoid version conflicts\n",
+    "!pip install -q \\\n",
+    "    \"datasets>=2.0.0\" \\\n",
+    "    \"transformers>=4.40.0\" \\\n",
+    "    \"peft>=0.19.0\" \\\n",
+    "    \"bitsandbytes>=0.43.0\" \\\n",
+    "    \"accelerate>=0.30.0\" \\\n",
+    "    \"rouge-score==0.1.2\" \\\n",
+    "    \"python-dotenv==1.0.1\"\n",
+    "print(\"Dependencies installed.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1000006",
+   "metadata": {},
+   "source": [
+    "## 3. Clone the repo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1000007",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "REPO_URL = \"https://github.com/rotemso23/dialogue-summarizer.git\"\n",
+    "REPO_DIR = \"dialogue-summarizer\"\n",
+    "\n",
+    "if os.path.exists(REPO_DIR):\n",
+    "    !git -C {REPO_DIR} pull\n",
+    "else:\n",
+    "    !git clone {REPO_URL}\n",
+    "\n",
+    "os.chdir(REPO_DIR)\n",
+    "print(f\"Working directory: {os.getcwd()}\")\n",
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1000008",
+   "metadata": {},
+   "source": [
+    "## 4. Set HuggingFace token\n",
+    "\n",
+    "Your token needs **read** permissions (write not required for evaluation).  \n",
+    "Get one at: https://huggingface.co/settings/tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1000009",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from google.colab import userdata\n",
+    "import os\n",
+    "\n",
+    "# Option A: read from Colab Secrets (Secrets tab on the left sidebar → add HF_TOKEN)\n",
+    "try:\n",
+    "    os.environ[\"HF_TOKEN\"] = userdata.get(\"HF_TOKEN\")\n",
+    "    print(\"HF_TOKEN loaded from Colab Secrets.\")\n",
+    "except Exception:\n",
+    "    # Option B: paste directly (don't commit this)\n",
+    "    os.environ[\"HF_TOKEN\"] = \"hf_xxx_YOUR_TOKEN_HERE\"\n",
+    "    print(\"HF_TOKEN set manually — remember not to commit this notebook with a real token.\")\n",
+    "\n",
+    "# Write to .env so evaluate.py can find it via python-dotenv\n",
+    "with open(\".env\", \"w\") as f:\n",
+    "    f.write(f'HF_TOKEN={os.environ[\"HF_TOKEN\"]}\\n')\n",
+    "print(\"Token written to .env\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1000010",
+   "metadata": {},
+   "source": [
+    "## 5. Run evaluation\n",
+    "\n",
+    "This runs two full inference passes over the 819-example test split:\n",
+    "1. Fine-tuned model (`rotemso23/dialogsum-phi3-lora`)\n",
+    "2. Zero-shot baseline (same base model, no adapter)\n",
+    "\n",
+    "Results are saved to `evaluation_results.json`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1000011",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!PYTHONPATH=/content/dialogue-summarizer python src/evaluate.py"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1000012",
+   "metadata": {},
+   "source": [
+    "## 6. View results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1000013",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "with open(\"evaluation_results.json\") as f:\n",
+    "    results = json.load(f)\n",
+    "\n",
+    "print(f\"{'Metric':<12} {'Baseline':>10} {'Fine-tuned':>12} {'Delta':>10}\")\n",
+    "print(\"-\" * 52)\n",
+    "for k in [\"rouge1\", \"rouge2\", \"rougeL\"]:\n",
+    "    base_val = results[\"baseline\"][k]\n",
+    "    ft_val = results[\"fine_tuned\"][k]\n",
+    "    delta = ft_val - base_val\n",
+    "    print(f\"{k:<12} {base_val:>10.4f} {ft_val:>12.4f} {delta:>+10.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+src/evaluate.py — ROUGE evaluation: fine-tuned vs. zero-shot baseline on DialogSum test split.
+Loads the fine-tuned LoRA adapter from HuggingFace Hub and the base model (no adapter),
+runs greedy inference on the 819-example test split, computes ROUGE-1/2/L, and saves
+results to evaluation_results.json.
+Run on Colab T4:
+    python src/evaluate.py
+"""
+from __future__ import annotations
+import json
+from typing import Any
+import torch
+from datasets import load_dataset
+from peft import PeftModel
+from rouge_score import rouge_scorer
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from src.data import DATASET_NAME, INSTRUCTION
+from src.model import HUB_REPO, MODEL_ID
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+BATCH_SIZE = 4
+MAX_NEW_TOKENS = 128
+NUM_QUALITATIVE = 5
+OUTPUT_FILE = "evaluation_results.json"
+# ---------------------------------------------------------------------------
+# Prompt formatting (inference only — user turn, no assistant content)
+# ---------------------------------------------------------------------------
+def format_inference_prompt(dialogue: str, tokenizer: Any) -> str:
+    """
+    Format a dialogue into an inference prompt (user turn only).
+    Uses add_generation_prompt=True so the model continues with the assistant turn.
+    This is the inference-time counterpart of tokenize_and_mask's prompt_text.
+    Args:
+        dialogue: Raw conversation string from the dataset.
+        tokenizer: Phi-3 tokenizer with apply_chat_template support.
+    Returns:
+        Prompt string ending with the assistant generation trigger token.
+    """
+    messages = [
+        {"role": "user", "content": f"{INSTRUCTION}\n\nConversation:\n{dialogue}"}
+    ]
+    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+# ---------------------------------------------------------------------------
+# Model loading helpers
+# ---------------------------------------------------------------------------
+def _load_tokenizer(model_id: str = MODEL_ID) -> Any:
+    """Load tokenizer with left-padding (required for batched generation)."""
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    tokenizer.padding_side = "left"
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+def _load_base_model(model_id: str = MODEL_ID) -> Any:
+    """Load Phi-3-mini in 4-bit quantization without any LoRA adapter."""
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=False,
+        dtype=torch.float16,
+    )
+    model.eval()
+    return model
+# ---------------------------------------------------------------------------
+# Inference
+# ---------------------------------------------------------------------------
+def run_inference(
+    model: Any,
+    tokenizer: Any,
+    dialogues: list[str],
+    batch_size: int = BATCH_SIZE,
+) -> list[str]:
+    """
+    Run batched greedy inference on a list of dialogues.
+    Formats each dialogue into an inference prompt, tokenizes in batches with
+    left-padding, generates with max_new_tokens=128 and do_sample=False, then
+    strips the prompt prefix from each output to return only the generated summary.
+    Args:
+        model: Loaded causal LM (base model or PeftModel).
+        tokenizer: Matching tokenizer with padding_side='left'.
+        dialogues: List of raw dialogue strings.
+        batch_size: Number of examples per forward pass.
+    Returns:
+        List of generated summary strings, one per dialogue.
+    """
+    prompts = [format_inference_prompt(d, tokenizer) for d in dialogues]
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    all_summaries: list[str] = []
+    for i in tqdm(range(0, len(prompts), batch_size), desc="Inferring"):
+        batch_prompts = prompts[i : i + batch_size]
+        inputs = tokenizer(
+            batch_prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=1024,
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        input_len = inputs["input_ids"].shape[1]
+        with torch.inference_mode():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=MAX_NEW_TOKENS,
+                do_sample=False,
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        for out in output_ids:
+            generated_ids = out[input_len:]
+            summary = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+            all_summaries.append(summary)
+    return all_summaries
+# ---------------------------------------------------------------------------
+# ROUGE scoring
+# ---------------------------------------------------------------------------
+def compute_rouge(predictions: list[str], references: list[str]) -> dict[str, float]:
+    """
+    Compute average ROUGE-1, ROUGE-2, and ROUGE-L F-scores.
+    Args:
+        predictions: Generated summaries (one per test example).
+        references: Ground-truth summaries from the dataset.
+    Returns:
+        Dict with keys 'rouge1', 'rouge2', 'rougeL' — mean F-scores in [0, 1].
+    """
+    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+    totals: dict[str, float] = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
+    for pred, ref in zip(predictions, references):
+        scores = scorer.score(ref, pred)
+        totals["rouge1"] += scores["rouge1"].fmeasure
+        totals["rouge2"] += scores["rouge2"].fmeasure
+        totals["rougeL"] += scores["rougeL"].fmeasure
+    n = len(predictions)
+    return {k: v / n for k, v in totals.items()}
+# ---------------------------------------------------------------------------
+# Qualitative display
+# ---------------------------------------------------------------------------
+def print_qualitative_examples(
+    dialogues: list[str],
+    references: list[str],
+    finetuned_preds: list[str],
+    baseline_preds: list[str],
+    n: int = NUM_QUALITATIVE,
+) -> None:
+    """Print n side-by-side examples: dialogue, reference, fine-tuned, baseline."""
+    print("\n" + "=" * 80)
+    print(f"QUALITATIVE EXAMPLES (n={n})")
+    print("=" * 80)
+    for i in range(n):
+        print(f"\n--- Example {i + 1} ---")
+        print(f"[Dialogue]\n{dialogues[i]}\n")
+        print(f"[Reference]\n{references[i]}\n")
+        print(f"[Fine-tuned]\n{finetuned_preds[i]}\n")
+        print(f"[Baseline]\n{baseline_preds[i]}\n")
+        print("-" * 60)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    from dotenv import load_dotenv
+    load_dotenv()
+    print("Loading DialogSum test split...")
+    test_data = load_dataset(DATASET_NAME, split="test")
+    dialogues: list[str] = test_data["dialogue"]
+    references: list[str] = test_data["summary"]
+    print(f"Test examples: {len(dialogues)}")
+    tokenizer = _load_tokenizer()
+    # --- Fine-tuned model ---
+    print(f"\nLoading fine-tuned model from Hub: {HUB_REPO}")
+    base_model = _load_base_model()
+    finetuned_model = PeftModel.from_pretrained(base_model, HUB_REPO)
+    finetuned_model.eval()
+    print("Running fine-tuned inference...")
+    finetuned_preds = run_inference(finetuned_model, tokenizer, dialogues)
+    finetuned_rouge = compute_rouge(finetuned_preds, references)
+    print("\nFine-tuned ROUGE scores:")
+    for k, v in finetuned_rouge.items():
+        print(f"  {k}: {v:.4f}")
+    # Free GPU memory before loading the baseline
+    del finetuned_model
+    del base_model
+    torch.cuda.empty_cache()
+    # --- Baseline model (no adapter) ---
+    print(f"\nLoading baseline model (no adapter): {MODEL_ID}")
+    baseline_model = _load_base_model()
+    print("Running baseline inference...")
+    baseline_preds = run_inference(baseline_model, tokenizer, dialogues)
+    baseline_rouge = compute_rouge(baseline_preds, references)
+    print("\nBaseline ROUGE scores:")
+    for k, v in baseline_rouge.items():
+        print(f"  {k}: {v:.4f}")
+    del baseline_model
+    torch.cuda.empty_cache()
+    # --- Results table ---
+    print("\n" + "=" * 52)
+    print(f"{'Metric':<12} {'Baseline':>10} {'Fine-tuned':>12} {'Delta':>10}")
+    print("-" * 52)
+    for k in ["rouge1", "rouge2", "rougeL"]:
+        base_val = baseline_rouge[k]
+        ft_val = finetuned_rouge[k]
+        delta = ft_val - base_val
+        print(f"{k:<12} {base_val:>10.4f} {ft_val:>12.4f} {delta:>+10.4f}")
+    print("=" * 52)
+    # --- Save results ---
+    results = {
+        "fine_tuned": finetuned_rouge,
+        "baseline": baseline_rouge,
+    }
+    with open(OUTPUT_FILE, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nSaved results to {OUTPUT_FILE}")
+    # --- Qualitative examples ---
+    print_qualitative_examples(dialogues, references, finetuned_preds, baseline_preds)
+if __name__ == "__main__":
+    main()