Upload 3 files

Browse files

Files changed (3) hide show

eurovoc.py +691 -0
inference_test.ipynb +386 -0
train_lora_included.ipynb +687 -0

eurovoc.py ADDED Viewed

	@@ -0,0 +1,691 @@

+import torch
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+import numpy as np
+import pytorch_lightning as pl
+import torch.nn as nn
+import torch
+from transformers import BertTokenizerFast as BertTokenizer, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel
+import json
+import random
+from collections import Counter
+from tqdm.auto import tqdm
+import gzip
+from pathlib import Path
+import os
+from datasets import load_dataset
+from peft import LoraConfig, get_peft_model, TaskType, PeftModel
+def save_split_config(train_files, val_files, config_path, metadata=None):
+    """
+    Save train/val split configuration to a JSON file.
+    Args:
+        train_files: List of training file paths
+        val_files: List of validation file paths
+        config_path: Path to save the configuration JSON
+        metadata: Optional dict with additional info (train_ratio, seed, etc.)
+    """
+    config = {
+        'train_files': train_files,
+        'val_files': val_files,
+        'num_train_files': len(train_files),
+        'num_val_files': len(val_files),
+        'metadata': metadata or {}
+    }
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(config_path) if os.path.dirname(config_path) else '.', exist_ok=True)
+    with open(config_path, 'w') as f:
+        json.dump(config, f, indent=2)
+    print(f"✓ Split configuration saved to {config_path}")
+def load_split_config(config_path):
+    """
+    Load train/val split configuration from a JSON file.
+    Args:
+        config_path: Path to the configuration JSON
+    Returns:
+        Tuple of (train_files, val_files, metadata)
+    """
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    print(f"✓ Loaded split configuration from {config_path}")
+    print(f"  Train files: {config['num_train_files']}")
+    print(f"  Val files: {config['num_val_files']}")
+    return config['train_files'], config['val_files'], config.get('metadata', {})
+def get_file_label_stats(jsonl_files):
+    """
+    Get label distribution from all files.
+    Since we need accurate stats for rare labels, we count everything.
+    Args:
+        jsonl_files: List of paths to JSONL files
+    Returns:
+        Dict mapping file paths to their label statistics
+    """
+    file_labels = {}
+    print(f"Analyzing {len(jsonl_files)} files...")
+    for file_path in tqdm(jsonl_files):
+        label_counts = Counter()
+        total_records = 0
+        if file_path.endswith('.gz'):
+            open_func = lambda f: gzip.open(f, 'rt', encoding='utf-8')
+        else:
+            open_func = lambda f: open(f, 'r', encoding='utf-8')
+        with open_func(file_path) as f:
+            for line in f:
+                try:
+                    record = json.loads(line)
+                    eurovoc_ids = record.get('eurovoc_ids', [])
+                    label_counts.update(eurovoc_ids)
+                    total_records += 1
+                except Exception as e:
+                    continue
+        file_labels[file_path] = {
+            'label_counts': label_counts,
+            'total_records': total_records
+        }
+    return file_labels
+def smart_split_files(all_jsonl_files, train_ratio=0.92,
+                      rare_threshold=0.005, seed=42, verbose=True,
+                      save_config_path=None):
+    """
+    Split files ensuring rare labels appear in training set.
+    Args:
+        all_jsonl_files: List of all JSONL file paths
+        train_ratio: Fraction of files for training (default 0.92)
+        rare_threshold: Labels appearing in < this fraction are considered rare
+        seed: Random seed for reproducibility
+        verbose: Print statistics
+        save_config_path: If provided, save the split configuration to this path
+    Returns:
+        Tuple of (train_files, val_files)
+    """
+    random.seed(seed)
+    if verbose:
+        print("Analyzing label distribution across files...")
+    file_stats = get_file_label_stats(all_jsonl_files)
+    # Calculate which labels are rare globally
+    global_label_counts = Counter()
+    for stats in file_stats.values():
+        global_label_counts.update(stats['label_counts'])
+    # Identify rare labels
+    total_labels = sum(global_label_counts.values())
+    rare_count_threshold = total_labels * rare_threshold
+    rare_labels = {label for label, count in global_label_counts.items()
+                   if count < rare_count_threshold}
+    if verbose:
+        print(f"Found {len(rare_labels)} rare labels out of {len(global_label_counts)} total")
+    # Score files by number of rare labels they contain
+    file_rare_counts = {}
+    for file_path, stats in file_stats.items():
+        file_labels_set = set(stats['label_counts'].keys())
+        rare_in_file = file_labels_set & rare_labels
+        file_rare_counts[file_path] = len(rare_in_file)
+    # Sort files by rare label count (descending)
+    sorted_files = sorted(file_rare_counts.items(), key=lambda x: x[1], reverse=True)
+    # Calculate split point
+    split_idx = int(len(all_jsonl_files) * train_ratio)
+    # Assign files
+    train_files = [f for f, _ in sorted_files[:split_idx]]
+    val_files = [f for f, _ in sorted_files[split_idx:]]
+    # Calculate stats
+    train_rare_count = sum(1 for f in train_files if file_rare_counts[f] > 0)
+    val_rare_count = sum(1 for f in val_files if file_rare_counts[f] > 0)
+    if verbose:
+        print(f"Train files: {len(train_files)} ({train_rare_count} with rare labels)")
+        print(f"Val files: {len(val_files)} ({val_rare_count} with rare labels)")
+        # Check label coverage
+        train_labels = set()
+        val_labels = set()
+        for f in train_files:
+            train_labels.update(file_stats[f]['label_counts'].keys())
+        for f in val_files:
+            val_labels.update(file_stats[f]['label_counts'].keys())
+        labels_only_in_train = train_labels - val_labels
+        labels_only_in_val = val_labels - train_labels
+        print(f"Labels only in train: {len(labels_only_in_train)}")
+        print(f"Labels only in val: {len(labels_only_in_val)}")
+        if len(labels_only_in_val) > 0:
+            print(f"⚠️  WARNING: {len(labels_only_in_val)} labels appear only in validation!")
+    # Save configuration if path provided
+    if save_config_path:
+        metadata = {
+            'train_ratio': train_ratio,
+            'rare_threshold': rare_threshold,
+            'seed': seed,
+            'total_files': len(all_jsonl_files),
+            'num_rare_labels': len(rare_labels),
+            'num_total_labels': len(global_label_counts),
+            'train_rare_count': train_rare_count,
+            'val_rare_count': val_rare_count
+        }
+        save_split_config(train_files, val_files, save_config_path, metadata)
+    return train_files, val_files
+class EurovocDataset(Dataset):
+    def __init__(
+            self,
+            text: np.array,
+            labels: np.array,
+            tokenizer: BertTokenizer,
+            max_token_len: int = 128
+    ):
+        self.tokenizer = tokenizer
+        self.text = text
+        self.labels = labels
+        self.max_token_len = max_token_len
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, index: int):
+        text = self.text[index][0]
+        labels = self.labels[index]
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_token_len,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+        return dict(
+            text=text,
+            input_ids=encoding["input_ids"].flatten(),
+            attention_mask=encoding["attention_mask"].flatten(),
+            labels=torch.FloatTensor(labels)
+        )
+class StreamingEurovocDataset(IterableDataset):
+    """
+    Streaming dataset that doesn't load everything into memory.
+    Processes one record at a time from disk.
+    """
+    def __init__(self, jsonl_files, mlb, tokenizer, max_token_len=512, split='train'):
+        self.jsonl_files = jsonl_files
+        self.mlb = mlb
+        self.tokenizer = tokenizer
+        self.max_token_len = max_token_len
+        self.split = split
+    def __iter__(self):
+        dataset = load_dataset(
+            'json',
+            data_files=self.jsonl_files,
+            streaming=True,
+            split='train'
+        )
+        for record in dataset:
+            text = record.get('text')
+            eurovoc_ids = record.get('eurovoc_ids', [])
+            # Skip invalid records
+            if not text or not eurovoc_ids:
+                continue
+            # Convert concepts to binary labels
+            labels = self.mlb.transform([eurovoc_ids])[0]
+            # Tokenize
+            encoding = self.tokenizer.encode_plus(
+                text,
+                add_special_tokens=True,
+                max_length=self.max_token_len,
+                return_token_type_ids=False,
+                padding="max_length",
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors='pt',
+            )
+            yield {
+                'input_ids': encoding["input_ids"].flatten(),
+                'attention_mask': encoding["attention_mask"].flatten(),
+                'labels': torch.FloatTensor(labels)
+            }
+class EuroVocLongTextDataset(Dataset):
+    def __splitter__(text, max_lenght):
+        l = text.split()
+        for i in range(0, len(l), max_lenght):
+            yield l[i:i + max_lenght]
+    def __init__(
+            self,
+            text: np.array,
+            labels: np.array,
+            tokenizer: BertTokenizer,
+            max_token_len: int = 128
+    ):
+        self.tokenizer = tokenizer
+        self.text = text
+        self.labels = labels
+        self.max_token_len = max_token_len
+        self.chunks_and_labels = [(c, l) for t, l in zip(self.text, self.labels) for c in self.__splitter__(t)]
+        self.encoding = self.tokenizer.batch_encode_plus(
+            [c for c, _ in self.chunks_and_labels],
+            add_special_tokens=True,
+            max_length=self.max_token_len,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+    def __len__(self):
+        return len(self.chunks_and_labels)
+    def __getitem__(self, index: int):
+        text, labels = self.chunks_and_labels[index]
+        return dict(
+            text=text,
+            input_ids=self.encoding[index]["input_ids"].flatten(),
+            attention_mask=self.encoding[index]["attention_mask"].flatten(),
+            labels=torch.FloatTensor(labels)
+        )
+class EurovocDataModule(pl.LightningDataModule):
+    def __init__(self, bert_model_name, x_tr, y_tr, x_test, y_test, batch_size=8, max_token_len=512):
+        super().__init__()
+        self.batch_size = batch_size
+        self.x_tr = x_tr
+        self.y_tr = y_tr
+        self.x_test = x_test
+        self.y_test = y_test
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+        self.max_token_len = max_token_len
+    def setup(self, stage=None):
+        self.train_dataset = EurovocDataset(
+            self.x_tr,
+            self.y_tr,
+            self.tokenizer,
+            self.max_token_len
+        )
+        self.test_dataset = EurovocDataset(
+            self.x_test,
+            self.y_test,
+            self.tokenizer,
+            self.max_token_len
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=2
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=2
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=2
+        )
+class StreamingEurovocDataModule(pl.LightningDataModule):
+    """
+    DataModule that uses streaming datasets.
+    Supports both random and smart (stratified) file splitting.
+    Can load pre-computed splits from config file.
+    """
+    def __init__(self, bert_model_name, all_jsonl_files, mlb,
+                 batch_size=64, max_token_len=512,
+                 train_ratio=0.92, rare_threshold=0.005,
+                 split_strategy='smart',
+                 split_config_path="../eurovoc_data/train_val_split_config.json",
+                 save_split_config_path="../eurovoc_data/train_val_split_config.json"):
+        """
+        Args:
+            bert_model_name: Name of the BERT model to use
+            all_jsonl_files: List of all JSONL file paths (ignored if split_config_path provided)
+            mlb: Fitted MultiLabelBinarizer
+            batch_size: Batch size for dataloaders
+            max_token_len: Maximum token length for tokenization
+            train_ratio: Fraction of files for training
+            rare_threshold: Threshold for rare label identification
+            split_strategy: 'random' or 'smart'
+            split_config_path: Path to existing split config JSON (if provided, loads from this)
+            save_split_config_path: Path to save new split config JSON
+        """
+        super().__init__()
+        self.batch_size = batch_size
+        self.mlb = mlb
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+        self.max_token_len = max_token_len
+        # Option 1: Load from existing config
+        if split_config_path and os.path.exists(split_config_path):
+            print(f"Loading split from existing config: {split_config_path}")
+            self.train_files, self.val_files, metadata = load_split_config(split_config_path)
+            if metadata:
+                print(f"Split metadata: {metadata}")
+        # Option 2: Create new split
+        else:
+            if split_strategy == 'smart':
+                print("Using smart split strategy (ensuring rare label coverage)...")
+                self.train_files, self.val_files = smart_split_files(
+                    all_jsonl_files,
+                    train_ratio=train_ratio,
+                    rare_threshold=rare_threshold,
+                    save_config_path=save_split_config_path
+                )
+            elif split_strategy == 'random':
+                print("Using random split strategy...")
+                random.shuffle(all_jsonl_files)
+                split_idx = int(len(all_jsonl_files) * train_ratio)
+                self.train_files = all_jsonl_files[:split_idx]
+                self.val_files = all_jsonl_files[split_idx:]
+                print(f"Train files: {len(self.train_files)}")
+                print(f"Val files: {len(self.val_files)}")
+                # Save config if requested
+                if save_split_config_path:
+                    metadata = {
+                        'train_ratio': train_ratio,
+                        'split_strategy': 'random',
+                        'total_files': len(all_jsonl_files)
+                    }
+                    save_split_config(self.train_files, self.val_files,
+                                    save_split_config_path, metadata)
+            else:
+                raise ValueError(f"Unknown split_strategy: {split_strategy}. Use 'random' or 'smart'")
+    def setup(self, stage=None):
+        self.train_dataset = StreamingEurovocDataset(
+            self.train_files,
+            self.mlb,
+            self.tokenizer,
+            self.max_token_len
+        )
+        self.val_dataset = StreamingEurovocDataset(
+            self.val_files,
+            self.mlb,
+            self.tokenizer,
+            self.max_token_len
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            num_workers=4,
+            pin_memory=True
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            num_workers=4,
+            pin_memory=True
+        )
+class EurovocTagger(pl.LightningModule):
+    def __init__(self, bert_model_name, n_classes, lr=2e-5, eps=1e-8):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(bert_model_name)
+        self.dropout = nn.Dropout(p=0.2)
+        self.classifier1 = nn.Linear(self.bert.config.hidden_size, n_classes)
+        self.criterion = nn.BCELoss()
+        self.lr = lr
+        self.eps = eps
+    def forward(self, input_ids, attention_mask, labels=None):
+        output = self.bert(input_ids, attention_mask=attention_mask)
+        output = self.dropout(output.pooler_output)
+        output = self.classifier1(output)
+        output = torch.sigmoid(output)
+        loss = 0
+        if labels is not None:
+            loss = self.criterion(output, labels)
+        return loss, output
+    def training_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("train_loss", loss, prog_bar=True, logger=True)
+        return {"loss": loss, "predictions": outputs, "labels": labels}
+    def validation_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("val_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("test_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)
+class EurovocTaggerBCELogit(pl.LightningModule):
+    def __init__(self, bert_model_name, n_classes, lr=2e-5, eps=1e-8):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(bert_model_name)
+        self.dropout = nn.Dropout(p=0.2)
+        self.classifier1 = nn.Linear(self.bert.config.hidden_size, n_classes)
+        self.criterion = nn.BCEWithLogitsLoss()
+        self.lr = lr
+        self.eps = eps
+    def forward(self, input_ids, attention_mask, labels=None):
+        output = self.bert(input_ids, attention_mask=attention_mask)
+        output = self.dropout(output.pooler_output)
+        output = self.classifier1(output)
+        loss = 0
+        if labels is not None:
+            loss = self.criterion(output, labels)
+        return loss, output
+    def training_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("train_loss", loss, prog_bar=True, logger=True)
+        return {"loss": loss, "predictions": outputs, "labels": labels}
+    def validation_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("val_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("test_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)
+class EurovocTaggerLoRA(pl.LightningModule):
+    def __init__(self, bert_model_name, n_classes, n_intermediate=256, lr=2e-5, eps=1e-8, lora_r=8, lora_alpha=16, lora_dropout=0.1):
+        super().__init__()
+        # Load base BERT model
+        self.bert = AutoModel.from_pretrained(bert_model_name)
+        # Configure LoRA
+        # Target modules: query and value projection layers in attention
+        lora_config = LoraConfig(
+            r=lora_r,  # Rank of the low-rank matrices (smaller = fewer params)
+            lora_alpha=lora_alpha,  # Scaling factor
+            target_modules=["query", "value"],  # Which layers to apply LoRA to
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type=TaskType.FEATURE_EXTRACTION  # For getting embeddings
+        )
+        # Apply LoRA to BERT
+        self.bert = get_peft_model(self.bert, lora_config)
+        # Print trainable parameters info
+        self.bert.print_trainable_parameters()
+        # Hierarchical classification head for 6800 labels
+        # Instead of 768 → 6800 (5.2M params), use 768 → 256 → 6800 (1.9M params)
+        # 768
+        hidden_size = self.bert.config.hidden_size
+        self.dropout1 = nn.Dropout(p=0.2)
+        # Layer 1: Compress to intermediate representation
+        self.classifier1 = nn.Linear(hidden_size, n_intermediate)  # 768 → 256
+        self.relu = nn.ReLU()
+        self.dropout2 = nn.Dropout(p=0.2)
+        # Layer 2: Expand to all labels
+        self.classifier2 = nn.Linear(n_intermediate, n_classes)  # 256 → 6800
+        self.criterion = nn.BCEWithLogitsLoss()
+        self.lr = lr
+        self.eps = eps
+    def forward(self, input_ids, attention_mask, labels=None):
+        # Forward pass through LoRA-enhanced BERT
+        output = self.bert(input_ids, attention_mask=attention_mask)
+        # Get pooled output (CLS token representation)
+        # (batch, 768)
+        output = self.dropout1(output.pooler_output)
+        # Hierarchical classifier
+        output = self.classifier1(output)
+        # (batch, 256)
+        output = self.relu(output)
+        output = self.dropout2(output)
+        # (batch, 6800)
+        output = self.classifier2(output)
+        loss = 0
+        if labels is not None:
+            loss = self.criterion(output, labels)
+        return loss, output
+    def training_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("train_loss", loss, prog_bar=True, logger=True)
+        return {"loss": loss, "predictions": outputs, "labels": labels}
+    def validation_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("val_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        loss, outputs = self(input_ids, attention_mask, labels)
+        self.log("test_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)
+    def save_lora_adapter(self, path):
+        """Save only the LoRA adapter weights"""
+        self.bert.save_pretrained(path)
+    def load_lora_adapter(self, path):
+        """Load LoRA adapter weights"""
+        self.bert = PeftModel.from_pretrained(self.bert, path)

inference_test.ipynb ADDED Viewed

	@@ -0,0 +1,386 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "11ab9cd5-a6e4-416a-b44f-201e8bf8ee84",
+   "metadata": {},
+   "source": [
+    "## Test inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "40523be3-6ec7-4cac-aa90-6b5177c0f07d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pdfminer.high_level import extract_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "c0e5cc3f-5a9d-4b0f-8f7c-d46c0f79b5df",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Cannot set gray non-stroke color because /'P3954' is an invalid float value\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = extract_text(\"./example_docs_for_inference/publication_climate.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "120528e3-26b9-40ce-ac8c-3c30c3092d28",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ISSN 1831-9424 \n",
+      "\n",
+      "How to plan mitigation, adaptatio\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(text[0:50])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d191928f-381e-4da3-8342-1300909b52c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mbarhdadi/projects/training/eurovoc_training_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model loaded. Ready to predict 6958 eurovoc labels.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pickle\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "from eurovoc import EurovocTagger\n",
+    "\n",
+    "# Load MLBinarizer\n",
+    "with open('./models_finetuned/latest/mlb.pickle', 'rb') as f:\n",
+    "    mlb = pickle.load(f)\n",
+    "\n",
+    "# Load tokenizer\n",
+    "BERT_MODEL_NAME = \"nlpaueb/legal-bert-base-uncased\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)\n",
+    "\n",
+    "# Load trained model\n",
+    "checkpoint_path = \"./models_finetuned/latest/EurovocTaggerFP32-epoch=04-val_loss=0.00.ckpt\" \n",
+    "model = EurovocTagger.load_from_checkpoint(\n",
+    "    checkpoint_path,\n",
+    "    bert_model_name=BERT_MODEL_NAME,\n",
+    "    n_classes=len(mlb.classes_)\n",
+    ")\n",
+    "\n",
+    "\n",
+    "print(f\"Model loaded. Ready to predict {len(mlb.classes_)} eurovoc labels.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "7a1fd7e6-e14d-4c24-97ae-abcd5a30ab71",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def get_eurovoc_id_to_term_mapping():\n",
+    "    \"\"\"\n",
+    "    Create a mapping from eurovoc IDs to their human-readable terms.\n",
+    "    \n",
+    "    Returns:\n",
+    "        Dict mapping eurovoc_id -> term_name\n",
+    "    \"\"\"\n",
+    "    import requests\n",
+    "    import xmltodict\n",
+    "    \n",
+    "    eurovoc_id_to_term = {}\n",
+    "    \n",
+    "    response = requests.get(\n",
+    "        'http://publications.europa.eu/resource/dataset/eurovoc',\n",
+    "        headers={\n",
+    "            'Accept': 'application/xml',\n",
+    "            'Accept-Language': 'en',\n",
+    "            'User-Agent': 'Mozilla/5.0'\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    data = xmltodict.parse(response.content)\n",
+    "    \n",
+    "    for term in data['xs:schema']['xs:simpleType']['xs:restriction']['xs:enumeration']:\n",
+    "        try:\n",
+    "            name = term['xs:annotation']['xs:documentation'].split('/')[0].strip()\n",
+    "            eurovoc_id = term['@value'].split(':')[1]\n",
+    "            \n",
+    "            # Map ID -> term \n",
+    "            eurovoc_id_to_term[eurovoc_id] = {\n",
+    "                'original': name,\n",
+    "                'lowercase': name.lower()\n",
+    "            }\n",
+    "        except (KeyError, IndexError) as e:\n",
+    "            print(f\"⚠️ Could not parse term: {term}\")\n",
+    "    \n",
+    "    print(f\"✓ Loaded {len(eurovoc_id_to_term)} eurovoc terms\")\n",
+    "    return eurovoc_id_to_term"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "d2b703ea-ca41-4353-8776-1a226f02c56b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading Eurovoc terms...\n",
+      "✓ Loaded 7488 eurovoc terms\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Loading Eurovoc terms...\")\n",
+    "eurovoc_id_to_term = get_eurovoc_id_to_term_mapping()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7a5fed81-64e8-4454-a56b-73eb50676b75",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "def predict_eurovoc_labels(text, model, mlb, tokenizer, \n",
+    "                           eurovoc_id_to_term=None,\n",
+    "                           max_token_len=512, \n",
+    "                           threshold=0.5, \n",
+    "                           top_k=10,\n",
+    "                           device='cuda'):\n",
+    "    model.eval()\n",
+    "    model.to(device)\n",
+    "    \n",
+    "    # Tokenize\n",
+    "    encoding = tokenizer.encode_plus(\n",
+    "        text,\n",
+    "        add_special_tokens=True,\n",
+    "        max_length=max_token_len,\n",
+    "        return_token_type_ids=False,\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        return_attention_mask=True,\n",
+    "        return_tensors='pt',\n",
+    "    )\n",
+    "    \n",
+    "    input_ids = encoding[\"input_ids\"].to(device)\n",
+    "    attention_mask = encoding[\"attention_mask\"].to(device)\n",
+    "    \n",
+    "    # Predict\n",
+    "    with torch.no_grad():\n",
+    "        _, outputs = model(input_ids, attention_mask)\n",
+    "        \n",
+    "\n",
+    "        probabilities = outputs\n",
+    "    \n",
+    "    probabilities = probabilities.cpu().numpy()[0]\n",
+    "    \n",
+    "    # Helper function to enrich labels with terms\n",
+    "    def enrich_labels(label_ids, probs):\n",
+    "        \"\"\"Add human-readable terms to eurovoc IDs\"\"\"\n",
+    "        enriched = []\n",
+    "        for label_id, prob in zip(label_ids, probs):\n",
+    "            entry = {\n",
+    "                'eurovoc_id': label_id,\n",
+    "                'probability': float(prob)\n",
+    "            }\n",
+    "            \n",
+    "            # Add term if mapping available\n",
+    "            if eurovoc_id_to_term and label_id in eurovoc_id_to_term:\n",
+    "                entry['term'] = eurovoc_id_to_term[label_id]['original']\n",
+    "                entry['term_lower'] = eurovoc_id_to_term[label_id]['lowercase']\n",
+    "            else:\n",
+    "                entry['term'] = None\n",
+    "                entry['term_lower'] = None\n",
+    "            \n",
+    "            enriched.append(entry)\n",
+    "        \n",
+    "        return enriched\n",
+    "    \n",
+    "    # Get predictions above threshold\n",
+    "    predicted_indices = np.where(probabilities >= threshold)[0]\n",
+    "    predicted_labels = mlb.classes_[predicted_indices]\n",
+    "    predicted_probs = probabilities[predicted_indices]\n",
+    "    \n",
+    "    # Get top-k predictions\n",
+    "    top_k_indices = np.argsort(probabilities)[-top_k:][::-1]\n",
+    "    top_k_labels = mlb.classes_[top_k_indices]\n",
+    "    top_k_probs = probabilities[top_k_indices]\n",
+    "    \n",
+    "    return {\n",
+    "        'above_threshold': {\n",
+    "            'predictions': enrich_labels(predicted_labels, predicted_probs),\n",
+    "            'count': len(predicted_labels)\n",
+    "        },\n",
+    "        'top_k': {\n",
+    "            'predictions': enrich_labels(top_k_labels, top_k_probs)\n",
+    "        }\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "030b99aa-edc7-472a-8c7f-636a47a9cdce",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document length: 696483 characters\n",
+      "Truncated to: 2048 tokens (~2048 chars)\n",
+      "\n",
+      "Running inference...\n",
+      "\n",
+      "================================================================================\n",
+      "TOP 15 PREDICTED EUROVOC LABELS (with terms)\n",
+      "================================================================================\n",
+      "642             | energy saving                                 | 0.8567\n",
+      "6700            | energy efficiency                             | 0.7060\n",
+      "2281            | poverty                                       | 0.4645\n",
+      "5311            | user guide                                    | 0.4198\n",
+      "2498            | energy policy                                 | 0.3545\n",
+      "5482            | climate change                                | 0.1736\n",
+      "754             | renewable energy                              | 0.1338\n",
+      "6400            | reduction of gas emissions                    | 0.1321\n",
+      "2517            | social policy                                 | 0.1260\n",
+      "475             | energy distribution                           | 0.1253\n",
+      "5188            | information technology                        | 0.1087\n",
+      "2715            | energy production                             | 0.1087\n",
+      "2451            | EU policy                                     | 0.0812\n",
+      "4139            | serial publication                            | 0.0808\n",
+      "83              | living conditions                             | 0.0793\n",
+      "\n",
+      "5 labels above threshold (0.3)\n",
+      "\n",
+      "================================================================================\n",
+      "PREDICTIONS ABOVE THRESHOLD (with readable terms)\n",
+      "================================================================================\n",
+      "2281            | poverty                                       | 0.4645\n",
+      "2498            | energy policy                                 | 0.3545\n",
+      "5311            | user guide                                    | 0.4198\n",
+      "642             | energy saving                                 | 0.8567\n",
+      "6700            | energy efficiency                             | 0.7060\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Document length: {len(text)} characters\")\n",
+    "print(f\"Truncated to: {512 * 4} tokens (~2048 chars)\\n\") \n",
+    "\n",
+    "print(\"Running inference...\\n\")\n",
+    "results = predict_eurovoc_labels(\n",
+    "    text=text,\n",
+    "    model=model,\n",
+    "    mlb=mlb,\n",
+    "    tokenizer=tokenizer,\n",
+    "    eurovoc_id_to_term=eurovoc_id_to_term,  # ← Pass the mapping\n",
+    "    threshold=0.3,\n",
+    "    top_k=15\n",
+    ")\n",
+    "print(\"=\" * 80)\n",
+    "print(\"TOP 15 PREDICTED EUROVOC LABELS\")\n",
+    "print(\"=\" * 80)\n",
+    "\n",
+    "for pred in results['top_k']['predictions']:\n",
+    "    term = pred['term'] if pred['term'] else \"(term not found)\"\n",
+    "    print(f\"{pred['eurovoc_id']:15s} | {term:45s} | {pred['probability']:.4f}\")\n",
+    "\n",
+    "print(f\"\\n{results['above_threshold']['count']} labels above threshold (0.3)\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\" * 80)\n",
+    "print(\"PREDICTIONS ABOVE THRESHOLD\")\n",
+    "print(\"=\" * 80)\n",
+    "\n",
+    "for pred in results['above_threshold']['predictions']:\n",
+    "    if pred['term']:  # Only show if term was found\n",
+    "        print(f\"{pred['eurovoc_id']:15s} | {pred['term']:45s} | {pred['probability']:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27ebc73c-5832-4702-bc1e-dd026ebeed02",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "eurovoc_training_env",
+   "language": "python",
+   "name": "eurovoc_training_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

train_lora_included.ipynb ADDED Viewed

	@@ -0,0 +1,687 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3dc740a0-1865-40da-a163-b858f29d1313",
+   "metadata": {},
+   "source": [
+    "# 🇪🇺 🏷️ Eurovoc Model Training Notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64a1dc4a-5bf5-46d9-9356-3958802837ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle \n",
+    "import pandas as pd\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "from sklearn.preprocessing import MultiLabelBinarizer\n",
+    "import torch\n",
+    "\n",
+    "import pytorch_lightning as pl\n",
+    "from pytorch_lightning.callbacks import ModelCheckpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caa5dc4b-2fe3-43da-846d-a866c2224280",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fixed_dir = fix_all_files(all_jsonl_files)\n",
+    "logger.info(f\"Done! Use files from: {fixed_dir}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d63a920-52aa-4c73-bd2d-575e888d3d55",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Create the MultiLabel Binarizer and save it in a file for prediction "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "921fd5cd-67e7-4962-8e5e-15e055dd63b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "\n",
+    "FIXED_DIR = \"../eurovoc_data/files_fixed\"\n",
+    "\n",
+    "def list_all_json_files(directory=FIXED_DIR):\n",
+    "    # List all items in the directory\n",
+    "    all_items = os.listdir(directory)\n",
+    "\n",
+    "    def extract_date_key(filename):\n",
+    "        \"\"\"\n",
+    "        Extracts a datetime object from filenames containing YYYY-MM.\n",
+    "        Handles .jsonl and .jsonl.gz.\n",
+    "        \"\"\"\n",
+    "        base = filename.split('.')[0]     \n",
+    "        yyyy, mm = base.split('-')         \n",
+    "        return datetime(int(yyyy), int(mm), 1)\n",
+    "\n",
+    "\n",
+    "    jsonl_files = [\n",
+    "        f for f in all_items\n",
+    "        if f.endswith(\".jsonl\") or f.endswith(\".jsonl.gz\")\n",
+    "    ]\n",
+    "\n",
+    "    # Sort newest to oldest\n",
+    "    jsonl_files_sorted = sorted(\n",
+    "        jsonl_files,\n",
+    "        key=extract_date_key,\n",
+    "        reverse=True\n",
+    "    )\n",
+    "    return [os.path.join(directory, f) for f in jsonl_files_sorted]\n",
+    "\n",
+    "all_jsonl_files = list_all_json_files(FIXED_DIR)\n",
+    "\n",
+    "        \n",
+    "print(f\"Found {len(all_jsonl_files)} files to load (including compressed).\")\n",
+    "\n",
+    "\n",
+    "def build_mlb_from_streaming(all_jsonl_files, output_path='../eurovoc_data/mlb.pickle'):\n",
+    "    \"\"\"\n",
+    "    Build MLBinarizer by scanning all files once to collect unique concepts.\n",
+    "    This is more memory efficient than loading everything.\n",
+    "    \"\"\"\n",
+    "    print(\"Scanning files to collect all unique eurovoc concepts...\")\n",
+    "    all_concepts = set()\n",
+    "    \n",
+    "    dataset = load_dataset(\n",
+    "        'json',\n",
+    "        data_files=all_jsonl_files,\n",
+    "        streaming=True,\n",
+    "        split='train'\n",
+    "    )\n",
+    "    \n",
+    "    for record in tqdm(dataset, desc=\"Collecting eurovoc IDS\"):\n",
+    "        concepts = record.get('eurovoc_ids', [])\n",
+    "        if concepts:\n",
+    "            all_concepts.update(concepts)\n",
+    "    \n",
+    "    print(f\"Found {len(all_concepts)} unique eurovoc IDS\")\n",
+    "    \n",
+    "    # Create and fit MLBinarizer\n",
+    "    mlb = MultiLabelBinarizer()\n",
+    "    mlb.fit([sorted(list(all_concepts))])\n",
+    "    \n",
+    "    # Save it\n",
+    "    with open(output_path, 'wb') as f:\n",
+    "        pickle.dump(mlb, f)\n",
+    "    \n",
+    "    print(f\"Saved MLBinarizer to {output_path}\")\n",
+    "    return mlb\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66e1d48e-83a7-4a38-a081-b72ba679e960",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "build_mlb_from_streaming(all_jsonl_files)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2fd1bda-ee0e-40f2-85a6-87322a9db725",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "## 2. Load cleaned data and Split data using iterative train test \n",
+    "\n",
+    "## THIS ASSUMES ALL DATA IS IN 'TRAIN' OF DATASET, IF NOT ALSO LOAD IT HERE\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aaba16cf-a9b6-4c22-944a-2d31b8b5812d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "\n",
+    "mlb = pickle.load(open('../eurovoc_data/mlb.pickle', 'rb'))\n",
+    "\n",
+    "print(f\"Loaded MLBinarizer with {len(mlb.classes_)} classes\")\n",
+    " # Show first 10\n",
+    "print(f\"Classes: {mlb.classes_[:10]}...\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f10ac21-5731-4937-8340-829d531c6116",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "\n",
+    "FIXED_DIR = \"../eurovoc_data/files_fixed\"\n",
+    "\n",
+    "def list_all_json_files(directory=FIXED_DIR):\n",
+    "    # List all items in the directory\n",
+    "    all_items = os.listdir(directory)\n",
+    "\n",
+    "    def extract_date_key(filename):\n",
+    "        \"\"\"\n",
+    "        Extracts a datetime object from filenames containing YYYY-MM.\n",
+    "        Handles .jsonl and .jsonl.gz.\n",
+    "        \"\"\"\n",
+    "        base = filename.split('.')[0]     \n",
+    "        yyyy, mm = base.split('-')         \n",
+    "        return datetime(int(yyyy), int(mm), 1)\n",
+    "\n",
+    "\n",
+    "    jsonl_files = [\n",
+    "        f for f in all_items\n",
+    "        if f.endswith(\".jsonl\") or f.endswith(\".jsonl.gz\")\n",
+    "    ]\n",
+    "\n",
+    "    # Sort newest to oldest\n",
+    "    jsonl_files_sorted = sorted(\n",
+    "        jsonl_files,\n",
+    "        key=extract_date_key,\n",
+    "        reverse=True\n",
+    "    )\n",
+    "    return [os.path.join(directory, f) for f in jsonl_files_sorted]\n",
+    "\n",
+    "all_jsonl_files = list_all_json_files(FIXED_DIR)\n",
+    "\n",
+    "        \n",
+    "print(f\"Found {len(all_jsonl_files)} files to load (including compressed).\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25ecca51-7901-448b-9d89-4ed0663b2bae",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import gc\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ff0c6b0-abcb-4424-be97-5c7bd8fb9af7",
+   "metadata": {},
+   "source": [
+    "## 2.1 Model definition"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aaa9dc1b-1086-47d2-9b3b-20d954bda644",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Model definition and training (NORMAL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f15e504-9431-4913-9016-4b0c6344a127",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "\n",
+    "wandb.login()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f780f9d-730c-4540-ad9a-e0b60c87f147",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from eurovoc import StreamingEurovocDataModule\n",
+    "from eurovoc import EurovocTagger\n",
+    "from pytorch_lightning.callbacks import ModelCheckpoint\n",
+    "import pytorch_lightning as pl\n",
+    "import torch\n",
+    "from pytorch_lightning.callbacks import EarlyStopping\n",
+    "import gc\n",
+    "\n",
+    "class MemoryMonitorCallback(pl.Callback):\n",
+    "    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):\n",
+    "        # Log memory every 100 batches\n",
+    "        if batch_idx % 100 == 0:\n",
+    "            if torch.cuda.is_available():\n",
+    "                for i in range(torch.cuda.device_count()):\n",
+    "                    allocated = torch.cuda.memory_allocated(i) / 1e9\n",
+    "                    reserved = torch.cuda.memory_reserved(i) / 1e9\n",
+    "                    trainer.logger.experiment.log({\n",
+    "                        f\"memory/gpu_{i}_allocated_gb\": allocated,\n",
+    "                        f\"memory/gpu_{i}_reserved_gb\": reserved,\n",
+    "                        \"batch_idx\": batch_idx\n",
+    "                    })\n",
+    "    \n",
+    "    def on_train_epoch_end(self, trainer, pl_module):\n",
+    "        # Force cleanup at end of each epoch\n",
+    "        gc.collect()\n",
+    "        torch.cuda.empty_cache()\n",
+    "    \n",
+    "    def on_validation_epoch_end(self, trainer, pl_module):\n",
+    "        # Force cleanup after validation\n",
+    "        gc.collect()\n",
+    "        torch.cuda.empty_cache()\n",
+    "        \n",
+    "        \n",
+    "early_stop = EarlyStopping(\n",
+    "   monitor='val_loss',\n",
+    "   patience=4,\n",
+    "   mode='min'\n",
+    ")\n",
+    "\n",
+    "memory_monitor = MemoryMonitorCallback()\n",
+    "\n",
+    "checkpoint_callback = ModelCheckpoint(\n",
+    "    monitor='val_loss',\n",
+    "    filename='EurovocTaggerFP32-{epoch:02d}-{val_loss:.2f}',\n",
+    "    mode='min',\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a069d202-2e61-4148-baeb-20fbd9b7bf7b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pytorch_lightning.loggers import WandbLogger\n",
+    "wandb_logger = WandbLogger(\n",
+    "    project=\"EUROVOC\",\n",
+    "    name=\"EUROVOC-FP32\",\n",
+    "    log_model=True,  \n",
+    "    save_dir=\"../logs\"\n",
+    ")\n",
+    "\n",
+    "FIXED_DIR = \"../eurovoc_data/files_fixed\"\n",
+    "\n",
+    "BATCH_SIZE=58\n",
+    "\n",
+    "BERT_MODEL_NAME = \"nlpaueb/legal-bert-base-uncased\"\n",
+    "all_jsonl_files = list_all_json_files(FIXED_DIR)\n",
+    "\n",
+    "dataloader = StreamingEurovocDataModule(BERT_MODEL_NAME, all_jsonl_files, mlb, batch_size=BATCH_SIZE)\n",
+    "dataloader.setup()\n",
+    "\n",
+    "N_EPOCHS = 30\n",
+    "LR = 5e-05\n",
+    "\n",
+    "model = EurovocTagger(BERT_MODEL_NAME, len(mlb.classes_), lr=LR)\n",
+    "\n",
+    "\n",
+    "wandb_logger.experiment.config.update({\n",
+    "    \"bert_model\": BERT_MODEL_NAME,\n",
+    "    \"batch_size\": BATCH_SIZE,\n",
+    "    \"learning_rate\": LR,\n",
+    "    \"max_epochs\": N_EPOCHS,\n",
+    "    \"num_workers\": 3,\n",
+    "    \"num_gpus\": 4,\n",
+    "    \"precision\": \"32\",\n",
+    "    \"num_classes\": len(mlb.classes_)\n",
+    "})\n",
+    "\n",
+    "\n",
+    "\n",
+    "if torch.cuda.is_available():\n",
+    "    torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "    torch.backends.cudnn.allow_tf32 = True\n",
+    "\n",
+    "torch.set_float32_matmul_precision('medium')\n",
+    "\n",
+    "\n",
+    "trainer = pl.Trainer(max_epochs=N_EPOCHS ,\n",
+    "                     accelerator=\"gpu\",\n",
+    "                     devices=4, \n",
+    "                     callbacks=[checkpoint_callback, early_stop, memory_monitor],\n",
+    "                     strategy=\"ddp_notebook\",\n",
+    "                     logger=wandb_logger,\n",
+    "                     log_every_n_steps=50,\n",
+    "                    )\n",
+    "\n",
+    "trainer.fit(model, dataloader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29d9203e-c02b-4a76-a57c-d2e0246722c7",
+   "metadata": {},
+   "source": [
+    "## Finetuning in BF16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86734efb-0bcd-442f-976e-ea0bbdb393d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "\n",
+    "wandb.login()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b609efc-e3b7-4924-96c3-59a236f52ec6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pytorch_lightning.loggers import WandbLogger\n",
+    "wandb_logger = WandbLogger(\n",
+    "    project=\"EUROVOC\",\n",
+    "    name=\"EUROVOC-BF16\",\n",
+    "    log_model=True,  \n",
+    "    save_dir=\"../logs\"\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cab5811-0ab9-48d5-8ec4-8d835bb0d3df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%%capture output\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "from eurovoc import StreamingEurovocDataModule\n",
+    "from eurovoc import EurovocTaggerBCELogit, EurovocTagger\n",
+    "from pytorch_lightning.callbacks import ModelCheckpoint\n",
+    "import pytorch_lightning as pl\n",
+    "import torch\n",
+    "from pytorch_lightning.callbacks import EarlyStopping\n",
+    "import gc\n",
+    "\n",
+    "class MemoryMonitorCallback(pl.Callback):\n",
+    "    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):\n",
+    "        # Log memory every 100 batches\n",
+    "        if batch_idx % 100 == 0:\n",
+    "            if torch.cuda.is_available():\n",
+    "                for i in range(torch.cuda.device_count()):\n",
+    "                    allocated = torch.cuda.memory_allocated(i) / 1e9\n",
+    "                    reserved = torch.cuda.memory_reserved(i) / 1e9\n",
+    "                    trainer.logger.experiment.log({\n",
+    "                        f\"memory/gpu_{i}_allocated_gb\": allocated,\n",
+    "                        f\"memory/gpu_{i}_reserved_gb\": reserved\n",
+    "                    })\n",
+    "    \n",
+    "    def on_train_epoch_end(self, trainer, pl_module):\n",
+    "        # Force cleanup at end of each epoch\n",
+    "        gc.collect()\n",
+    "        torch.cuda.empty_cache()\n",
+    "    \n",
+    "    def on_validation_epoch_end(self, trainer, pl_module):\n",
+    "        # Force cleanup after validation\n",
+    "        gc.collect()\n",
+    "        torch.cuda.empty_cache()\n",
+    "\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "early_stop = EarlyStopping(\n",
+    "   monitor='val_loss',\n",
+    "   patience=4,\n",
+    "   mode='min'\n",
+    ")\n",
+    "\n",
+    "memory_monitor = MemoryMonitorCallback()\n",
+    "\n",
+    "checkpoint_callback = ModelCheckpoint(\n",
+    "    monitor='val_loss',\n",
+    "    filename='EurovocTaggerA-{epoch:02d}-{val_loss:.2f}',\n",
+    "    mode='min',\n",
+    ")\n",
+    "\n",
+    "\n",
+    "if torch.cuda.is_available():\n",
+    "    torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "    torch.backends.cudnn.allow_tf32 = True\n",
+    "\n",
+    "torch.set_float32_matmul_precision('medium')\n",
+    "\n",
+    "\n",
+    "FIXED_DIR = \"../eurovoc_data/files_fixed\"\n",
+    "\n",
+    "BATCH_SIZE=74\n",
+    "\n",
+    "BERT_MODEL_NAME = \"nlpaueb/legal-bert-base-uncased\"\n",
+    "all_jsonl_files = list_all_json_files(FIXED_DIR)\n",
+    "\n",
+    "dataloader = StreamingEurovocDataModule(BERT_MODEL_NAME, all_jsonl_files, mlb, batch_size=BATCH_SIZE)\n",
+    "dataloader.setup()\n",
+    "\n",
+    "\n",
+    "\n",
+    "N_EPOCHS = 30\n",
+    "LR = 5e-05\n",
+    "\n",
+    "BERT_MODEL_NAME = \"nlpaueb/legal-bert-base-uncased\"\n",
+    "\n",
+    "\n",
+    "model = EurovocTaggerBCELogit(BERT_MODEL_NAME, len(mlb.classes_), lr=LR)\n",
+    "\n",
+    "\n",
+    "\n",
+    "wandb_logger.experiment.config.update({\n",
+    "    \"bert_model\": BERT_MODEL_NAME,\n",
+    "    \"batch_size\": BATCH_SIZE,\n",
+    "    \"learning_rate\": LR,\n",
+    "    \"max_epochs\": N_EPOCHS,\n",
+    "    \"num_workers\": 3,\n",
+    "    \"num_gpus\": 4,\n",
+    "    \"precision\": \"16\",\n",
+    "    \"num_classes\": len(mlb.classes_)\n",
+    "})\n",
+    "\n",
+    "trainer = pl.Trainer(max_epochs=N_EPOCHS ,\n",
+    "                     accelerator=\"gpu\",\n",
+    "                     devices=4, \n",
+    "                     callbacks=[checkpoint_callback, early_stop, memory_monitor],\n",
+    "                     strategy=\"ddp_notebook\",\n",
+    "                     accumulate_grad_batches=1,\n",
+    "                     precision=16,\n",
+    "                     logger=wandb_logger,\n",
+    "                     log_every_n_steps=50,\n",
+    "                    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6af63c61-5ecd-4207-8aaa-2a0dbd008df2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "trainer.fit(model, dataloader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e2f69c2-9d89-4468-8198-b15da16e9403",
+   "metadata": {},
+   "source": [
+    "## 4. MODEL definition and training (LORA) (STILL USES OLD EUROVOC TAGGER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c28014b2-5ccb-45d9-8025-d05d31d77a08",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from eurovoc import StreamingEurovocDataModule\n",
+    "from eurovoc import EurovocTaggerLoRA\n",
+    "from pytorch_lightning.callbacks import ModelCheckpoint\n",
+    "import pytorch_lightning as pl\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "torch.set_float32_matmul_precision('medium')\n",
+    "\n",
+    "FIXED_DIR = \"../eurovoc_data/files_fixed\"\n",
+    "\n",
+    "BATCH_SIZE=94\n",
+    "\n",
+    "BERT_MODEL_NAME = \"nlpaueb/legal-bert-base-uncased\"\n",
+    "\n",
+    "\n",
+    "all_jsonl_files = list_all_json_files(FIXED_DIR)\n",
+    "\n",
+    "dataloader = StreamingEurovocDataModule(BERT_MODEL_NAME, all_jsonl_files, mlb, batch_size=BATCH_SIZE)\n",
+    "dataloader.setup()\n",
+    "\n",
+    "\n",
+    "N_EPOCHS = 30\n",
+    "LR = 5e-05\n",
+    "\n",
+    "# LoRA hyperparameters\n",
+    "# Rank of LoRA matrices\n",
+    "LORA_R = 16  \n",
+    "# Scaling factor (usually 2 * r)\n",
+    "LORA_ALPHA = 32  \n",
+    "LORA_DROPOUT = 0.1\n",
+    "\n",
+    "# Hierarchical classifier parameter (for 6800 labels)\n",
+    "# Bottleneck size: 768 → 256 → 6800\n",
+    "N_INTERMEDIATE = 256  \n",
+    "\n",
+    "\n",
+    "# Create LoRA model with hierarchical classifier\n",
+    "model = EurovocTaggerLoRA(\n",
+    "    BERT_MODEL_NAME, \n",
+    "    # 6800+ labels\n",
+    "    len(mlb.classes_),\n",
+    "    # Bottleneck size\n",
+    "    n_intermediate=N_INTERMEDIATE,  \n",
+    "    lr=LR,\n",
+    "    lora_r=LORA_R,\n",
+    "    lora_alpha=LORA_ALPHA,\n",
+    "    lora_dropout=LORA_DROPOUT\n",
+    ")\n",
+    "\n",
+    "checkpoint_callback = ModelCheckpoint(\n",
+    "    monitor='val_loss',\n",
+    "    filename='EurovocTaggerLoRA-6800-{epoch:02d}-{val_loss:.2f}',\n",
+    "    mode='min',\n",
+    ")\n",
+    "\n",
+    "trainer = pl.Trainer(\n",
+    "    max_epochs=N_EPOCHS, \n",
+    "    accelerator=\"gpu\", \n",
+    "    devices=4, \n",
+    "    callbacks=[checkpoint_callback],\n",
+    "    strategy=\"ddp_notebook\",\n",
+    "    precision=16\n",
+    ")\n",
+    "\n",
+    "print(f\"Starting LoRA training with {len(mlb.classes_)} labels...\")\n",
+    "print(f\"Classifier architecture: 768 → {N_INTERMEDIATE} → {len(mlb.classes_)}\")\n",
+    "trainer.fit(model, dataloader)\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "694ad3d7-794d-4a5f-a7be-8b47e872418d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save only the LoRA adapter \n",
+    "model.save_lora_adapter('./eurovoc_lora_adapter')\n",
+    "\n",
+    "print(\"LoRA adapter saved to ./eurovoc_lora_adapter\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "eurovoc_training_env",
+   "language": "python",
+   "name": "eurovoc_training_env"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}