Add PhoBERT-based dependency parser for Trankit reproduction

- bamboo1/models/: PhoBERT + Biaffine parser with MST decoding
- bamboo1/ud_corpus.py: UD Vietnamese VTB dataset loader
- scripts/train_phobert.py: Training with FP16, gradient accumulation
- scripts/run_phobert_runpod.sh: RunPod automation for cloud training
- scripts/runpod_setup.py: launch-fast command for H100 (<5 min training)

Target: Reproduce Trankit benchmark (70.96% UAS / 64.76% LAS on UD-VTB)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (10) hide show

bamboo1/models/__init__.py +10 -0
bamboo1/models/mst.py +244 -0
bamboo1/models/transformer_parser.py +507 -0
bamboo1/ud_corpus.py +229 -0
pyproject.toml +6 -1
scripts/evaluate.py +87 -12
scripts/run_phobert_runpod.sh +322 -0
scripts/runpod_setup.py +257 -1
scripts/train_phobert.py +603 -0
uv.lock +50 -48

bamboo1/models/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Bamboo-1 Model implementations.
+This module contains the transformer-based dependency parser using PhoBERT.
+"""
+from bamboo1.models.transformer_parser import PhoBERTDependencyParser
+from bamboo1.models.mst import mst_decode
+__all__ = ["PhoBERTDependencyParser", "mst_decode"]

bamboo1/models/mst.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Minimum Spanning Tree (MST) decoding for dependency parsing.
+Implements the Chu-Liu/Edmonds algorithm for finding the maximum spanning
+arborescence, which ensures valid dependency tree structures.
+Reference:
+- Edmonds, J. (1967). Optimum branchings.
+- Chu, Y.J. & Liu, T.H. (1965). On the shortest arborescence of a directed graph.
+"""
+import numpy as np
+from typing import List, Tuple, Optional
+def mst_decode(scores: np.ndarray, length: Optional[int] = None) -> np.ndarray:
+    """
+    Decode the maximum spanning arborescence using Chu-Liu/Edmonds algorithm.
+    Args:
+        scores: Arc scores matrix of shape (seq_len, seq_len) where scores[i, j]
+                is the score for token i having token j as its head.
+                Index 0 is the root node.
+        length: Actual sequence length (excluding padding). If None, uses full matrix.
+    Returns:
+        heads: Array of shape (seq_len,) containing head indices for each token.
+               heads[0] is always 0 (root has no head).
+    """
+    if length is None:
+        length = scores.shape[0]
+    # Work on the actual tokens (excluding padding)
+    scores = scores[:length, :length].copy()
+    # Token 0 is root - root cannot have a head other than itself
+    scores[0, :] = float('-inf')
+    scores[0, 0] = 0
+    # No self-loops (except for root)
+    np.fill_diagonal(scores[1:, 1:], float('-inf'))
+    heads = _chu_liu_edmonds(scores)
+    return heads
+def _chu_liu_edmonds(scores: np.ndarray) -> np.ndarray:
+    """
+    Chu-Liu/Edmonds algorithm for maximum spanning arborescence.
+    Args:
+        scores: Arc scores matrix of shape (n, n)
+    Returns:
+        heads: Array of head indices
+    """
+    n = scores.shape[0]
+    # Step 1: For each node (except root), select the maximum incoming arc
+    heads = np.argmax(scores, axis=1)
+    heads[0] = 0  # Root points to itself
+    # Step 2: Check for cycles
+    cycle = _find_cycle(heads)
+    if cycle is None:
+        # No cycle - we have a valid tree
+        return heads
+    # Step 3: Contract the cycle and recurse
+    cycle_set = set(cycle)
+    cycle_head = cycle[0]  # Representative node for the contracted cycle
+    # Create mapping from old indices to new indices
+    # Cycle nodes (except representative) are removed
+    old_to_new = {}
+    new_to_old = {}
+    new_idx = 0
+    for i in range(n):
+        if i not in cycle_set or i == cycle_head:
+            old_to_new[i] = new_idx
+            new_to_old[new_idx] = i
+            new_idx += 1
+    # Number of nodes in contracted graph
+    n_contracted = new_idx
+    # Build contracted graph
+    contracted_scores = np.full((n_contracted, n_contracted), float('-inf'))
+    for i in range(n):
+        if i in cycle_set and i != cycle_head:
+            continue
+        new_i = old_to_new[i]
+        for j in range(n):
+            if j in cycle_set and j != cycle_head:
+                continue
+            new_j = old_to_new[j]
+            if new_i == new_j:
+                continue
+            if i == cycle_head:
+                # Incoming edges to cycle: find best way to enter cycle
+                if j not in cycle_set:
+                    # Edge from outside to cycle
+                    best_score = float('-inf')
+                    for c in cycle:
+                        # Score of edge j->c minus score of edge heads[c]->c
+                        # (because we're replacing that edge)
+                        score = scores[c, j] - scores[c, heads[c]]
+                        if score > best_score:
+                            best_score = score
+                    contracted_scores[new_i, new_j] = best_score
+                else:
+                    contracted_scores[new_i, new_j] = float('-inf')
+            elif j == cycle_head:
+                # Outgoing edges from cycle
+                if i not in cycle_set:
+                    best_score = float('-inf')
+                    for c in cycle:
+                        if scores[i, c] > best_score:
+                            best_score = scores[i, c]
+                    contracted_scores[new_i, new_j] = best_score
+            else:
+                # Edge not involving cycle
+                contracted_scores[new_i, new_j] = scores[i, j]
+    # Recurse on contracted graph
+    contracted_heads = _chu_liu_edmonds(contracted_scores)
+    # Step 4: Expand the solution
+    final_heads = np.zeros(n, dtype=np.int64)
+    # First, set heads for non-cycle nodes
+    for new_i in range(n_contracted):
+        old_i = new_to_old[new_i]
+        if old_i != cycle_head:
+            new_head = contracted_heads[new_i]
+            old_head = new_to_old[new_head]
+            # If head is cycle representative, find which cycle node is actual head
+            if old_head == cycle_head:
+                best_score = float('-inf')
+                best_c = cycle_head
+                for c in cycle:
+                    if scores[old_i, c] > best_score:
+                        best_score = scores[old_i, c]
+                        best_c = c
+                final_heads[old_i] = best_c
+            else:
+                final_heads[old_i] = old_head
+    # Find which node in cycle is entered from outside
+    new_cycle_head = contracted_heads[old_to_new[cycle_head]]
+    if new_cycle_head != old_to_new[cycle_head]:  # Cycle has incoming edge from outside
+        outside_head = new_to_old[new_cycle_head]
+        # Find which cycle node is entered
+        best_score = float('-inf')
+        entered_node = cycle_head
+        for c in cycle:
+            score = scores[c, outside_head] - scores[c, heads[c]]
+            if score > best_score:
+                best_score = score
+                entered_node = c
+        # Set heads within cycle, breaking at entered node
+        for c in cycle:
+            if c == entered_node:
+                final_heads[c] = outside_head
+            else:
+                final_heads[c] = heads[c]
+    else:
+        # Cycle contains root (shouldn't happen in valid dependency parsing)
+        for c in cycle:
+            final_heads[c] = heads[c]
+    final_heads[0] = 0  # Root
+    return final_heads
+def _find_cycle(heads: np.ndarray) -> Optional[List[int]]:
+    """
+    Find a cycle in the given head assignments.
+    Args:
+        heads: Array of head indices
+    Returns:
+        List of node indices forming a cycle, or None if no cycle exists
+    """
+    n = len(heads)
+    visited = np.zeros(n, dtype=np.int32)
+    for start in range(1, n):  # Skip root
+        if visited[start] == 2:  # Already processed
+            continue
+        path = []
+        node = start
+        while visited[node] == 0:
+            visited[node] = 1  # Mark as in current path
+            path.append(node)
+            node = heads[node]
+            if node == 0:  # Reached root
+                break
+        if visited[node] == 1:
+            # Found cycle - extract it
+            cycle_start = path.index(node)
+            cycle = path[cycle_start:]
+            return cycle
+        # Mark all nodes in path as fully processed
+        for p in path:
+            visited[p] = 2
+    return None
+def batch_mst_decode(scores: np.ndarray, lengths: np.ndarray) -> np.ndarray:
+    """
+    Batch version of MST decoding.
+    Args:
+        scores: Arc scores of shape (batch, seq_len, seq_len)
+        lengths: Sequence lengths of shape (batch,)
+    Returns:
+        heads: Head indices of shape (batch, seq_len)
+    """
+    batch_size, seq_len, _ = scores.shape
+    heads = np.zeros((batch_size, seq_len), dtype=np.int64)
+    for i in range(batch_size):
+        heads[i, :lengths[i]] = mst_decode(scores[i], lengths[i])
+    return heads

bamboo1/models/transformer_parser.py ADDED Viewed

	@@ -0,0 +1,507 @@

+"""
+Transformer-based Dependency Parser using PhoBERT.
+This module implements a Biaffine dependency parser with PhoBERT as the encoder,
+following the Trankit approach but using Vietnamese-specific PhoBERT.
+Architecture:
+    Input → PhoBERT → Word-level pooling → MLP projections → Biaffine attention → MST decoding
+Reference:
+- Dozat & Manning (2017): Deep Biaffine Attention for Neural Dependency Parsing
+- Nguyen & Nguyen (2020): PhoBERT: Pre-trained language models for Vietnamese
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Tuple, Optional, Dict, Any
+import numpy as np
+from bamboo1.models.mst import mst_decode, batch_mst_decode
+class MLP(nn.Module):
+    """Multi-layer perceptron for biaffine scoring."""
+    def __init__(self, input_dim: int, hidden_dim: int, dropout: float = 0.33):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, hidden_dim)
+        self.activation = nn.LeakyReLU(0.1)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.activation(self.linear(x)))
+class Biaffine(nn.Module):
+    """Biaffine attention layer for dependency scoring."""
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int = 1,
+        bias_x: bool = True,
+        bias_y: bool = True
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.bias_x = bias_x
+        self.bias_y = bias_y
+        self.weight = nn.Parameter(
+            torch.zeros(output_dim, input_dim + bias_x, input_dim + bias_y)
+        )
+        nn.init.xavier_uniform_(self.weight)
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch, seq_len, input_dim) - dependent representations
+            y: (batch, seq_len, input_dim) - head representations
+        Returns:
+            scores: (batch, seq_len, seq_len, output_dim) or (batch, seq_len, seq_len) if output_dim=1
+        """
+        if self.bias_x:
+            x = torch.cat([x, torch.ones_like(x[..., :1])], dim=-1)
+        if self.bias_y:
+            y = torch.cat([y, torch.ones_like(y[..., :1])], dim=-1)
+        # (batch, seq_len, output_dim, input_dim+1)
+        x = torch.einsum('bxi,oij->bxoj', x, self.weight)
+        # (batch, seq_len, seq_len, output_dim)
+        scores = torch.einsum('bxoj,byj->bxyo', x, y)
+        if self.output_dim == 1:
+            scores = scores.squeeze(-1)
+        return scores
+class PhoBERTDependencyParser(nn.Module):
+    """
+    PhoBERT-based Biaffine Dependency Parser.
+    Uses PhoBERT as encoder with first-subword pooling for word alignment,
+    followed by biaffine attention for arc and relation prediction.
+    """
+    def __init__(
+        self,
+        encoder_name: str = "vinai/phobert-base",
+        n_rels: int = 50,
+        arc_hidden: int = 500,
+        rel_hidden: int = 100,
+        dropout: float = 0.33,
+        use_mst: bool = True,
+    ):
+        """
+        Args:
+            encoder_name: HuggingFace model name for PhoBERT
+            n_rels: Number of dependency relations
+            arc_hidden: Hidden dimension for arc MLPs
+            rel_hidden: Hidden dimension for relation MLPs
+            dropout: Dropout rate
+            use_mst: Use MST decoding (True) or greedy decoding (False)
+        """
+        super().__init__()
+        from transformers import AutoModel, AutoTokenizer
+        self.encoder_name = encoder_name
+        self.n_rels = n_rels
+        self.use_mst = use_mst
+        # Load PhoBERT encoder
+        self.encoder = AutoModel.from_pretrained(encoder_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(encoder_name)
+        self.hidden_size = self.encoder.config.hidden_size  # 768 for phobert-base
+        # Dropout
+        self.dropout = nn.Dropout(dropout)
+        # MLP projections
+        self.mlp_arc_dep = MLP(self.hidden_size, arc_hidden, dropout)
+        self.mlp_arc_head = MLP(self.hidden_size, arc_hidden, dropout)
+        self.mlp_rel_dep = MLP(self.hidden_size, rel_hidden, dropout)
+        self.mlp_rel_head = MLP(self.hidden_size, rel_hidden, dropout)
+        # Biaffine attention
+        self.arc_attn = Biaffine(arc_hidden, 1, bias_x=True, bias_y=False)
+        self.rel_attn = Biaffine(rel_hidden, n_rels, bias_x=True, bias_y=True)
+    def _get_word_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        word_starts: torch.Tensor,
+        word_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Get word-level embeddings from subword encoder output.
+        Uses first-subword pooling strategy: each word is represented by
+        the embedding of its first subword token.
+        Args:
+            input_ids: (batch, subword_seq_len) - Subword token IDs
+            attention_mask: (batch, subword_seq_len) - Attention mask for subwords
+            word_starts: (batch, word_seq_len) - Indices of first subword for each word
+            word_mask: (batch, word_seq_len) - Mask for actual words
+        Returns:
+            word_embeddings: (batch, word_seq_len, hidden_size)
+        """
+        # Get encoder output
+        encoder_output = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        hidden_states = encoder_output.last_hidden_state  # (batch, subword_seq_len, hidden)
+        # Apply dropout
+        hidden_states = self.dropout(hidden_states)
+        # Extract word embeddings using first-subword indices
+        batch_size, word_seq_len = word_starts.shape
+        # Gather word embeddings
+        # word_starts: (batch, word_seq_len) -> (batch, word_seq_len, hidden)
+        word_embeddings = torch.gather(
+            hidden_states,
+            dim=1,
+            index=word_starts.unsqueeze(-1).expand(-1, -1, hidden_states.size(-1))
+        )
+        return word_embeddings
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        word_starts: torch.Tensor,
+        word_mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass computing arc and relation scores.
+        Args:
+            input_ids: (batch, subword_seq_len) - Subword token IDs
+            attention_mask: (batch, subword_seq_len) - Attention mask for subwords
+            word_starts: (batch, word_seq_len) - Indices of first subword for each word
+            word_mask: (batch, word_seq_len) - Mask for actual words
+        Returns:
+            arc_scores: (batch, word_seq_len, word_seq_len) - Arc scores
+            rel_scores: (batch, word_seq_len, word_seq_len, n_rels) - Relation scores
+        """
+        # Get word-level embeddings
+        word_embeddings = self._get_word_embeddings(
+            input_ids, attention_mask, word_starts, word_mask
+        )
+        # MLP projections
+        arc_dep = self.mlp_arc_dep(word_embeddings)
+        arc_head = self.mlp_arc_head(word_embeddings)
+        rel_dep = self.mlp_rel_dep(word_embeddings)
+        rel_head = self.mlp_rel_head(word_embeddings)
+        # Biaffine attention
+        arc_scores = self.arc_attn(arc_dep, arc_head)  # (batch, seq, seq)
+        rel_scores = self.rel_attn(rel_dep, rel_head)  # (batch, seq, seq, n_rels)
+        return arc_scores, rel_scores
+    def loss(
+        self,
+        arc_scores: torch.Tensor,
+        rel_scores: torch.Tensor,
+        heads: torch.Tensor,
+        rels: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute cross-entropy loss for arcs and relations.
+        Args:
+            arc_scores: (batch, seq_len, seq_len) - Arc scores
+            rel_scores: (batch, seq_len, seq_len, n_rels) - Relation scores
+            heads: (batch, seq_len) - Gold head indices
+            rels: (batch, seq_len) - Gold relation indices
+            mask: (batch, seq_len) - Token mask (1 for real tokens, 0 for padding)
+        Returns:
+            Total loss (arc_loss + rel_loss)
+        """
+        batch_size, seq_len = mask.shape
+        # Mask invalid positions
+        arc_scores_masked = arc_scores.clone()
+        arc_scores_masked = arc_scores_masked.masked_fill(~mask.unsqueeze(2), float('-inf'))
+        # Arc loss: cross-entropy over possible heads
+        arc_loss = F.cross_entropy(
+            arc_scores_masked[mask].view(-1, seq_len),
+            heads[mask],
+            reduction='mean'
+        )
+        # Relation loss: cross-entropy conditioned on gold heads
+        batch_indices = torch.arange(batch_size, device=rel_scores.device).unsqueeze(1)
+        seq_indices = torch.arange(seq_len, device=rel_scores.device)
+        rel_scores_gold = rel_scores[batch_indices, seq_indices, heads]
+        rel_loss = F.cross_entropy(
+            rel_scores_gold[mask],
+            rels[mask],
+            reduction='mean'
+        )
+        return arc_loss + rel_loss
+    def decode(
+        self,
+        arc_scores: torch.Tensor,
+        rel_scores: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Decode predictions using MST or greedy decoding.
+        Args:
+            arc_scores: (batch, seq_len, seq_len) - Arc scores
+            rel_scores: (batch, seq_len, seq_len, n_rels) - Relation scores
+            mask: (batch, seq_len) - Token mask
+        Returns:
+            arc_preds: (batch, seq_len) - Predicted head indices
+            rel_preds: (batch, seq_len) - Predicted relation indices
+        """
+        batch_size, seq_len = mask.shape
+        device = arc_scores.device
+        if self.use_mst:
+            # MST decoding for valid tree structure
+            lengths = mask.sum(dim=1).cpu().numpy()
+            arc_scores_np = arc_scores.cpu().numpy()
+            arc_preds_np = batch_mst_decode(arc_scores_np, lengths)
+            arc_preds = torch.from_numpy(arc_preds_np).to(device)
+        else:
+            # Greedy decoding
+            arc_preds = arc_scores.argmax(dim=-1)
+        # Get relation predictions for predicted heads
+        batch_indices = torch.arange(batch_size, device=device).unsqueeze(1)
+        seq_indices = torch.arange(seq_len, device=device)
+        rel_scores_pred = rel_scores[batch_indices, seq_indices, arc_preds]
+        rel_preds = rel_scores_pred.argmax(dim=-1)
+        return arc_preds, rel_preds
+    def predict(
+        self,
+        words: List[str],
+        return_probs: bool = False,
+    ) -> List[Tuple[str, int, str]]:
+        """
+        Predict dependencies for a single sentence.
+        Args:
+            words: List of words (pre-tokenized)
+            return_probs: Whether to return probability scores
+        Returns:
+            List of (word, head, deprel) tuples
+        """
+        self.eval()
+        device = next(self.parameters()).device
+        # Tokenize with word boundary tracking
+        encoded = self.tokenize_with_alignment([words])
+        # Move to device
+        input_ids = encoded['input_ids'].to(device)
+        attention_mask = encoded['attention_mask'].to(device)
+        word_starts = encoded['word_starts'].to(device)
+        word_mask = encoded['word_mask'].to(device)
+        with torch.no_grad():
+            arc_scores, rel_scores = self.forward(
+                input_ids, attention_mask, word_starts, word_mask
+            )
+            arc_preds, rel_preds = self.decode(arc_scores, rel_scores, word_mask)
+        # Convert to list of tuples
+        arc_preds = arc_preds[0].cpu().tolist()
+        rel_preds = rel_preds[0].cpu().tolist()
+        results = []
+        for i, word in enumerate(words):
+            head = arc_preds[i]
+            rel_idx = rel_preds[i]
+            rel = self.idx2rel.get(rel_idx, "dep")
+            results.append((word, head, rel))
+        return results
+    def tokenize_with_alignment(
+        self,
+        sentences: List[List[str]],
+        max_length: int = 256,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Tokenize sentences and track word-subword alignment.
+        Args:
+            sentences: List of sentences, where each sentence is a list of words
+            max_length: Maximum subword sequence length
+        Returns:
+            Dictionary with input_ids, attention_mask, word_starts, word_mask
+        """
+        batch_input_ids = []
+        batch_attention_mask = []
+        batch_word_starts = []
+        batch_word_mask = []
+        for words in sentences:
+            # Tokenize each word separately to track boundaries
+            word_starts = []
+            subword_ids = [self.tokenizer.cls_token_id]
+            for word in words:
+                word_starts.append(len(subword_ids))
+                word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
+                subword_ids.extend(word_tokens)
+            subword_ids.append(self.tokenizer.sep_token_id)
+            # Truncate if needed
+            if len(subword_ids) > max_length:
+                subword_ids = subword_ids[:max_length-1] + [self.tokenizer.sep_token_id]
+                # Truncate word_starts that go beyond
+                word_starts = [ws for ws in word_starts if ws < max_length - 1]
+            attention_mask = [1] * len(subword_ids)
+            batch_input_ids.append(subword_ids)
+            batch_attention_mask.append(attention_mask)
+            batch_word_starts.append(word_starts)
+            batch_word_mask.append([1] * len(word_starts))
+        # Pad sequences
+        max_subword_len = max(len(ids) for ids in batch_input_ids)
+        max_word_len = max(len(ws) for ws in batch_word_starts)
+        padded_input_ids = []
+        padded_attention_mask = []
+        padded_word_starts = []
+        padded_word_mask = []
+        for i in range(len(sentences)):
+            # Pad subwords
+            pad_len = max_subword_len - len(batch_input_ids[i])
+            padded_input_ids.append(
+                batch_input_ids[i] + [self.tokenizer.pad_token_id] * pad_len
+            )
+            padded_attention_mask.append(
+                batch_attention_mask[i] + [0] * pad_len
+            )
+            # Pad words
+            word_pad_len = max_word_len - len(batch_word_starts[i])
+            # Use 0 for padding word_starts (points to CLS token, but masked)
+            padded_word_starts.append(
+                batch_word_starts[i] + [0] * word_pad_len
+            )
+            padded_word_mask.append(
+                batch_word_mask[i] + [0] * word_pad_len
+            )
+        return {
+            'input_ids': torch.tensor(padded_input_ids, dtype=torch.long),
+            'attention_mask': torch.tensor(padded_attention_mask, dtype=torch.long),
+            'word_starts': torch.tensor(padded_word_starts, dtype=torch.long),
+            'word_mask': torch.tensor(padded_word_mask, dtype=torch.bool),
+        }
+    def save(self, path: str, vocab: Optional[Dict] = None):
+        """
+        Save model checkpoint.
+        Args:
+            path: Directory path to save the model
+            vocab: Vocabulary dict with rel2idx and idx2rel mappings
+        """
+        import os
+        os.makedirs(path, exist_ok=True)
+        # Save model state
+        checkpoint = {
+            'model_state_dict': self.state_dict(),
+            'config': {
+                'encoder_name': self.encoder_name,
+                'n_rels': self.n_rels,
+                'arc_hidden': self.mlp_arc_dep.linear.out_features,
+                'rel_hidden': self.mlp_rel_dep.linear.out_features,
+                'dropout': self.dropout.p,
+                'use_mst': self.use_mst,
+            },
+        }
+        if vocab is not None:
+            checkpoint['vocab'] = vocab
+        torch.save(checkpoint, os.path.join(path, 'model.pt'))
+        # Save tokenizer
+        self.tokenizer.save_pretrained(path)
+    @classmethod
+    def load(cls, path: str, device: str = 'cpu') -> 'PhoBERTDependencyParser':
+        """
+        Load model from checkpoint.
+        Args:
+            path: Directory path containing the saved model
+            device: Device to load the model to
+        Returns:
+            Loaded PhoBERTDependencyParser model
+        """
+        import os
+        checkpoint = torch.load(
+            os.path.join(path, 'model.pt'),
+            map_location=device,
+            weights_only=False
+        )
+        config = checkpoint['config']
+        # Create model
+        model = cls(
+            encoder_name=config['encoder_name'],
+            n_rels=config['n_rels'],
+            arc_hidden=config['arc_hidden'],
+            rel_hidden=config['rel_hidden'],
+            dropout=config['dropout'],
+            use_mst=config.get('use_mst', True),
+        )
+        # Load state dict
+        model.load_state_dict(checkpoint['model_state_dict'])
+        # Load vocabulary
+        if 'vocab' in checkpoint:
+            model.rel2idx = checkpoint['vocab'].get('rel2idx', {})
+            model.idx2rel = checkpoint['vocab'].get('idx2rel', {})
+        else:
+            model.rel2idx = {}
+            model.idx2rel = {}
+        model.to(device)
+        return model

bamboo1/ud_corpus.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+UD Vietnamese VTB Corpus loader for dependency parsing.
+This module provides a corpus class that downloads the UD Vietnamese VTB dataset
+from Universal Dependencies for comparison with Trankit benchmark results.
+UD Vietnamese VTB:
+- Treebank size: ~3,300 sentences
+- Source: Vietnamese Language and Speech Processing (VLSP)
+- Standard benchmark for Vietnamese dependency parsing
+"""
+import os
+import tarfile
+import urllib.request
+from pathlib import Path
+from typing import Optional
+class UDVietnameseVTB:
+    """
+    Corpus class for UD Vietnamese VTB dataset.
+    This class downloads the UD Vietnamese VTB treebank from Universal Dependencies
+    for fair comparison with Trankit's reported benchmark results.
+    Attributes:
+        train: Path to the training data file (CoNLL-U format)
+        dev: Path to the development/validation data file (CoNLL-U format)
+        test: Path to the test data file (CoNLL-U format)
+    Example:
+        >>> from bamboo1.ud_corpus import UDVietnameseVTB
+        >>> corpus = UDVietnameseVTB()
+        >>> print(corpus.train)  # Path to train.conllu
+    """
+    name = "UD_Vietnamese-VTB"
+    # UD Vietnamese VTB release URL (v2.14)
+    UD_VERSION = "2.14"
+    UD_BASE_URL = "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master"
+    FILE_NAMES = {
+        "train": "vi_vtb-ud-train.conllu",
+        "dev": "vi_vtb-ud-dev.conllu",
+        "test": "vi_vtb-ud-test.conllu",
+    }
+    def __init__(self, data_dir: Optional[str] = None, force_download: bool = False):
+        """
+        Initialize the UD Vietnamese VTB corpus.
+        Args:
+            data_dir: Directory to store the CoNLL-U files.
+                     Defaults to ./data/UD_Vietnamese-VTB
+            force_download: If True, re-download even if files exist.
+        """
+        if data_dir is None:
+            data_dir = Path(__file__).parent.parent / "data" / "UD_Vietnamese-VTB"
+        self.data_dir = Path(data_dir)
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self._train = self.data_dir / self.FILE_NAMES["train"]
+        self._dev = self.data_dir / self.FILE_NAMES["dev"]
+        self._test = self.data_dir / self.FILE_NAMES["test"]
+        if force_download or not self._files_exist():
+            self._download()
+    def _files_exist(self) -> bool:
+        """Check if all required files exist."""
+        return self._train.exists() and self._dev.exists() and self._test.exists()
+    def _download(self):
+        """Download UD Vietnamese VTB files from GitHub."""
+        print(f"Downloading UD Vietnamese VTB from Universal Dependencies...")
+        for split, filename in self.FILE_NAMES.items():
+            url = f"{self.UD_BASE_URL}/{filename}"
+            output_path = self.data_dir / filename
+            print(f"  Downloading {filename}...")
+            try:
+                urllib.request.urlretrieve(url, output_path)
+            except Exception as e:
+                print(f"  Warning: Failed to download {filename}: {e}")
+                print(f"  Trying alternative method...")
+                self._download_alternative()
+                return
+        print(f"Dataset saved to {self.data_dir}")
+        self._print_statistics()
+    def _download_alternative(self):
+        """Alternative download method using HuggingFace datasets."""
+        try:
+            from datasets import load_dataset
+            print("  Using HuggingFace datasets library...")
+            dataset = load_dataset("universal_dependencies", "vi_vtb")
+            for split_name, output_path in [
+                ("train", self._train),
+                ("validation", self._dev),
+                ("test", self._test),
+            ]:
+                self._convert_hf_split(dataset[split_name], output_path)
+            print(f"Dataset saved to {self.data_dir}")
+            self._print_statistics()
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to download UD Vietnamese VTB. "
+                f"Please download manually from: "
+                f"https://github.com/UniversalDependencies/UD_Vietnamese-VTB\n"
+                f"Error: {e}"
+            )
+    def _convert_hf_split(self, split, output_path: Path):
+        """Convert a HuggingFace dataset split to CoNLL-U format."""
+        with open(output_path, "w", encoding="utf-8") as f:
+            for idx, item in enumerate(split):
+                sent_id = item.get("idx", idx)
+                text = item.get("text", "")
+                f.write(f"# sent_id = {sent_id}\n")
+                if text:
+                    f.write(f"# text = {text}\n")
+                tokens = item["tokens"]
+                lemmas = item.get("lemmas", ["_"] * len(tokens))
+                upos = item["upos"]
+                xpos = item.get("xpos", ["_"] * len(tokens))
+                feats = item.get("feats", [None] * len(tokens))
+                heads = item["head"]
+                deprels = item["deprel"]
+                deps = item.get("deps", [None] * len(tokens))
+                misc = item.get("misc", [None] * len(tokens))
+                for i in range(len(tokens)):
+                    token_id = i + 1
+                    form = tokens[i]
+                    lemma = lemmas[i] if lemmas[i] else "_"
+                    upos_tag = upos[i] if upos[i] else "_"
+                    xpos_tag = xpos[i] if xpos[i] else "_"
+                    feat = feats[i] if feats[i] else "_"
+                    head = int(heads[i]) if heads[i] is not None else 0
+                    deprel = deprels[i] if deprels[i] else "_"
+                    dep = deps[i] if deps[i] else "_"
+                    misc_val = misc[i] if misc[i] else "_"
+                    line = f"{token_id}\t{form}\t{lemma}\t{upos_tag}\t{xpos_tag}\t{feat}\t{head}\t{deprel}\t{dep}\t{misc_val}"
+                    f.write(line + "\n")
+                f.write("\n")
+    def _print_statistics(self):
+        """Print dataset statistics."""
+        for name, path in [("Train", self._train), ("Dev", self._dev), ("Test", self._test)]:
+            n_sents, n_tokens = self._count_sentences_tokens(path)
+            print(f"  {name}: {n_sents} sentences, {n_tokens} tokens")
+    def _count_sentences_tokens(self, path: Path) -> tuple:
+        """Count sentences and tokens in a CoNLL-U file."""
+        n_sents = 0
+        n_tokens = 0
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    n_sents += 1
+                elif not line.startswith("#"):
+                    parts = line.split("\t")
+                    if "-" not in parts[0] and "." not in parts[0]:
+                        n_tokens += 1
+        return n_sents, n_tokens
+    @property
+    def train(self) -> str:
+        """Path to training data file."""
+        return str(self._train)
+    @property
+    def dev(self) -> str:
+        """Path to development/validation data file."""
+        return str(self._dev)
+    @property
+    def test(self) -> str:
+        """Path to test data file."""
+        return str(self._test)
+    def get_statistics(self) -> dict:
+        """Get dataset statistics."""
+        stats = {}
+        for split_name, path in [
+            ("train", self._train),
+            ("dev", self._dev),
+            ("test", self._test)
+        ]:
+            n_sents, n_tokens = self._count_sentences_tokens(path)
+            stats[f"{split_name}_sentences"] = n_sents
+            stats[f"{split_name}_tokens"] = n_tokens
+        # Collect all POS tags and relations
+        all_upos = set()
+        all_deprels = set()
+        for path in [self._train, self._dev, self._test]:
+            with open(path, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        parts = line.split("\t")
+                        if len(parts) >= 8 and "-" not in parts[0] and "." not in parts[0]:
+                            all_upos.add(parts[3])
+                            all_deprels.add(parts[7])
+        stats["num_upos_tags"] = len(all_upos)
+        stats["num_deprels"] = len(all_deprels)
+        stats["upos_tags"] = sorted(all_upos)
+        stats["deprels"] = sorted(all_deprels)
+        return stats

pyproject.toml CHANGED Viewed

@@ -9,7 +9,9 @@ dependencies = [
     "datasets>=2.14.0",
     "click>=8.0.0",
     "underthesea>=9.2.0",
-    "transformers>=5.0.0",
 ]
 [project.optional-dependencies]
@@ -20,6 +22,9 @@ dev = [
 cloud = [
     "runpod>=1.6.0",
 ]
 [build-system]
 requires = ["hatchling"]

     "datasets>=2.14.0",
     "click>=8.0.0",
     "underthesea>=9.2.0",
+    "transformers>=4.30.0",
+    "tqdm>=4.60.0",
+    "numpy>=1.24.0",
 ]
 [project.optional-dependencies]
 cloud = [
     "runpod>=1.6.0",
 ]
+adapters = [
+    "adapters>=0.1.0",
+]
 [build-system]
 requires = ["hatchling"]

scripts/evaluate.py CHANGED Viewed

@@ -11,10 +11,15 @@
 """
 Evaluation script for Bamboo-1 Vietnamese Dependency Parser.
 Usage:
     uv run scripts/evaluate.py --model models/bamboo-1
-    uv run scripts/evaluate.py --model models/bamboo-1 --split test
-    uv run scripts/evaluate.py --model models/bamboo-1 --detailed
 """
 import sys
@@ -27,6 +32,7 @@ import click
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from bamboo1.corpus import UDD1Corpus
 def read_conll_sentences(filepath: str):
@@ -103,12 +109,71 @@ def calculate_attachment_scores(gold_sentences, pred_sentences):
     }
 @click.command()
 @click.option(
     "--model", "-m",
     required=True,
     help="Path to trained model directory",
 )
 @click.option(
     "--split",
     type=click.Choice(["dev", "test", "both"]),
@@ -125,21 +190,32 @@ def calculate_attachment_scores(gold_sentences, pred_sentences):
     "--output", "-o",
     help="Save predictions to file (CoNLL-U format)",
 )
-def evaluate(model, split, detailed, output):
-    """Evaluate Bamboo-1 Vietnamese Dependency Parser on UDD-1 dataset."""
-    from underthesea.models.dependency_parser import DependencyParser
     click.echo("=" * 60)
     click.echo("Bamboo-1: Vietnamese Dependency Parser Evaluation")
     click.echo("=" * 60)
     # Load model
-    click.echo(f"\nLoading model from {model}...")
-    parser = DependencyParser.load(model)
     # Load corpus
-    click.echo("Loading UDD-1 corpus...")
-    corpus = UDD1Corpus()
     splits_to_eval = []
     if split == "both":
@@ -164,12 +240,11 @@ def evaluate(model, split, detailed, output):
         pred_sentences = []
         for gold_sent in gold_sentences:
-            # Reconstruct text from tokens
             tokens = [tok["form"] for tok in gold_sent]
-            text = " ".join(tokens)
             # Parse
-            result = parser.predict(text)
             # Convert result to same format as gold
             pred_sent = []

 """
 Evaluation script for Bamboo-1 Vietnamese Dependency Parser.
+Supports both BiLSTM and PhoBERT-based models, and multiple datasets:
+- UDD-1: Main Vietnamese dependency dataset (~18K sentences)
+- UD Vietnamese VTB: Universal Dependencies benchmark (~3.3K sentences)
 Usage:
     uv run scripts/evaluate.py --model models/bamboo-1
+    uv run scripts/evaluate.py --model models/bamboo-1-phobert --model-type phobert
+    uv run scripts/evaluate.py --model models/bamboo-1-phobert --dataset ud-vtb
+    uv run scripts/evaluate.py --model models/bamboo-1 --split test --detailed
 """
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from bamboo1.corpus import UDD1Corpus
+from bamboo1.ud_corpus import UDVietnameseVTB
 def read_conll_sentences(filepath: str):
     }
+def load_phobert_model(model_path, device='cuda'):
+    """Load PhoBERT-based model."""
+    import torch
+    from bamboo1.models.transformer_parser import PhoBERTDependencyParser
+    if not torch.cuda.is_available():
+        device = 'cpu'
+    return PhoBERTDependencyParser.load(model_path, device=device)
+def predict_phobert(parser, words):
+    """Make predictions using PhoBERT model."""
+    import torch
+    parser.eval()
+    device = next(parser.parameters()).device
+    # Tokenize
+    encoded = parser.tokenize_with_alignment([words])
+    input_ids = encoded['input_ids'].to(device)
+    attention_mask = encoded['attention_mask'].to(device)
+    word_starts = encoded['word_starts'].to(device)
+    word_mask = encoded['word_mask'].to(device)
+    with torch.no_grad():
+        arc_scores, rel_scores = parser.forward(
+            input_ids, attention_mask, word_starts, word_mask
+        )
+        arc_preds, rel_preds = parser.decode(arc_scores, rel_scores, word_mask)
+    # Convert to list
+    arc_preds = arc_preds[0].cpu().tolist()
+    rel_preds = rel_preds[0].cpu().tolist()
+    results = []
+    for i, word in enumerate(words):
+        head = arc_preds[i]
+        rel_idx = rel_preds[i]
+        rel = parser.idx2rel.get(rel_idx, "dep")
+        results.append((word, head, rel))
+    return results
 @click.command()
 @click.option(
     "--model", "-m",
     required=True,
     help="Path to trained model directory",
 )
+@click.option(
+    "--model-type",
+    type=click.Choice(["bilstm", "phobert"]),
+    default="bilstm",
+    help="Model type: bilstm (underthesea) or phobert (transformer)",
+    show_default=True,
+)
+@click.option(
+    "--dataset",
+    type=click.Choice(["udd1", "ud-vtb"]),
+    default="udd1",
+    help="Dataset: udd1 (UDD-1) or ud-vtb (UD Vietnamese VTB)",
+    show_default=True,
+)
 @click.option(
     "--split",
     type=click.Choice(["dev", "test", "both"]),
     "--output", "-o",
     help="Save predictions to file (CoNLL-U format)",
 )
+def evaluate(model, model_type, dataset, split, detailed, output):
+    """Evaluate Bamboo-1 Vietnamese Dependency Parser.
+    Supports both BiLSTM (underthesea) and PhoBERT-based models,
+    and evaluation on UDD-1 or UD Vietnamese VTB datasets.
+    """
     click.echo("=" * 60)
     click.echo("Bamboo-1: Vietnamese Dependency Parser Evaluation")
     click.echo("=" * 60)
     # Load model
+    click.echo(f"\nLoading {model_type} model from {model}...")
+    if model_type == "phobert":
+        parser = load_phobert_model(model)
+        predict_fn = lambda words: predict_phobert(parser, words)
+    else:
+        from underthesea.models.dependency_parser import DependencyParser
+        parser = DependencyParser.load(model)
+        predict_fn = lambda words: parser.predict(" ".join(words))
     # Load corpus
+    click.echo(f"Loading {dataset.upper()} corpus...")
+    if dataset == "udd1":
+        corpus = UDD1Corpus()
+    else:
+        corpus = UDVietnameseVTB()
     splits_to_eval = []
     if split == "both":
         pred_sentences = []
         for gold_sent in gold_sentences:
+            # Get tokens
             tokens = [tok["form"] for tok in gold_sent]
             # Parse
+            result = predict_fn(tokens)
             # Convert result to same format as gold
             pred_sent = []

scripts/run_phobert_runpod.sh ADDED Viewed

	@@ -0,0 +1,322 @@

+#!/bin/bash
+# Run PhoBERT dependency parser training on RunPod
+#
+# Usage:
+#   ./scripts/run_phobert_runpod.sh setup           # Install uv, clone repo, sync deps
+#   ./scripts/run_phobert_runpod.sh train           # Train PhoBERT on UDD-1
+#   ./scripts/run_phobert_runpod.sh train-vtb       # Train PhoBERT on UD Vietnamese VTB
+#   ./scripts/run_phobert_runpod.sh train-large     # Train with PhoBERT-large
+#   ./scripts/run_phobert_runpod.sh eval            # Evaluate trained model
+#   ./scripts/run_phobert_runpod.sh download        # Download trained model
+#   ./scripts/run_phobert_runpod.sh ssh             # Interactive SSH session
+#   ./scripts/run_phobert_runpod.sh <command>       # Run custom command
+#
+# Environment variables:
+#   RUNPOD_HOST    Pod IP address
+#   RUNPOD_PORT    Pod SSH port
+#   WANDB_API_KEY  (optional) W&B API key for logging
+set -e
+# Pod connection details (update these after launching pod)
+POD_HOST="${RUNPOD_HOST:-213.173.99.13}"
+POD_PORT="${RUNPOD_PORT:-11375}"
+SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
+# Training defaults
+MODEL_DIR="models/bamboo-1-phobert"
+ENCODER="vinai/phobert-base"
+EPOCHS=100
+BATCH_SIZE="${BATCH_SIZE:-48}"  # Auto: 48 for A5000, increase for larger GPUs
+PATIENCE=10
+FP16="--fp16"  # Enable mixed precision for ~2x speedup
+ssh_cmd() {
+    ssh $SSH_OPTS root@$POD_HOST -p $POD_PORT "$@"
+}
+scp_to_pod() {
+    scp $SSH_OPTS -P $POD_PORT "$1" root@$POD_HOST:"$2"
+}
+scp_from_pod() {
+    scp $SSH_OPTS -P $POD_PORT root@$POD_HOST:"$1" "$2"
+}
+terminate_pod() {
+    echo ""
+    echo "Terminating pod..."
+    cd "$(dirname "$0")/.." && uv run scripts/runpod_setup.py status 2>/dev/null | grep -oP '\(\K[a-z0-9]+(?=\))' | head -1 | xargs -I {} uv run scripts/runpod_setup.py terminate {} 2>/dev/null || echo "Could not auto-terminate. Run: uv run scripts/runpod_setup.py terminate <pod-id>"
+}
+# Build wandb flags if API key is set
+get_wandb_flags() {
+    if [ -n "$WANDB_API_KEY" ]; then
+        echo "--wandb --wandb-project bamboo-1-phobert"
+    fi
+}
+# Build wandb env export for SSH commands
+get_wandb_env() {
+    if [ -n "$WANDB_API_KEY" ]; then
+        echo "export WANDB_API_KEY=$WANDB_API_KEY && "
+    fi
+}
+case "${1:-help}" in
+    setup)
+        echo "Setting up environment on RunPod..."
+        # Install uv
+        ssh_cmd 'curl -LsSf https://astral.sh/uv/install.sh | sh'
+        # Clone repo
+        ssh_cmd 'source $HOME/.local/bin/env && git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 || true'
+        # Pull latest and sync dependencies
+        ssh_cmd 'source $HOME/.local/bin/env && cd /workspace/bamboo-1 && git pull && uv sync'
+        echo "Setup complete!"
+        if [ -n "$WANDB_API_KEY" ]; then
+            echo ""
+            echo "WANDB_API_KEY detected - it will be passed automatically during training."
+        fi
+        echo ""
+        echo "Next steps:"
+        echo "  ./scripts/run_phobert_runpod.sh train      # Train on UDD-1"
+        echo "  ./scripts/run_phobert_runpod.sh train-vtb  # Train on UD-VTB (Trankit benchmark)"
+        ;;
+    train)
+        echo "Training PhoBERT dependency parser on UDD-1..."
+        echo "  Encoder: $ENCODER"
+        echo "  Output: $MODEL_DIR"
+        echo "  Epochs: $EPOCHS"
+        WANDB_FLAGS=$(get_wandb_flags)
+        WANDB_ENV=$(get_wandb_env)
+        ssh_cmd "${WANDB_ENV}source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/train_phobert.py \
+                --output $MODEL_DIR \
+                --encoder $ENCODER \
+                --dataset udd1 \
+                --epochs $EPOCHS \
+                --batch-size $BATCH_SIZE \
+                --patience $PATIENCE \
+                $FP16 \
+                $WANDB_FLAGS"
+        echo ""
+        echo "Training complete! Download model with:"
+        echo "  ./scripts/run_phobert_runpod.sh download"
+        ;;
+    train-vtb)
+        echo "Training PhoBERT dependency parser on UD Vietnamese VTB..."
+        echo "  (For comparison with Trankit benchmark)"
+        echo "  Encoder: $ENCODER"
+        echo "  Output: ${MODEL_DIR}-vtb"
+        echo "  Epochs: $EPOCHS"
+        WANDB_FLAGS=$(get_wandb_flags)
+        WANDB_ENV=$(get_wandb_env)
+        ssh_cmd "${WANDB_ENV}source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/train_phobert.py \
+                --output ${MODEL_DIR}-vtb \
+                --encoder $ENCODER \
+                --dataset ud-vtb \
+                --epochs $EPOCHS \
+                --batch-size $BATCH_SIZE \
+                --patience $PATIENCE \
+                $FP16 \
+                $WANDB_FLAGS"
+        echo ""
+        echo "Training complete! Download model with:"
+        echo "  ./scripts/run_phobert_runpod.sh download-vtb"
+        ;;
+    train-large)
+        echo "Training PhoBERT-large dependency parser on UDD-1..."
+        echo "  Encoder: vinai/phobert-large"
+        echo "  Output: ${MODEL_DIR}-large"
+        echo "  Epochs: $EPOCHS"
+        echo "  (Note: Requires GPU with >= 24GB VRAM)"
+        WANDB_FLAGS=$(get_wandb_flags)
+        WANDB_ENV=$(get_wandb_env)
+        ssh_cmd "${WANDB_ENV}source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/train_phobert.py \
+                --output ${MODEL_DIR}-large \
+                --encoder vinai/phobert-large \
+                --dataset udd1 \
+                --epochs $EPOCHS \
+                --batch-size 24 \
+                --patience $PATIENCE \
+                $FP16 \
+                $WANDB_FLAGS"
+        echo ""
+        echo "Training complete! Download model with:"
+        echo "  ./scripts/run_phobert_runpod.sh download-large"
+        ;;
+    train-quick)
+        echo "Quick training run (100 samples) for testing..."
+        WANDB_FLAGS=$(get_wandb_flags)
+        WANDB_ENV=$(get_wandb_env)
+        ssh_cmd "${WANDB_ENV}source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/train_phobert.py \
+                --output ${MODEL_DIR}-test \
+                --encoder $ENCODER \
+                --dataset udd1 \
+                --epochs 5 \
+                --batch-size $BATCH_SIZE \
+                --sample 100 \
+                $FP16 \
+                $WANDB_FLAGS"
+        ;;
+    train-fast)
+        echo "FAST Trankit reproduction (<5 min) - H100 settings!"
+        echo "  Dataset: UD Vietnamese VTB (Trankit benchmark)"
+        echo "  Encoder: $ENCODER"
+        echo "  Output: ${MODEL_DIR}-vtb"
+        echo "  Settings: batch=256, epochs=30, patience=5"
+        echo ""
+        echo "  Target: Trankit base 70.96% UAS / 64.76% LAS"
+        WANDB_FLAGS=$(get_wandb_flags)
+        WANDB_ENV=$(get_wandb_env)
+        ssh_cmd "${WANDB_ENV}source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/train_phobert.py \
+                --output ${MODEL_DIR}-vtb \
+                --encoder $ENCODER \
+                --dataset ud-vtb \
+                --epochs 30 \
+                --batch-size 256 \
+                --patience 5 \
+                --warmup-steps 50 \
+                --num-workers 8 \
+                $FP16 \
+                $WANDB_FLAGS"
+        echo ""
+        echo "Training complete! Download model with:"
+        echo "  ./scripts/run_phobert_runpod.sh download-vtb"
+        ;;
+    eval)
+        echo "Evaluating PhoBERT model on UDD-1 test set..."
+        ssh_cmd "source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/evaluate.py \
+                --model $MODEL_DIR \
+                --model-type phobert \
+                --dataset udd1 \
+                --split test \
+                --detailed"
+        ;;
+    eval-vtb)
+        echo "Evaluating PhoBERT model on UD Vietnamese VTB test set..."
+        echo "  (For comparison with Trankit: 70.96% UAS / 64.76% LAS)"
+        ssh_cmd "source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && \
+            uv run scripts/evaluate.py \
+                --model ${MODEL_DIR}-vtb \
+                --model-type phobert \
+                --dataset ud-vtb \
+                --split test \
+                --detailed"
+        ;;
+    download)
+        echo "Downloading trained model from RunPod..."
+        mkdir -p models/bamboo-1-phobert
+        scp_from_pod "/workspace/bamboo-1/$MODEL_DIR/*" "models/bamboo-1-phobert/"
+        echo "Model downloaded to models/bamboo-1-phobert/"
+        ;;
+    download-vtb)
+        echo "Downloading VTB-trained model from RunPod..."
+        mkdir -p models/bamboo-1-phobert-vtb
+        scp_from_pod "/workspace/bamboo-1/${MODEL_DIR}-vtb/*" "models/bamboo-1-phobert-vtb/"
+        echo "Model downloaded to models/bamboo-1-phobert-vtb/"
+        ;;
+    download-large)
+        echo "Downloading PhoBERT-large model from RunPod..."
+        mkdir -p models/bamboo-1-phobert-large
+        scp_from_pod "/workspace/bamboo-1/${MODEL_DIR}-large/*" "models/bamboo-1-phobert-large/"
+        echo "Model downloaded to models/bamboo-1-phobert-large/"
+        ;;
+    logs)
+        echo "Tailing training logs..."
+        ssh_cmd "tail -f /workspace/bamboo-1/training.log 2>/dev/null || echo 'No log file found. Training may not have started yet.'"
+        ;;
+    gpu-status)
+        echo "GPU status on RunPod..."
+        ssh_cmd "nvidia-smi"
+        ;;
+    ssh)
+        echo "Connecting to RunPod..."
+        ssh $SSH_OPTS root@$POD_HOST -p $POD_PORT
+        ;;
+    help|--help|-h)
+        echo "Usage: $0 <command>"
+        echo ""
+        echo "PhoBERT Training Commands:"
+        echo "  setup         Install uv, clone repo, sync dependencies"
+        echo "  train-fast    FAST Trankit reproduction <5 min (H100, UD-VTB)"
+        echo "  train         Train PhoBERT on UDD-1 dataset (18K sentences)"
+        echo "  train-vtb     Train PhoBERT on UD Vietnamese VTB (Trankit benchmark)"
+        echo "  train-large   Train PhoBERT-large on UDD-1 (requires 24GB+ VRAM)"
+        echo "  train-quick   Quick test run with 100 samples"
+        echo ""
+        echo "Evaluation Commands:"
+        echo "  eval          Evaluate model on UDD-1 test set"
+        echo "  eval-vtb      Evaluate model on UD-VTB test set"
+        echo ""
+        echo "Utility Commands:"
+        echo "  download      Download trained model (UDD-1)"
+        echo "  download-vtb  Download trained model (UD-VTB)"
+        echo "  download-large Download trained model (PhoBERT-large)"
+        echo "  logs          Tail training logs"
+        echo "  gpu-status    Show GPU utilization"
+        echo "  ssh           Interactive SSH session"
+        echo "  <cmd>         Run custom command on pod"
+        echo ""
+        echo "Environment variables:"
+        echo "  RUNPOD_HOST    Pod IP address (default: $POD_HOST)"
+        echo "  RUNPOD_PORT    Pod SSH port (default: $POD_PORT)"
+        echo "  WANDB_API_KEY  W&B API key for experiment tracking (optional)"
+        echo "  BATCH_SIZE     Override default batch size (default: 48)"
+        echo ""
+        echo "GPU Recommendations (for launch-phobert):"
+        echo "  A4000 (16GB)  - Budget, ~\$0.20/hr, batch_size=32"
+        echo "  A5000 (24GB)  - Recommended, ~\$0.30/hr, batch_size=48 (default)"
+        echo "  A6000 (48GB)  - Fast, ~\$0.50/hr, batch_size=64"
+        echo "  A100 (80GB)   - Fastest, ~\$1.50/hr, batch_size=128"
+        echo ""
+        echo "Trankit Benchmark Reference:"
+        echo "  Trankit base:  70.96% UAS / 64.76% LAS (UD Vietnamese VTB)"
+        echo "  Trankit large: 71.07% UAS / 65.37% LAS (UD Vietnamese VTB)"
+        ;;
+    *)
+        # Run custom command
+        ssh_cmd "source \$HOME/.local/bin/env && cd /workspace/bamboo-1 && $*"
+        ;;
+esac

scripts/runpod_setup.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # dependencies = [
 #     "runpod>=1.6.0",
 #     "requests>=2.28.0",
 # ]
 # ///
 """
@@ -29,9 +30,15 @@ Usage:
 """
 import os
 import click
 import runpod
 import requests
 @click.group()
@@ -147,7 +154,12 @@ def status():
     click.echo("Active pods:")
     for pod in pods:
-        click.echo(f"  - {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}")
 @cli.command()
@@ -168,6 +180,139 @@ def terminate(pod_id):
     click.echo("Pod terminated.")
 # =============================================================================
 # Volume Management
 # =============================================================================
@@ -192,6 +337,117 @@ def _graphql_request(query: str, variables: dict = None) -> dict:
     return response.json()
 @cli.command("volume-list")
 def volume_list():
     """List all network volumes."""

 # dependencies = [
 #     "runpod>=1.6.0",
 #     "requests>=2.28.0",
+#     "python-dotenv>=1.0.0",
 # ]
 # ///
 """
 """
 import os
+from pathlib import Path
 import click
 import runpod
 import requests
+from dotenv import load_dotenv
+# Load .env file from project root
+load_dotenv(Path(__file__).parent.parent / ".env")
 @click.group()
     click.echo("Active pods:")
     for pod in pods:
+        click.echo(f"\n  {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}")
+        runtime = pod.get('runtime') or {}
+        ports = runtime.get('ports') or []
+        for p in ports:
+            if p.get('privatePort') == 22:
+                click.echo(f"    SSH: ssh root@{p.get('ip')} -p {p.get('publicPort')}")
 @cli.command()
     click.echo("Pod terminated.")
+GPU_RECOMMENDATIONS = {
+    "budget": "NVIDIA RTX A4000",       # 16GB, $0.20/hr - Basic training
+    "balanced": "NVIDIA RTX A5000",     # 24GB, $0.30/hr - Good balance (Recommended)
+    "fast": "NVIDIA RTX A6000",         # 48GB, $0.50/hr - Larger batches, faster
+    "fastest": "NVIDIA A100 80GB PCIe", # 80GB, $1.50/hr - Best for production
+}
+@cli.command("launch-phobert")
+@click.option("--gpu", default="NVIDIA RTX A5000",
+              help="GPU type: A4000 (budget), A5000 (balanced), A6000 (fast), A100 (fastest)")
+@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
+@click.option("--disk", default=30, type=int, help="Disk size in GB (PhoBERT needs more space)")
+@click.option("--name", default="bamboo-1-phobert", help="Pod name")
+@click.option("--volume", default=None, help="Network volume ID to attach")
+@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
+@click.option("--dataset", type=click.Choice(["udd1", "ud-vtb"]), default="udd1",
+              help="Dataset: udd1 or ud-vtb (Trankit benchmark)")
+@click.option("--encoder", default="vinai/phobert-base",
+              help="Encoder: vinai/phobert-base or vinai/phobert-large")
+@click.option("--epochs", default=100, type=int, help="Number of epochs")
+@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
+@click.option("--batch-size", default=0, type=int, help="Batch size (0=auto based on GPU)")
+def launch_phobert(gpu, image, disk, name, volume, wandb_key, dataset, encoder, epochs, sample, batch_size):
+    """Launch a RunPod instance for PhoBERT training.
+    This launches a pod configured for training the PhoBERT-based dependency parser.
+    After the pod starts, SSH in and run the training command printed below.
+    GPU Recommendations:
+        A4000 (16GB) - Budget option, batch_size=32
+        A5000 (24GB) - Recommended balance, batch_size=48-64
+        A6000 (48GB) - Fast training, batch_size=64-96
+        A100 (80GB) - Fastest, batch_size=128+
+    Example:
+        uv run scripts/runpod_setup.py launch-phobert
+        uv run scripts/runpod_setup.py launch-phobert --gpu "NVIDIA RTX A6000"  # Faster
+        uv run scripts/runpod_setup.py launch-phobert --dataset ud-vtb  # Trankit benchmark
+        uv run scripts/runpod_setup.py launch-phobert --encoder vinai/phobert-large --gpu "NVIDIA RTX A6000"
+    """
+    # Auto-select batch size based on GPU if not specified
+    if batch_size == 0:
+        if "A100" in gpu or "H100" in gpu:
+            batch_size = 128
+        elif "A6000" in gpu:
+            batch_size = 64
+        elif "A5000" in gpu:
+            batch_size = 48
+        else:  # A4000 or unknown
+            batch_size = 32
+        # Reduce batch size for large encoder
+        if "large" in encoder:
+            batch_size = batch_size // 2
+    click.echo("Launching RunPod instance for PhoBERT training...")
+    click.echo(f"  GPU: {gpu}")
+    click.echo(f"  Image: {image}")
+    click.echo(f"  Disk: {disk}GB")
+    click.echo(f"  Dataset: {dataset}")
+    click.echo(f"  Encoder: {encoder}")
+    click.echo(f"  Batch size: {batch_size}")
+    # Build training command with optimizations
+    train_cmd = f"uv run scripts/train_phobert.py --encoder {encoder} --dataset {dataset} --epochs {epochs} --batch-size {batch_size} --fp16"
+    if sample > 0:
+        train_cmd += f" --sample {sample}"
+    if wandb_key:
+        train_cmd += " --wandb --wandb-project bamboo-1-phobert"
+    # Output directory based on config
+    output_suffix = ""
+    if dataset == "ud-vtb":
+        output_suffix += "-vtb"
+    if "large" in encoder:
+        output_suffix += "-large"
+    train_cmd += f" --output models/bamboo-1-phobert{output_suffix}"
+    # Set environment variables
+    env_vars = {}
+    if wandb_key:
+        env_vars["WANDB_API_KEY"] = wandb_key
+    # Add SSH public key
+    ssh_key = get_ssh_public_key()
+    if ssh_key:
+        env_vars["PUBLIC_KEY"] = ssh_key
+        click.echo("  SSH key: configured")
+    if volume:
+        click.echo(f"  Volume: {volume}")
+    pod = runpod.create_pod(
+        name=name,
+        image_name=image,
+        gpu_type_id=gpu,
+        volume_in_gb=disk,
+        env=env_vars if env_vars else None,
+        ports="22/tcp",
+        network_volume_id=volume,
+    )
+    click.echo("\nPod created!")
+    click.echo(f"  ID: {pod['id']}")
+    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
+    click.echo("\nMonitor at: https://runpod.io/console/pods")
+    # Generate setup and training commands
+    click.echo("\n" + "="*70)
+    click.echo("After SSH into the pod, run these commands:")
+    click.echo("="*70)
+    setup_cmd = """curl -LsSf https://astral.sh/uv/install.sh | sh && \\
+source $HOME/.local/bin/env && \\
+git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
+cd /workspace/bamboo-1 && uv sync"""
+    click.echo("\n# 1. Setup (run once):")
+    click.echo(setup_cmd)
+    click.echo("\n# 2. Train:")
+    click.echo(f"cd /workspace/bamboo-1 && {train_cmd}")
+    click.echo("\n" + "="*70)
+    if dataset == "ud-vtb":
+        click.echo("\nTranskit benchmark reference:")
+        click.echo("  Trankit base:  70.96% UAS / 64.76% LAS")
+        click.echo("  Trankit large: 71.07% UAS / 65.37% LAS")
+        click.echo("")
 # =============================================================================
 # Volume Management
 # =============================================================================
     return response.json()
+@cli.command("launch-fast")
+@click.option("--gpu", default="NVIDIA H100 80GB HBM3", help="GPU type (H100 for fastest)")
+@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
+@click.option("--disk", default=30, type=int, help="Disk size in GB")
+@click.option("--name", default="bamboo-1-trankit", help="Pod name")
+@click.option("--volume", default=None, help="Network volume ID to attach")
+@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
+@click.option("--encoder", default="vinai/phobert-base", help="Encoder model")
+def launch_fast(gpu, image, disk, name, volume, wandb_key, encoder):
+    """Launch pod for FAST Trankit reproduction (<5 minutes).
+    Trains on UD Vietnamese VTB to reproduce Trankit benchmark:
+    - Trankit base:  70.96% UAS / 64.76% LAS
+    - Trankit large: 71.07% UAS / 65.37% LAS
+    Uses H100 with aggressive settings for <5 min training.
+    Example:
+        uv run scripts/runpod_setup.py launch-fast
+        uv run scripts/runpod_setup.py launch-fast --encoder vinai/phobert-large
+    """
+    dataset = "ud-vtb"  # Always use UD-VTB for Trankit reproduction
+    # Set batch size based on GPU
+    if "H100" in gpu:
+        batch_size = 256
+        epochs = 30
+    elif "A100" in gpu:
+        batch_size = 128
+        epochs = 40
+    else:
+        batch_size = 64
+        epochs = 50
+        click.echo("WARNING: For <5 min training, use H100!")
+    # Reduce batch for large model
+    if "large" in encoder:
+        batch_size = batch_size // 2
+    click.echo("Launching FAST Trankit reproduction (<5 minutes)...")
+    click.echo(f"  GPU: {gpu}")
+    click.echo(f"  Batch size: {batch_size}")
+    click.echo(f"  Epochs: {epochs}")
+    click.echo(f"  Dataset: {dataset} (UD Vietnamese VTB)")
+    click.echo(f"  Encoder: {encoder}")
+    click.echo("")
+    click.echo("  Target: Trankit base 70.96% UAS / 64.76% LAS")
+    # Output name
+    output_name = "models/bamboo-1-phobert-vtb"
+    if "large" in encoder:
+        output_name += "-large"
+    # Build optimized training command
+    train_cmd = f"""uv run scripts/train_phobert.py \\
+        --encoder {encoder} \\
+        --dataset {dataset} \\
+        --output {output_name} \\
+        --epochs {epochs} \\
+        --batch-size {batch_size} \\
+        --patience 5 \\
+        --warmup-steps 50 \\
+        --num-workers 8 \\
+        --fp16"""
+    if wandb_key:
+        train_cmd += " --wandb --wandb-project bamboo-1-phobert"
+    # Set environment variables
+    env_vars = {}
+    if wandb_key:
+        env_vars["WANDB_API_KEY"] = wandb_key
+    ssh_key = get_ssh_public_key()
+    if ssh_key:
+        env_vars["PUBLIC_KEY"] = ssh_key
+        click.echo("  SSH key: configured")
+    if volume:
+        click.echo(f"  Volume: {volume}")
+    pod = runpod.create_pod(
+        name=name,
+        image_name=image,
+        gpu_type_id=gpu,
+        volume_in_gb=disk,
+        env=env_vars if env_vars else None,
+        ports="22/tcp",
+        network_volume_id=volume,
+    )
+    click.echo(f"\nPod created!")
+    click.echo(f"  ID: {pod['id']}")
+    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
+    click.echo("\nMonitor at: https://runpod.io/console/pods")
+    # One-liner setup + train
+    click.echo("\n" + "="*70)
+    click.echo("SSH in and run this ONE command for <5 min training:")
+    click.echo("="*70)
+    one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && \\
+source $HOME/.local/bin/env && \\
+git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
+cd /workspace/bamboo-1 && uv sync && \\
+{train_cmd}"""
+    click.echo(one_liner)
+    click.echo("="*70)
 @cli.command("volume-list")
 def volume_list():
     """List all network volumes."""

scripts/train_phobert.py ADDED Viewed

	@@ -0,0 +1,603 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "torch>=2.0.0",
+#     "transformers>=4.30.0",
+#     "datasets>=2.14.0",
+#     "click>=8.0.0",
+#     "tqdm>=4.60.0",
+#     "wandb>=0.15.0",
+# ]
+# ///
+"""
+Training script for PhoBERT-based Vietnamese Dependency Parser.
+This script trains a transformer-based dependency parser using PhoBERT as the
+encoder, following the Trankit approach for Vietnamese dependency parsing.
+Architecture:
+    PhoBERT -> Word-level pooling -> Biaffine attention -> MST decoding
+Usage:
+    uv run scripts/train_phobert.py
+    uv run scripts/train_phobert.py --output models/bamboo-1-phobert --epochs 100
+    uv run scripts/train_phobert.py --encoder vinai/phobert-large
+    uv run scripts/train_phobert.py --dataset ud-vtb  # Use UD Vietnamese VTB
+"""
+import sys
+from pathlib import Path
+from collections import Counter
+from dataclasses import dataclass
+from typing import List, Tuple, Optional, Dict
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from tqdm import tqdm
+import click
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from bamboo1.corpus import UDD1Corpus
+from bamboo1.ud_corpus import UDVietnameseVTB
+from bamboo1.models.transformer_parser import PhoBERTDependencyParser
+from scripts.cost_estimate import CostTracker, detect_hardware
+# ============================================================================
+# Data Processing
+# ============================================================================
+@dataclass
+class Sentence:
+    """A dependency-parsed sentence."""
+    words: List[str]
+    heads: List[int]
+    rels: List[str]
+def read_conllu(path: str) -> List[Sentence]:
+    """Read CoNLL-U file and return list of sentences."""
+    sentences = []
+    words, heads, rels = [], [], []
+    with open(path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                if words:
+                    sentences.append(Sentence(words, heads, rels))
+                    words, heads, rels = [], [], []
+            elif line.startswith('#'):
+                continue
+            else:
+                parts = line.split('\t')
+                if '-' in parts[0] or '.' in parts[0]:
+                    continue
+                words.append(parts[1])
+                heads.append(int(parts[6]))
+                rels.append(parts[7])
+        if words:
+            sentences.append(Sentence(words, heads, rels))
+    return sentences
+class Vocabulary:
+    """Vocabulary for relations."""
+    def __init__(self):
+        self.rel2idx = {}
+        self.idx2rel = {}
+    def build(self, sentences: List[Sentence]):
+        """Build vocabulary from sentences."""
+        rel_counts = Counter()
+        for sent in sentences:
+            for rel in sent.rels:
+                rel_counts[rel] += 1
+        for rel in sorted(rel_counts.keys()):
+            if rel not in self.rel2idx:
+                idx = len(self.rel2idx)
+                self.rel2idx[rel] = idx
+                self.idx2rel[idx] = rel
+    @property
+    def n_rels(self) -> int:
+        return len(self.rel2idx)
+class PhoBERTDependencyDataset(Dataset):
+    """Dataset for PhoBERT dependency parsing."""
+    def __init__(
+        self,
+        sentences: List[Sentence],
+        vocab: Vocabulary,
+        tokenizer,
+        max_length: int = 256,
+    ):
+        self.sentences = sentences
+        self.vocab = vocab
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.sentences)
+    def __getitem__(self, idx):
+        sent = self.sentences[idx]
+        # Tokenize with word boundary tracking
+        word_starts = []
+        subword_ids = [self.tokenizer.cls_token_id]
+        for word in sent.words:
+            word_starts.append(len(subword_ids))
+            word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
+            if not word_tokens:
+                word_tokens = [self.tokenizer.unk_token_id]
+            subword_ids.extend(word_tokens)
+        subword_ids.append(self.tokenizer.sep_token_id)
+        # Truncate if needed
+        if len(subword_ids) > self.max_length:
+            subword_ids = subword_ids[:self.max_length-1] + [self.tokenizer.sep_token_id]
+            # Keep words that fit
+            valid_words = sum(1 for ws in word_starts if ws < self.max_length - 1)
+            word_starts = word_starts[:valid_words]
+            heads = sent.heads[:valid_words]
+            rels = sent.rels[:valid_words]
+        else:
+            heads = sent.heads
+            rels = sent.rels
+        # Encode relations
+        rel_ids = [self.vocab.rel2idx.get(r, 0) for r in rels]
+        return {
+            'input_ids': subword_ids,
+            'word_starts': word_starts,
+            'heads': heads,
+            'rels': rel_ids,
+        }
+def collate_fn(batch):
+    """Collate function for DataLoader."""
+    # Get max lengths
+    max_subword_len = max(len(item['input_ids']) for item in batch)
+    max_word_len = max(len(item['word_starts']) for item in batch)
+    batch_size = len(batch)
+    # Initialize tensors
+    input_ids = torch.zeros(batch_size, max_subword_len, dtype=torch.long)
+    attention_mask = torch.zeros(batch_size, max_subword_len, dtype=torch.long)
+    word_starts = torch.zeros(batch_size, max_word_len, dtype=torch.long)
+    word_mask = torch.zeros(batch_size, max_word_len, dtype=torch.bool)
+    heads = torch.zeros(batch_size, max_word_len, dtype=torch.long)
+    rels = torch.zeros(batch_size, max_word_len, dtype=torch.long)
+    for i, item in enumerate(batch):
+        # Subwords
+        seq_len = len(item['input_ids'])
+        input_ids[i, :seq_len] = torch.tensor(item['input_ids'])
+        attention_mask[i, :seq_len] = 1
+        # Words
+        word_len = len(item['word_starts'])
+        word_starts[i, :word_len] = torch.tensor(item['word_starts'])
+        word_mask[i, :word_len] = True
+        heads[i, :word_len] = torch.tensor(item['heads'])
+        rels[i, :word_len] = torch.tensor(item['rels'])
+    return {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+        'word_starts': word_starts,
+        'word_mask': word_mask,
+        'heads': heads,
+        'rels': rels,
+    }
+# ============================================================================
+# Training
+# ============================================================================
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
+    """Create scheduler with linear warmup and linear decay."""
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0,
+            float(num_training_steps - current_step) /
+            float(max(1, num_training_steps - num_warmup_steps))
+        )
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+def evaluate(model, dataloader, device):
+    """Evaluate model and return UAS/LAS."""
+    model.eval()
+    total_arcs = 0
+    correct_arcs = 0
+    correct_rels = 0
+    with torch.no_grad():
+        for batch in dataloader:
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            word_starts = batch['word_starts'].to(device)
+            word_mask = batch['word_mask'].to(device)
+            heads = batch['heads'].to(device)
+            rels = batch['rels'].to(device)
+            arc_scores, rel_scores = model(
+                input_ids, attention_mask, word_starts, word_mask
+            )
+            arc_preds, rel_preds = model.decode(arc_scores, rel_scores, word_mask)
+            # Count correct
+            arc_correct = (arc_preds == heads) & word_mask
+            rel_correct = (rel_preds == rels) & word_mask & arc_correct
+            total_arcs += word_mask.sum().item()
+            correct_arcs += arc_correct.sum().item()
+            correct_rels += rel_correct.sum().item()
+    uas = correct_arcs / total_arcs * 100 if total_arcs > 0 else 0
+    las = correct_rels / total_arcs * 100 if total_arcs > 0 else 0
+    return uas, las
+@click.command()
+@click.option('--output', '-o', default='models/bamboo-1-phobert', help='Output directory')
+@click.option('--encoder', default='vinai/phobert-base', help='PhoBERT encoder model')
+@click.option('--dataset', type=click.Choice(['udd1', 'ud-vtb']), default='udd1',
+              help='Dataset to use: udd1 (UDD-1) or ud-vtb (UD Vietnamese VTB)')
+@click.option('--epochs', default=100, type=int, help='Number of epochs')
+@click.option('--batch-size', default=32, type=int, help='Batch size')
+@click.option('--bert-lr', default=1e-5, type=float, help='Learning rate for BERT layers')
+@click.option('--head-lr', default=1e-3, type=float, help='Learning rate for parser head')
+@click.option('--warmup-steps', default=1000, type=int, help='Warmup steps')
+@click.option('--weight-decay', default=0.01, type=float, help='Weight decay')
+@click.option('--max-grad-norm', default=5.0, type=float, help='Max gradient norm for clipping')
+@click.option('--arc-hidden', default=500, type=int, help='Arc MLP hidden size')
+@click.option('--rel-hidden', default=100, type=int, help='Relation MLP hidden size')
+@click.option('--dropout', default=0.33, type=float, help='Dropout rate')
+@click.option('--patience', default=10, type=int, help='Early stopping patience')
+@click.option('--use-mst/--no-mst', default=True, help='Use MST decoding')
+@click.option('--force-download', is_flag=True, help='Force re-download dataset')
+@click.option('--gpu-type', default='RTX_A4000', help='GPU type for cost estimation')
+@click.option('--cost-interval', default=300, type=int, help='Cost report interval in seconds')
+@click.option('--wandb', 'use_wandb', is_flag=True, help='Enable W&B logging')
+@click.option('--wandb-project', default='bamboo-1-phobert', help='W&B project name')
+@click.option('--max-time', default=0, type=int, help='Max training time in minutes (0=unlimited)')
+@click.option('--sample', default=0, type=int, help='Sample N sentences from each split (0=all)')
+@click.option('--freeze-bert', default=0, type=int, help='Freeze BERT for first N epochs')
+@click.option('--fp16/--no-fp16', default=True, help='Use mixed precision training (FP16)')
+@click.option('--num-workers', default=4, type=int, help='DataLoader workers')
+@click.option('--grad-accum', default=1, type=int, help='Gradient accumulation steps')
+def train(
+    output, encoder, dataset, epochs, batch_size, bert_lr, head_lr, warmup_steps,
+    weight_decay, max_grad_norm, arc_hidden, rel_hidden, dropout, patience,
+    use_mst, force_download, gpu_type, cost_interval, use_wandb, wandb_project,
+    max_time, sample, freeze_bert, fp16, num_workers, grad_accum
+):
+    """Train PhoBERT-based Vietnamese Dependency Parser."""
+    # Detect hardware
+    hardware = detect_hardware()
+    detected_gpu_type = hardware.get_gpu_type()
+    if gpu_type == "RTX_A4000":
+        gpu_type = detected_gpu_type
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    click.echo(f"Using device: {device}")
+    click.echo(f"Hardware: {hardware}")
+    # Mixed precision training
+    use_amp = fp16 and torch.cuda.is_available()
+    scaler = torch.cuda.amp.GradScaler() if use_amp else None
+    if use_amp:
+        click.echo(f"Mixed precision (FP16): enabled")
+    # Initialize wandb
+    if use_wandb:
+        import wandb
+        wandb.init(
+            project=wandb_project,
+            config={
+                "encoder": encoder,
+                "dataset": dataset,
+                "epochs": epochs,
+                "batch_size": batch_size,
+                "bert_lr": bert_lr,
+                "head_lr": head_lr,
+                "warmup_steps": warmup_steps,
+                "weight_decay": weight_decay,
+                "arc_hidden": arc_hidden,
+                "rel_hidden": rel_hidden,
+                "dropout": dropout,
+                "patience": patience,
+                "use_mst": use_mst,
+                "gpu_type": gpu_type,
+                "hardware": hardware.to_dict(),
+            }
+        )
+        click.echo(f"W&B logging enabled: {wandb.run.url}")
+    click.echo("=" * 60)
+    click.echo("Bamboo-1: PhoBERT Vietnamese Dependency Parser")
+    click.echo("=" * 60)
+    # Load corpus
+    click.echo(f"\nLoading {dataset.upper()} corpus...")
+    if dataset == 'udd1':
+        corpus = UDD1Corpus(force_download=force_download)
+    else:
+        corpus = UDVietnameseVTB(force_download=force_download)
+    train_sents = read_conllu(corpus.train)
+    dev_sents = read_conllu(corpus.dev)
+    test_sents = read_conllu(corpus.test)
+    if sample > 0:
+        train_sents = train_sents[:sample]
+        dev_sents = dev_sents[:min(sample // 2, len(dev_sents))]
+        test_sents = test_sents[:min(sample // 2, len(test_sents))]
+        click.echo(f"  Sampling {sample} sentences...")
+    click.echo(f"  Train: {len(train_sents)} sentences")
+    click.echo(f"  Dev: {len(dev_sents)} sentences")
+    click.echo(f"  Test: {len(test_sents)} sentences")
+    # Build vocabulary
+    click.echo("\nBuilding vocabulary...")
+    vocab = Vocabulary()
+    vocab.build(train_sents)
+    click.echo(f"  Relations: {vocab.n_rels}")
+    # Create model
+    click.echo(f"\nInitializing model with {encoder}...")
+    model = PhoBERTDependencyParser(
+        encoder_name=encoder,
+        n_rels=vocab.n_rels,
+        arc_hidden=arc_hidden,
+        rel_hidden=rel_hidden,
+        dropout=dropout,
+        use_mst=use_mst,
+    ).to(device)
+    # Set relation mappings
+    model.rel2idx = vocab.rel2idx
+    model.idx2rel = vocab.idx2rel
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    n_bert_params = sum(p.numel() for p in model.encoder.parameters() if p.requires_grad)
+    n_head_params = n_params - n_bert_params
+    click.echo(f"  Total parameters: {n_params:,}")
+    click.echo(f"  BERT parameters: {n_bert_params:,}")
+    click.echo(f"  Head parameters: {n_head_params:,}")
+    # Create datasets
+    train_dataset = PhoBERTDependencyDataset(train_sents, vocab, model.tokenizer)
+    dev_dataset = PhoBERTDependencyDataset(dev_sents, vocab, model.tokenizer)
+    test_dataset = PhoBERTDependencyDataset(test_sents, vocab, model.tokenizer)
+    # DataLoader with optimizations
+    loader_kwargs = {
+        'collate_fn': collate_fn,
+        'num_workers': num_workers,
+        'pin_memory': torch.cuda.is_available(),
+        'persistent_workers': num_workers > 0,
+    }
+    train_loader = DataLoader(
+        train_dataset, batch_size=batch_size, shuffle=True, **loader_kwargs
+    )
+    dev_loader = DataLoader(
+        dev_dataset, batch_size=batch_size, **loader_kwargs
+    )
+    test_loader = DataLoader(
+        test_dataset, batch_size=batch_size, **loader_kwargs
+    )
+    # Effective batch size with gradient accumulation
+    effective_batch_size = batch_size * grad_accum
+    if grad_accum > 1:
+        click.echo(f"  Effective batch size: {effective_batch_size} (batch={batch_size} x accum={grad_accum})")
+    # Optimizer with differential learning rates
+    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
+    optimizer_grouped_parameters = [
+        # BERT parameters with weight decay
+        {
+            'params': [p for n, p in model.encoder.named_parameters()
+                      if not any(nd in n for nd in no_decay)],
+            'lr': bert_lr,
+            'weight_decay': weight_decay,
+        },
+        # BERT parameters without weight decay
+        {
+            'params': [p for n, p in model.encoder.named_parameters()
+                      if any(nd in n for nd in no_decay)],
+            'lr': bert_lr,
+            'weight_decay': 0.0,
+        },
+        # Parser head parameters
+        {
+            'params': [p for n, p in model.named_parameters()
+                      if not n.startswith('encoder.')],
+            'lr': head_lr,
+            'weight_decay': weight_decay,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters)
+    # Learning rate scheduler with warmup
+    total_steps = len(train_loader) * epochs
+    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+    # Training
+    click.echo(f"\nTraining for {epochs} epochs...")
+    if freeze_bert > 0:
+        click.echo(f"  Freezing BERT for first {freeze_bert} epochs")
+    if max_time > 0:
+        click.echo(f"  Time limit: {max_time} minutes")
+    output_path = Path(output)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Cost tracking
+    cost_tracker = CostTracker(gpu_type=gpu_type)
+    cost_tracker.report_interval = cost_interval
+    cost_tracker.start()
+    click.echo(f"Cost tracking: {gpu_type} @ ${cost_tracker.hourly_rate}/hr")
+    best_las = -1
+    no_improve = 0
+    time_limit_seconds = max_time * 60 if max_time > 0 else float('inf')
+    for epoch in range(1, epochs + 1):
+        # Check time limit
+        if cost_tracker.elapsed_seconds() >= time_limit_seconds:
+            click.echo(f"\nTime limit reached ({max_time} minutes)")
+            break
+        # Freeze/unfreeze BERT
+        if epoch <= freeze_bert:
+            for param in model.encoder.parameters():
+                param.requires_grad = False
+        elif epoch == freeze_bert + 1:
+            click.echo("  Unfreezing BERT parameters...")
+            for param in model.encoder.parameters():
+                param.requires_grad = True
+        model.train()
+        total_loss = 0
+        optimizer.zero_grad()
+        pbar = tqdm(train_loader, desc=f"Epoch {epoch:3d}", leave=False)
+        for step, batch in enumerate(pbar):
+            input_ids = batch['input_ids'].to(device, non_blocking=True)
+            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
+            word_starts = batch['word_starts'].to(device, non_blocking=True)
+            word_mask = batch['word_mask'].to(device, non_blocking=True)
+            heads = batch['heads'].to(device, non_blocking=True)
+            rels = batch['rels'].to(device, non_blocking=True)
+            # Mixed precision forward pass
+            with torch.cuda.amp.autocast(enabled=use_amp):
+                arc_scores, rel_scores = model(
+                    input_ids, attention_mask, word_starts, word_mask
+                )
+                loss = model.loss(arc_scores, rel_scores, heads, rels, word_mask)
+                loss = loss / grad_accum  # Scale for gradient accumulation
+            # Backward pass with gradient scaling
+            if use_amp:
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            # Optimizer step (every grad_accum steps)
+            if (step + 1) % grad_accum == 0 or (step + 1) == len(train_loader):
+                if use_amp:
+                    scaler.unscale_(optimizer)
+                    nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
+                    optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            total_loss += loss.item() * grad_accum
+            pbar.set_postfix({'loss': f'{loss.item() * grad_accum:.4f}'})
+        # Evaluate
+        dev_uas, dev_las = evaluate(model, dev_loader, device)
+        # Cost update
+        progress = epoch / epochs
+        current_cost = cost_tracker.current_cost()
+        estimated_total_cost = cost_tracker.estimate_total_cost(progress)
+        elapsed_minutes = cost_tracker.elapsed_seconds() / 60
+        cost_status = cost_tracker.update(epoch, epochs)
+        if cost_status:
+            click.echo(f"  [{cost_status}]")
+        avg_loss = total_loss / len(train_loader)
+        current_lr = scheduler.get_last_lr()[0]
+        click.echo(f"Epoch {epoch:3d} | Loss: {avg_loss:.4f} | "
+                   f"Dev UAS: {dev_uas:.2f}% | Dev LAS: {dev_las:.2f}% | "
+                   f"LR: {current_lr:.2e}")
+        # Log to wandb
+        if use_wandb:
+            wandb.log({
+                "epoch": epoch,
+                "train/loss": avg_loss,
+                "dev/uas": dev_uas,
+                "dev/las": dev_las,
+                "lr": current_lr,
+                "cost/current_usd": current_cost,
+                "cost/estimated_total_usd": estimated_total_cost,
+                "cost/elapsed_minutes": elapsed_minutes,
+            })
+        # Save best model
+        if dev_las >= best_las:
+            best_las = dev_las
+            no_improve = 0
+            model.save(
+                str(output_path),
+                vocab={'rel2idx': vocab.rel2idx, 'idx2rel': vocab.idx2rel}
+            )
+            click.echo(f"  -> Saved best model (LAS: {best_las:.2f}%)")
+        else:
+            no_improve += 1
+            if no_improve >= patience:
+                click.echo(f"\nEarly stopping after {patience} epochs without improvement")
+                break
+    # Final evaluation
+    click.echo("\nLoading best model for final evaluation...")
+    model = PhoBERTDependencyParser.load(str(output_path), device=str(device))
+    test_uas, test_las = evaluate(model, test_loader, device)
+    click.echo(f"\nTest Results:")
+    click.echo(f"  UAS: {test_uas:.2f}%")
+    click.echo(f"  LAS: {test_las:.2f}%")
+    click.echo(f"\nModel saved to: {output_path}")
+    # Final cost summary
+    final_cost = cost_tracker.current_cost()
+    click.echo(f"\n{cost_tracker.summary(epoch, epochs)}")
+    # Log final metrics to wandb
+    if use_wandb:
+        wandb.log({
+            "test/uas": test_uas,
+            "test/las": test_las,
+            "cost/final_usd": final_cost,
+        })
+        wandb.finish()
+if __name__ == '__main__':
+    train()

uv.lock CHANGED Viewed

@@ -22,6 +22,19 @@ resolution-markers = [
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
 [[package]]
 name = "aiodns"
 version = "4.0.0"
@@ -367,12 +380,18 @@ source = { editable = "." }
 dependencies = [
     { name = "click" },
     { name = "datasets" },
     { name = "torch" },
     { name = "transformers" },
     { name = "underthesea" },
 ]
 [package.optional-dependencies]
 cloud = [
     { name = "runpod" },
 ]
@@ -383,16 +402,19 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "click", specifier = ">=8.0.0" },
     { name = "datasets", specifier = ">=2.14.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
     { name = "runpod", marker = "extra == 'cloud'", specifier = ">=1.6.0" },
     { name = "torch", specifier = ">=2.0.0" },
-    { name = "transformers", specifier = ">=5.0.0" },
     { name = "underthesea", specifier = ">=9.2.0" },
     { name = "wandb", marker = "extra == 'dev'", specifier = ">=0.15.0" },
 ]
-provides-extras = ["dev", "cloud"]
 [[package]]
 name = "bcrypt"
@@ -1384,23 +1406,21 @@ wheels = [
 [[package]]
 name = "huggingface-hub"
-version = "1.3.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "fsspec" },
-    { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
-    { name = "httpx" },
     { name = "packaging" },
     { name = "pyyaml" },
-    { name = "shellingham" },
     { name = "tqdm" },
-    { name = "typer-slim" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/67/e9/2658cb9bc4c72a67b7f87650e827266139befaf499095883d30dabc4d49f/huggingface_hub-1.3.5.tar.gz", hash = "sha256:8045aca8ddab35d937138f3c386c6d43a275f53437c5c64cdc9aa8408653b4ed", size = 627456, upload-time = "2026-01-29T10:34:19.687Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f9/84/a579b95c46fe8e319f89dc700c087596f665141575f4dcf136aaa97d856f/huggingface_hub-1.3.5-py3-none-any.whl", hash = "sha256:fe332d7f86a8af874768452295c22cd3f37730fb2463cf6cc3295e26036f8ef9", size = 536675, upload-time = "2026-01-29T10:34:17.713Z" },
 ]
 [[package]]
@@ -3763,32 +3783,27 @@ wheels = [
 [[package]]
 name = "tokenizers"
-version = "0.22.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/73/6f/f80cfef4a312e1fb34baf7d85c72d4411afde10978d4657f8cdd811d3ccc/tokenizers-0.22.2.tar.gz", hash = "sha256:473b83b915e547aa366d1eee11806deaf419e17be16310ac0a14077f1e28f917", size = 372115, upload-time = "2026-01-05T10:45:15.988Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/92/97/5dbfabf04c7e348e655e907ed27913e03db0923abb5dfdd120d7b25630e1/tokenizers-0.22.2-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:544dd704ae7238755d790de45ba8da072e9af3eea688f698b137915ae959281c", size = 3100275, upload-time = "2026-01-05T10:41:02.158Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/47/174dca0502ef88b28f1c9e06b73ce33500eedfac7a7692108aec220464e7/tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e418a55456beedca4621dbab65a318981467a2b188e982a23e117f115ce5001", size = 2981472, upload-time = "2026-01-05T10:41:00.276Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/84/7990e799f1309a8b87af6b948f31edaa12a3ed22d11b352eaf4f4b2e5753/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2249487018adec45d6e3554c71d46eb39fa8ea67156c640f7513eb26f318cec7", size = 3290736, upload-time = "2026-01-05T10:40:32.165Z" },
-    { url = "https://files.pythonhosted.org/packages/78/59/09d0d9ba94dcd5f4f1368d4858d24546b4bdc0231c2354aa31d6199f0399/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b85325d0815e86e0bac263506dd114578953b7b53d7de09a6485e4a160a7dd", size = 3168835, upload-time = "2026-01-05T10:40:38.847Z" },
-    { url = "https://files.pythonhosted.org/packages/47/50/b3ebb4243e7160bda8d34b731e54dd8ab8b133e50775872e7a434e524c28/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfb88f22a209ff7b40a576d5324bf8286b519d7358663db21d6246fb17eea2d5", size = 3521673, upload-time = "2026-01-05T10:40:56.614Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/fa/89f4cb9e08df770b57adb96f8cbb7e22695a4cb6c2bd5f0c4f0ebcf33b66/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c774b1276f71e1ef716e5486f21e76333464f47bece56bbd554485982a9e03e", size = 3724818, upload-time = "2026-01-05T10:40:44.507Z" },
-    { url = "https://files.pythonhosted.org/packages/64/04/ca2363f0bfbe3b3d36e95bf67e56a4c88c8e3362b658e616d1ac185d47f2/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df6c4265b289083bf710dff49bc51ef252f9d5be33a45ee2bed151114a56207b", size = 3379195, upload-time = "2026-01-05T10:40:51.139Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/76/932be4b50ef6ccedf9d3c6639b056a967a86258c6d9200643f01269211ca/tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:369cc9fc8cc10cb24143873a0d95438bb8ee257bb80c71989e3ee290e8d72c67", size = 3274982, upload-time = "2026-01-05T10:40:58.331Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/28/5f9f5a4cc211b69e89420980e483831bcc29dade307955cc9dc858a40f01/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:29c30b83d8dcd061078b05ae0cb94d3c710555fbb44861139f9f83dcca3dc3e4", size = 9478245, upload-time = "2026-01-05T10:41:04.053Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/fb/66e2da4704d6aadebf8cb39f1d6d1957df667ab24cff2326b77cda0dcb85/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:37ae80a28c1d3265bb1f22464c856bd23c02a05bb211e56d0c5301a435be6c1a", size = 9560069, upload-time = "2026-01-05T10:45:10.673Z" },
-    { url = "https://files.pythonhosted.org/packages/16/04/fed398b05caa87ce9b1a1bb5166645e38196081b225059a6edaff6440fac/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:791135ee325f2336f498590eb2f11dc5c295232f288e75c99a36c5dbce63088a", size = 9899263, upload-time = "2026-01-05T10:45:12.559Z" },
-    { url = "https://files.pythonhosted.org/packages/05/a1/d62dfe7376beaaf1394917e0f8e93ee5f67fea8fcf4107501db35996586b/tokenizers-0.22.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38337540fbbddff8e999d59970f3c6f35a82de10053206a7562f1ea02d046fa5", size = 10033429, upload-time = "2026-01-05T10:45:14.333Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" },
-    { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" },
-    { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
-    { url = "https://files.pythonhosted.org/packages/84/04/655b79dbcc9b3ac5f1479f18e931a344af67e5b7d3b251d2dcdcd7558592/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:753d47ebd4542742ef9261d9da92cd545b2cacbb48349a1225466745bb866ec4", size = 3282301, upload-time = "2026-01-05T10:40:34.858Z" },
-    { url = "https://files.pythonhosted.org/packages/46/cd/e4851401f3d8f6f45d8480262ab6a5c8cb9c4302a790a35aa14eeed6d2fd/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e10bf9113d209be7cd046d40fbabbaf3278ff6d18eb4da4c500443185dc1896c", size = 3161308, upload-time = "2026-01-05T10:40:40.737Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/6e/55553992a89982cd12d4a66dddb5e02126c58677ea3931efcbe601d419db/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64d94e84f6660764e64e7e0b22baa72f6cd942279fdbb21d46abd70d179f0195", size = 3718964, upload-time = "2026-01-05T10:40:46.56Z" },
-    { url = "https://files.pythonhosted.org/packages/59/8c/b1c87148aa15e099243ec9f0cf9d0e970cc2234c3257d558c25a2c5304e6/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f01a9c019878532f98927d2bacb79bbb404b43d3437455522a00a30718cdedb5", size = 3373542, upload-time = "2026-01-05T10:40:52.803Z" },
 ]
 [[package]]
@@ -3942,7 +3957,7 @@ wheels = [
 [[package]]
 name = "transformers"
-version = "5.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -3952,14 +3967,14 @@ dependencies = [
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
     { name = "safetensors" },
     { name = "tokenizers" },
     { name = "tqdm" },
-    { name = "typer-slim" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/bc/79/845941711811789c85fb7e2599cea425a14a07eda40f50896b9d3fda7492/transformers-5.0.0.tar.gz", hash = "sha256:5f5634efed6cf76ad068cc5834c7adbc32db78bbd6211fb70df2325a9c37dec8", size = 8424830, upload-time = "2026-01-26T10:46:46.813Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/52/f3/ac976fa8e305c9e49772527e09fbdc27cc6831b8a2f6b6063406626be5dd/transformers-5.0.0-py3-none-any.whl", hash = "sha256:587086f249ce64c817213cf36afdb318d087f790723e9b3d4500b97832afd52d", size = 10142091, upload-time = "2026-01-26T10:46:43.88Z" },
 ]
 [[package]]
@@ -3991,19 +4006,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl", hash = "sha256:7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01", size = 47381, upload-time = "2026-01-06T11:21:09.824Z" },
 ]
-[[package]]
-name = "typer-slim"
-version = "0.21.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/17/d4/064570dec6358aa9049d4708e4a10407d74c99258f8b2136bb8702303f1a/typer_slim-0.21.1.tar.gz", hash = "sha256:73495dd08c2d0940d611c5a8c04e91c2a0a98600cbd4ee19192255a233b6dbfd", size = 110478, upload-time = "2026-01-06T11:21:11.176Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c8/0a/4aca634faf693e33004796b6cee0ae2e1dba375a800c16ab8d3eff4bb800/typer_slim-0.21.1-py3-none-any.whl", hash = "sha256:6e6c31047f171ac93cc5a973c9e617dbc5ab2bddc4d0a3135dc161b4e2020e0d", size = 47444, upload-time = "2026-01-06T11:21:12.441Z" },
-]
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"

     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
+[[package]]
+name = "adapters"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5f/c7/96580e5b7417b0838bd3e41a416939be63a549f22cfe0bcf8cdc62fd2ed8/adapters-1.2.0.tar.gz", hash = "sha256:40db5c5e0789603859980229f7acbae51168abf1999efdb65e5a7778e81a104e", size = 226695, upload-time = "2025-05-20T19:27:07.202Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/e5/91cb0ea212558443b3d62e2b8d8537647549b9c6d34d613847a9fb2fcc58/adapters-1.2.0-py3-none-any.whl", hash = "sha256:fa55ddd9a99577ad0bacb16bebd0a26b6c5db2eae8730b5bfe4403eb917f2e22", size = 302180, upload-time = "2025-05-20T19:27:05.323Z" },
+]
 [[package]]
 name = "aiodns"
 version = "4.0.0"
 dependencies = [
     { name = "click" },
     { name = "datasets" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "torch" },
+    { name = "tqdm" },
     { name = "transformers" },
     { name = "underthesea" },
 ]
 [package.optional-dependencies]
+adapters = [
+    { name = "adapters" },
+]
 cloud = [
     { name = "runpod" },
 ]
 [package.metadata]
 requires-dist = [
+    { name = "adapters", marker = "extra == 'adapters'", specifier = ">=0.1.0" },
     { name = "click", specifier = ">=8.0.0" },
     { name = "datasets", specifier = ">=2.14.0" },
+    { name = "numpy", specifier = ">=1.24.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
     { name = "runpod", marker = "extra == 'cloud'", specifier = ">=1.6.0" },
     { name = "torch", specifier = ">=2.0.0" },
+    { name = "tqdm", specifier = ">=4.60.0" },
+    { name = "transformers", specifier = ">=4.30.0" },
     { name = "underthesea", specifier = ">=9.2.0" },
     { name = "wandb", marker = "extra == 'dev'", specifier = ">=0.15.0" },
 ]
+provides-extras = ["dev", "cloud", "adapters"]
 [[package]]
 name = "bcrypt"
 [[package]]
 name = "huggingface-hub"
+version = "0.36.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "fsspec" },
+    { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
     { name = "packaging" },
     { name = "pyyaml" },
+    { name = "requests" },
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" },
 ]
 [[package]]
 [[package]]
 name = "tokenizers"
+version = "0.21.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/c2/2f/402986d0823f8d7ca139d969af2917fefaa9b947d1fb32f6168c509f2492/tokenizers-0.21.4.tar.gz", hash = "sha256:fa23f85fbc9a02ec5c6978da172cdcbac23498c3ca9f3645c5c68740ac007880", size = 351253, upload-time = "2025-07-28T15:48:54.325Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/c6/fdb6f72bf6454f52eb4a2510be7fb0f614e541a2554d6210e370d85efff4/tokenizers-0.21.4-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2ccc10a7c3bcefe0f242867dc914fc1226ee44321eb618cfe3019b5df3400133", size = 2863987, upload-time = "2025-07-28T15:48:44.877Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/a6/28975479e35ddc751dc1ddc97b9b69bf7fcf074db31548aab37f8116674c/tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e2f601a8e0cd5be5cc7506b20a79112370b9b3e9cb5f13f68ab11acd6ca7d60", size = 2732457, upload-time = "2025-07-28T15:48:43.265Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/8f/24f39d7b5c726b7b0be95dca04f344df278a3fe3a4deb15a975d194cbb32/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b376f5a1aee67b4d29032ee85511bbd1b99007ec735f7f35c8a2eb104eade5", size = 3012624, upload-time = "2025-07-28T13:22:43.895Z" },
+    { url = "https://files.pythonhosted.org/packages/58/47/26358925717687a58cb74d7a508de96649544fad5778f0cd9827398dc499/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2107ad649e2cda4488d41dfd031469e9da3fcbfd6183e74e4958fa729ffbf9c6", size = 2939681, upload-time = "2025-07-28T13:22:47.499Z" },
+    { url = "https://files.pythonhosted.org/packages/99/6f/cc300fea5db2ab5ddc2c8aea5757a27b89c84469899710c3aeddc1d39801/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c73012da95afafdf235ba80047699df4384fdc481527448a078ffd00e45a7d9", size = 3247445, upload-time = "2025-07-28T15:48:39.711Z" },
+    { url = "https://files.pythonhosted.org/packages/be/bf/98cb4b9c3c4afd8be89cfa6423704337dc20b73eb4180397a6e0d456c334/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f23186c40395fc390d27f519679a58023f368a0aad234af145e0f39ad1212732", size = 3428014, upload-time = "2025-07-28T13:22:49.569Z" },
+    { url = "https://files.pythonhosted.org/packages/75/c7/96c1cc780e6ca7f01a57c13235dd05b7bc1c0f3588512ebe9d1331b5f5ae/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc88bb34e23a54cc42713d6d98af5f1bf79c07653d24fe984d2d695ba2c922a2", size = 3193197, upload-time = "2025-07-28T13:22:51.471Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/90/273b6c7ec78af547694eddeea9e05de771278bd20476525ab930cecaf7d8/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51b7eabb104f46c1c50b486520555715457ae833d5aee9ff6ae853d1130506ff", size = 3115426, upload-time = "2025-07-28T15:48:41.439Z" },
+    { url = "https://files.pythonhosted.org/packages/91/43/c640d5a07e95f1cf9d2c92501f20a25f179ac53a4f71e1489a3dcfcc67ee/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b05b2e1af1288bd1bc56ce496c4cebb64a20d158ee802887757791191e6e2", size = 9089127, upload-time = "2025-07-28T15:48:46.472Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a1/dd23edd6271d4dca788e5200a807b49ec3e6987815cd9d0a07ad9c96c7c2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:1340ff877ceedfa937544b7d79f5b7becf33a4cfb58f89b3b49927004ef66f78", size = 9055243, upload-time = "2025-07-28T15:48:48.539Z" },
+    { url = "https://files.pythonhosted.org/packages/21/2b/b410d6e9021c4b7ddb57248304dc817c4d4970b73b6ee343674914701197/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c1f4317576e465ac9ef0d165b247825a2a4078bcd01cba6b54b867bdf9fdd8b", size = 9298237, upload-time = "2025-07-28T15:48:50.443Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/0a/42348c995c67e2e6e5c89ffb9cfd68507cbaeb84ff39c49ee6e0a6dd0fd2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c212aa4e45ec0bb5274b16b6f31dd3f1c41944025c2358faaa5782c754e84c24", size = 9461980, upload-time = "2025-07-28T15:48:52.325Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/d3/dacccd834404cd71b5c334882f3ba40331ad2120e69ded32cf5fda9a7436/tokenizers-0.21.4-cp39-abi3-win32.whl", hash = "sha256:6c42a930bc5f4c47f4ea775c91de47d27910881902b0f20e4990ebe045a415d0", size = 2329871, upload-time = "2025-07-28T15:48:56.841Z" },
+    { url = "https://files.pythonhosted.org/packages/41/f2/fd673d979185f5dcbac4be7d09461cbb99751554ffb6718d0013af8604cb/tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597", size = 2507568, upload-time = "2025-07-28T15:48:55.456Z" },
 ]
 [[package]]
 [[package]]
 name = "transformers"
+version = "4.51.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
+    { name = "requests" },
     { name = "safetensors" },
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/f1/11/7414d5bc07690002ce4d7553602107bf969af85144bbd02830f9fb471236/transformers-4.51.3.tar.gz", hash = "sha256:e292fcab3990c6defe6328f0f7d2004283ca81a7a07b2de9a46d67fd81ea1409", size = 8941266, upload-time = "2025-04-14T08:15:00.485Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a9/b6/5257d04ae327b44db31f15cce39e6020cc986333c715660b1315a9724d82/transformers-4.51.3-py3-none-any.whl", hash = "sha256:fd3279633ceb2b777013234bbf0b4f5c2d23c4626b05497691f00cfda55e8a83", size = 10383940, upload-time = "2025-04-14T08:13:43.023Z" },
 ]
 [[package]]
     { url = "https://files.pythonhosted.org/packages/a0/1d/d9257dd49ff2ca23ea5f132edf1281a0c4f9de8a762b9ae399b670a59235/typer-0.21.1-py3-none-any.whl", hash = "sha256:7985e89081c636b88d172c2ee0cfe33c253160994d47bdfdc302defd7d1f1d01", size = 47381, upload-time = "2026-01-06T11:21:09.824Z" },
 ]
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"