File size: 27,983 Bytes

9b82aa2

#!/usr/bin/env python3
"""
WURCS-BPE Tokenizer

A hybrid tokenizer that learns semantic subwords from WURCS while preserving
the ability to handle rare/novel glycan structures character-by-character.

Key features:
1. Pre-tokenization: Split WURCS into semantic units (residues, linkages, mods)
2. BPE: Learn subword merges from corpus
3. Character fallback: Handle novel structures
4. Tree embeddings: Preserve branch_depth and linkage_type per token

Usage:
    # Train BPE on corpus
    tokenizer = WURCSBPETokenizer.train_from_corpus(
        wurcs_strings, 
        num_merges=500,
        output_path="bpe_vocabulary.json"
    )
    
    # Tokenize
    result = tokenizer.tokenize(wurcs_string)
"""

import json
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Set
import pickle


class WURCSPreTokenizer:
    """
    Pre-tokenize WURCS into semantic units before BPE.
    
    WURCS format: WURCS=2.0/count/[residues]/indices/linkages
    
    We split into:
    - Residues: [a2122h-1b_1-5_2*NCC/3=O] -> one unit per []
    - Linkages: a4-b1 -> one unit per linkage
    - Special markers: [BRANCH_OPEN], [BRANCH_CLOSE], etc.
    """
    
    # Residue patterns for common monosaccharides
    RESIDUE_PATTERN = re.compile(r'\[([^\]]+)\]')
    LINKAGE_PATTERN = re.compile(r'([a-z])(\d+|\?)-([a-z])(\d+|\?)')
    
    def __init__(self):
        self.special_tokens = {
            '[PAD]': 0,
            '[UNK]': 1,
            '[START]': 2,
            '[END]': 3,
            '[MASK]': 4,
            '[BRANCH_OPEN]': 5,
            '[BRANCH_CLOSE]': 6,
            '[LINK]': 7,
            '[MOD]': 8,
            '[RESIDUE_ERROR]': 9,
        }
    
    def pre_tokenize(self, wurcs: str) -> List[Dict]:
        """
        Pre-tokenize WURCS into semantic units.
        
        Returns list of dicts with:
            - text: The unit text
            - type: 'special', 'residue', 'linkage', 'mod', 'index'
            - residue_id: Which residue this belongs to (-1 for special, -2 for linkage)
            - branch_depth: Tree depth (computed later)
        """
        units = []
        
        # Add start token
        units.append({
            'text': '[START]',
            'type': 'special',
            'residue_id': -1,
            'branch_depth': 0,
            'linkage_type': 0,
        })
        
        # Parse WURCS sections
        if not wurcs.startswith('WURCS='):
            units.append({'text': '[RESIDUE_ERROR]', 'type': 'special', 'residue_id': -1, 'branch_depth': 0, 'linkage_type': 0})
            units.append({'text': '[END]', 'type': 'special', 'residue_id': -1, 'branch_depth': 0, 'linkage_type': 0})
            return units
        
        try:
            parts = self._split_wurcs_sections(wurcs)
            if len(parts) < 4:
                return [{'text': '[ERROR]', 'type': 'special', 'residue_id': -1, 'branch_depth': 0, 'linkage_type': 0}]
            
            # parts: WURCS=2.0/3,3,2/[a2122h-1b_1-5][a2122h-1a_1-5][a1122h-1b_1-5]/1-2-3-1/a4-b1_b3-c1_c4-d1
            # section 2: residue definitions
            # section 3: indices
            # section 4: linkages (optional)
            
            version = parts[0]  # WURCS=2.0
            counts = parts[1]   # residue_count,node_count,link_count
            residue_defs = parts[2]  # [res1][res2]...
            indices = parts[3]  # 1-2-3-1
            linkages = parts[4] if len(parts) > 4 else ""  # a4-b1_b3-c1
            
            # Parse residue definitions
            residue_list = self.RESIDUE_PATTERN.findall(residue_defs)
            
            # Parse linkages to compute branch structure
            linkage_list = linkages.split('_') if linkages else []
            branch_points, residue_depths, linkage_types_map, adj = self._analyze_tree_structure(linkage_list, num_residues=len(residue_list))
            
            # Compute distance matrix and cache it based on the linkage string (structure)
            # This is the most expensive part, so we cache it
            if not hasattr(self, '_dist_cache'): self._dist_cache = {}
            if linkages not in self._dist_cache:
                self._dist_cache[linkages] = self._compute_distance_matrix(adj, len(residue_list))
            dist_matrix_raw = self._dist_cache[linkages]
            
            # Parse indices to map positions to residue definitions
            index_list = indices.split('-') if indices else []
            
            # Process each residue instance
            residue_letter = ord('a')
            for idx, res_idx in enumerate(index_list):
                current_residue_id = idx
                res_letter = chr(residue_letter + idx)
                
                # Check if this is a branch point - add branch marker before
                if res_letter in branch_points and branch_points[res_letter] > 0:
                    for _ in range(branch_points[res_letter]):
                        units.append({
                            'text': '[BRANCH_OPEN]',
                            'type': 'special',
                            'residue_id': -1,
                            'branch_depth': residue_depths.get(res_letter, 0),
                            'linkage_type': 0,
                        })
                
                # Get residue definition
                try:
                    res_def_idx = int(res_idx) - 1  # 1-indexed to 0-indexed
                    res_def = residue_list[res_def_idx] if res_def_idx < len(residue_list) else ""
                except (ValueError, IndexError):
                    res_def = ""
                
                # Split residue into base and modifications
                res_parts = res_def.split('_')
                base = res_parts[0] if res_parts else res_def
                mods = res_parts[1:] if len(res_parts) > 1 else []
                
                # Add residue base as a single unit
                depth = residue_depths.get(res_letter, 0)
                units.append({
                    'text': base,
                    'type': 'residue',
                    'residue_id': current_residue_id,
                    'branch_depth': depth,
                    'linkage_type': 0,
                })
                
                # Add modifications
                for mod in mods:
                    units.append({
                        'text': mod,
                        'type': 'mod',
                        'residue_id': current_residue_id,
                        'branch_depth': depth,
                        'linkage_type': 0,
                    })
            
            # Store distance matrix in units for easy access in tokenizer
            if units:
                # Find first residue unit or just use START
                units[0]['distance_matrix'] = dist_matrix_raw
            
            # Add linkages
            for link in linkage_list:
                if not link:
                    continue
                # Parse linkage type
                lt = self._parse_linkage_type(link)
                units.append({
                    'text': link,
                    'type': 'linkage',
                    'residue_id': -2,
                    'branch_depth': 0,
                    'linkage_type': lt,
                })
        except Exception:
            # Fallback for truly broken WURCS
            pass
        
        # Add end token
        units.append({
            'text': '[END]',
            'type': 'special',
            'residue_id': -1,
            'branch_depth': 0,
            'linkage_type': 0,
        })
        
        return units
    
    def _split_wurcs_sections(self, wurcs: str) -> List[str]:
        """Split WURCS string into sections, handling nested brackets."""
        # Remove WURCS= prefix
        if wurcs.startswith('WURCS='):
            wurcs = wurcs[6:]
        
        sections = []
        current = ""
        bracket_depth = 0
        
        for char in wurcs:
            if char == '[':
                bracket_depth += 1
                current += char
            elif char == ']':
                bracket_depth -= 1
                current += char
            elif char == '/' and bracket_depth == 0:
                sections.append(current)
                current = ""
            else:
                current += char
        
        if current:
            sections.append(current)
        
        return sections
    
    def _analyze_tree_structure(self, linkages: List[str], num_residues: int) -> Tuple[Dict, Dict, Dict, Dict]:
        """Analyze linkages to compute branch points and residue depths."""
        branch_points = defaultdict(int)  # residue -> number of children
        children = defaultdict(list)
        all_residues = set()
        linkage_types = {}
        
        for link in linkages:
            match = self.LINKAGE_PATTERN.match(link)
            if match:
                from_res, from_pos, to_res, to_pos = match.groups()
                children[from_res].append(to_res)
                all_residues.add(from_res)
                all_residues.add(to_res)
                
                # Store linkage type
                linkage_types[link] = self._parse_linkage_type(link)
        
        # Build adjacency list for BFS
        adj = defaultdict(list)
        for link in linkages:
            match = self.LINKAGE_PATTERN.match(link)
            if match:
                u = ord(match.group(1)) - ord('a')
                v = ord(match.group(3)) - ord('a')
                if 0 <= u < num_residues and 0 <= v < num_residues:
                    adj[u].append(v)
                    adj[v].append(u)
        
        # Find branch points (residues with >1 child)
        for res, kids in children.items():
            if len(kids) > 1:
                branch_points[res] = len(kids) - 1  # Number of extra branches
        
        # Compute depths using BFS
        # Find root (residue with no parent)
        child_set = set()
        for kids in children.values():
            child_set.update(kids)
        roots = all_residues - child_set
        root = min(roots) if roots else 'a'
        
        depths = {root: 0}
        queue = [root]
        while queue:
            current = queue.pop(0)
            for child in children.get(current, []):
                if child not in depths:
                    depths[child] = depths[current] + 1
                    queue.append(child)
        
        return branch_points, depths, linkage_types, adj
    
    def _compute_distance_matrix(self, adj: Dict[int, List[int]], num_residues: int) -> List[List[int]]:
        """
        Compute shortest path distance (number of bonds) between all residue pairs using BFS.
        """
        if num_residues == 0:
            return []
            
        dist_matrix = [[-1] * num_residues for _ in range(num_residues)]
        
        for i in range(num_residues):
            dist_matrix[i][i] = 0
            queue = [(i, 0)]
            visited = {i}
            
            while queue:
                curr, d = queue.pop(0)
                dist_matrix[i][curr] = d
                
                for neighbor in adj[curr]:
                    if neighbor not in visited:
                        visited.add(neighbor)
                        queue.append((neighbor, d + 1))
                        
        return dist_matrix
             
    def _compute_distance_matrix_OLD(self, linkages: List[str], num_residues: int) -> List[List[int]]:
        """
        Compute shortest path distance (number of bonds) between all residue pairs.
        Returns a symmetric N x N matrix where N is num_residues.
        Values are integers (number of steps). 0 on diagonal. -1 if unreachable (shouldn't happen in single tree).
        """
        if num_residues == 0:
            return []
            
        # Initialize adjacency list
        adj = defaultdict(list)
        for link in linkages:
            match = self.LINKAGE_PATTERN.match(link)
            if match:
                # WURCS indices are 1-based letters (a=1, b=2...)
                from_res_char, _, to_res_char, _ = match.groups()
                # Convert char to 0-based index
                u = ord(from_res_char) - ord('a')
                v = ord(to_res_char) - ord('a')
                
                # Undirected graph for structural distance
                if 0 <= u < num_residues and 0 <= v < num_residues:
                    adj[u].append(v)
                    adj[v].append(u)
                    
        # Compute All-Pairs Shortest Path (BFS from each node is fine for small N)
        # Glycans are small (N ~ 5-20 usually), so O(N^2) BFS is cheap.
        dist_matrix = [[-1] * num_residues for _ in range(num_residues)]
        
        for i in range(num_residues):
            dist_matrix[i][i] = 0
            queue = [(i, 0)]
            visited = {i}
            
            while queue:
                curr, d = queue.pop(0)
                dist_matrix[i][curr] = d
                
                for neighbor in adj[curr]:
                    if neighbor not in visited:
                        visited.add(neighbor)
                        queue.append((neighbor, d + 1))
                        
        return dist_matrix
    
    def _parse_linkage_type(self, link: str) -> int:
        """Parse linkage string to get type ID."""
        LINKAGE_TYPES = {
            (1, 2): 0, (2, 1): 0,
            (1, 3): 1, (3, 1): 1,
            (1, 4): 2, (4, 1): 2,
            (1, 6): 3, (6, 1): 3,
            (2, 3): 4, (3, 2): 4,
            (2, 6): 5, (6, 2): 5,
            (3, 6): 6, (6, 3): 6,
        }
        
        match = self.LINKAGE_PATTERN.match(link)
        if match:
            _, from_pos, _, to_pos = match.groups()
            try:
                pos_tuple = (int(from_pos), int(to_pos))
                return LINKAGE_TYPES.get(pos_tuple, 7)
            except ValueError:
                return 8  # Unknown
        return 8


class WURCSBPETokenizer:
    """
    BPE tokenizer for WURCS with tree-aware embeddings.
    """
    
    def __init__(self, vocab_path: Optional[str] = None):
        self.pre_tokenizer = WURCSPreTokenizer()
        
        # Special tokens (fixed)
        self.special_tokens = self.pre_tokenizer.special_tokens
        
        # BPE vocabulary
        self.token_to_id: Dict[str, int] = {}
        self.id_to_token: Dict[int, str] = {}
        self.merges: List[Tuple[str, str]] = []
        
        if vocab_path:
            self.load_vocab(vocab_path)
        else:
            # Initialize with special tokens only
            self.token_to_id = dict(self.special_tokens)
            self.id_to_token = {v: k for k, v in self.token_to_id.items()}
    
    @classmethod
    def train_from_corpus(
        cls, 
        wurcs_strings: List[str], 
        num_merges: int = 500,
        output_path: Optional[str] = None,
        min_frequency: int = 2,
        max_token_length: Optional[int] = None,
    ) -> 'WURCSBPETokenizer':
        """
        Train BPE on a corpus of WURCS strings.
        
        Args:
            wurcs_strings: List of WURCS strings
            num_merges: Number of BPE merge operations
            output_path: Optional path to save vocabulary
            min_frequency: Minimum frequency for a token to be kept
            max_token_length: Maximum length of a merged token (None = no limit)
        
        Returns:
            Trained tokenizer
        """
        tokenizer = cls()
        pre_tok = WURCSPreTokenizer()
        
        print(f"Training BPE on {len(wurcs_strings)} WURCS strings...")
        
        # Step 1: Pre-tokenize all strings to get semantic units
        all_units = []
        for wurcs in wurcs_strings:
            units = pre_tok.pre_tokenize(wurcs)
            for unit in units:
                if unit['type'] != 'special':
                    all_units.append(unit['text'])
        
        # Step 2: Count unit frequencies
        unit_counts = Counter(all_units)
        print(f"Found {len(unit_counts)} unique units")
        
        # Step 3: Initialize vocabulary with characters from all units
        char_vocab = set()
        for unit in unit_counts:
            for char in unit:
                char_vocab.add(char)
        
        # Build initial vocab: special tokens + characters
        vocab_id = len(tokenizer.special_tokens)
        for char in sorted(char_vocab):
            tokenizer.token_to_id[char] = vocab_id
            tokenizer.id_to_token[vocab_id] = char
            vocab_id += 1
        
        print(f"Initial vocab size: {vocab_id} (special + characters)")
        
        # Step 4: Convert units to character sequences
        word_freqs = {}
        for unit, count in unit_counts.items():
            if count >= min_frequency:
                # Split into characters with space separator
                chars = tuple(unit)
                word_freqs[chars] = count
        
        # Step 5: BPE merging
        merges = []
        
        for merge_idx in range(num_merges):
            # Count pairs
            pair_counts = Counter()
            for word, freq in word_freqs.items():
                for i in range(len(word) - 1):
                    pair = (word[i], word[i + 1])
                    pair_counts[pair] += freq
            
            if not pair_counts:
                break
            
            # Find most frequent pair
            best_pair = pair_counts.most_common(1)[0][0]
            best_count = pair_counts[best_pair]
            
            if best_count < min_frequency:
                break
            
            # Merge pair
            new_token = best_pair[0] + best_pair[1]
            
            # Check length constraint
            if max_token_length and len(new_token) > max_token_length:
                # remove this pair from consideration for this iteration and future?
                # Actually, skipping it here is tricky because we need to ignore it in pair_counts next time
                # Simpler: Just skip adding it to merges and modify word_freqs?
                # No, if we don't merge, we just continue to the next best pair in THIS iteration.
                # But pair_counts is already computed.
                # We need to loop until we find a valid pair or run out
                
                # In this simple implementation, let's just skip this merge efficiently
                # We need to find the NEXT most common pair.
                
                # Re-do finding best pair loop
                found_valid_pair = False
                for pair, count in pair_counts.most_common():
                    token_candidate = pair[0] + pair[1]
                    if max_token_length and len(token_candidate) > max_token_length:
                        continue # Skip too long
                    
                    if count < min_frequency:
                        break # Stop if frequency too low
                        
                    # Found valid pair
                    best_pair = pair
                    best_count = count
                    new_token = token_candidate
                    found_valid_pair = True
                    break
                
                if not found_valid_pair:
                    print(f"  Stopping early: No more pairs satisfy max_token_length={max_token_length}")
                    break
            
            # Final check before merging (in case we didn't enter the if block but updated vars)
            # Actually the logic above handles it. If we entered the block, we either found a new best_pair or broke.
            
            merges.append(best_pair)
            
            # Add to vocab
            tokenizer.token_to_id[new_token] = vocab_id
            tokenizer.id_to_token[vocab_id] = new_token
            vocab_id += 1
            
            # Update word_freqs
            new_word_freqs = {}
            for word, freq in word_freqs.items():
                new_word = []
                i = 0
                while i < len(word):
                    if i < len(word) - 1 and word[i] == best_pair[0] and word[i + 1] == best_pair[1]:
                        new_word.append(new_token)
                        i += 2
                    else:
                        new_word.append(word[i])
                        i += 1
                new_word_freqs[tuple(new_word)] = freq
            word_freqs = new_word_freqs
            
            if (merge_idx + 1) % 100 == 0:
                print(f"  Merge {merge_idx + 1}/{num_merges}: '{best_pair[0]}' + '{best_pair[1]}' -> '{new_token}' (count={best_count})")
        
        tokenizer.merges = merges
        print(f"Final vocab size: {len(tokenizer.token_to_id)}")
        
        # Save if requested
        if output_path:
            tokenizer.save_vocab(output_path)
        
        return tokenizer
    
    def apply_bpe(self, text: str) -> List[str]:
        """Apply BPE merges to a text string."""
        if text in self.token_to_id:
            return [text]
        
        # Split into characters
        tokens = list(text)
        
        # Apply merges
        for pair in self.merges:
            new_tokens = []
            i = 0
            while i < len(tokens):
                if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i + 1] == pair[1]:
                    new_tokens.append(pair[0] + pair[1])
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            tokens = new_tokens
        
        return tokens
    
    def tokenize(self, wurcs: str, max_length: int = 256) -> Dict:
        """
        Tokenize a WURCS string.
        
        Returns:
            Dict with:
                - tokens: List of token strings
                - token_ids: List of token IDs
                - residue_ids: List of residue IDs
                - branch_depths: List of branch depths
                - linkage_types: List of linkage types
                - attention_mask: Attention mask
        """
        # Pre-tokenize
        units = self.pre_tokenizer.pre_tokenize(wurcs)
        
        tokens = []
        token_ids = []
        residue_ids = []
        branch_depths = []
        linkage_types = []
        
        for unit in units:
            if unit['type'] == 'special':
                # Special tokens stay as-is
                tok = unit['text']
                tokens.append(tok)
                token_ids.append(self.token_to_id.get(tok, self.token_to_id['[UNK]']))
                residue_ids.append(unit['residue_id'])
                branch_depths.append(unit['branch_depth'])
                linkage_types.append(unit['linkage_type'])
            else:
                # Apply BPE to this unit
                bpe_tokens = self.apply_bpe(unit['text'])
                for tok in bpe_tokens:
                    tokens.append(tok)
                    token_ids.append(self.token_to_id.get(tok, self.token_to_id['[UNK]']))
                    residue_ids.append(unit['residue_id'])
                    branch_depths.append(unit['branch_depth'])
                    linkage_types.append(unit['linkage_type'])
        
        # Truncate if needed
        if len(tokens) > max_length:
            tokens = tokens[:max_length - 1] + ['[END]']
            token_ids = token_ids[:max_length - 1] + [self.token_to_id['[END]']]
            residue_ids = residue_ids[:max_length - 1] + [-1]
            branch_depths = branch_depths[:max_length - 1] + [0]
            linkage_types = linkage_types[:max_length - 1] + [0]
        
        # Create attention mask and pad
        length = len(tokens)
        attention_mask = [1] * length
        
        while len(tokens) < max_length:
            tokens.append('[PAD]')
            token_ids.append(self.token_to_id['[PAD]'])
            residue_ids.append(-1)
            branch_depths.append(0)
            linkage_types.append(0)
            attention_mask.append(0)
        # Pre-tokenize
        units = self.pre_tokenizer.pre_tokenize(wurcs)
        
        # Extract distance matrix from pre-tokenizer result
        dist_matrix_raw = units[0].get('distance_matrix', [])
        num_residues = len(dist_matrix_raw)
        
        # Map token-to-token distances using residue_ids
        # token_i is associated with residue_ids[i]. 
        # residue_ids[i] is index into dist_matrix_raw.
        # If residue_ids[i] == -1 (special), distance is undefined (use -1 or 999)
        
        # Use UNPADDED length for distance matrix to save massive memory
        # distance_matrix will be e.g. 20x20, while tokens are padded to 256
        token_len = length 
        distance_matrix = [[-1] * token_len for _ in range(token_len)]
        
        for i in range(token_len):
            for j in range(token_len):
                r_i = residue_ids[i]
                r_j = residue_ids[j]
                
                if r_i >= 0 and r_j >= 0 and r_i < num_residues and r_j < num_residues:
                    distance_matrix[i][j] = dist_matrix_raw[r_i][r_j]
                else:
                    distance_matrix[i][j] = -1 # Special/Padding
                    
        # MEMORY OPTIMIZATION: Do NOT pad matrix here. 
        # Pad on-the-fly in Dataset class instead.
        # This saves massive memory (0.2GB vs 66GB).
        
        return {
            'tokens': tokens,
            'token_ids': token_ids,
            'residue_ids': residue_ids,
            'branch_depths': branch_depths,
            'linkage_types': linkage_types,
            'attention_mask': attention_mask,
            'distance_matrix': distance_matrix, # New Output
            'length': length,
        }
    
    def save_vocab(self, path: str):
        """Save vocabulary to JSON file."""
        data = {
            'special_tokens': self.special_tokens,
            'token_to_id': self.token_to_id,
            'merges': [list(m) for m in self.merges],
            'metadata': {
                'vocab_size': len(self.token_to_id),
                'num_merges': len(self.merges),
            }
        }
        with open(path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Saved vocabulary to {path}")
    
    def load_vocab(self, path: str):
        """Load vocabulary from JSON file."""
        with open(path, 'r') as f:
            data = json.load(f)
        
        self.special_tokens = data['special_tokens']
        self.token_to_id = data['token_to_id']
        self.id_to_token = {int(v): k for k, v in self.token_to_id.items()}
        self.merges = [tuple(m) for m in data['merges']]
        
        print(f"Loaded vocabulary with {len(self.token_to_id)} tokens")
    
    @property
    def vocab_size(self) -> int:
        return len(self.token_to_id)


# ============================================================================
# Testing
# ============================================================================

if __name__ == '__main__':
    # Test pre-tokenizer
    print("="*80)
    print("Testing WURCSPreTokenizer")
    print("="*80)
    
    pre_tok = WURCSPreTokenizer()
    
    test_wurcs = [
        "WURCS=2.0/2,2,1/[a2122h-1b_1-5][a2211m-1a_1-5]/1-2/a4-b1",
        "WURCS=2.0/3,3,2/[a2122h-1b_1-5_2*NCC/3=O][a2112h-1a_1-5][a2211m-1a_1-5]/1-2-3/a4-b1_b3-c1",
    ]
    
    for wurcs in test_wurcs:
        print(f"\nWURCS: {wurcs[:60]}...")
        units = pre_tok.pre_tokenize(wurcs)
        print(f"Units ({len(units)}):")
        for u in units[:10]:
            print(f"  {u['type']:10} | res={u['residue_id']:2} | depth={u['branch_depth']} | {u['text']}")
        if len(units) > 10:
            print(f"  ... and {len(units) - 10} more")