File size: 11,957 Bytes

"""
Model Saving & Inference Module
===================================
Easy-to-use API for loading and running inference with the embedding model.
"""

import torch
import torch.nn.functional as F
import numpy as np
import json
import os
from pathlib import Path
from typing import List, Dict, Union, Tuple

from .model import MiniTransformerEmbedding
from .tokenizer import SimpleTokenizer


class EmbeddingModelManager:
    """
    Handles saving and loading the embedding model.
    
    Save structure:
    model_dir/
    ├── config.json          # Model architecture config
    ├── model.pt             # Model weights
    ├── tokenizer.json       # Vocabulary
    └── training_info.json   # Training metadata (optional)
    """
    
    @staticmethod
    def save_model(
        model: MiniTransformerEmbedding,
        tokenizer: SimpleTokenizer,
        save_dir: str,
        training_info: dict = None
    ):
        """
        Save model, tokenizer, and config for later use.
        
        Args:
            model: Trained MiniTransformerEmbedding
            tokenizer: SimpleTokenizer with vocabulary
            save_dir: Directory to save model
            training_info: Optional training metadata
        """
        save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        
        # 1. Save model config
        config = {
            'vocab_size': len(tokenizer.word_to_id),
            'd_model': model.d_model,
            'num_heads': model.layers[0].attention.num_heads,
            'num_layers': len(model.layers),
            'd_ff': model.layers[0].feed_forward.linear1.out_features,
            'max_seq_len': model.positional_encoding.pe.size(1),
            'pad_token_id': model.pad_token_id,
            'size_name': save_dir.name # Use folder name as size name
        }
        
        with open(save_dir / 'config.json', 'w') as f:
            json.dump(config, f, indent=2)
        
        # 2. Save model weights
        torch.save(model.state_dict(), save_dir / 'model.pt')
        
        # 3. Save tokenizer vocabulary
        tokenizer.save(str(save_dir / 'tokenizer.json'))
        
        # 4. Save training info (optional)
        if training_info:
            with open(save_dir / 'training_info.json', 'w') as f:
                json.dump(training_info, f, indent=2)
        
        print(f"Model saved to: {save_dir}")
    
    @staticmethod
    def load_model(model_dir: str, device: str = None) -> Tuple[MiniTransformerEmbedding, SimpleTokenizer]:
        """
        Load model and tokenizer from a local directory or HuggingFace repo.
        
        Args:
            model_dir: Local directory path OR HuggingFace repo ID 
                       (e.g., "surazbhandari/miniembed")
            device: Device to load model on ('cpu', 'cuda', 'mps')
            
        Returns:
            (model, tokenizer) tuple
        """
        # Auto-detect HuggingFace repo ID (contains "/" but is not a local path)
        if '/' in model_dir and not os.path.exists(model_dir):
            model_dir = EmbeddingModelManager._download_from_hub(model_dir)
        
        model_dir = Path(model_dir)
        
        if device is None:
            if torch.cuda.is_available():
                device = 'cuda'
            elif torch.backends.mps.is_available():
                device = 'mps'
            else:
                device = 'cpu'
        
        # 1. Load config
        config_path = model_dir / 'config.json'
                
        if config_path.exists():
            with open(config_path, 'r') as f:
                config = json.load(f)
        else:
            # Fallback defaults matching the MiniEmbed-Mini architecture
            print("Warning: config.json not found. Using default MiniEmbed-Mini configuration.")
            config = {
                "vocab_size": 30000,
                "d_model": 256,
                "num_heads": 4,
                "num_layers": 4,
                "d_ff": 1024,
                "max_seq_len": 128,
                "pad_token_id": 0
            }
        
        # 2. Load tokenizer
        tokenizer_path = model_dir / 'tokenizer.json'

        tokenizer = SimpleTokenizer(vocab_size=config['vocab_size'])
        tokenizer.load(str(tokenizer_path))
        
        # 3. Create and load model
        model = MiniTransformerEmbedding(
            vocab_size=config['vocab_size'],
            d_model=config['d_model'],
            num_heads=config['num_heads'],
            num_layers=config['num_layers'],
            d_ff=config['d_ff'],
            max_seq_len=config['max_seq_len'],
            pad_token_id=config.get('pad_token_id', 0)
        )
        
        # Load weights (prefer safetensors)
        st_path = model_dir / 'model.safetensors'
        pt_path = model_dir / 'model.pt'
        
        if st_path.exists():
            from safetensors.torch import load_file
            state_dict = load_file(str(st_path), device=device)
        elif pt_path.exists():
            state_dict = torch.load(pt_path, map_location=device, weights_only=True)
        else:
            raise FileNotFoundError(f"Neither model.safetensors nor model.pt found in {model_dir}")
            
        model.load_state_dict(state_dict)
        model = model.to(device)
        model.eval()
        
        return model, tokenizer
    
    @staticmethod
    def _download_from_hub(repo_id: str) -> str:
        """
        Download model files from a HuggingFace repository.
        
        Args:
            repo_id: HuggingFace repo ID (e.g., "surazbhandari/miniembed")
            
        Returns:
            Local directory path containing the downloaded files.
        """
        try:
            from huggingface_hub import snapshot_download
        except ImportError:
            raise ImportError(
                "huggingface_hub is required to download models from HuggingFace. "
                "Install it with: pip install huggingface_hub"
            )
        
        # Download the full repo (including src/ for inference code)
        local_dir = snapshot_download(repo_id=repo_id)
        
        return local_dir
    
    @staticmethod
    def list_models(base_dir: str = "models") -> List[str]:
        """
        List available model names in the base directory.
        
        Returns:
            List of directory names containing valid models
        """
        path = Path(base_dir)
        if not path.exists():
            return []
        return sorted([d.name for d in path.iterdir() if d.is_dir() and (d / "model.pt").exists()])

class EmbeddingInference:
    """
    High-level inference API for the embedding model.
    
    Usage:
        # From local directory
        model = EmbeddingInference.from_pretrained("./models/mini")
        
        # From HuggingFace
        model = EmbeddingInference.from_pretrained("surazbhandari/miniembed")
        
        # Encode texts
        embeddings = model.encode(["Hello world", "Machine learning"])
        
        # Compute similarity
        score = model.similarity("query", "document")
        
        # Semantic search
        results = model.search("python programming", documents)
    """
    
    def __init__(
        self, 
        model: MiniTransformerEmbedding, 
        tokenizer: SimpleTokenizer,
        device: str = 'cpu',
        max_length: int = 64
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.max_length = max_length
        self.model.eval()
    
    @classmethod
    def from_pretrained(cls, model_dir: str, device: str = None):
        """
        Load model from a local directory or HuggingFace repo ID.
        
        Args:
            model_dir: Local path (e.g., "models/mini") or 
                       HuggingFace repo ID (e.g., "surazbhandari/miniembed")
            device: Device to load on ('cpu', 'cuda', 'mps'). Auto-detected if None.
        """
        model, tokenizer = EmbeddingModelManager.load_model(model_dir, device)
        if device is None:
            device = next(model.parameters()).device.type
        return cls(model, tokenizer, device)
    
    def encode(
        self, 
        texts: Union[str, List[str]], 
        batch_size: int = 32,
        show_progress: bool = False
    ) -> np.ndarray:
        """
        Encode texts to embeddings.
        
        Args:
            texts: Single text or list of texts
            batch_size: Batch size for encoding
            show_progress: Show progress bar
            
        Returns:
            numpy array of shape (n_texts, d_model)
        """
        if isinstance(texts, str):
            texts = [texts]
        
        all_embeddings = []
        
        # Process in batches
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize
            encodings = [
                self.tokenizer.encode(t, self.max_length) 
                for t in batch_texts
            ]
            
            input_ids = torch.stack([e['input_ids'] for e in encodings]).to(self.device)
            attention_mask = torch.stack([e['attention_mask'] for e in encodings]).to(self.device)
            
            # Encode
            with torch.no_grad():
                embeddings = self.model.encode(input_ids, attention_mask)
            
            all_embeddings.append(embeddings.cpu().numpy())
        
        return np.vstack(all_embeddings)
    
    def similarity(self, text1: str, text2: str) -> float:
        """Compute cosine similarity between two texts."""
        emb1 = self.encode(text1)
        emb2 = self.encode(text2)
        return float(np.dot(emb1[0], emb2[0]))
    
    def pairwise_similarity(self, texts1: List[str], texts2: List[str]) -> np.ndarray:
        """
        Compute pairwise similarity between two lists.
        
        Returns:
            Matrix of shape (len(texts1), len(texts2))
        """
        emb1 = self.encode(texts1)
        emb2 = self.encode(texts2)
        return np.dot(emb1, emb2.T)
    
    def search(
        self, 
        query: str, 
        documents: List[str], 
        top_k: int = 5
    ) -> List[Dict]:
        """
        Semantic search: Find most similar documents to query.
        
        Args:
            query: Search query
            documents: List of documents to search
            top_k: Number of results to return
            
        Returns:
            List of dicts with 'text', 'score', 'rank'
        """
        query_emb = self.encode(query)
        doc_embs = self.encode(documents)
        
        # Compute similarities
        scores = np.dot(doc_embs, query_emb.T).flatten()
        
        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):
            results.append({
                'rank': rank,
                'text': documents[idx],
                'score': float(scores[idx]),
                'index': int(idx)
            })
        
        return results
    
    def cluster_texts(self, texts: List[str], n_clusters: int = 5) -> Dict:
        """
        Cluster texts by embedding similarity.
        
        Returns:
            Dict with 'labels' and 'texts_by_cluster'
        """
        from sklearn.cluster import KMeans
        
        embeddings = self.encode(texts)
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        
        return {
            'labels': labels.tolist(),
            'centroids': kmeans.cluster_centers_,
            'texts_by_cluster': {
                i: [texts[j] for j in range(len(texts)) if labels[j] == i]
                for i in range(n_clusters)
            }
        }