DeepXR
/

helion-v1-embeddings

+"""
+Helion-V1-Embeddings Training Script
+Train a lightweight embedding model for semantic similarity and retrieval
+"""
+import json
+import logging
+from typing import List, Dict, Tuple
+from pathlib import Path
+from datetime import datetime
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class EmbeddingsTrainer:
+    """Train embeddings model for Helion-V1-Embeddings."""
+    def __init__(
+        self,
+        base_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        output_path: str = "./helion-embeddings-output"
+    ):
+        self.base_model = base_model
+        self.output_path = Path(output_path)
+        self.output_path.mkdir(parents=True, exist_ok=True)
+    def prepare_training_data(self) -> List[Dict]:
+        """
+        Prepare training data for embeddings.
+        Format: sentence pairs with similarity scores.
+        """
+        training_examples = [
+            # High similarity pairs
+            {
+                "sentence1": "How do I reset my password?",
+                "sentence2": "What's the password reset process?",
+                "score": 0.95
+            },
+            {
+                "sentence1": "Machine learning training methods",
+                "sentence2": "How to train ML models",
+                "score": 0.90
+            },
+            {
+                "sentence1": "Python programming tutorial",
+                "sentence2": "Learn Python coding",
+                "score": 0.88
+            },
+            # Medium similarity pairs
+            {
+                "sentence1": "Install Python on Windows",
+                "sentence2": "Python setup guide",
+                "score": 0.70
+            },
+            {
+                "sentence1": "Best restaurants in Paris",
+                "sentence2": "Where to eat in France",
+                "score": 0.65
+            },
+            # Low similarity pairs
+            {
+                "sentence1": "How to bake cookies",
+                "sentence2": "Machine learning algorithms",
+                "score": 0.10
+            },
+            {
+                "sentence1": "Weather forecast tomorrow",
+                "sentence2": "Stock market analysis",
+                "score": 0.05
+            }
+        ]
+        logger.info(f"Prepared {len(training_examples)} training examples")
+        return training_examples
+    def create_contrastive_pairs(self) -> List[Tuple[str, str]]:
+        """
+        Create pairs for contrastive learning.
+        Format: (anchor, positive) pairs.
+        """
+        pairs = [
+            ("What is machine learning?", "Machine learning explained simply"),
+            ("How to learn Python?", "Python learning resources"),
+            ("Best coding practices", "Software development best practices"),
+            ("Data science tutorial", "Learn data science basics"),
+            ("Natural language processing", "NLP fundamentals guide"),
+            ("Deep learning introduction", "Getting started with deep learning"),
+            ("Web development guide", "How to build websites"),
+            ("Database design principles", "SQL database design tutorial"),
+            ("Cloud computing basics", "Introduction to cloud services"),
+            ("API development guide", "How to create REST APIs"),
+        ]
+        logger.info(f"Created {len(pairs)} contrastive pairs")
+        return pairs
+    def train_model(
+        self,
+        train_examples: List[Dict] = None,
+        epochs: int = 3,
+        batch_size: int = 16,
+        warmup_steps: int = 100
+    ):
+        """
+        Train the embeddings model.
+        Args:
+            train_examples: Training data (if None, uses default)
+            epochs: Number of training epochs
+            batch_size: Batch size for training
+            warmup_steps: Warmup steps for learning rate
+        """
+        try:
+            from sentence_transformers import (
+                SentenceTransformer,
+                InputExample,
+                losses,
+                evaluation
+            )
+            from torch.utils.data import DataLoader
+            logger.info("Loading base model...")
+            model = SentenceTransformer(self.base_model)
+            # Prepare data
+            if train_examples is None:
+                train_examples = self.prepare_training_data()
+            # Convert to InputExample format
+            train_data = []
+            for example in train_examples:
+                train_data.append(InputExample(
+                    texts=[example["sentence1"], example["sentence2"]],
+                    label=example["score"]
+                ))
+            # Create DataLoader
+            train_dataloader = DataLoader(
+                train_data,
+                shuffle=True,
+                batch_size=batch_size
+            )
+            # Define loss function
+            train_loss = losses.CosineSimilarityLoss(model)
+            # Training
+            logger.info("Starting training...")
+            model.fit(
+                train_objectives=[(train_dataloader, train_loss)],
+                epochs=epochs,
+                warmup_steps=warmup_steps,
+                output_path=str(self.output_path),
+                show_progress_bar=True,
+                save_best_model=True
+            )
+            logger.info(f"✅ Training complete! Model saved to {self.output_path}")
+            return model
+        except ImportError:
+            logger.error("sentence-transformers not installed. Install with: pip install sentence-transformers")
+            return None
+        except Exception as e:
+            logger.error(f"Training failed: {e}")
+            return None
+    def evaluate_model(self, model, test_pairs: List[Tuple[str, str, float]] = None):
+        """
+        Evaluate the trained model.
+        Args:
+            model: Trained SentenceTransformer model
+            test_pairs: List of (sentence1, sentence2, expected_similarity)
+        """
+        from sentence_transformers import util
+        if test_pairs is None:
+            # Default test pairs
+            test_pairs = [
+                ("How to code?", "Coding tutorial", 0.85),
+                ("Weather today", "Stock prices", 0.1),
+                ("Machine learning", "AI and ML", 0.95),
+            ]
+        logger.info("Evaluating model...")
+        total_error = 0
+        for sent1, sent2, expected in test_pairs:
+            emb1 = model.encode(sent1)
+            emb2 = model.encode(sent2)
+            similarity = float(util.cos_sim(emb1, emb2)[0][0])
+            error = abs(similarity - expected)
+            total_error += error
+            logger.info(f"'{sent1}' <-> '{sent2}'")
+            logger.info(f"  Expected: {expected:.2f}, Got: {similarity:.2f}, Error: {error:.2f}")
+        avg_error = total_error / len(test_pairs)
+        logger.info(f"Average error: {avg_error:.3f}")
+        return avg_error
+    def create_config_files(self):
+        """Create necessary configuration files."""
+        # Sentence transformers config
+        config = {
+            "__version__": {
+                "sentence_transformers": "2.2.2",
+                "transformers": "4.36.0",
+                "pytorch": "2.0.0"
+            },
+            "prompts": {},
+            "default_prompt_name": None,
+            "similarity_fn_name": "cosine",
+            "max_seq_length": 256,
+            "do_lower_case": False
+        }
+        with open(self.output_path / "config_sentence_transformers.json", 'w') as f:
+            json.dump(config, f, indent=2)
+        # Modules configuration
+        modules = [
+            {
+                "idx": 0,
+                "name": "0",
+                "path": "",
+                "type": "sentence_transformers.models.Transformer"
+            },
+            {
+                "idx": 1,
+                "name": "1",
+                "path": "1_Pooling",
+                "type": "sentence_transformers.models.Pooling"
+            },
+            {
+                "idx": 2,
+                "name": "2",
+                "path": "2_Normalize",
+                "type": "sentence_transformers.models.Normalize"
+            }
+        ]
+        with open(self.output_path / "modules.json", 'w') as f:
+            json.dump(modules, f, indent=2)
+        logger.info("✅ Configuration files created")
+def main():
+    """Main training function."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Train Helion-V1-Embeddings model"
+    )
+    parser.add_argument(
+        "--base-model",
+        default="sentence-transformers/all-MiniLM-L6-v2",
+        help="Base model to fine-tune"
+    )
+    parser.add_argument(
+        "--output",
+        default="./helion-embeddings-output",
+        help="Output directory"
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=3,
+        help="Number of training epochs"
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        help="Batch size"
+    )
+    parser.add_argument(
+        "--data-file",
+        type=str,
+        help="Path to training data JSON file"
+    )
+    args = parser.parse_args()
+    # Create trainer
+    trainer = EmbeddingsTrainer(
+        base_model=args.base_model,
+        output_path=args.output
+    )
+    # Load custom data if provided
+    train_examples = None
+    if args.data_file:
+        with open(args.data_file, 'r') as f:
+            train_examples = json.load(f)
+        logger.info(f"Loaded {len(train_examples)} examples from {args.data_file}")
+    # Train model
+    model = trainer.train_model(
+        train_examples=train_examples,
+        epochs=args.epochs,
+        batch_size=args.batch_size
+    )
+    if model:
+        # Evaluate
+        trainer.evaluate_model(model)
+        # Create config files
+        trainer.create_config_files()
+        print("\n" + "="*60)
+        print("✅ Helion-V1-Embeddings Training Complete!")
+        print("="*60)
+        print(f"📁 Model saved to: {args.output}")
+        print("\n💡 Test your model:")
+        print("```python")
+        print("from sentence_transformers import SentenceTransformer")
+        print(f"model = SentenceTransformer('{args.output}')")
+        print("embeddings = model.encode(['Hello world'])")
+        print("```")
+        print("="*60)
+if __name__ == "__main__":
+    main()