| { |
| "epoch": 0, |
| "global_step": 99498, |
| "tokens_processed": 99686013, |
| "target_tokens": 100000000, |
| "best_similarity": 0.34183505177497864, |
| "training_config": { |
| "model": { |
| "vocab_size": 50257, |
| "text_encoder_dim": 128, |
| "text_encoder_layers": 4, |
| "text_encoder_heads": 4, |
| "text_decoder_dim": 128, |
| "text_decoder_layers": 4, |
| "text_decoder_heads": 4, |
| "vision_encoder_dim": 768, |
| "vision_latent_size": 128, |
| "vision_hidden_size": 64, |
| "vision_compression_method": "learned_compression", |
| "vision_spatial_pooling": true, |
| "vision_pool_size": 2, |
| "fusion_hidden_size": 128, |
| "fusion_num_heads": 4, |
| "fusion_num_layers": 2, |
| "memory_size": 32, |
| "episode_dim": 128, |
| "memory_alpha": 0.2, |
| "direct_writing": true, |
| "memory_compression": true, |
| "enable_adaptive_training": true, |
| "max_seq_len": 256, |
| "dropout": 0.15 |
| }, |
| "token_constraints": { |
| "total_tokens": 100000000, |
| "caption_tokens": 50000000, |
| "text_tokens": 50000000, |
| "enforce_exact_count": true, |
| "uniform_sampling": true, |
| "alignment_priority": "perfect_alignment", |
| "preserve_image_caption_pairs": true, |
| "strict_alignment_validation": true |
| }, |
| "vision_feature_reduction": { |
| "enabled": true, |
| "method": "learned_compression", |
| "target_dim": 64, |
| "spatial_pooling": true, |
| "pool_method": "attention", |
| "hidden_dim": 128, |
| "learnable": true, |
| "preserve_spatial_info": true |
| }, |
| "data": { |
| "dataset_dir": "../babylm_dataset", |
| "text_encoder_name": "gpt2", |
| "max_seq_length": 256, |
| "count_tokens": true, |
| "target_caption_tokens": 50000000, |
| "target_text_tokens": 50000000, |
| "token_counting_method": "gpt2", |
| "batch_size": 64, |
| "num_workers": 6, |
| "pin_memory": true, |
| "persistent_workers": true, |
| "mix_ratio": 0.5, |
| "shuffle_datasets": true, |
| "ensure_alignment": true, |
| "validate_alignment": true, |
| "alignment_verification": "strict", |
| "never_break_pairs": true, |
| "alignment_check_frequency": 1000, |
| "use_validation": false, |
| "train_only": true |
| }, |
| "attention_analysis": { |
| "track_top_k": 5, |
| "log_every_n_steps": 200, |
| "viz_every_n_epochs": 3, |
| "save_head_patterns": true, |
| "analyze_memory_attention": true, |
| "analyze_cross_modal": true, |
| "track_token_alignment": true |
| }, |
| "adaptive_training": { |
| "enabled": true, |
| "similarity_window_size": 200, |
| "drop_threshold": 0.12, |
| "min_steps_between_interventions": 800, |
| "freeze_duration_steps": 1500, |
| "loss_rebalance_factor": 2.0, |
| "similarity_smoothing_alpha": 0.15 |
| }, |
| "training": { |
| "max_epochs": 10, |
| "accumulate_grad_batches": 2, |
| "gradient_clip_val": 0.3, |
| "val_check_interval": 1000, |
| "scheduler": "cosine_with_restarts", |
| "min_lr": 5e-05, |
| "warmup_steps": 1000, |
| "learning_rate": 0.0002, |
| "weight_decay": 0.02, |
| "optimizer": "adamw8bit", |
| "scheduler_config": { |
| "T_0": 1000, |
| "T_mult": 2, |
| "eta_min_ratio": 0.1 |
| }, |
| "cross_modal_loss_weight": 1.5, |
| "text_generation_loss_weight": 1.0, |
| "memory_regularization_weight": 0.1, |
| "alignment_consistency_weight": 0.5, |
| "track_token_usage": true, |
| "log_token_progress": true, |
| "stop_at_token_limit": false, |
| "validate_alignment_every_n_steps": 500, |
| "log_alignment_metrics": true, |
| "alignment_loss_scaling": "adaptive" |
| }, |
| "wandb": { |
| "project": "bitmar-100M-attention-epochs", |
| "entity": "babylm-ntust", |
| "api_key": null, |
| "log_every_n_steps": 100, |
| "log_attention": true, |
| "log_memory": true, |
| "log_gradients": true, |
| "log_token_usage": true, |
| "log_cross_modal_similarity": true, |
| "log_alignment_quality": true, |
| "log_caption_image_matching": true, |
| "save_code": true, |
| "create_plots": true, |
| "plot_attention_heatmaps": true, |
| "plot_memory_usage": true, |
| "plot_token_distribution": true, |
| "plot_alignment_metrics": true, |
| "log_memory_evolution": true, |
| "plot_memory_evolution_heatmap": true, |
| "plot_memory_diversity": true, |
| "plot_memory_access_patterns": true, |
| "memory_visualization_frequency": 5000, |
| "memory_snapshot_frequency": 10000, |
| "track_memory_metrics": [ |
| "memory_diversity_score", |
| "memory_specialization_score", |
| "memory_usage_entropy", |
| "cross_modal_memory_ratio", |
| "memory_slot_utilization", |
| "memory_update_frequency", |
| "memory_retrieval_accuracy" |
| ] |
| }, |
| "evaluation": { |
| "metrics": [ |
| "bleu", |
| "rouge", |
| "cross_modal_similarity", |
| "memory_efficiency" |
| ], |
| "generate_samples": true, |
| "num_samples": 20, |
| "max_generation_length": 32, |
| "temperature": 0.8, |
| "top_p": 0.9, |
| "evaluate_alignment": true, |
| "alignment_metrics": [ |
| "cosine_similarity", |
| "retrieval_accuracy", |
| "caption_image_matching", |
| "cross_modal_retrieval" |
| ], |
| "alignment_threshold": 0.8, |
| "validate_pairs_during_eval": true |
| }, |
| "output": { |
| "checkpoint_dir": "checkpoints_100M_dataset", |
| "log_dir": "logs_100M_dataset", |
| "attention_dir": "attention_100M_dataset", |
| "memory_dir": "memory_100M_dataset", |
| "results_dir": "results_100M_dataset", |
| "token_logs_dir": "token_logs_100M_dataset" |
| }, |
| "memory_optimization": { |
| "use_gradient_checkpointing": true, |
| "use_fp16": true, |
| "use_int8_vision": false, |
| "empty_cache_frequency": 10, |
| "max_memory_slots_in_ram": 16, |
| "compress_episodic_memory": true, |
| "vision_feature_caching": false, |
| "vision_batch_processing": true, |
| "tie_word_embeddings": true, |
| "use_shared_attention": false |
| }, |
| "performance_targets": { |
| "max_model_size_mb": 50, |
| "target_cross_modal_similarity": 0.75, |
| "target_text_generation_quality": 0.6, |
| "memory_efficiency_threshold": 0.8 |
| }, |
| "flops_tracking": { |
| "enabled": true, |
| "log_frequency": 100, |
| "save_statistics": true, |
| "estimate_theoretical": true, |
| "track_peak_performance": true, |
| "log_to_wandb": true, |
| "detailed_breakdown": true, |
| "memory_bandwidth_tracking": false, |
| "efficiency_analysis": true, |
| "track_components": [ |
| "attention", |
| "feedforward", |
| "layer_norm", |
| "embeddings", |
| "vision_encoder", |
| "cross_modal_fusion" |
| ] |
| }, |
| "token_tracking": { |
| "log_frequency": 1000, |
| "save_token_distribution": true, |
| "monitor_caption_text_ratio": true, |
| "enforce_token_limits": false, |
| "early_stopping_on_limit": false, |
| "track_alignment_quality": true, |
| "log_misaligned_samples": true, |
| "alignment_quality_threshold": 0.7, |
| "save_alignment_statistics": true, |
| "correlate_flops_with_tokens": true, |
| "log_computational_efficiency": true, |
| "track_throughput_vs_quality": true |
| }, |
| "huggingface_hub": { |
| "enabled": true, |
| "repo_id": "euhidaman/bitmar-attention-multimodal", |
| "private": true, |
| "upload_after_epoch": true, |
| "upload_final_model": true, |
| "commit_message_template": "BitMar 100M tokens - Epoch {epoch} - {tokens_processed:,} tokens processed", |
| "create_model_card": true, |
| "model_card_template": "---\nlanguage: en\nlicense: mit\ntags:\n- bitmar\n- multimodal\n- babylm\n- cross-modal\ndatasets:\n- babylm_multimodal\nmetrics:\n- bleu\n- cross_modal_similarity\n---\n\n# BitMar 100M Token Model\n\nThis model was trained on exactly 100 million tokens as part of the BabyLM challenge.\n\n## Training Details\n- Total tokens: 100,000,000\n- Epochs completed: {epoch}\n- Tokens processed: {tokens_processed:,}\n- Cross-modal similarity: {best_similarity:.4f}\n\n## Model Architecture\n- Text encoder: {text_encoder_layers} layers, {text_encoder_dim} hidden size\n- Vision encoder: DiNOv2 features compressed to {vision_latent_size}\n- Episodic memory: {memory_size} slots\n\n## Usage\n```python\nfrom transformers import AutoModel, AutoTokenizer\n\nmodel = AutoModel.from_pretrained(\"{repo_id}\")\ntokenizer = AutoTokenizer.from_pretrained(\"{repo_id}\")\n```\n" |
| }, |
| "attention_sinks": { |
| "enabled": true, |
| "attention_sink_size": 4, |
| "attention_sink_window_size": 1020, |
| "inject_to_text_encoder": true, |
| "inject_to_text_decoder": true, |
| "position_shift_enabled": true, |
| "cache_compression": true, |
| "adaptive_window_size": false, |
| "memory_efficient_attention": true, |
| "preserve_episodic_memory": true, |
| "preserve_quantization": true, |
| "preserve_cross_modal_fusion": true |
| } |
| } |
| } |