tiny-llama-ultra-compact / training_config.json

Upload artefact training_config.json sin subcarpetas

5430ffe verified 6 months ago

13.4 kB

	{
	"output_dir": "xdhshshsgv",
	"dataset_name": "HuggingFaceFW/finewiki",
	"dataset_config": "sr",
	"train_split": "train",
	"validation_split": "validation",
	"dataset_streaming": true,
	"dataset_streaming_train_take": 256,
	"dataset_streaming_validation_take": 256,
	"dataset_streaming_candidates": [
	"HuggingFaceFW/finewiki"
	],
	"dataset_streaming_overrides": {
	"HuggingFaceFW/finewiki": "sr"
	},
	"dataset_streaming_autodiscover": false,
	"auto_train_each_dataset": false,
	"quick_test_mode": true,
	"quick_test_train_steps": 64,
	"quick_test_max_training_cycles": 2,
	"quick_test_max_total_steps": 512,
	"quick_test_sample_attempts": 6,
	"quick_test_sample_max_new_tokens": 160,
	"quick_test_chat_max_new_tokens": 200,
	"quick_test_dataset_take": 256,
	"quick_test_alignment_examples": 512,
	"quick_test_alignment_generations": 4,
	"quick_test_streaming_chunk_size": 2048,
	"quick_test_streaming_prefetch": 2,
	"quick_test_streaming_processor_workers": 2,
	"continuous_training": true,
	"vocab_size": 104,
	"sequence_length": 256,
	"num_hidden_layers": 1,
	"hidden_size": 128,
	"intermediate_size": 512,
	"num_attention_heads": 4,
	"learning_rate": 0.00025,
	"weight_decay": 0.01,
	"train_batch_size": 4,
	"eval_batch_size": 4,
	"gradient_accumulation_steps": 1,
	"train_steps": 64,
	"warmup_ratio": 0.12,
	"logging_steps": 4,
	"eval_steps": 16,
	"eval_max_batches": 8,
	"seed": 42,
	"ascii_only_tokenizer": true,
	"sample_prompt": null,
	"sample_max_new_tokens": 160,
	"sample_chunk_size": 0,
	"sample_temperature": 0.38,
	"sample_temperature_floor": 0.18,
	"sample_temperature_ceiling": 0.75,
	"sample_top_p": 0.62,
	"sample_top_p_floor": 0.35,
	"sample_top_p_ceiling": 0.95,
	"sample_repetition_penalty": 1.1,
	"sample_attempts": 6,
	"sample_temperature_decay": 0.6,
	"sample_top_p_decay": 0.68,
	"retry_sample_attempt_growth": 3,
	"retry_sample_attempt_cap": 6,
	"retry_sample_max_new_tokens_growth": 18,
	"retry_sample_max_new_tokens_cap": 160,
	"retry_chat_preview_max_new_tokens_growth": 20,
	"retry_chat_preview_max_new_tokens_cap": 200,
	"retry_sample_temperature_step": -0.04,
	"retry_sample_top_p_boost": 0.05,
	"retry_min_sample_temperature_decay": 0.5,
	"retry_min_sample_top_p_decay": 0.5,
	"retry_temperature_step_multiplier": 1.15,
	"retry_top_p_boost_multiplier": 1.08,
	"retry_cycle_growth_factor": 1.4,
	"retry_cycle_growth_cap": 9.0,
	"retry_learning_rate_decay": 0.85,
	"retry_min_learning_rate": 5e-05,
	"retry_gradient_accumulation_growth": 1,
	"retry_gradient_accumulation_cap": 12,
	"retry_relaxation_factor_decay": 0.9,
	"retry_relaxation_perplexity_boost": 0.15,
	"retry_sample_chunk_growth": 8,
	"retry_sample_chunk_cap": 0,
	"retry_chat_preview_chunk_growth": 10,
	"retry_chat_preview_chunk_cap": 0,
	"retry_max_training_cycles_growth": 1,
	"retry_max_training_cycles_cap": 2.0,
	"retry_max_total_steps_growth": 96,
	"retry_max_total_steps_cap": 512,
	"retry_dataset_step_multiplier_boost": 0.4,
	"retry_dataset_step_multiplier_cap": 6.0,
	"retry_dataset_example_fraction_decay": 0.88,
	"retry_dataset_example_fraction_floor": 0.18,
	"retry_dataset_token_ratio_decay": 0.85,
	"retry_dataset_token_ratio_floor": 0.18,
	"retry_dataset_scan_multiplier_growth": 1,
	"retry_dataset_scan_multiplier_cap": 6,
	"chat_preview": false,
	"chat_preview_system_prompt": null,
	"chat_preview_user_prompt": null,
	"chat_preview_max_new_tokens": 200,
	"chat_preview_chunk_size": 0,
	"chat_preview_fallback_message": "Lo siento, aún no puedo generar una respuesta coherente.",
	"coherence_target_perplexity": 24.0,
	"coherence_min_chars": 64,
	"coherence_min_words": 11,
	"coherence_unique_token_ratio": 0.48,
	"coherence_max_word_repetition": 2,
	"max_training_cycles": 2,
	"cycle_step_growth": 2.6,
	"coherence_alpha_token_ratio": 0.8,
	"coherence_min_stopwords": 4,
	"coherence_min_alpha_fraction": 0.72,
	"coherence_min_clean_tokens": 11,
	"coherence_min_clean_ratio": 0.78,
	"coherence_min_dictionary_hits": 5,
	"coherence_min_known_word_ratio": 0.4,
	"coherence_max_unknown_ratio": 0.4,
	"coherence_min_known_word_hits": 6,
	"coherence_max_symbol_fraction": 0.08,
	"coherence_max_char_run": 3,
	"coherence_min_sentence_endings": 1,
	"coherence_min_vowel_tokens": 6,
	"coherence_min_vowel_ratio": 0.34,
	"coherence_max_repeat_fraction": 0.3,
	"coherence_max_repeat_tokens": 4,
	"coherence_required_keywords": [
	"inteligencia",
	"artificial",
	"objetivo",
	"principal",
	"sistemas",
	"datos",
	"soluciones"
	],
	"coherence_min_required_keywords": 3,
	"coherence_relaxation_factor": 0.85,
	"coherence_relaxation_max_passes": 6,
	"coherence_relaxation_perplexity_growth": 0.38,
	"dataset_min_chars": 60,
	"dataset_min_words": 8,
	"dataset_max_symbol_fraction": 0.12,
	"dataset_max_char_run": 4,
	"dataset_corrupt_keep_fraction": 0.01,
	"dataset_min_vowel_tokens": 4,
	"dataset_min_vowel_ratio": 0.28,
	"dataset_max_examples_per_split": null,
	"dataset_forbidden_substrings": [
	"<\|",
	"\|>",
	"assistant_",
	"_assistant",
	"<assistant",
	"<user",
	"</s>",
	"<s>"
	],
	"dataset_chat_system_prompt": "Eres un asistente que debe reproducir literalmente el texto proporcionado en partes ordenadas.",
	"dataset_chat_user_prompt": "Entrega la parte {part} de {total} exactamente como aparece en la fuente.",
	"dataset_chat_max_segment_chars": 0,
	"dataset_chat_format_use_processes": true,
	"dataset_chat_format_prefetch": null,
	"dataset_catalogue_max_entries": null,
	"coherence_known_words_top_k": 24,
	"coherence_known_words_min_freq": 3,
	"max_total_steps": 512,
	"sentencepiece_input_sentence_size": 512,
	"sentencepiece_max_sentence_length": 360,
	"sentencepiece_shuffle_input_sentence": true,
	"dataset_streaming_scan_multiplier": 1,
	"dataset_streaming_scan_buffer": 128,
	"dataset_streaming_loader_workers": 2,
	"dataset_streaming_processor_workers": 2,
	"dataset_streaming_scan_chunk_size": 2048,
	"dataset_streaming_max_chunk_size": 2048,
	"dataset_streaming_max_prefetch_chunks": 2,
	"dataset_streaming_map_chunksize": null,
	"dataset_streaming_progress_instant": true,
	"dataset_streaming_prefetch_chunks": 2,
	"dataset_streaming_progress_interval": 1.0,
	"web_scraper_enabled": true,
	"web_scraper_start_urls": [
	"https://example.com/"
	],
	"web_scraper_max_depth": 2,
	"web_scraper_max_pages": 200,
	"web_scraper_concurrency": 8,
	"web_scraper_timeout": 10.0,
	"web_scraper_user_agent": "Mozilla/5.0 (compatible; TinyLLaMAWebScraper/1.0; +https://huggingface.co)",
	"web_scraper_text_min_chars": 160,
	"web_scraper_retry_attempts": 2,
	"web_scraper_max_links_per_page": 200,
	"sklearn_newsgroups_enabled": true,
	"sklearn_newsgroups_subset": "all",
	"sklearn_newsgroups_categories": [],
	"sklearn_newsgroups_remove_headers": true,
	"sklearn_newsgroups_remove_quotes": true,
	"sklearn_newsgroups_texts_per_second": 500,
	"sklearn_newsgroups_batch_size": 500,
	"sklearn_newsgroups_workers": 8,
	"sklearn_newsgroups_max_texts": null,
	"sklearn_newsgroups_shuffle": true,
	"dataset_streaming_validation_shards": 64,
	"dataset_allow_non_streaming_fallback": false,
	"dataset_repo_fallback_enabled": true,
	"dataset_repo_fallback_max_files": null,
	"dataset_repo_fallback_extensions": [
	".jsonl",
	".json",
	".txt",
	".csv",
	".md",
	".jsonl.gz",
	".json.gz",
	".txt.gz",
	".csv.gz",
	".parquet",
	".jsonl.zst"
	],
	"dataset_repo_fallback_workers": null,
	"dataset_repo_fallback_segment_chars": 2048,
	"dataset_chat_format_workers": null,
	"dataset_chat_format_batch_size": null,
	"dataset_step_multiplier": 1.3,
	"dataset_step_example_fraction": 0.92,
	"dataset_step_token_ratio": 0.68,
	"tokenizer_additional_special_tokens": [
	"<\|im_start\|>",
	"<\|im_end\|>",
	"<\|system\|>",
	"<\|user\|>",
	"<\|assistant\|>",
	"<\|tool\|>",
	"<\|assistant_reasoning\|>",
	"<\|assistant_reasoning_end\|>"
	],
	"tokenizer_chat_template": "\n{% for message in messages %}\n{% if message['role'] == 'system' %}\n<\|im_start\|>system\n{{ message['content'] }}<\|im_end\|>\n{% elif message['role'] == 'user' %}\n<\|im_start\|>user\n{{ message['content'] }}<\|im_end\|>\n{% elif message['role'] == 'assistant' %}\n<\|im_start\|>assistant\n{% if message.get('reasoning') %}\n<\|assistant_reasoning\|>\n{{ message['reasoning'] \| trim }}\n<\|assistant_reasoning_end\|>\n{% endif %}\n{{ message['content'] }}<\|im_end\|>\n{% elif message['role'] == 'tool' %}\n<\|im_start\|>tool\n{{ message['content'] }}<\|im_end\|>\n{% endif %}\n{% endfor %}\n<\|im_start\|>assistant\n",
	"allow_dataset_downloads": false,
	"huggingface_username": "Ignaciohhhhggfgjfrffd",
	"huggingface_repo_name": "tiny-llama-ultra-compact",
	"huggingface_upload": true,
	"huggingface_periodic_upload_interval": 600,
	"auto_install_dependencies": true,
	"runtime_dependency_torch_index_url": "https://download.pytorch.org/whl/cpu",
	"runtime_dependencies": {
	"torch": "torch",
	"datasets": "datasets",
	"transformers": "transformers",
	"accelerate": "accelerate",
	"peft": "peft",
	"trl": "trl",
	"sentencepiece": "sentencepiece",
	"huggingface_hub": "huggingface_hub",
	"requests": "requests",
	"bs4": "beautifulsoup4",
	"sklearn": "scikit-learn"
	},
	"numeric_self_test_enabled": true,
	"numeric_self_test_attempts": 1,
	"numeric_self_test_relaxation_passes": 1,
	"numeric_self_test_perplexity": 48.0,
	"numeric_self_test_history_limit": 2,
	"coherence_stopwords": [
	"la",
	"el",
	"de",
	"que",
	"en",
	"y",
	"es",
	"para",
	"con",
	"no",
	"lo",
	"una",
	"un",
	"the",
	"and",
	"of",
	"to",
	"in",
	"is"
	],
	"coherence_dictionary": [
	"inteligencia",
	"artificial",
	"datos",
	"sistema",
	"aprendizaje",
	"tecnologia",
	"modelo",
	"informacion",
	"conocimiento",
	"red",
	"texto",
	"la",
	"el",
	"los",
	"las",
	"una",
	"un",
	"the",
	"and",
	"of",
	"to",
	"in",
	"is",
	"data",
	"learning",
	"system",
	"machine",
	"model",
	"science",
	"technology",
	"respuesta",
	"coherente",
	"pregunta",
	"objetivo",
	"principal",
	"ayuda",
	"soluciones",
	"responsables",
	"decisiones",
	"automatizan",
	"transparentes",
	"transparente",
	"colaboran",
	"problemas",
	"confiables"
	],
	"coherence_forbidden_substrings": [
	"�",
	"�",
	"<unk>",
	"<s>",
	"</s>",
	"<\|",
	"\|>",
	"__",
	"assistant_",
	"<assistant",
	"<user"
	],
	"repair_support_terms": [
	"datos",
	"modelo",
	"sistema",
	"informacion",
	"aprendizaje",
	"tecnologia",
	"respuesta",
	"respuestas",
	"claridad",
	"coherencia",
	"coherentes",
	"naturales",
	"natural",
	"organiza",
	"usa",
	"presenta",
	"objetivo",
	"principales",
	"beneficios",
	"aplicaciones",
	"impacto",
	"soluciones",
	"problemas",
	"decisiones",
	"responsables",
	"algoritmos"
	],
	"repair_forbidden_fragments": [
	"entrega la parte",
	"debe reproducir literalmente",
	"<\|im_start\|>system",
	"<\|im_start\|>user"
	],
	"repair_keyword_answer": "La inteligencia artificial tiene como objetivo principal desarrollar sistemas que aprenden de los datos para ofrecer soluciones responsables y confiables. Estos sistemas analizan información, automatizan decisiones y colaboran con las personas para resolver problemas complejos de forma transparente.",
	"repair_min_alpha_fraction": 0.68,
	"repair_min_vowel_ratio": 0.3,
	"repair_max_char_run": 3,
	"enable_alignment_pipeline": false,
	"alignment_pipeline": {
	"enabled": false,
	"base_model_dir": null,
	"stage1_output_dir": "outputs/Qwen-1.5B-Mega-SFT-Stage1",
	"stage2_output_dir": "outputs/Qwen-1.5B-Reasoning-GRPO-Stage2",
	"stage3_output_dir": "outputs/Qwen-1.5B-Final-Aligned-Model",
	"stage1_examples_per_dataset": 512,
	"stage1_translation_examples": 512,
	"stage1_num_proc": 4,
	"stage1_streaming": true,
	"stage1_streaming_buffer": 4096,
	"stage1_dataset_seed": 42,
	"stage2_max_examples": 512,
	"stage2_num_generations": 4,
	"stage2_max_prompt_length": 256,
	"stage2_max_completion_length": 786,
	"stage3_examples_per_dataset": 512,
	"stage3_dataset_seed": 42,
	"report_to": "wandb",
	"upload_repo_suffix": "alignment",
	"stage1_run_name": "Qwen-1.5B-Mega-SFT-Stage1",
	"stage2_run_name": "Qwen-1.5B-Reasoning-GRPO-Stage2",
	"stage3_run_name": "Qwen-1.5B-Final-Aligned-DPO-Stage3",
	"stage4_output_dir": "outputs/Qwen-1.5B-Reflective-Awareness-Stage4",
	"stage5_output_dir": "outputs/Qwen-1.5B-Elite-Reasoning-Stage5",
	"stage4_examples_per_dataset": 512,
	"stage4_dataset_seed": 52,
	"stage4_run_name": "Qwen-1.5B-Reflective-Awareness-Stage4",
	"stage5_max_examples": 512,
	"stage5_num_generations": 4,
	"stage5_max_prompt_length": 320,
	"stage5_max_completion_length": 896,
	"stage5_run_name": "Qwen-1.5B-Elite-Reasoning-Stage5",
	"stage6_output_dir": "outputs/Qwen-1.5B-Bilingual-Translator-Stage6",
	"stage6_examples_per_dataset": 8000,
	"stage6_dataset_seed": 64,
	"stage6_run_name": "Qwen-1.5B-Bilingual-Translator-Stage6"
	},
	"exported_at": "2025-10-30T21:43:14.906209Z"
	}