"""Tests training pipeline konfigurācijai un datu sagatavošanai.""" from __future__ import annotations import asyncio import importlib.util import json import os import re import subprocess import sys import types from importlib.metadata import PackageNotFoundError from pathlib import Path from typing import Any import pytest from maris_core.data.preprocessing import record_to_training_text from maris_core.training.config import ( AVAILABLE_TRAINING_BASE_MODELS, DEFAULT_TRAINING_BASE_MODEL, list_training_base_models, load_training_config, ) from maris_core.training.hf_compat import ( MARIS_COMPATIBILITY_ARTIFACT_NAME, apply_maris_compatibility_identity, write_maris_compatibility_artifact, ) from maris_core.training.preferences import load_preference_dataset from maris_core.training.train import ( _build_benchmark_gate_artifact, _build_distributed_training_argument_overrides, _ensure_runtime_home_dir, _filter_preference_examples_for_branch, _filter_records_for_branch, _run_post_training_benchmark, build_branch_training_configs, evaluate_with_config, train, train_branch_suite, ) FOREIGN_AI_NAME_RE = re.compile( r"(?i)\b(?:anthropic|chatgpt|claude|deepseek|gemini|llama|mistral|openai|qwen|TinyLlama)\b" ) FOREIGN_MODEL_REPO_RE = re.compile( r"(?i)\b(?:deepseek-ai|meta-llama|mistralai|openai|qwen|TinyLlama)/[A-Za-z0-9][\w.-]*\b" ) def _assert_output_dir_uses_only_maris_identity(output_dir: Path) -> None: checked_files = sorted(output_dir.rglob("*")) for path in checked_files: if not path.is_file(): continue if path.name == MARIS_COMPATIBILITY_ARTIFACT_NAME: continue if path.suffix.lower() not in {".json", ".jinja", ".md", ".txt"}: continue content = path.read_text(encoding="utf-8") assert FOREIGN_MODEL_REPO_RE.search(content) is None, path assert FOREIGN_AI_NAME_RE.search(content) is None, path def test_record_to_training_text_formats_conversation_and_generation() -> None: conversation = record_to_training_text({"user": "Sveiki", "assistant": "Čau!"}) generation = record_to_training_text({"prompt": "Uzzīmē kaķi", "metadata": {"style": "anime"}}) assert "<|user|>" in conversation assert "Sveiki" in conversation assert "Čau!" in conversation assert "Uzzīmē kaķi" in generation assert '"style": "anime"' in generation def test_record_to_training_text_formats_structured_coder_record() -> None: formatted = record_to_training_text( { "prompt": "Salabo retry helperi.", "target_file": "core-python/maris_core/retries.py", "buggy_code": "def retry(count):\n return count / 0", "tests": ["assert retry(1) == 1", "assert retry(3) == 3"], "edge_cases": ["0 mēģinājumi", "negatīvs skaits"], "metadata": {"language": "python", "task": "bugfix"}, "completion": "```python\ndef retry(count: int) -> int:\n return max(count, 0)\n```", } ) assert "Mērķa fails" in formatted assert "Esošais vai kļūdainais kods" in formatted assert "Robežgadījumi" in formatted assert "```python" in formatted def test_load_training_config_reads_json_and_env_overrides( tmp_path: Path, monkeypatch, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps( { "model_name": "repo/from-json", "branch_name": "coder", "num_epochs": 7, "report_to": ["tensorboard"], } ), encoding="utf-8", ) monkeypatch.setenv("HF_TRAIN_BATCH_SIZE", "3") monkeypatch.setenv("HF_TRAIN_ADAPTER_TYPE", "lora") config = load_training_config(str(config_path), overrides={"learning_rate": 1e-4}) assert config.model_name == "repo/from-json" assert config.branch_name == "coder" assert config.num_epochs == 7 assert config.per_device_train_batch_size == 3 assert config.learning_rate == 1e-4 assert config.adapter_type == "lora" assert config.report_to == ["tensorboard"] assert config.text_model_id == "MarisUK/maris-ai-text" assert config.image_model_id == "MarisUK/maris-ai-image" def test_load_training_config_reads_distributed_runtime_overrides(monkeypatch) -> None: monkeypatch.setenv("HF_TRAIN_DISTRIBUTED_STRATEGY", "deepspeed") monkeypatch.setenv("HF_TRAIN_DISTRIBUTED_CONFIG_PATH", "huggingface/deepspeed-zero3.json") monkeypatch.setenv("HF_TRAIN_NUM_PROCESSES", "8") monkeypatch.setenv("HF_TRAIN_NUM_MACHINES", "2") monkeypatch.setenv("HF_TRAIN_MACHINE_RANK", "1") monkeypatch.setenv("HF_TRAIN_MAIN_PROCESS_IP", "10.0.0.10") monkeypatch.setenv("HF_TRAIN_MAIN_PROCESS_PORT", "29510") config = load_training_config() assert config.distributed_strategy == "deepspeed" assert config.distributed_config_path == "huggingface/deepspeed-zero3.json" assert config.use_accelerate is True assert config.num_processes == 8 assert config.num_machines == 2 assert config.machine_rank == 1 assert config.main_process_ip == "10.0.0.10" assert config.main_process_port == 29510 def test_load_training_config_reads_gradient_checkpointing_use_reentrant_override( monkeypatch, ) -> None: monkeypatch.setenv("HF_TRAIN_GRADIENT_CHECKPOINTING_USE_REENTRANT", "false") config = load_training_config() assert config.gradient_checkpointing_use_reentrant is False def test_load_training_config_reads_runtime_model_repo_overrides(monkeypatch) -> None: monkeypatch.setenv("TEXT_MODEL", "MarisUK/custom-text") monkeypatch.setenv("IMAGE_MODEL", "MarisUK/custom-image") monkeypatch.setenv("MUSIC_MODEL", "MarisUK/custom-music") monkeypatch.setenv("TTS_MODEL", "MarisUK/custom-tts") monkeypatch.setenv("STT_MODEL", "MarisUK/custom-stt") monkeypatch.setenv("VIDEO_MODEL", "MarisUK/custom-video") config = load_training_config() assert config.text_model_id == "MarisUK/custom-text" assert config.image_model_id == "MarisUK/custom-image" assert config.music_model_id == "MarisUK/custom-music" assert config.tts_model_id == "MarisUK/custom-tts" assert config.stt_model_id == "MarisUK/custom-stt" assert config.video_model_id == "MarisUK/custom-video" def test_load_training_config_rejects_conflicting_precision_modes( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"fp16": True, "bf16": True}), encoding="utf-8", ) try: load_training_config(str(config_path)) except ValueError as exc: assert "fp16 un bf16" in str(exc) else: raise AssertionError("load_training_config() should reject conflicting precision modes") def test_load_training_config_resolves_model_preset( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"model_preset": "coding"}), encoding="utf-8", ) config = load_training_config(str(config_path)) assert config.model_preset == "coding" assert config.model_name == AVAILABLE_TRAINING_BASE_MODELS["coding"]["model_name"] def test_load_training_config_resolves_extra_model_preset( tmp_path: Path, monkeypatch, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"model_preset": "qwen-32b"}), encoding="utf-8", ) monkeypatch.setenv( "MARIS_TRAIN_EXTRA_MODELS", json.dumps({"qwen-32b": "Qwen/Qwen2.5-32B-Instruct"}), ) config = load_training_config(str(config_path)) assert config.model_preset == "qwen-32b" assert config.model_name == "Qwen/Qwen2.5-32B-Instruct" def test_load_training_config_rejects_unknown_model_preset( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"model_preset": "unknown"}), encoding="utf-8", ) try: load_training_config(str(config_path)) except ValueError as exc: assert "model_preset" in str(exc) assert "balanced" in str(exc) else: raise AssertionError("load_training_config() should reject unknown model presets") def test_load_training_config_rejects_non_maris_hub_model_id( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"hub_model_id": "someone-else/not-maris"}), encoding="utf-8", ) try: load_training_config(str(config_path)) except RuntimeError as exc: assert "Maris AI modeli" in str(exc) else: raise AssertionError("load_training_config() should reject non-Maris output model ids") def test_load_training_config_rejects_non_maris_dataset_repo( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"dataset_repo": "someone-else/not-maris-memory"}), encoding="utf-8", ) try: load_training_config(str(config_path)) except RuntimeError as exc: assert "dataset repozitorijs" in str(exc) else: raise AssertionError("load_training_config() should reject non-Maris dataset repo ids") def test_load_training_config_reads_optional_eval_dataset_repo( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"eval_dataset_repo": "MarisUK/maris-ai-evals"}), encoding="utf-8", ) config = load_training_config(str(config_path)) assert config.eval_dataset_repo == "MarisUK/maris-ai-evals" def test_load_training_config_reads_explicit_training_and_eval_dataset_repo_lists( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps( { "dataset_repo": "MarisUK/maris-ai-memory", "dataset_repos": [ "MarisUK/maris-ai-memory", "MarisUK/maris-ai-lv-memory", "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ], "eval_dataset_repo": "MarisUK/maris-ai-evals", "eval_dataset_repos": [ "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ], } ), encoding="utf-8", ) config = load_training_config(str(config_path)) assert config.dataset_repos == [ "MarisUK/maris-ai-memory", "MarisUK/maris-ai-lv-memory", "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ] assert config.eval_dataset_repos == [ "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ] def test_load_training_config_reads_benchmark_and_preference_paths( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps( { "benchmark_dataset_path": "/tmp/benchmarks/release.json", "benchmark_name": "release-gate", "benchmark_levels": ["ci", "release"], "benchmark_min_overall": 0.75, "benchmark_gate_enabled": True, "benchmark_feedback_auto_discover": False, "benchmark_feedback_path": "/tmp/benchmarks/previous.json", "benchmark_feedback_boost_scale": 2.5, "benchmark_feedback_max_multiplier": 1.8, "preference_dataset_path": "/tmp/preferences.json", "branch_benchmark_targets": {"master": {"overall": 0.8, "reasoning": 0.78}}, "branch_benchmark_names": { "master": "memory-quality", "coder": "coder-release-quality", }, "branch_benchmark_dataset_paths": { "coder": "/tmp/benchmarks/coder-release.json", "planner": "/tmp/benchmarks/planner-release.json", }, "branch_preference_dataset_paths": { "coder": "/tmp/preferences/coder-preferences.json" }, "branch_dataset_filter_rules": { "planner": {"include_record_types": ["autonomous"], "allow_unlabeled": False} }, "source_weight_map": {"production": 1.5, "synthetic": 1.0, "noisy": 0.6}, } ), encoding="utf-8", ) config = load_training_config(str(config_path)) assert config.benchmark_dataset_path == "/tmp/benchmarks/release.json" assert config.benchmark_name == "release-gate" assert config.benchmark_levels == ["ci", "release"] assert config.benchmark_min_overall == 0.75 assert config.benchmark_gate_enabled is True assert config.benchmark_feedback_auto_discover is False assert config.benchmark_feedback_path == "/tmp/benchmarks/previous.json" assert config.benchmark_feedback_boost_scale == 2.5 assert config.benchmark_feedback_max_multiplier == 1.8 assert config.preference_dataset_path == "/tmp/preferences.json" assert config.branch_benchmark_targets["master"]["reasoning"] == 0.78 assert config.branch_benchmark_names["master"] == "memory-quality" assert config.branch_benchmark_names["coder"] == "coder-release-quality" assert config.branch_benchmark_dataset_paths["coder"] == "/tmp/benchmarks/coder-release.json" assert ( config.branch_benchmark_dataset_paths["planner"] == "/tmp/benchmarks/planner-release.json" ) assert ( config.branch_preference_dataset_paths["coder"] == "/tmp/preferences/coder-preferences.json" ) assert config.branch_dataset_filter_rules["planner"]["include_record_types"] == ["autonomous"] assert config.source_weight_map["production"] == 1.5 def test_load_training_config_default_coder_targets_include_execution_gate() -> None: config = load_training_config() assert config.branch_benchmark_targets["coder"]["execution"] == 0.7 assert config.branch_benchmark_targets["master"]["memory_retrieval_pass_rate"] == 0.8 assert config.branch_benchmark_names["master"] == "memory-quality" assert config.branch_benchmark_dataset_paths["master"].endswith( "core-python/evals/master_memory_benchmark.json" ) assert config.branch_benchmark_dataset_paths["coder"].endswith( "core-python/evals/coder_release_benchmark.json" ) assert config.branch_preference_dataset_paths["coder"].endswith( "core-python/evals/coder_preference_dataset.json" ) def test_apply_branch_runtime_defaults_prefers_master_memory_suite() -> None: import maris_core.training.train as train_module config = load_training_config( overrides={ "branch_name": "master", "benchmark_dataset_path": "", "benchmark_name": "chat-quality", "benchmark_gate_enabled": True, } ) resolved = train_module._apply_branch_runtime_defaults(config) assert resolved.benchmark_name == "memory-quality" assert resolved.benchmark_dataset_path.endswith( "core-python/evals/master_memory_benchmark.json" ) def test_build_benchmark_gate_artifact_uses_world_class_defaults_and_blocks_regressions() -> None: config = load_training_config( overrides={ "branch_name": "coder", "benchmark_gate_enabled": True, } ) gate = _build_benchmark_gate_artifact( config, { "benchmark_name": "release-gate", "score_manifest": { "overall": 0.8, "coding": 0.81, "reasoning": 0.76, "execution": 0.74, "grounding": 0.78, "safety": 0.93, "judge_overall": 0.78, "judge_task_completion": 0.77, "judge_instruction_following": 0.79, "judge_safety": 0.95, "judge_regression_risk": 0.8, }, "success_rate": 0.88, "production_like_cases": 3, "production_like_pass_rate": 0.8, "execution_cases": 4, "grounding_cases": 3, }, regression_report={"regression_count": 2}, ) assert gate["targets"]["success_rate"] == 0.85 assert gate["targets"]["production_like_pass_rate"] == 0.75 assert gate["targets"]["judge_overall"] == 0.72 assert gate["passed"] is False assert gate["failed_metrics"]["regression_count"]["required"] == 0.0 assert gate["failed_metrics"]["regression_count"]["actual"] == 2.0 def test_build_benchmark_gate_artifact_uses_stricter_execution_threshold() -> None: config = load_training_config( overrides={ "branch_name": "coder", "benchmark_gate_enabled": True, } ) gate = _build_benchmark_gate_artifact( config, { "benchmark_name": "release-gate", "score_manifest": { "overall": 0.8, "coding": 0.82, "reasoning": 0.76, "execution": 0.6, "grounding": 0.78, "safety": 0.94, }, "execution_cases": 4, }, ) assert gate["passed"] is False assert gate["targets"]["execution"] == 0.7 assert gate["failed_metrics"]["execution"]["actual"] == 0.6 def test_load_training_config_reads_category_weight_map(tmp_path: Path) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"category_weight_map": {"coding": 1.3, "grounding": 1.2}}), encoding="utf-8", ) config = load_training_config(str(config_path)) assert config.category_weight_map["coding"] == 1.3 assert config.category_weight_map["grounding"] == 1.2 def test_load_training_config_reads_continue_training_settings(monkeypatch) -> None: monkeypatch.setenv("HF_TRAIN_CONTINUE_FROM_LATEST", "true") monkeypatch.setenv("HF_TRAIN_CONTINUE_MODEL_PATH", "/tmp/maris-last-good") config = load_training_config() assert config.continue_from_latest_artifact is True assert config.continue_model_path == "/tmp/maris-last-good" def test_list_training_base_models_returns_copy() -> None: models = list_training_base_models() models["balanced"]["model_name"] = "modified" assert AVAILABLE_TRAINING_BASE_MODELS["balanced"]["model_name"] == DEFAULT_TRAINING_BASE_MODEL def test_list_training_base_models_ignores_invalid_extra_models_json(monkeypatch) -> None: monkeypatch.setenv("MARIS_TRAIN_EXTRA_MODELS", "{not valid json") models = list_training_base_models() assert {"balanced", "reasoning", "coding", "lightweight"}.issubset(models) def test_list_training_base_models_accepts_owner_name_fallback_syntax(monkeypatch) -> None: monkeypatch.setenv( "MARIS_TRAIN_EXTRA_MODELS", "Qwen/Qwen3-Coder-480B-A35B-Instruct, coder-7b=Qwen/Qwen2.5-7B-Instruct", ) models = list_training_base_models() assert models["qwen-qwen3-coder-480b-a35b-instruct"]["model_name"] == ( "Qwen/Qwen3-Coder-480B-A35B-Instruct" ) assert models["coder-7b"]["model_name"] == "Qwen/Qwen2.5-7B-Instruct" def test_list_training_base_models_accepts_string_shorthand(monkeypatch) -> None: monkeypatch.setenv("MARIS_TRAIN_EXTRA_MODELS", '{"qwen-880b":"Qwen/Qwen3-880B-Instruct"}') models = list_training_base_models() assert models["qwen-880b"]["model_name"] == "Qwen/Qwen3-880B-Instruct" assert models["qwen-880b"]["label"] == "Qwen 880B" def test_load_training_config_prefers_explicit_model_name_over_preset( tmp_path: Path, monkeypatch, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps({"model_preset": "coding"}), encoding="utf-8", ) monkeypatch.setenv("HF_TRAIN_BASE_MODEL", "custom/model") monkeypatch.setenv("HF_TRAIN_MODEL_PRESET", "reasoning") config = load_training_config(str(config_path)) assert config.model_name == "custom/model" assert config.model_preset == "" def test_huggingface_train_script_resolves_relative_config_from_repo_root( tmp_path: Path, monkeypatch, ) -> None: repo_root = next( parent for parent in Path(__file__).resolve().parents if (parent / "huggingface" / "train.sh").is_file() ) fake_python = tmp_path / "python3" invocation_log = tmp_path / "train-invocation.json" fake_python.write_text( "\n".join( [ f"#!{sys.executable}", "import json", "import os", "import sys", "from pathlib import Path", "", "Path(os.environ['TRAIN_SH_LOG']).write_text(", " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False),", " encoding='utf-8',", ")", ] ), encoding="utf-8", ) fake_python.chmod(0o755) existing_path = os.environ.get("PATH", "") monkeypatch.setenv( "PATH", f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), ) monkeypatch.setenv("HF_TRAINING_CONFIG_PATH", "huggingface/training-config.json") monkeypatch.setenv("TRAIN_SH_LOG", str(invocation_log)) subprocess.run( ["bash", str(repo_root / "huggingface" / "train.sh")], check=True, cwd=repo_root, ) logged = json.loads(invocation_log.read_text(encoding="utf-8")) assert logged["cwd"] == str(repo_root / "core-python") assert logged["argv"][0] == str(repo_root / "core-python" / "scripts" / "train_model.py") assert logged["argv"][1:3] == [ "--config", str(repo_root / "huggingface" / "training-config.json"), ] def test_huggingface_train_hf_script_uses_persistent_paths_and_uploads_model( tmp_path: Path, monkeypatch, ) -> None: repo_root = next( parent for parent in Path(__file__).resolve().parents if (parent / "huggingface" / "train-hf.sh").is_file() ) persistent_dir = tmp_path / "persistent" fake_python = tmp_path / "python3" invocation_log = tmp_path / "train-hf-invocations.jsonl" fake_python.write_text( "\n".join( [ f"#!{sys.executable}", "import json", "import os", "import sys", "from pathlib import Path", "", "log_path = Path(os.environ['TRAIN_HF_LOG'])", "with log_path.open('a', encoding='utf-8') as handle:", " handle.write(", " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False) + '\\n'", " )", ] ), encoding="utf-8", ) fake_python.chmod(0o755) existing_path = os.environ.get("PATH", "") monkeypatch.setenv( "PATH", f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), ) monkeypatch.setenv("HF_PERSISTENT_DIR", str(persistent_dir)) monkeypatch.setenv("TRAIN_HF_LOG", str(invocation_log)) monkeypatch.delenv("HF_TRAIN_OUTPUT_DIR", raising=False) monkeypatch.delenv("HF_LOCAL_MODEL_DIR", raising=False) monkeypatch.delenv("HF_TRAIN_PUSH_TO_HUB", raising=False) subprocess.run( ["bash", str(repo_root / "huggingface" / "train-hf.sh"), "--model-preset", "coding"], check=True, cwd=repo_root, ) logged = [ json.loads(line) for line in invocation_log.read_text(encoding="utf-8").splitlines() if line.strip() ] assert len(logged) == 2 assert logged[0]["cwd"] == str(repo_root / "core-python") assert logged[0]["argv"][0] == str(repo_root / "core-python" / "scripts" / "train_model.py") assert logged[0]["argv"][1:5] == [ "--config", str(repo_root / "huggingface" / "training-config.hf-jobs.json"), "--model-preset", "coding", ] assert logged[1]["argv"][0] == str(repo_root / "core-python" / "scripts" / "export_to_hf.py") assert logged[1]["argv"][1:3] == [ "--model-path", str(persistent_dir / "maris-ai-master"), ] def test_huggingface_train_hf_script_enables_accelerate_on_gpu_space( tmp_path: Path, monkeypatch, ) -> None: repo_root = next( parent for parent in Path(__file__).resolve().parents if (parent / "huggingface" / "train-hf.sh").is_file() ) persistent_dir = tmp_path / "persistent-gpu" fake_python = tmp_path / "python3" fake_nvidia_smi = tmp_path / "nvidia-smi" invocation_log = tmp_path / "train-hf-gpu-invocations.jsonl" fake_python.write_text( "\n".join( [ f"#!{sys.executable}", "import json", "import os", "import sys", "from pathlib import Path", "", "log_path = Path(os.environ['TRAIN_HF_GPU_LOG'])", "with log_path.open('a', encoding='utf-8') as handle:", " handle.write(", " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False) + '\\n'", " )", ] ), encoding="utf-8", ) fake_python.chmod(0o755) fake_nvidia_smi.write_text("#!/usr/bin/env bash\necho 'GPU 0: Fake GPU'\n", encoding="utf-8") fake_nvidia_smi.chmod(0o755) existing_path = os.environ.get("PATH", "") monkeypatch.setenv( "PATH", f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), ) monkeypatch.setenv("HF_PERSISTENT_DIR", str(persistent_dir)) monkeypatch.setenv("TRAIN_HF_GPU_LOG", str(invocation_log)) monkeypatch.delenv("HF_TRAIN_USE_ACCELERATE", raising=False) monkeypatch.delenv("HF_TRAIN_NUM_PROCESSES", raising=False) subprocess.run( ["bash", str(repo_root / "huggingface" / "train-hf.sh"), "--model-preset", "coding"], check=True, cwd=repo_root, ) logged = [ json.loads(line) for line in invocation_log.read_text(encoding="utf-8").splitlines() if line.strip() ] assert logged[0]["argv"][0:2] == ["-m", "accelerate.commands.launch"] assert "--config_file" in logged[0]["argv"] assert str(repo_root / "huggingface" / "accelerate-gpu-config.yaml") in logged[0]["argv"] assert "--num_processes" in logged[0]["argv"] assert logged[0]["argv"][logged[0]["argv"].index("--num_processes") + 1] == "1" assert str(repo_root / "core-python" / "scripts" / "train_model.py") in logged[0]["argv"] assert logged[1]["argv"][0] == str(repo_root / "core-python" / "scripts" / "export_to_hf.py") def test_huggingface_train_job_script_uses_accelerate_for_distributed_launch( tmp_path: Path, monkeypatch, ) -> None: repo_root = next( parent for parent in Path(__file__).resolve().parents if (parent / "huggingface" / "train-job.sh").is_file() ) fake_python = tmp_path / "python3" fake_nvidia_smi = tmp_path / "nvidia-smi" invocation_log = tmp_path / "train-job-invocations.jsonl" fake_python.write_text( "\n".join( [ f"#!{sys.executable}", "import json", "import os", "import sys", "from pathlib import Path", "", "log_path = Path(os.environ['TRAIN_JOB_LOG'])", "with log_path.open('a', encoding='utf-8') as handle:", " handle.write(", " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False) + '\\n'", " )", ] ), encoding="utf-8", ) fake_python.chmod(0o755) fake_nvidia_smi.write_text("#!/usr/bin/env bash\necho 'GPU 0: Fake GPU'\n", encoding="utf-8") fake_nvidia_smi.chmod(0o755) existing_path = os.environ.get("PATH", "") monkeypatch.setenv( "PATH", f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), ) monkeypatch.setenv("HF_JOB_WORK_DIR", str(tmp_path / "job-work")) monkeypatch.setenv("TRAIN_JOB_LOG", str(invocation_log)) monkeypatch.setenv("HF_TRAIN_DISTRIBUTED_STRATEGY", "deepspeed") monkeypatch.delenv("HF_TRAIN_USE_ACCELERATE", raising=False) monkeypatch.delenv("HF_TRAIN_NUM_PROCESSES", raising=False) subprocess.run( ["bash", str(repo_root / "huggingface" / "train-job.sh"), "--model-preset", "coding"], check=True, cwd=repo_root, ) logged = [ json.loads(line) for line in invocation_log.read_text(encoding="utf-8").splitlines() if line.strip() ] assert logged[0]["argv"][0:2] == ["-m", "accelerate.commands.launch"] assert str(repo_root / "huggingface" / "accelerate-gpu-config.yaml") in logged[0]["argv"] assert str(repo_root / "huggingface" / "training-config.hf-jobs.json") in logged[0]["argv"] assert str(repo_root / "core-python" / "scripts" / "train_model.py") in logged[0]["argv"] assert logged[1]["argv"][0] == str(repo_root / "core-python" / "scripts" / "export_to_hf.py") def test_configure_tokenizer_expands_large_model_context_window() -> None: import maris_core.training.train as train_module tokenizer = types.SimpleNamespace( pad_token=None, pad_token_id=None, eos_token="", eos_token_id=7, model_max_length=4096, ) config = load_training_config( overrides={ "model_name": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "max_seq_length": 65536, } ) train_module._configure_tokenizer(tokenizer, config) assert tokenizer.pad_token == "" assert tokenizer.pad_token_id == 7 assert tokenizer.model_max_length == 65536 def test_load_tokenizer_forces_remote_snapshot_restore(monkeypatch) -> None: import maris_core.training.train as train_module compat_flags: list[bool | None] = [] class FakeTokenizer: @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class CompatPath: def __init__(self, model_name: str, *, allow_remote_snapshot: bool | None = None): del model_name compat_flags.append(allow_remote_snapshot) def __enter__(self) -> str: return "/tmp/fake-model" def __exit__(self, exc_type, exc, tb) -> None: del exc_type, exc, tb return None monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace(AutoTokenizer=FakeTokenizer) ) monkeypatch.setattr(train_module, "maris_hf_compatible_path", CompatPath) config = load_training_config(overrides={"model_name": "MarisUK/maris-ai-master"}) tokenizer = train_module._load_tokenizer("MarisUK/maris-ai-master", config) assert isinstance(tokenizer, FakeTokenizer) assert compat_flags == [True] def test_load_tokenizer_falls_back_to_explicit_slow_class(monkeypatch, tmp_path) -> None: import maris_core.training.train as train_module compat_flags: list[bool | None] = [] tokenizer_attempts: list[tuple[str, Any]] = [] model_dir = tmp_path / "trained-model" model_dir.mkdir(parents=True, exist_ok=True) (model_dir / "tokenizer_config.json").write_text( json.dumps({"tokenizer_class": "Qwen2TokenizerFast"}), encoding="utf-8", ) (model_dir / "config.json").write_text( json.dumps( { "tokenizer_class": "Qwen2TokenizerFast", "auto_map": {"AutoTokenizer": ["Qwen2Tokenizer", "Qwen2TokenizerFast"]}, } ), encoding="utf-8", ) class FakeAutoTokenizer: @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name tokenizer_attempts.append(("auto", kwargs.get("use_fast"))) if kwargs.get("use_fast", True): raise ValueError( "Couldn't instantiate the backend tokenizer from one of the available paths." ) raise ValueError("tokenizer config still points to a fast tokenizer class") class FakeSlowTokenizer: @classmethod def from_pretrained(cls, model_name, **kwargs): tokenizer_attempts.append(("slow", kwargs.get("use_fast"))) assert model_name == str(model_dir) assert "use_fast" not in kwargs return cls() class CompatPath: def __init__(self, model_name: str, *, allow_remote_snapshot: bool | None = None): del model_name compat_flags.append(allow_remote_snapshot) def __enter__(self) -> str: return str(model_dir) def __exit__(self, exc_type, exc, tb) -> None: del exc_type, exc, tb return None monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoTokenizer=FakeAutoTokenizer, Qwen2Tokenizer=FakeSlowTokenizer, ), ) monkeypatch.setattr(train_module, "maris_hf_compatible_path", CompatPath) config = load_training_config(overrides={"model_name": "MarisUK/maris-ai-master"}) tokenizer = train_module._load_tokenizer("MarisUK/maris-ai-master", config) assert isinstance(tokenizer, FakeSlowTokenizer) assert compat_flags == [True] assert tokenizer_attempts == [("auto", True), ("auto", False), ("slow", None)] def test_load_tokenizer_retries_after_installing_missing_backends(monkeypatch, tmp_path) -> None: import maris_core.training.train as train_module compat_flags: list[bool | None] = [] tokenizer_attempts: list[tuple[str, Any]] = [] model_dir = tmp_path / "trained-model" model_dir.mkdir(parents=True, exist_ok=True) class FakeAutoTokenizer: retry_ready = False @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name tokenizer_attempts.append(("auto", kwargs.get("use_fast"))) if cls.retry_ready: return cls() raise ValueError( "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one." ) class CompatPath: def __init__(self, model_name: str, *, allow_remote_snapshot: bool | None = None): del model_name compat_flags.append(allow_remote_snapshot) def __enter__(self) -> str: return str(model_dir) def __exit__(self, exc_type, exc, tb) -> None: del exc_type, exc, tb install_attempts: list[bool] = [] def fake_install_missing_tokenizer_backends() -> bool: install_attempts.append(True) FakeAutoTokenizer.retry_ready = True return True monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace(AutoTokenizer=FakeAutoTokenizer) ) monkeypatch.setattr(train_module, "maris_hf_compatible_path", CompatPath) monkeypatch.setattr( train_module, "_install_missing_tokenizer_backends", fake_install_missing_tokenizer_backends, ) config = load_training_config(overrides={"model_name": "MarisUK/maris-ai-master"}) tokenizer = train_module._load_tokenizer("MarisUK/maris-ai-master", config) assert isinstance(tokenizer, FakeAutoTokenizer) assert compat_flags == [True] assert install_attempts == [True] assert tokenizer_attempts == [("auto", True), ("auto", False), ("auto", True)] def test_install_missing_tokenizer_backends_only_installs_missing_packages(monkeypatch) -> None: import maris_core.training.train as train_module installed_commands: list[list[str]] = [] available_modules = {"tiktoken"} def fake_find_spec(name: str): return object() if name in available_modules else None monkeypatch.setattr(train_module.importlib.util, "find_spec", fake_find_spec) monkeypatch.setattr(train_module.importlib, "invalidate_caches", lambda: None) monkeypatch.setattr( train_module.subprocess, "run", lambda command, **kwargs: ( installed_commands.append(command) or types.SimpleNamespace(stdout="") ), ) installed = train_module._install_missing_tokenizer_backends() assert installed is True assert installed_commands == [ [sys.executable, "-m", "pip", "install", "--no-cache-dir", "sentencepiece"] ] def test_install_missing_tokenizer_backends_is_noop_when_backends_exist(monkeypatch) -> None: import maris_core.training.train as train_module monkeypatch.setattr(train_module.importlib.util, "find_spec", lambda name: object()) monkeypatch.setattr( train_module.subprocess, "run", lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("pip should not run")), ) installed = train_module._install_missing_tokenizer_backends() assert installed is False def test_prepare_training_model_passes_use_reentrant_override(monkeypatch) -> None: import maris_core.training.train as train_module class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) self.gradient_checkpointing_kwargs = None def gradient_checkpointing_enable(self, *, gradient_checkpointing_kwargs=None): self.gradient_checkpointing_kwargs = gradient_checkpointing_kwargs model = FakeModel() tokenizer = types.SimpleNamespace(pad_token_id=7) config = load_training_config( overrides={ "gradient_checkpointing": True, "gradient_checkpointing_use_reentrant": False, } ) monkeypatch.setattr(train_module, "_load_model", lambda model_name, config: model) monkeypatch.setattr(train_module, "_apply_peft_adapter", lambda model, config: model) prepared_model = train_module._prepare_training_model( "MarisUK/maris-ai-master", tokenizer, config ) assert prepared_model is model assert model.config.pad_token_id == 7 assert model.config.use_cache is False assert model.gradient_checkpointing_kwargs == {"use_reentrant": False} def test_prepare_training_model_falls_back_when_runtime_rejects_use_reentrant( monkeypatch, caplog ) -> None: import maris_core.training.train as train_module class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) self.gradient_checkpointing_enabled = False def gradient_checkpointing_enable(self): self.gradient_checkpointing_enabled = True model = FakeModel() tokenizer = types.SimpleNamespace(pad_token_id=7) config = load_training_config( overrides={ "gradient_checkpointing": True, "gradient_checkpointing_use_reentrant": False, } ) monkeypatch.setattr(train_module, "_load_model", lambda model_name, config: model) monkeypatch.setattr(train_module, "_apply_peft_adapter", lambda model, config: model) with caplog.at_level("WARNING"): prepared_model = train_module._prepare_training_model( "MarisUK/maris-ai-master", tokenizer, config ) assert prepared_model is model assert model.gradient_checkpointing_enabled is True assert "Ignoring explicit gradient_checkpointing_use_reentrant=False" in caplog.text def test_train_auto_enables_deepspeed_for_giant_long_context_model( tmp_path: Path, monkeypatch ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch if batched else self.items[0]) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Čau!"}, {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, ] ) }, ) monkeypatch.setattr( "maris_core.training.train._load_json_object", lambda path_value, *, label: {"config_path": path_value, "label": label}, ) monkeypatch.setattr( "maris_core.training.train._require_runtime_package", lambda *args, **kwargs: None ) class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 0 model_max_length = 4096 @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def __call__(self, texts, truncation, max_length, padding): del truncation, padding return { "input_ids": [[1] * min(max_length, 4) for _ in texts], "attention_mask": [[1] * min(max_length, 4) for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def gradient_checkpointing_enable(self): self.gradient_checkpointing_enabled = True class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: last_instance = None def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): del model, train_dataset, eval_dataset, data_collator self.args = args FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.1} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, BitsAndBytesConfig=lambda **kwargs: kwargs, ), ) monkeypatch.setitem( sys.modules, "peft", types.SimpleNamespace( LoraConfig=lambda **kwargs: kwargs, TaskType=types.SimpleNamespace(CAUSAL_LM="CAUSAL_LM"), get_peft_model=lambda model, peft_config: model, prepare_model_for_kbit_training=lambda model, use_gradient_checkpointing: model, ), ) train( output_dir=str(tmp_path / "giant-long-context"), model_name="Qwen/Qwen3-Coder-480B-A35B-Instruct", max_seq_length=65536, distributed_strategy="none", use_accelerate=False, ) assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.args.kwargs["deepspeed"].endswith( "huggingface/deepspeed-zero3.json" ) assert FakeTrainer.last_instance.args.kwargs["ddp_find_unused_parameters"] is False def test_train_uses_eval_split_and_writes_metrics( tmp_path: Path, monkeypatch, ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): split_index = max(1, len(self.items) - 1) return { "train": FakeDataset(self.items[:split_index]), "test": FakeDataset(self.items[split_index:]), } def map(self, fn, *, batched, remove_columns, desc): assert batched is True del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) fake_dataset = { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kā iet?", "assistant": "Labi"}, ] ) } monkeypatch.setattr("maris_core.training.train.load_hf_dataset", lambda _: fake_dataset) class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() def __call__(self, texts, *, truncation, max_length, padding): assert truncation is True assert max_length == 256 assert padding is False return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir, "tokenizer.json").write_text( json.dumps( { "model": { "type": "BPE", "unk_token": "Qwen/Qwen2.5-7B-Instruct", }, "added_tokens": [ {"content": "Claude"}, {"content": "DeepSeek"}, ], } ), encoding="utf-8", ) Path(output_dir, "tokenizer_config.json").write_text( json.dumps( { "name_or_path": DEFAULT_TRAINING_BASE_MODEL, "tokenizer_class": "Qwen2TokenizerFast", "auto_map": {"AutoTokenizer": ["qwen2.Qwen2Tokenizer", None]}, "chat_template": "You are Qwen, a helpful assistant for Qwen/Qwen2.5-7B-Instruct.", "init_kwargs": { "chat_template": "Respond like TinyLlama and DeepSeek.", }, } ), encoding="utf-8", ) Path(output_dir, "chat_template.jinja").write_text( "System: meta-llama/Llama-3.2-3B-Instruct and Claude must answer here.", encoding="utf-8", ) class FakeModelConfig: pad_token_id = None class FakeModel: config = FakeModelConfig() @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.25} class FakeTrainer: last_instance = None def __init__( self, *, model, args, train_dataset, eval_dataset=None, data_collator=None, ): del model, data_collator self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.5} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") Path(output_dir, "config.json").write_text( json.dumps( { "_name_or_path": DEFAULT_TRAINING_BASE_MODEL, "model_type": "qwen2", "architectures": ["Qwen2ForCausalLM"], "tokenizer_class": "Qwen2TokenizerFast", "auto_map": { "AutoConfig": "qwen2.configuration_qwen2.Qwen2Config", "AutoModelForCausalLM": "qwen2.modeling_qwen2.Qwen2ForCausalLM", }, } ), encoding="utf-8", ) Path(output_dir, "adapter_config.json").write_text( json.dumps( { "base_model_name_or_path": DEFAULT_TRAINING_BASE_MODEL, "base_model_class": "Qwen2ForCausalLM", "parent_library": "transformers.models.qwen2.modeling_qwen2", "auto_mapping": { "base_model_class": "Qwen2ForCausalLM", "parent_library": "transformers.models.qwen2.modeling_qwen2", }, "description": "Adapter derived from Qwen and Llama.", } ), encoding="utf-8", ) def push_to_hub(self, **kwargs): self.push_kwargs = kwargs fake_transformers = types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ) monkeypatch.setitem(sys.modules, "transformers", fake_transformers) preference_dataset_path = tmp_path / "preferences.json" preference_dataset_path.write_text( json.dumps( [ { "prompt": "Kurš variants ir labāks?", "chosen": "Variants A", "rejected": "Variants B", "source": "human_review", "tags": ["quality"], } ] ), encoding="utf-8", ) async def fake_benchmark(config, *, model_path): assert model_path.endswith("trained-model") return { "artifact_type": "chat-benchmark-manifest", "benchmark_name": config.benchmark_name, "branch": config.branch_name, "model": config.hub_model_id, "score_manifest": { "overall": 0.81, "reasoning": 0.8, "factuality": 0.79, "latvian_quality": 0.86, "coding": 0.74, "long_context": 0.75, "helpfulness": 0.83, }, } monkeypatch.setattr("maris_core.training.train._run_post_training_benchmark", fake_benchmark) output_dir = tmp_path / "trained-model" metrics = train( output_dir=str(output_dir), max_seq_length=256, benchmark_dataset_path=str(tmp_path / "benchmarks.json"), preference_dataset_path=str(preference_dataset_path), ) assert metrics["train_loss"] == 0.25 assert metrics["eval_loss"] == 0.5 assert metrics["perplexity"] > 1.0 assert FakeTrainer.last_instance is not None assert len(FakeTrainer.last_instance.train_dataset) == 1 assert len(FakeTrainer.last_instance.eval_dataset) == 1 assert FakeTrainer.last_instance.args.kwargs["evaluation_strategy"] == "steps" assert (output_dir / "training-config.json").is_file() assert (output_dir / "training-metrics.json").is_file() assert (output_dir / "maris-metadata.json").is_file() assert (output_dir / "training-provenance.json").is_file() assert (output_dir / "README.md").is_file() assert (output_dir / "benchmark-manifest.json").is_file() assert (output_dir / "benchmark-release-gate.json").is_file() assert (output_dir / "benchmark-history.json").is_file() assert (output_dir / "benchmark-regression-report.json").is_file() assert (output_dir / "benchmark-feedback.json").is_file() assert (output_dir / "preference-summary.json").is_file() assert (output_dir / "human-eval-summary.json").is_file() assert (output_dir / "blind-side-by-side-eval.json").is_file() training_config = json.loads((output_dir / "training-config.json").read_text(encoding="utf-8")) training_metrics = json.loads( (output_dir / "training-metrics.json").read_text(encoding="utf-8") ) benchmark_manifest = json.loads( (output_dir / "benchmark-manifest.json").read_text(encoding="utf-8") ) benchmark_gate = json.loads( (output_dir / "benchmark-release-gate.json").read_text(encoding="utf-8") ) benchmark_history = json.loads( (output_dir / "benchmark-history.json").read_text(encoding="utf-8") ) benchmark_regression = json.loads( (output_dir / "benchmark-regression-report.json").read_text(encoding="utf-8") ) benchmark_feedback = json.loads( (output_dir / "benchmark-feedback.json").read_text(encoding="utf-8") ) preference_summary = json.loads( (output_dir / "preference-summary.json").read_text(encoding="utf-8") ) human_eval_summary = json.loads( (output_dir / "human-eval-summary.json").read_text(encoding="utf-8") ) blind_side_by_side = json.loads( (output_dir / "blind-side-by-side-eval.json").read_text(encoding="utf-8") ) training_provenance = json.loads( (output_dir / "training-provenance.json").read_text(encoding="utf-8") ) saved_model_config = json.loads((output_dir / "config.json").read_text(encoding="utf-8")) saved_tokenizer_config = json.loads( (output_dir / "tokenizer_config.json").read_text(encoding="utf-8") ) saved_tokenizer_json = json.loads((output_dir / "tokenizer.json").read_text(encoding="utf-8")) saved_adapter_config = json.loads( (output_dir / "adapter_config.json").read_text(encoding="utf-8") ) compatibility_manifest = json.loads( (output_dir / MARIS_COMPATIBILITY_ARTIFACT_NAME).read_text(encoding="utf-8") ) saved_chat_template = (output_dir / "chat_template.jinja").read_text(encoding="utf-8") assert training_config["maris_origin"] == "Maris AI" assert training_config["maris_model_id"] == "MarisUK/maris-ai-master" assert "model_name" not in training_config assert training_metrics["maris_origin"] == "Maris AI" assert training_metrics["artifact_type"] == "training-metrics" assert training_metrics["dataset_repo"] == "MarisUK/maris-ai-memory" assert training_metrics["benchmark_regressions"] == 0.0 assert training_provenance["maris_origin"] == "Maris AI" assert training_provenance["train_examples"] == 1 assert training_provenance["eval_examples"] == 1 assert training_provenance["base_model_name"] == "Maris AI" assert training_provenance["base_model_lineage"] == "Maris AI" model_card = (output_dir / "README.md").read_text(encoding="utf-8") assert "Maris AI Model" in model_card assert "Qwen/" not in model_card assert "TinyLlama/" not in model_card assert saved_model_config["_name_or_path"] == "MarisUK/maris-ai-master" assert saved_model_config["model_type"] == "maris" assert saved_model_config["architectures"] == ["MarisCompatibleCausalLM"] assert saved_model_config["tokenizer_class"] == "MarisCompatibleTokenizer" assert saved_tokenizer_config["name_or_path"] == "MarisUK/maris-ai-master" assert saved_tokenizer_config["tokenizer_class"] == "MarisCompatibleTokenizer" assert saved_tokenizer_json["model"]["unk_token"] == "MarisUK/maris-ai-master" assert saved_tokenizer_json["added_tokens"][0]["content"] == "Maris AI" assert saved_tokenizer_json["added_tokens"][1]["content"] == "Maris AI" assert "Maris AI" in saved_tokenizer_config["chat_template"] assert "Qwen" not in saved_tokenizer_config["chat_template"] assert "Maris AI" in saved_tokenizer_config["init_kwargs"]["chat_template"] assert saved_adapter_config["base_model_name_or_path"] == "MarisUK/maris-ai-master" assert saved_adapter_config["base_model_class"] == "MarisCompatibleCausalLM" assert saved_adapter_config["parent_library"] == "maris.compat" assert "Qwen" not in saved_adapter_config["description"] assert "Llama" not in saved_adapter_config["description"] assert compatibility_manifest["artifact_type"] == "maris-hf-compatibility" assert compatibility_manifest["maris_model_id"] == "MarisUK/maris-ai-master" assert "config.json" in compatibility_manifest["artifacts"] assert "tokenizer_config.json" in compatibility_manifest["artifacts"] assert "adapter_config.json" in compatibility_manifest["artifacts"] assert "meta-llama/" not in saved_chat_template assert "Claude" not in saved_chat_template assert "Maris AI" in saved_chat_template assert metrics["perplexity"] < 1000 assert metrics["benchmark_overall"] == 0.81 assert metrics["benchmark_gate_passed"] == 1.0 assert benchmark_manifest["artifact_type"] == "chat-benchmark-manifest" assert benchmark_gate["artifact_type"] == "benchmark-release-gate" assert benchmark_history["artifact_type"] == "chat-benchmark-history" assert benchmark_history["run_count"] == 1 assert benchmark_regression["artifact_type"] == "chat-benchmark-regression-report" assert benchmark_regression["status"] == "no-baseline" assert benchmark_feedback["artifact_type"] == "benchmark-feedback-reweighting" assert training_metrics["scoring_dashboard"]["train"]["sources"]["unknown"]["records"] == 1 assert training_metrics["scoring_dashboard"]["train"]["categories"]["general"]["records"] == 1 assert training_metrics["scoring_dashboard_train_sources_unknown_records"] == 1.0 assert training_metrics["scoring_dashboard_train_categories_general_records"] == 1.0 _assert_output_dir_uses_only_maris_identity(output_dir) assert benchmark_gate["passed"] is True assert preference_summary["artifact_type"] == "preference-dataset-summary" assert benchmark_manifest["score_manifest"]["pairwise_win_rate"] == 1.0 assert human_eval_summary["artifact_type"] == "human-eval-summary" assert blind_side_by_side["artifact_type"] == "blind-side-by-side-eval-set" def test_train_pushes_to_hub_when_enabled( tmp_path: Path, monkeypatch, ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): assert batched is True del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setenv("HF_TRAIN_PUSH_TO_HUB", "true") monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kā iet?", "assistant": "Labi"}, ] ) }, ) class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() def __call__(self, texts, *, truncation, max_length, padding): del texts, truncation, max_length, padding return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} def save_pretrained(self, output_dir): Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModelConfig: pad_token_id = None class FakeModel: config = FakeModelConfig() @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.1} class FakeTrainer: last_instance = None def __init__(self, **kwargs): self.kwargs = kwargs self.push_kwargs = None FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.2} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") def push_to_hub(self, **kwargs): self.push_kwargs = kwargs monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) train(output_dir=str(tmp_path / "push-model"), max_seq_length=256) assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.push_kwargs == { "commit_message": "Maris AI training sync (master)" } def test_train_prefers_existing_local_artifact_when_continue_mode_enabled( tmp_path: Path, monkeypatch, ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del desc, remove_columns assert batched is True batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) output_dir = tmp_path / "continued-model" output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "config.json").write_text("{}", encoding="utf-8") import maris_core.training.train as train_module (output_dir / "training-config.json").write_text( json.dumps( { train_module.MODEL_SOURCE_FINGERPRINT_KEY: train_module._build_model_source_fingerprint( DEFAULT_TRAINING_BASE_MODEL ) } ), encoding="utf-8", ) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kā iet?", "assistant": "Labi"}, ] ) }, ) captured_paths: dict[str, str] = {} class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): captured_paths["tokenizer"] = model_name return cls() def __call__(self, texts, *, truncation, max_length, padding): del texts, truncation, max_length, padding return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} def save_pretrained(self, output_dir): Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModelConfig: pad_token_id = None class FakeModel: config = FakeModelConfig() @classmethod def from_pretrained(cls, model_name, **kwargs): del kwargs captured_paths["model"] = model_name return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainer: def __init__(self, **kwargs): self.kwargs = kwargs def train(self): return types.SimpleNamespace(metrics={"train_loss": 0.1}) def evaluate(self): return {"eval_loss": 0.2} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) train( output_dir=str(output_dir), continue_from_latest_artifact=True, max_seq_length=256, ) assert captured_paths["tokenizer"] == str(output_dir) assert captured_paths["model"] == str(output_dir) def test_train_does_not_auto_resume_from_incompatible_output_artifact( tmp_path: Path, monkeypatch ) -> None: output_dir = tmp_path / "incompatible-output" output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "config.json").write_text("{}", encoding="utf-8") import maris_core.training.train as train_module (output_dir / "training-config.json").write_text( json.dumps( { train_module.MODEL_SOURCE_FINGERPRINT_KEY: train_module._build_model_source_fingerprint( "meta-llama/Llama-3.2-3B-Instruct" ) } ), encoding="utf-8", ) config = load_training_config( overrides={ "output_dir": str(output_dir), "model_name": "Qwen/Qwen2.5-1.5B-Instruct", "continue_from_latest_artifact": True, } ) assert train_module._resolve_training_model_source(config) == "Qwen/Qwen2.5-1.5B-Instruct" def test_train_restores_maris_artifacts_after_push_to_hub(tmp_path: Path, monkeypatch) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): assert batched is True del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setenv("HF_TRAIN_PUSH_TO_HUB", "true") monkeypatch.setenv("HF_TOKEN", "token") monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kā iet?", "assistant": "Labi"}, ] ) }, ) class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() def __call__(self, texts, *, truncation, max_length, padding): del texts, truncation, max_length, padding return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} def save_pretrained(self, output_dir): Path(output_dir, "tokenizer_config.json").write_text( json.dumps( { "name_or_path": "MarisUK/maris-ai-master", "tokenizer_class": "Qwen2TokenizerFast", "chat_template": "You are Qwen and Claude in one assistant.", } ), encoding="utf-8", ) class FakeModelConfig: pad_token_id = None class FakeModel: config = FakeModelConfig() @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.1} class FakeTrainer: def __init__(self, **kwargs): self.kwargs = kwargs def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.2} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text( json.dumps( { "_name_or_path": "MarisUK/maris-ai-master", "model_type": "qwen2", "architectures": ["Qwen2ForCausalLM"], } ), encoding="utf-8", ) Path(output_dir, "adapter_config.json").write_text( json.dumps( { "base_model_name_or_path": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "base_model_class": "Qwen2ForCausalLM", "parent_library": "transformers.models.qwen2.modeling_qwen2", "description": "Adapter built from DeepSeek and Mistral.", } ), encoding="utf-8", ) def push_to_hub(self, **kwargs): del kwargs output_dir = Path(self.kwargs["args"].kwargs["output_dir"]) Path(output_dir, "README.md").write_text( "\n".join( ( "---", "library_name: transformers", "datasets:", "- generator", "---", "# master", ) ) + "\n", encoding="utf-8", ) Path(output_dir, "config.json").write_text( json.dumps( { "_name_or_path": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "model_type": "qwen2", "architectures": ["Qwen2ForCausalLM"], } ), encoding="utf-8", ) Path(output_dir, "tokenizer_config.json").write_text( json.dumps( { "name_or_path": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "tokenizer_class": "Qwen2TokenizerFast", "chat_template": "Use meta-llama/Llama-3.2-3B-Instruct with Gemini.", } ), encoding="utf-8", ) Path(output_dir, "tokenizer.json").write_text( json.dumps( { "model": {"type": "BPE", "unk_token": "DeepSeek-Coder"}, "added_tokens": [{"content": "Anthropic"}], } ), encoding="utf-8", ) Path(output_dir, "chat_template.jinja").write_text( "System prompt from Anthropic Claude and OpenAI ChatGPT.", encoding="utf-8", ) upload_calls: list[dict[str, str]] = [] class FakeHfApi: def __init__(self, token=None): self.token = token def create_repo(self, **kwargs): upload_calls.append({"create_repo": kwargs}) def upload_folder(self, **kwargs): upload_calls.append(kwargs) monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) output_dir = tmp_path / "push-model" train(output_dir=str(output_dir), max_seq_length=256) assert "Maris AI Model" in (output_dir / "README.md").read_text(encoding="utf-8") assert "generated_from_trainer" not in (output_dir / "README.md").read_text(encoding="utf-8") assert json.loads((output_dir / "config.json").read_text(encoding="utf-8"))[ "_name_or_path" ] == ("MarisUK/maris-ai-master") assert ( json.loads((output_dir / "config.json").read_text(encoding="utf-8"))["model_type"] == "maris" ) assert json.loads((output_dir / "config.json").read_text(encoding="utf-8"))[ "architectures" ] == ["MarisCompatibleCausalLM"] assert ( json.loads((output_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ "name_or_path" ] == "MarisUK/maris-ai-master" ) assert ( json.loads((output_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ "tokenizer_class" ] == "MarisCompatibleTokenizer" ) assert ( "Maris AI" in json.loads((output_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ "chat_template" ] ) assert ( json.loads((output_dir / "tokenizer.json").read_text(encoding="utf-8"))["model"][ "unk_token" ] == "Maris AI" ) assert ( json.loads((output_dir / "tokenizer.json").read_text(encoding="utf-8"))["added_tokens"][0][ "content" ] == "Maris AI" ) assert ( json.loads((output_dir / "adapter_config.json").read_text(encoding="utf-8"))[ "base_model_name_or_path" ] == "MarisUK/maris-ai-master" ) assert ( json.loads((output_dir / "adapter_config.json").read_text(encoding="utf-8"))[ "base_model_class" ] == "MarisCompatibleCausalLM" ) assert ( "DeepSeek" not in json.loads((output_dir / "adapter_config.json").read_text(encoding="utf-8"))[ "description" ] ) assert "Anthropic" not in (output_dir / "chat_template.jinja").read_text(encoding="utf-8") assert "ChatGPT" not in (output_dir / "chat_template.jinja").read_text(encoding="utf-8") assert "Maris AI" in (output_dir / "chat_template.jinja").read_text(encoding="utf-8") assert (output_dir / MARIS_COMPATIBILITY_ARTIFACT_NAME).is_file() _assert_output_dir_uses_only_maris_identity(output_dir) assert upload_calls == [ { "create_repo": { "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "exist_ok": True, } }, { "folder_path": str(output_dir), "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "commit_message": "Maris AI artifact sync (master)", }, ] def test_export_model_creates_repo_before_upload(tmp_path: Path, monkeypatch) -> None: script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_to_hf.py" spec = importlib.util.spec_from_file_location("export_to_hf", script_path) assert spec is not None and spec.loader is not None export_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(export_module) model_dir = tmp_path / "model" model_dir.mkdir() model_dir.joinpath("config.json").write_text("{}", encoding="utf-8") monkeypatch.setenv("HF_TOKEN", "token") calls: list[dict[str, object]] = [] class FakeHfApi: def __init__(self, token=None): calls.append({"init": token}) def create_repo(self, **kwargs): calls.append({"create_repo": kwargs}) def upload_folder(self, **kwargs): calls.append({"upload_folder": kwargs}) monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) export_module.export_model(str(model_dir), "MarisUK/maris-ai-master") assert calls == [ {"init": "token"}, { "create_repo": { "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(model_dir), "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "commit_message": "Maris AI model export", } }, ] def test_export_model_publishes_branch_suite_to_runtime_repos(tmp_path: Path, monkeypatch) -> None: script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_to_hf.py" spec = importlib.util.spec_from_file_location("export_to_hf", script_path) assert spec is not None and spec.loader is not None export_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(export_module) suite_dir = tmp_path / "suite" suite_dir.mkdir() for branch_name in ("master", "coder", "image", "tts"): branch_dir = suite_dir / branch_name branch_dir.mkdir() branch_dir.joinpath("config.json").write_text("{}", encoding="utf-8") suite_dir.joinpath("branch-suite.json").write_text( json.dumps( { "branches": { "master": {"output_dir": str(suite_dir / "master")}, "coder": {"output_dir": str(suite_dir / "coder")}, "image": {"output_dir": str(suite_dir / "image")}, "tts": {"output_dir": str(suite_dir / "tts")}, } } ), encoding="utf-8", ) monkeypatch.setenv("HF_TOKEN", "token") calls: list[dict[str, object]] = [] class FakeHfApi: def __init__(self, token=None): calls.append({"init": token}) def create_repo(self, **kwargs): calls.append({"create_repo": kwargs}) def upload_folder(self, **kwargs): calls.append({"upload_folder": kwargs}) monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) export_module.export_model(str(suite_dir), "MarisUK/maris-ai-master") assert calls == [ {"init": "token"}, { "create_repo": { "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir), "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "commit_message": "Maris AI model export", } }, { "create_repo": { "repo_id": "MarisUK/maris-ai-text", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir / "master"), "repo_id": "MarisUK/maris-ai-text", "repo_type": "model", "commit_message": "Maris AI model export (master)", } }, { "create_repo": { "repo_id": "MarisUK/maris-ai-codex", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir / "coder"), "repo_id": "MarisUK/maris-ai-codex", "repo_type": "model", "commit_message": "Maris AI model export (coder)", } }, { "create_repo": { "repo_id": "MarisUK/maris-ai-image", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir / "image"), "repo_id": "MarisUK/maris-ai-image", "repo_type": "model", "commit_message": "Maris AI model export (image)", } }, { "create_repo": { "repo_id": "MarisUK/maris-tts-runtime", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir / "tts"), "repo_id": "MarisUK/maris-tts-runtime", "repo_type": "model", "commit_message": "Maris AI model export (tts)", } }, ] def test_export_model_discovers_fallback_branch_dirs_without_manifest( tmp_path: Path, monkeypatch ) -> None: script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_to_hf.py" spec = importlib.util.spec_from_file_location("export_to_hf", script_path) assert spec is not None and spec.loader is not None export_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(export_module) suite_dir = tmp_path / "suite" suite_dir.mkdir() suite_dir.joinpath("config.json").write_text("{}", encoding="utf-8") for branch_name in ("master", "coder"): branch_dir = suite_dir / branch_name branch_dir.mkdir() branch_dir.joinpath("config.json").write_text("{}", encoding="utf-8") monkeypatch.setenv("HF_TOKEN", "token") calls: list[dict[str, object]] = [] class FakeHfApi: def __init__(self, token=None): calls.append({"init": token}) def create_repo(self, **kwargs): calls.append({"create_repo": kwargs}) def upload_folder(self, **kwargs): calls.append({"upload_folder": kwargs}) monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) export_module.export_model(str(suite_dir), "MarisUK/maris-ai-master") assert calls == [ {"init": "token"}, { "create_repo": { "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir), "repo_id": "MarisUK/maris-ai-master", "repo_type": "model", "commit_message": "Maris AI model export", } }, { "create_repo": { "repo_id": "MarisUK/maris-ai-text", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir / "master"), "repo_id": "MarisUK/maris-ai-text", "repo_type": "model", "commit_message": "Maris AI model export (master)", } }, { "create_repo": { "repo_id": "MarisUK/maris-ai-codex", "repo_type": "model", "exist_ok": True, } }, { "upload_folder": { "folder_path": str(suite_dir / "coder"), "repo_id": "MarisUK/maris-ai-codex", "repo_type": "model", "commit_message": "Maris AI model export (coder)", } }, ] def test_train_filters_unsupported_training_arguments( tmp_path: Path, monkeypatch, ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): assert batched is True del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kā iet?", "assistant": "Labi"}, ] ) }, ) class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() def __call__(self, texts, *, truncation, max_length, padding): assert truncation is True assert max_length == 256 assert padding is False return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir, "tokenizer.json").write_text("{}", encoding="utf-8") class FakeModelConfig: pad_token_id = None class FakeModel: config = FakeModelConfig() @classmethod def from_pretrained(cls, model_name): assert model_name == DEFAULT_TRAINING_BASE_MODEL return cls() class StrictTrainingArguments: def __init__( self, *, output_dir, num_train_epochs, learning_rate, per_device_train_batch_size, per_device_eval_batch_size, gradient_accumulation_steps, warmup_ratio, weight_decay, logging_steps, save_steps, eval_steps, save_total_limit, lr_scheduler_type, seed, fp16, bf16, report_to, save_safetensors, remove_unused_columns, eval_strategy, load_best_model_at_end, metric_for_best_model, greater_is_better, ): self.kwargs = { "output_dir": output_dir, "num_train_epochs": num_train_epochs, "learning_rate": learning_rate, "per_device_train_batch_size": per_device_train_batch_size, "per_device_eval_batch_size": per_device_eval_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, "warmup_ratio": warmup_ratio, "weight_decay": weight_decay, "logging_steps": logging_steps, "save_steps": save_steps, "eval_steps": eval_steps, "save_total_limit": save_total_limit, "lr_scheduler_type": lr_scheduler_type, "seed": seed, "fp16": fp16, "bf16": bf16, "report_to": report_to, "save_safetensors": save_safetensors, "remove_unused_columns": remove_unused_columns, "eval_strategy": eval_strategy, "load_best_model_at_end": load_best_model_at_end, "metric_for_best_model": metric_for_best_model, "greater_is_better": greater_is_better, } class FakeTrainResult: metrics = {"train_loss": 0.25} class FakeTrainer: last_instance = None def __init__( self, *, model, args, train_dataset, eval_dataset=None, data_collator=None, ): del model, data_collator self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.5} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=StrictTrainingArguments, ), ) metrics = train(output_dir=str(tmp_path / "trained-model"), max_seq_length=256) assert metrics["eval_loss"] == 0.5 assert FakeTrainer.last_instance is not None assert "overwrite_output_dir" not in FakeTrainer.last_instance.args.kwargs assert FakeTrainer.last_instance.args.kwargs["eval_strategy"] == "steps" def test_build_branch_training_configs_creates_branch_output_dirs() -> None: configs = build_branch_training_configs( load_training_config( overrides={ "output_dir": "/tmp/maris-branch", "eval_dataset_repo": "MarisUK/maris-ai-evals", } ) ) branch_names = {config.branch_name for config in configs} assert {"master", "coder", "planner", "image", "music", "tts", "stt", "video"} == branch_names coder_config = next(config for config in configs if config.branch_name == "coder") assert coder_config.output_dir.endswith("/coder") assert coder_config.eval_dataset_repo == "MarisUK/maris-ai-evals" assert coder_config.benchmark_gate_enabled is True assert coder_config.benchmark_min_overall >= 0.76 assert coder_config.benchmark_dataset_path.endswith( "core-python/evals/coder_release_benchmark.json" ) assert coder_config.preference_dataset_path.endswith( "core-python/evals/coder_preference_dataset.json" ) assert coder_config.quality_min_text_chars >= 18 assert coder_config.category_weight_map["coding"] >= 1.35 assert coder_config.category_weight_map["grounding"] >= 1.25 planner_config = next(config for config in configs if config.branch_name == "planner") assert planner_config.benchmark_gate_enabled is True assert planner_config.benchmark_min_overall >= 0.76 assert planner_config.benchmark_dataset_path.endswith( "core-python/evals/planner_release_benchmark.json" ) master_config = next(config for config in configs if config.branch_name == "master") assert master_config.hub_model_id == "MarisUK/maris-ai-text" assert master_config.benchmark_gate_enabled is True assert master_config.quality_min_text_chars >= 12 image_config = next(config for config in configs if config.branch_name == "image") assert image_config.adapter_type == "specialist_model" assert image_config.hub_model_id == "MarisUK/maris-ai-image" def test_train_branch_suite_writes_external_manifests_for_specialists( tmp_path: Path, monkeypatch, ) -> None: base_config = load_training_config(overrides={"output_dir": str(tmp_path / "branches")}) monkeypatch.setattr( "maris_core.training.train.train_with_config", lambda branch_config: {"branch_len": float(len(branch_config.branch_name))}, ) results = train_branch_suite(base_config) branch_suite = json.loads( (tmp_path / "branches" / "branch-suite.json").read_text(encoding="utf-8") ) assert results["master"]["status"] == "trained" assert results["master"]["maris_origin"] == "Maris AI" assert results["coder"]["status"] == "trained" assert results["image"]["status"] == "external_specialist" assert results["tts"]["status"] == "external_specialist" assert (tmp_path / "branches" / "image" / "branch-config.json").is_file() assert (tmp_path / "branches" / "tts" / "branch-config.json").is_file() assert (tmp_path / "branches" / "branch-suite.json").is_file() assert branch_suite["artifact_type"] == "branch-suite" assert branch_suite["maris_origin"] == "Maris AI" assert branch_suite["dataset_repo"] == "MarisUK/maris-ai-memory" assert branch_suite["branches"]["image"]["maris_origin"] == "Maris AI" assert branch_suite["branches"]["stt"]["maris_model_id"] == "MarisUK/maris-stt-runtime" def test_post_training_benchmark_results_use_maris_model_id(tmp_path: Path, monkeypatch) -> None: benchmark_path = tmp_path / "benchmark.json" benchmark_path.write_text( json.dumps([{"name": "identity", "message": "Kas tu esi?", "expected_terms": ["Maris"]}]), encoding="utf-8", ) config = load_training_config( overrides={ "benchmark_dataset_path": str(benchmark_path), "benchmark_levels": ["ci"], "hub_model_id": "MarisUK/maris-ai-master-trained", } ) class FakePipeline: pass def fake_pipeline(*args, **kwargs): del args, kwargs return FakePipeline() async def fake_run_chat_benchmark_with_responder(cases, *, responder, concurrency): del concurrency response = await responder(cases[0]) return [types.SimpleNamespace(model=response["model"], response=response["response"])] def fake_build_chat_benchmark_manifest(results, *, benchmark_name, branch, model): return { "benchmark_name": benchmark_name, "branch": branch, "model": model, "results": [{"model": results[0].model, "response": results[0].response}], } monkeypatch.setitem(sys.modules, "transformers", types.SimpleNamespace(pipeline=fake_pipeline)) monkeypatch.setattr( "maris_core.training.train.run_chat_benchmark_with_responder", fake_run_chat_benchmark_with_responder, ) monkeypatch.setattr( "maris_core.training.train.call_generation_pipeline", lambda *args, **kwargs: [{"generated_text": "Es esmu Maris AI."}], ) monkeypatch.setattr( "maris_core.training.train.build_chat_benchmark_manifest", fake_build_chat_benchmark_manifest, ) payload = asyncio.run( _run_post_training_benchmark(config, model_path=str(tmp_path / "trained-model")) ) assert payload == { "benchmark_name": config.benchmark_name, "branch": config.branch_name, "model": "MarisUK/maris-ai-master-trained", "results": [ { "model": "MarisUK/maris-ai-master-trained", "response": "Es esmu Maris AI.", } ], } def test_post_training_benchmark_filters_cases_by_branch(tmp_path: Path, monkeypatch) -> None: benchmark_path = tmp_path / "benchmark.json" benchmark_path.write_text( json.dumps( [ {"name": "master-case", "message": "Sveiki", "branches": ["master"], "level": "ci"}, { "name": "coder-case", "message": "Uzraksti Python helperi", "profile": "coder", "branches": ["coder"], "level": "ci", }, ] ), encoding="utf-8", ) config = load_training_config( overrides={ "branch_name": "coder", "benchmark_dataset_path": str(benchmark_path), "benchmark_levels": ["ci"], } ) class FakePipeline: pass captured_case_names: list[str] = [] def fake_pipeline(*args, **kwargs): del args, kwargs return FakePipeline() async def fake_run_chat_benchmark_with_responder(cases, *, responder, concurrency): del responder, concurrency captured_case_names.extend(case.name for case in cases) return [] monkeypatch.setitem(sys.modules, "transformers", types.SimpleNamespace(pipeline=fake_pipeline)) monkeypatch.setattr( "maris_core.training.train.run_chat_benchmark_with_responder", fake_run_chat_benchmark_with_responder, ) monkeypatch.setattr( "maris_core.training.train.build_chat_benchmark_manifest", lambda results, *, benchmark_name, branch, model: { "benchmark_name": benchmark_name, "branch": branch, "model": model, "results": results, }, ) asyncio.run(_run_post_training_benchmark(config, model_path=str(tmp_path / "trained-model"))) assert captured_case_names == ["coder-case"] def test_filter_records_for_branch_keeps_coder_specific_mix() -> None: records = [ { "type": "conversation", "user": "Sveiki", "assistant": "Čau", }, { "type": "code", "prompt": "Salabo parseri", "metadata": {"language": "python", "task": "bugfix", "project_area": "core-python"}, }, { "type": "autonomous", "prompt": "Investigate CI", "metadata": {"workflow": "ci-triage", "project_area": "operations"}, }, ] filtered, report = _filter_records_for_branch( records, branch_name="coder", split_name="train", ) assert len(filtered) == 1 assert filtered[0]["type"] == "code" assert report.kept_records == 1 assert report.dropped_records == 2 def test_filter_records_for_branch_keeps_master_general_mix() -> None: records = [ {"type": "conversation", "user": "Sveiki", "assistant": "Čau"}, {"type": "code", "prompt": "Uzraksti helperi", "profile": "coder"}, {"type": "autonomous", "prompt": "Plan sprint", "branch": "planner"}, ] filtered, report = _filter_records_for_branch( records, branch_name="master", split_name="train", ) assert [record["type"] for record in filtered] == ["conversation"] assert report.kept_records == 1 assert report.dropped_records == 2 def test_filter_records_for_branch_uses_custom_rule_map() -> None: records = [ {"type": "conversation", "user": "Sveiki", "assistant": "Čau"}, {"type": "code", "prompt": "Uzraksti helperi", "profile": "coder"}, ] filtered, report = _filter_records_for_branch( records, branch_name="coder", split_name="train", branch_filter_rules={ "coder": { "include_record_types": ["conversation"], "exclude_explicit_branches": ["planner"], } }, ) assert [record["type"] for record in filtered] == ["conversation"] assert report.kept_records == 1 assert report.dropped_records == 1 def test_filter_preference_examples_for_branch_keeps_coder_examples_only() -> None: examples_path = Path( "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json" ) examples = load_preference_dataset(examples_path) filtered = _filter_preference_examples_for_branch( examples, branch_name="coder", ) assert filtered assert all((example.branch or "").lower() == "coder" for example in filtered) def test_filter_preference_examples_for_branch_uses_custom_rule_map() -> None: examples_path = Path( "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json" ) examples = load_preference_dataset(examples_path) filtered = _filter_preference_examples_for_branch( examples, branch_name="planner", branch_filter_rules={ "planner": { "include_task_types": ["repo-level"], } }, ) assert len(filtered) >= 1 assert all(example.task_type == "repo-level" for example in filtered) def test_train_uses_external_eval_dataset_when_configured(tmp_path: Path, monkeypatch) -> None: dataset_calls: list[str] = [] class FakeSplit(list): column_names = ["text"] def map(self, function, **kwargs): del kwargs batch = {"text": [item["text"] for item in self]} mapped = function(batch) size = len(next(iter(mapped.values()))) if mapped else 0 return FakeSplit( [{key: value[index] for key, value in mapped.items()} for index in range(size)] ) def train_test_split(self, *, test_size, seed): del test_size, seed midpoint = max(1, len(self) - 1) return {"train": FakeSplit(self[:midpoint]), "test": FakeSplit(self[midpoint:])} def fake_load_hf_dataset(repo_id: str): dataset_calls.append(repo_id) if repo_id == "MarisUK/maris-ai-memory": return {"train": FakeSplit([{"text": "train-1"}, {"text": "train-2"}])} if repo_id == "MarisUK/maris-ai-evals": return {"train": FakeSplit([{"text": "eval-1"}])} raise AssertionError(f"Unexpected repo id: {repo_id}") class FakeTokenizer: pad_token = None eos_token = "" pad_token_id = None eos_token_id = 7 @classmethod def from_pretrained(cls, model_name): del model_name return cls() def __call__(self, texts, **kwargs): del kwargs return { "input_ids": [[index + 1] for index, _ in enumerate(texts)], "attention_mask": [[1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer.json").write_text("{}", encoding="utf-8") class FakeModel: config = types.SimpleNamespace(pad_token_id=None) @classmethod def from_pretrained(cls, model_name): del model_name return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainer: last_instance = None def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): del model, data_collator self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset FakeTrainer.last_instance = self def train(self): return types.SimpleNamespace(metrics={"train_loss": 0.1}) def evaluate(self): return {"eval_loss": 0.2} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") monkeypatch.setattr("maris_core.training.train.load_hf_dataset", fake_load_hf_dataset) monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) metrics = train( output_dir=str(tmp_path / "trained-model"), dataset_repos=["MarisUK/maris-ai-memory"], eval_dataset_repo="MarisUK/maris-ai-evals", eval_dataset_repos=["MarisUK/maris-ai-evals"], ) assert metrics["eval_loss"] == 0.2 assert dataset_calls == ["MarisUK/maris-ai-memory", "MarisUK/maris-ai-evals"] assert FakeTrainer.last_instance is not None assert len(FakeTrainer.last_instance.train_dataset) == 1 assert len(FakeTrainer.last_instance.eval_dataset) == 1 def test_train_merges_multiple_dataset_repos_for_training_and_eval( tmp_path: Path, monkeypatch, ) -> None: dataset_calls: list[str] = [] class FakeSplit(list): column_names = ["text"] def map(self, function, **kwargs): del kwargs batch = {"text": [item["text"] for item in self]} mapped = function(batch) size = len(next(iter(mapped.values()))) if mapped else 0 return FakeSplit( [{key: value[index] for key, value in mapped.items()} for index in range(size)] ) repo_rows = { "MarisUK/maris-ai-memory": { "train": [{"text": "memory-train"}], "validation": [{"text": "memory-val"}], }, "MarisUK/maris-ai-lv-memory": { "train": [{"text": "lv-train"}], "validation": [{"text": "lv-val"}], }, "MarisUK/maris-ai-evals": { "train": [{"text": "eval-train"}], "validation": [{"text": "eval-val"}], }, "MarisUK/maris-ai-benchmark": { "train": [{"text": "bench-train"}], "validation": [{"text": "bench-val"}], }, } def fake_load_hf_dataset(repo_id: str): dataset_calls.append(repo_id) if repo_id not in repo_rows: raise AssertionError(f"Unexpected repo id: {repo_id}") payload = repo_rows[repo_id] return {split_name: FakeSplit(list(records)) for split_name, records in payload.items()} class FakeTokenizer: pad_token = None eos_token = "" pad_token_id = None eos_token_id = 7 @classmethod def from_pretrained(cls, model_name): del model_name return cls() def __call__(self, texts, **kwargs): del kwargs return { "input_ids": [[index + 1] for index, _ in enumerate(texts)], "attention_mask": [[1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer.json").write_text("{}", encoding="utf-8") class FakeModel: config = types.SimpleNamespace(pad_token_id=None) @classmethod def from_pretrained(cls, model_name): del model_name return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainer: last_instance = None def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): del model, data_collator self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset FakeTrainer.last_instance = self def train(self): return types.SimpleNamespace(metrics={"train_loss": 0.1}) def evaluate(self): return {"eval_loss": 0.2} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") monkeypatch.setattr("maris_core.training.train.load_hf_dataset", fake_load_hf_dataset) monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) metrics = train( output_dir=str(tmp_path / "trained-model"), dataset_repo="MarisUK/maris-ai-memory", dataset_repos=[ "MarisUK/maris-ai-memory", "MarisUK/maris-ai-lv-memory", "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ], eval_dataset_repo="MarisUK/maris-ai-evals", eval_dataset_repos=[ "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ], ) assert metrics["eval_loss"] == 0.2 assert dataset_calls == [ "MarisUK/maris-ai-memory", "MarisUK/maris-ai-lv-memory", "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", "MarisUK/maris-ai-evals", "MarisUK/maris-ai-benchmark", ] assert FakeTrainer.last_instance is not None assert len(FakeTrainer.last_instance.train_dataset) == 4 assert len(FakeTrainer.last_instance.eval_dataset) == 2 def test_evaluate_with_config_prefers_external_eval_dataset(tmp_path: Path, monkeypatch) -> None: dataset_calls: list[str] = [] trained_model_dir = tmp_path / "trained-model" trained_model_dir.mkdir(parents=True, exist_ok=True) (trained_model_dir / "config.json").write_text( json.dumps( { "_name_or_path": "MarisUK/maris-ai-master", "model_type": "qwen2", "architectures": ["Qwen2ForCausalLM"], "tokenizer_class": "Qwen2TokenizerFast", "auto_map": {"AutoModelForCausalLM": "qwen2.modeling_qwen2.Qwen2ForCausalLM"}, } ), encoding="utf-8", ) (trained_model_dir / "tokenizer_config.json").write_text( json.dumps( { "name_or_path": "MarisUK/maris-ai-master", "tokenizer_class": "Qwen2TokenizerFast", } ), encoding="utf-8", ) write_maris_compatibility_artifact( trained_model_dir, maris_model_id="MarisUK/maris-ai-master", ) apply_maris_compatibility_identity(trained_model_dir) class FakeSplit(list): column_names = ["text"] def map(self, function, **kwargs): del kwargs batch = {"text": [item["text"] for item in self]} mapped = function(batch) size = len(next(iter(mapped.values()))) if mapped else 0 return FakeSplit( [{key: value[index] for key, value in mapped.items()} for index in range(size)] ) def fake_load_hf_dataset(repo_id: str): dataset_calls.append(repo_id) if repo_id == "MarisUK/maris-ai-evals": return {"train": FakeSplit([{"text": "eval-1"}, {"text": "eval-2"}])} raise AssertionError(f"Unexpected repo id: {repo_id}") class FakeTokenizer: pad_token = None eos_token = "" pad_token_id = None eos_token_id = 7 @classmethod def from_pretrained(cls, model_name): loaded_dir = Path(model_name) assert loaded_dir != trained_model_dir tokenizer_config = json.loads( loaded_dir.joinpath("tokenizer_config.json").read_text(encoding="utf-8") ) assert tokenizer_config["tokenizer_class"] == "Qwen2TokenizerFast" return cls() def __call__(self, texts, **kwargs): del kwargs return { "input_ids": [[index + 1] for index, _ in enumerate(texts)], "attention_mask": [[1] for _ in texts], } class FakeModel: @classmethod def from_pretrained(cls, model_name): loaded_dir = Path(model_name) assert loaded_dir != trained_model_dir model_config = json.loads( loaded_dir.joinpath("config.json").read_text(encoding="utf-8") ) assert model_config["model_type"] == "qwen2" assert model_config["architectures"] == ["Qwen2ForCausalLM"] return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainer: def __init__(self, *, model, args, eval_dataset=None, data_collator=None): del model, args, data_collator self.eval_dataset = eval_dataset def evaluate(self): return {"eval_loss": 0.3} monkeypatch.setattr("maris_core.training.train.load_hf_dataset", fake_load_hf_dataset) monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) config = load_training_config( overrides={ "output_dir": str(trained_model_dir), "eval_dataset_repo": "MarisUK/maris-ai-evals", "eval_dataset_repos": ["MarisUK/maris-ai-evals"], "benchmark_dataset_path": str(tmp_path / "benchmark.json"), "benchmark_levels": ["ci"], } ) async def fake_benchmark(config, *, model_path): assert model_path.endswith("trained-model") return { "artifact_type": "chat-benchmark-manifest", "benchmark_name": config.benchmark_name, "branch": config.branch_name, "model": config.hub_model_id, "generated_at": "2026-04-16T00:00:00Z", "score_manifest": { "overall": 0.79, "reasoning": 0.76, "factuality": 0.75, "helpfulness": 0.8, "execution": 0.75, }, "category_scores": {"coding": 0.74}, "execution_language_pass_rates": {"python": 1.0}, "execution_language_scores": {"python": 0.74}, "category_execution_pass_rates": {"coding": 1.0}, } monkeypatch.setattr("maris_core.training.train._run_post_training_benchmark", fake_benchmark) metrics = evaluate_with_config(config, model_path=str(trained_model_dir)) assert metrics["eval_loss"] == 0.3 assert metrics["eval_examples"] == 2.0 assert metrics["benchmark_overall"] == 0.79 assert metrics["benchmark_gate_passed"] == 1.0 assert metrics["benchmark_regressions"] == 0.0 assert dataset_calls == ["MarisUK/maris-ai-evals"] assert (trained_model_dir / "benchmark-manifest.json").is_file() assert (trained_model_dir / "benchmark-history.json").is_file() assert (trained_model_dir / "benchmark-regression-report.json").is_file() assert ( json.loads((trained_model_dir / "config.json").read_text(encoding="utf-8"))["model_type"] == "maris" ) assert ( json.loads((trained_model_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ "tokenizer_class" ] == "MarisCompatibleTokenizer" ) def test_load_training_config_reads_peft_and_preference_optimization_settings( tmp_path: Path, ) -> None: config_path = tmp_path / "training.json" config_path.write_text( json.dumps( { "adapter_type": "qlora", "lora_r": 32, "lora_alpha": 64, "lora_dropout": 0.15, "lora_bias": "all", "peft_target_modules": ["q_proj", "v_proj"], "qlora_quant_type": "fp4", "qlora_use_double_quant": False, "qlora_compute_dtype": "bfloat16", "preference_dataset_path": "/tmp/preferences.json", "preference_optimization": "dpo", "preference_beta": 0.25, "preference_max_prompt_length": 256, "preference_max_length": 768, "preference_reference_model": "MarisUK/maris-ai-master", } ), encoding="utf-8", ) config = load_training_config(str(config_path)) assert config.adapter_type == "qlora" assert config.lora_r == 32 assert config.lora_alpha == 64 assert config.lora_dropout == 0.15 assert config.lora_bias == "all" assert config.peft_target_modules == ["q_proj", "v_proj"] assert config.qlora_quant_type == "fp4" assert config.qlora_use_double_quant is False assert config.qlora_compute_dtype == "bfloat16" assert config.preference_optimization == "dpo" assert config.preference_beta == 0.25 assert config.preference_max_prompt_length == 256 assert config.preference_max_length == 768 assert config.preference_reference_model == "MarisUK/maris-ai-master" def test_train_runs_qlora_and_dpo_preference_stage(tmp_path: Path, monkeypatch) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): assert batched is True del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kas jauns?", "assistant": "Viss kārtībā"}, ] ) }, ) preference_dataset_path = tmp_path / "preferences.json" preference_dataset_path.write_text( json.dumps( [ { "prompt": "Atbildi korekti", "chosen": "Šī ir labākā atbilde.", "rejected": "Nē.", "source": "human_review", } ] ), encoding="utf-8", ) model_load_calls: list[dict[str, object]] = [] bnb_calls: list[dict[str, object]] = [] lora_config_calls: list[dict[str, object]] = [] class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): return cls() def __call__(self, texts, *, truncation, max_length, padding): del truncation, max_length, padding return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeBitsAndBytesConfig: def __init__(self, **kwargs): bnb_calls.append(kwargs) self.kwargs = kwargs class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) self.prepared_for_kbit = False self.peft_config = None self.trainable_parameters_printed = False @classmethod def from_pretrained(cls, model_name, **kwargs): model_load_calls.append({"model_name": model_name, "kwargs": kwargs}) return cls() def print_trainable_parameters(self): self.trainable_parameters_printed = True class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): del data_collator self.model = model self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.4} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "adapter_config.json").write_text("{}", encoding="utf-8") Path(output_dir, "config.json").write_text("{}", encoding="utf-8") fake_transformers = types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, BitsAndBytesConfig=FakeBitsAndBytesConfig, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ) monkeypatch.setitem(sys.modules, "transformers", fake_transformers) class FakeLoraConfig: def __init__(self, **kwargs): lora_config_calls.append(kwargs) self.kwargs = kwargs class FakeAutoPeftModelForCausalLM: @classmethod def from_pretrained(cls, model_name, **kwargs): model_load_calls.append({"model_name": model_name, "kwargs": kwargs, "auto_peft": True}) return FakeModel() def fake_prepare_model_for_kbit_training(model, use_gradient_checkpointing): model.prepared_for_kbit = use_gradient_checkpointing return model def fake_get_peft_model(model, peft_config): model.peft_config = peft_config return model monkeypatch.setitem( sys.modules, "peft", types.SimpleNamespace( AutoPeftModelForCausalLM=FakeAutoPeftModelForCausalLM, LoraConfig=FakeLoraConfig, TaskType=types.SimpleNamespace(CAUSAL_LM="CAUSAL_LM"), get_peft_model=fake_get_peft_model, prepare_model_for_kbit_training=fake_prepare_model_for_kbit_training, ), ) class FakeDPOConfig: def __init__(self, **kwargs): self.kwargs = kwargs class FakeDPOTrainer: last_instance = None def __init__(self, **kwargs): self.kwargs = kwargs FakeDPOTrainer.last_instance = self def train(self): return types.SimpleNamespace(metrics={"loss": 0.12}) def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "adapter_config.json").write_text("{}", encoding="utf-8") Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "trl", types.SimpleNamespace(DPOConfig=FakeDPOConfig, DPOTrainer=FakeDPOTrainer), ) output_dir = tmp_path / "trained-model" metrics = train( output_dir=str(output_dir), max_seq_length=256, adapter_type="qlora", qlora_compute_dtype="float16", qlora_quant_type="nf4", qlora_use_double_quant=True, lora_r=8, lora_alpha=16, lora_dropout=0.1, peft_target_modules=["q_proj", "v_proj"], preference_dataset_path=str(preference_dataset_path), preference_optimization="dpo", preference_beta=0.2, preference_max_prompt_length=128, preference_max_length=512, ) assert metrics["train_loss"] == 0.2 assert metrics["preference_loss"] == 0.12 assert metrics["preference_examples"] == 1.0 assert metrics["preference_stage"] == 1.0 assert bnb_calls[0]["load_in_4bit"] is True assert bnb_calls[0]["bnb_4bit_quant_type"] == "nf4" assert lora_config_calls[0]["r"] == 8 assert lora_config_calls[0]["lora_alpha"] == 16 assert lora_config_calls[0]["target_modules"] == ["q_proj", "v_proj"] assert FakeDPOTrainer.last_instance is not None assert "ref_model" in FakeDPOTrainer.last_instance.kwargs assert len(FakeDPOTrainer.last_instance.kwargs["train_dataset"]) == 1 assert any(call.get("auto_peft") for call in model_load_calls) def test_train_runs_orpo_preference_stage_without_reference_model( tmp_path: Path, monkeypatch, ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kā iet?", "assistant": "Labi"}, ] ) }, ) preference_dataset_path = tmp_path / "preferences.json" preference_dataset_path.write_text( json.dumps( [ { "prompt": "Atbildi pieklājīgi", "chosen": "Protams, palīdzēšu.", "rejected": "Nē.", "source": "human_review", } ] ), encoding="utf-8", ) class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name): del model_name return cls() def __call__(self, texts, *, truncation, max_length, padding): del truncation, max_length, padding return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainer: def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): del model, args, train_dataset, eval_dataset, data_collator def train(self): return types.SimpleNamespace(metrics={"train_loss": 0.11}) def evaluate(self): return {"eval_loss": 0.22} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) class FakeORPOConfig: def __init__(self, **kwargs): self.kwargs = kwargs class FakeORPOTrainer: last_instance = None def __init__(self, **kwargs): self.kwargs = kwargs FakeORPOTrainer.last_instance = self def train(self): return types.SimpleNamespace(metrics={"loss": 0.07}) def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "trl", types.SimpleNamespace(ORPOConfig=FakeORPOConfig, ORPOTrainer=FakeORPOTrainer), ) metrics = train( output_dir=str(tmp_path / "trained-model"), preference_dataset_path=str(preference_dataset_path), preference_optimization="orpo", ) assert metrics["preference_loss"] == 0.07 assert FakeORPOTrainer.last_instance is not None assert "ref_model" not in FakeORPOTrainer.last_instance.kwargs def test_train_retries_tokenizer_with_slow_backend(tmp_path: Path, monkeypatch) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kas jauns?", "assistant": "Viss kārtībā"}, ] ) }, ) tokenizer_fast_attempts: list[bool] = [] class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name tokenizer_fast_attempts.append(bool(kwargs.get("use_fast", True))) if kwargs.get("use_fast", True): raise ValueError("fast tokenizer unavailable") return cls() def __call__(self, texts, *, truncation, max_length, padding): del texts, truncation, max_length, padding return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} def save_pretrained(self, output_dir): Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModelConfig: pad_token_id = None use_cache = True class FakeModel: config = FakeModelConfig() @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.1} class FakeTrainer: def __init__(self, **kwargs): self.kwargs = kwargs def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.2} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) train( output_dir=str(tmp_path / "slow-tokenizer"), model_name="custom/model", max_seq_length=256 ) assert tokenizer_fast_attempts == [True, False] def test_train_auto_switches_giant_models_to_resource_saver_mode( tmp_path: Path, monkeypatch, ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Labdien"}, {"user": "Kas jauns?", "assistant": "Viss kārtībā"}, ] ) }, ) monkeypatch.setenv("HF_TRAIN_BATCH_SIZE", "4") monkeypatch.setenv("HF_TRAIN_EVAL_BATCH_SIZE", "2") monkeypatch.setenv("HF_TRAIN_GRADIENT_ACCUMULATION_STEPS", "4") model_load_calls: list[dict[str, object]] = [] bnb_calls: list[dict[str, object]] = [] class FakeTokenizer: pad_token = None pad_token_id = None eos_token = "" eos_token_id = 99 @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def __call__(self, texts, *, truncation, max_length, padding): del texts, truncation, max_length, padding return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} def save_pretrained(self, output_dir): Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeBitsAndBytesConfig: def __init__(self, **kwargs): bnb_calls.append(kwargs) self.kwargs = kwargs class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): model_load_calls.append({"model_name": model_name, "kwargs": kwargs}) return cls() def print_trainable_parameters(self): return None class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: last_instance = None def __init__(self, **kwargs): self.kwargs = kwargs self.args = kwargs["args"] FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.4} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "adapter_config.json").write_text("{}", encoding="utf-8") Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, BitsAndBytesConfig=FakeBitsAndBytesConfig, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) class FakeLoraConfig: def __init__(self, **kwargs): self.kwargs = kwargs def fake_prepare_model_for_kbit_training(model, use_gradient_checkpointing): del use_gradient_checkpointing return model def fake_get_peft_model(model, peft_config): del peft_config return model monkeypatch.setitem( sys.modules, "peft", types.SimpleNamespace( LoraConfig=FakeLoraConfig, TaskType=types.SimpleNamespace(CAUSAL_LM="CAUSAL_LM"), get_peft_model=fake_get_peft_model, prepare_model_for_kbit_training=fake_prepare_model_for_kbit_training, ), ) output_dir = tmp_path / "giant-model" train( output_dir=str(output_dir), model_name="Qwen/Qwen3-Coder-480B-A35B-Instruct", adapter_type="full", max_seq_length=256, ) assert bnb_calls[0]["load_in_4bit"] is True assert model_load_calls[0]["model_name"] == "Qwen/Qwen3-Coder-480B-A35B-Instruct" assert model_load_calls[0]["kwargs"]["device_map"] == "auto" assert model_load_calls[0]["kwargs"]["low_cpu_mem_usage"] is True assert "quantization_config" in model_load_calls[0]["kwargs"] assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.args.kwargs["per_device_train_batch_size"] == 1 assert FakeTrainer.last_instance.args.kwargs["per_device_eval_batch_size"] == 1 assert FakeTrainer.last_instance.args.kwargs["gradient_accumulation_steps"] == 16 training_config = json.loads((output_dir / "training-config.json").read_text(encoding="utf-8")) assert training_config["adapter_type"] == "qlora" def test_train_disables_pin_memory_and_tqdm_in_non_interactive_environment( tmp_path: Path, monkeypatch ) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch if batched else self.items[0]) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Čau!"}, {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, ] ) }, ) import maris_core.training.train as train_module monkeypatch.setattr(train_module.sys, "stderr", types.SimpleNamespace(isatty=lambda: False)) monkeypatch.setitem( sys.modules, "torch", types.SimpleNamespace( cuda=types.SimpleNamespace(is_available=lambda: False), backends=types.SimpleNamespace(mps=types.SimpleNamespace(is_available=lambda: False)), ), ) class FakeTokenizer: pad_token_id = 0 eos_token_id = 1 pad_token = "" eos_token = "" @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def __call__(self, texts, truncation, padding, max_length): del truncation, padding, max_length if isinstance(texts, str): texts = [texts] return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: last_instance = None def __init__(self, **kwargs): self.args = kwargs["args"] FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.4} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) train(output_dir=str(tmp_path / "cpu-runtime"), model_name="custom/model", max_seq_length=256) assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.args.kwargs["dataloader_pin_memory"] is False assert FakeTrainer.last_instance.args.kwargs["disable_tqdm"] is True assert FakeTrainer.last_instance.args.kwargs["logging_first_step"] is True def test_train_enables_bf16_by_default_when_cuda_supports_it(tmp_path: Path, monkeypatch) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch if batched else self.items[0]) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Čau!"}, {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, ] ) }, ) import maris_core.training.train as train_module monkeypatch.setattr(train_module.sys, "stderr", types.SimpleNamespace(isatty=lambda: True)) monkeypatch.setitem( sys.modules, "torch", types.SimpleNamespace( cuda=types.SimpleNamespace( is_available=lambda: True, is_bf16_supported=lambda: True, ), backends=types.SimpleNamespace(mps=types.SimpleNamespace(is_available=lambda: False)), ), ) class FakeTokenizer: pad_token_id = 0 eos_token_id = 1 pad_token = "" eos_token = "" @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def __call__(self, texts, truncation, padding, max_length): del truncation, padding, max_length if isinstance(texts, str): texts = [texts] return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: last_instance = None def __init__(self, **kwargs): self.args = kwargs["args"] FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.4} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) train(output_dir=str(tmp_path / "cuda-runtime"), model_name="custom/model", max_seq_length=256) assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.args.kwargs["bf16"] is True assert FakeTrainer.last_instance.args.kwargs["fp16"] is False def test_train_uses_fsdp_training_arguments_when_requested(tmp_path: Path, monkeypatch) -> None: class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch if batched else self.items[0]) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Čau!"}, {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, ] ) }, ) class FakeTokenizer: pad_token_id = 0 eos_token_id = 1 pad_token = "" eos_token = "" @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def __call__(self, texts, truncation, padding, max_length): del truncation, padding, max_length if isinstance(texts, str): texts = [texts] return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: last_instance = None def __init__(self, **kwargs): self.args = kwargs["args"] FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.4} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) fsdp_config_path = tmp_path / "fsdp-config.json" fsdp_config_path.write_text( json.dumps({"activation_checkpointing": False, "limit_all_gathers": False}), encoding="utf-8", ) train( output_dir=str(tmp_path / "fsdp-runtime"), model_name="custom/model", max_seq_length=256, distributed_strategy="fsdp", distributed_config_path=str(fsdp_config_path), fsdp_transformer_layer_cls_to_wrap=["Qwen2DecoderLayer"], ) assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.args.kwargs["fsdp"] == "full_shard auto_wrap" assert FakeTrainer.last_instance.args.kwargs["fsdp_config"]["activation_checkpointing"] is False assert FakeTrainer.last_instance.args.kwargs["fsdp_config"]["limit_all_gathers"] is False assert FakeTrainer.last_instance.args.kwargs["fsdp_config"]["min_num_params"] == 100_000_000 assert FakeTrainer.last_instance.args.kwargs["fsdp_config"][ "transformer_layer_cls_to_wrap" ] == ["Qwen2DecoderLayer"] assert FakeTrainer.last_instance.args.kwargs["ddp_find_unused_parameters"] is False def test_train_uses_deepspeed_training_arguments_when_requested( tmp_path: Path, monkeypatch ) -> None: monkeypatch.setattr( "maris_core.training.train.get_installed_package_version", lambda package_name: "0.18.9", ) class FakeDataset: def __init__(self, items): self.items = list(items) self.column_names = list(self.items[0].keys()) if self.items else [] def train_test_split(self, *, test_size, seed): del test_size, seed return { "train": FakeDataset(self.items[:1]), "test": FakeDataset(self.items[1:]), } def map(self, fn, *, batched, remove_columns, desc): del remove_columns, desc batch = {key: [item.get(key) for item in self.items] for key in self.column_names} transformed = fn(batch if batched else self.items[0]) size = len(next(iter(transformed.values()))) if transformed else 0 return FakeDataset( [{key: transformed[key][index] for key in transformed} for index in range(size)] ) def __len__(self): return len(self.items) monkeypatch.setattr( "maris_core.training.train.load_hf_dataset", lambda _: { "train": FakeDataset( [ {"user": "Sveiki", "assistant": "Čau!"}, {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, ] ) }, ) class FakeTokenizer: pad_token_id = 0 eos_token_id = 1 pad_token = "" eos_token = "" @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() def __call__(self, texts, truncation, padding, max_length): del truncation, padding, max_length if isinstance(texts, str): texts = [texts] return { "input_ids": [[1, 2, 3] for _ in texts], "attention_mask": [[1, 1, 1] for _ in texts], } def save_pretrained(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") class FakeModel: def __init__(self): self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) @classmethod def from_pretrained(cls, model_name, **kwargs): del model_name, kwargs return cls() class FakeTrainingArguments: def __init__(self, **kwargs): self.kwargs = kwargs class FakeTrainResult: metrics = {"train_loss": 0.2} class FakeTrainer: last_instance = None def __init__(self, **kwargs): self.args = kwargs["args"] FakeTrainer.last_instance = self def train(self): return FakeTrainResult() def evaluate(self): return {"eval_loss": 0.4} def save_model(self, output_dir): Path(output_dir).mkdir(parents=True, exist_ok=True) Path(output_dir, "config.json").write_text("{}", encoding="utf-8") monkeypatch.setitem( sys.modules, "transformers", types.SimpleNamespace( AutoModelForCausalLM=FakeModel, AutoTokenizer=FakeTokenizer, DataCollatorForLanguageModeling=lambda **kwargs: kwargs, Trainer=FakeTrainer, TrainingArguments=FakeTrainingArguments, ), ) deepspeed_config_path = tmp_path / "deepspeed.json" deepspeed_config_path.write_text( json.dumps({"zero_optimization": {"stage": 3}}), encoding="utf-8" ) train( output_dir=str(tmp_path / "deepspeed-runtime"), model_name="custom/model", max_seq_length=256, distributed_strategy="deepspeed", distributed_config_path=str(deepspeed_config_path), ) assert FakeTrainer.last_instance is not None assert FakeTrainer.last_instance.args.kwargs["deepspeed"] == str(deepspeed_config_path) assert FakeTrainer.last_instance.args.kwargs["ddp_find_unused_parameters"] is False def test_deepspeed_training_arguments_raise_clear_error_when_dependency_missing( tmp_path: Path, monkeypatch ) -> None: deepspeed_config_path = tmp_path / "deepspeed.json" deepspeed_config_path.write_text( json.dumps({"zero_optimization": {"stage": 3}}), encoding="utf-8" ) config = load_training_config( overrides={ "distributed_strategy": "deepspeed", "distributed_config_path": str(deepspeed_config_path), } ) def _raise_missing_package(package_name: str) -> None: raise PackageNotFoundError(package_name) monkeypatch.setattr( "maris_core.training.train.get_installed_package_version", _raise_missing_package, ) with pytest.raises( ImportError, match="DeepSpeed režīms nepieciešams instalēt 'deepspeed'", ): _build_distributed_training_argument_overrides(config) def test_deepspeed_training_arguments_raise_clear_error_when_metadata_lookup_stops( tmp_path: Path, monkeypatch ) -> None: deepspeed_config_path = tmp_path / "deepspeed.json" deepspeed_config_path.write_text( json.dumps({"zero_optimization": {"stage": 3}}), encoding="utf-8" ) config = load_training_config( overrides={ "distributed_strategy": "deepspeed", "distributed_config_path": str(deepspeed_config_path), } ) def _raise_stop_iteration(package_name: str) -> None: raise StopIteration(package_name) monkeypatch.setattr( "maris_core.training.train.get_installed_package_version", _raise_stop_iteration, ) with pytest.raises( ImportError, match="DeepSpeed režīms nepieciešams instalēt 'deepspeed'", ): _build_distributed_training_argument_overrides(config) def test_train_model_cli_exits_cleanly_when_runtime_dependency_missing(monkeypatch, capsys) -> None: script_path = Path(__file__).resolve().parents[1] / "scripts" / "train_model.py" spec = importlib.util.spec_from_file_location("train_model", script_path) assert spec is not None and spec.loader is not None train_model_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(train_model_module) def _raise_missing_dependency(_config: object) -> dict[str, object]: raise ImportError("DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni.") monkeypatch.setattr( train_model_module, "load_training_config", lambda *args, **kwargs: object() ) monkeypatch.setitem( sys.modules, "maris_core.training.train", types.SimpleNamespace( train_branch_suite=lambda _config: {}, train_with_config=_raise_missing_dependency, ), ) monkeypatch.setattr(sys, "argv", [str(script_path)]) with pytest.raises(SystemExit) as exc_info: train_model_module.main() assert exc_info.value.code == 2 captured = capsys.readouterr() assert "DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni." in captured.err assert "Traceback" not in captured.err def test_train_model_cli_exits_cleanly_for_branch_suite_dependency_missing( monkeypatch, capsys ) -> None: script_path = Path(__file__).resolve().parents[1] / "scripts" / "train_model.py" spec = importlib.util.spec_from_file_location("train_model", script_path) assert spec is not None and spec.loader is not None train_model_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(train_model_module) def _raise_missing_dependency(_config: object) -> dict[str, object]: raise ImportError("DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni.") monkeypatch.setattr( train_model_module, "load_training_config", lambda *args, **kwargs: object() ) monkeypatch.setattr(train_model_module, "replace", lambda config, **kwargs: config) monkeypatch.setitem( sys.modules, "maris_core.training.train", types.SimpleNamespace( train_branch_suite=_raise_missing_dependency, train_with_config=lambda _config: {}, ), ) monkeypatch.setattr(sys, "argv", [str(script_path), "--all-branches"]) with pytest.raises(SystemExit) as exc_info: train_model_module.main() assert exc_info.value.code == 2 captured = capsys.readouterr() assert "DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni." in captured.err assert "Traceback" not in captured.err def test_ensure_runtime_home_dir_sets_temp_home_when_missing(tmp_path: Path, monkeypatch) -> None: monkeypatch.delenv("HOME", raising=False) monkeypatch.delenv("USER", raising=False) monkeypatch.delenv("LOGNAME", raising=False) monkeypatch.delenv("USERNAME", raising=False) monkeypatch.setattr("maris_core.training.train.tempfile.gettempdir", lambda: str(tmp_path)) monkeypatch.setattr("maris_core.training.train.os.getuid", lambda: 1000) resolved = _ensure_runtime_home_dir() expected = tmp_path / "maris-home-1000" assert resolved == str(expected) assert os.environ["HOME"] == str(expected) assert os.environ["USER"] == "maris-1000" assert os.environ["LOGNAME"] == "maris-1000" assert os.environ["USERNAME"] == "maris-1000" assert expected.is_dir() def test_ensure_runtime_home_dir_keeps_existing_home_and_user(monkeypatch) -> None: monkeypatch.setenv("HOME", "/existing/home") monkeypatch.setenv("USER", "existing-user") monkeypatch.setenv("LOGNAME", "existing-user") monkeypatch.setenv("USERNAME", "existing-user") resolved = _ensure_runtime_home_dir() assert resolved == "/existing/home" assert os.environ["HOME"] == "/existing/home" assert os.environ["USER"] == "existing-user" assert os.environ["LOGNAME"] == "existing-user" assert os.environ["USERNAME"] == "existing-user" def test_ensure_runtime_home_dir_uses_unknown_suffix_when_getuid_fails( tmp_path: Path, monkeypatch ) -> None: monkeypatch.setenv("HOME", " ") monkeypatch.delenv("USER", raising=False) monkeypatch.delenv("LOGNAME", raising=False) monkeypatch.delenv("USERNAME", raising=False) monkeypatch.setattr("maris_core.training.train.tempfile.gettempdir", lambda: str(tmp_path)) def _raise_os_error() -> int: raise OSError("uid not available") monkeypatch.setattr("maris_core.training.train.os.getuid", _raise_os_error) resolved = _ensure_runtime_home_dir() expected = tmp_path / "maris-home-unknown" assert resolved == str(expected) assert os.environ["HOME"] == str(expected) assert os.environ["USER"] == "maris-unknown" assert os.environ["LOGNAME"] == "maris-unknown" assert os.environ["USERNAME"] == "maris-unknown" assert expected.is_dir()