| """Tests training pipeline konfigurācijai un datu sagatavošanai.""" |
|
|
| from __future__ import annotations |
|
|
| import asyncio |
| import importlib.util |
| import json |
| import os |
| import re |
| import subprocess |
| import sys |
| import types |
| from importlib.metadata import PackageNotFoundError |
| from pathlib import Path |
| from typing import Any |
|
|
| import pytest |
|
|
| from maris_core.data.preprocessing import record_to_training_text |
| from maris_core.training.config import ( |
| AVAILABLE_TRAINING_BASE_MODELS, |
| DEFAULT_TRAINING_BASE_MODEL, |
| list_training_base_models, |
| load_training_config, |
| ) |
| from maris_core.training.hf_compat import ( |
| MARIS_COMPATIBILITY_ARTIFACT_NAME, |
| apply_maris_compatibility_identity, |
| write_maris_compatibility_artifact, |
| ) |
| from maris_core.training.preferences import load_preference_dataset |
| from maris_core.training.train import ( |
| _build_benchmark_gate_artifact, |
| _build_distributed_training_argument_overrides, |
| _ensure_runtime_home_dir, |
| _filter_preference_examples_for_branch, |
| _filter_records_for_branch, |
| _run_post_training_benchmark, |
| build_branch_training_configs, |
| evaluate_with_config, |
| train, |
| train_branch_suite, |
| ) |
|
|
| FOREIGN_AI_NAME_RE = re.compile( |
| r"(?i)\b(?:anthropic|chatgpt|claude|deepseek|gemini|llama|mistral|openai|qwen|TinyLlama)\b" |
| ) |
| FOREIGN_MODEL_REPO_RE = re.compile( |
| r"(?i)\b(?:deepseek-ai|meta-llama|mistralai|openai|qwen|TinyLlama)/[A-Za-z0-9][\w.-]*\b" |
| ) |
|
|
|
|
| def _assert_output_dir_uses_only_maris_identity(output_dir: Path) -> None: |
| checked_files = sorted(output_dir.rglob("*")) |
| for path in checked_files: |
| if not path.is_file(): |
| continue |
| if path.name == MARIS_COMPATIBILITY_ARTIFACT_NAME: |
| continue |
| if path.suffix.lower() not in {".json", ".jinja", ".md", ".txt"}: |
| continue |
| content = path.read_text(encoding="utf-8") |
| assert FOREIGN_MODEL_REPO_RE.search(content) is None, path |
| assert FOREIGN_AI_NAME_RE.search(content) is None, path |
|
|
|
|
| def test_record_to_training_text_formats_conversation_and_generation() -> None: |
| conversation = record_to_training_text({"user": "Sveiki", "assistant": "Čau!"}) |
| generation = record_to_training_text({"prompt": "Uzzīmē kaķi", "metadata": {"style": "anime"}}) |
|
|
| assert "<|user|>" in conversation |
| assert "Sveiki" in conversation |
| assert "Čau!" in conversation |
| assert "Uzzīmē kaķi" in generation |
| assert '"style": "anime"' in generation |
|
|
|
|
| def test_record_to_training_text_formats_structured_coder_record() -> None: |
| formatted = record_to_training_text( |
| { |
| "prompt": "Salabo retry helperi.", |
| "target_file": "core-python/maris_core/retries.py", |
| "buggy_code": "def retry(count):\n return count / 0", |
| "tests": ["assert retry(1) == 1", "assert retry(3) == 3"], |
| "edge_cases": ["0 mēģinājumi", "negatīvs skaits"], |
| "metadata": {"language": "python", "task": "bugfix"}, |
| "completion": "```python\ndef retry(count: int) -> int:\n return max(count, 0)\n```", |
| } |
| ) |
|
|
| assert "Mērķa fails" in formatted |
| assert "Esošais vai kļūdainais kods" in formatted |
| assert "Robežgadījumi" in formatted |
| assert "```python" in formatted |
|
|
|
|
| def test_load_training_config_reads_json_and_env_overrides( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps( |
| { |
| "model_name": "repo/from-json", |
| "branch_name": "coder", |
| "num_epochs": 7, |
| "report_to": ["tensorboard"], |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| monkeypatch.setenv("HF_TRAIN_BATCH_SIZE", "3") |
| monkeypatch.setenv("HF_TRAIN_ADAPTER_TYPE", "lora") |
| config = load_training_config(str(config_path), overrides={"learning_rate": 1e-4}) |
|
|
| assert config.model_name == "repo/from-json" |
| assert config.branch_name == "coder" |
| assert config.num_epochs == 7 |
| assert config.per_device_train_batch_size == 3 |
| assert config.learning_rate == 1e-4 |
| assert config.adapter_type == "lora" |
| assert config.report_to == ["tensorboard"] |
| assert config.text_model_id == "MarisUK/maris-ai-text" |
| assert config.image_model_id == "MarisUK/maris-ai-image" |
|
|
|
|
| def test_load_training_config_reads_distributed_runtime_overrides(monkeypatch) -> None: |
| monkeypatch.setenv("HF_TRAIN_DISTRIBUTED_STRATEGY", "deepspeed") |
| monkeypatch.setenv("HF_TRAIN_DISTRIBUTED_CONFIG_PATH", "huggingface/deepspeed-zero3.json") |
| monkeypatch.setenv("HF_TRAIN_NUM_PROCESSES", "8") |
| monkeypatch.setenv("HF_TRAIN_NUM_MACHINES", "2") |
| monkeypatch.setenv("HF_TRAIN_MACHINE_RANK", "1") |
| monkeypatch.setenv("HF_TRAIN_MAIN_PROCESS_IP", "10.0.0.10") |
| monkeypatch.setenv("HF_TRAIN_MAIN_PROCESS_PORT", "29510") |
|
|
| config = load_training_config() |
|
|
| assert config.distributed_strategy == "deepspeed" |
| assert config.distributed_config_path == "huggingface/deepspeed-zero3.json" |
| assert config.use_accelerate is True |
| assert config.num_processes == 8 |
| assert config.num_machines == 2 |
| assert config.machine_rank == 1 |
| assert config.main_process_ip == "10.0.0.10" |
| assert config.main_process_port == 29510 |
|
|
|
|
| def test_load_training_config_reads_gradient_checkpointing_use_reentrant_override( |
| monkeypatch, |
| ) -> None: |
| monkeypatch.setenv("HF_TRAIN_GRADIENT_CHECKPOINTING_USE_REENTRANT", "false") |
|
|
| config = load_training_config() |
|
|
| assert config.gradient_checkpointing_use_reentrant is False |
|
|
|
|
| def test_load_training_config_reads_runtime_model_repo_overrides(monkeypatch) -> None: |
| monkeypatch.setenv("TEXT_MODEL", "MarisUK/custom-text") |
| monkeypatch.setenv("IMAGE_MODEL", "MarisUK/custom-image") |
| monkeypatch.setenv("MUSIC_MODEL", "MarisUK/custom-music") |
| monkeypatch.setenv("TTS_MODEL", "MarisUK/custom-tts") |
| monkeypatch.setenv("STT_MODEL", "MarisUK/custom-stt") |
| monkeypatch.setenv("VIDEO_MODEL", "MarisUK/custom-video") |
|
|
| config = load_training_config() |
|
|
| assert config.text_model_id == "MarisUK/custom-text" |
| assert config.image_model_id == "MarisUK/custom-image" |
| assert config.music_model_id == "MarisUK/custom-music" |
| assert config.tts_model_id == "MarisUK/custom-tts" |
| assert config.stt_model_id == "MarisUK/custom-stt" |
| assert config.video_model_id == "MarisUK/custom-video" |
|
|
|
|
| def test_load_training_config_rejects_conflicting_precision_modes( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"fp16": True, "bf16": True}), |
| encoding="utf-8", |
| ) |
|
|
| try: |
| load_training_config(str(config_path)) |
| except ValueError as exc: |
| assert "fp16 un bf16" in str(exc) |
| else: |
| raise AssertionError("load_training_config() should reject conflicting precision modes") |
|
|
|
|
| def test_load_training_config_resolves_model_preset( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"model_preset": "coding"}), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.model_preset == "coding" |
| assert config.model_name == AVAILABLE_TRAINING_BASE_MODELS["coding"]["model_name"] |
|
|
|
|
| def test_load_training_config_resolves_extra_model_preset( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"model_preset": "qwen-32b"}), |
| encoding="utf-8", |
| ) |
| monkeypatch.setenv( |
| "MARIS_TRAIN_EXTRA_MODELS", |
| json.dumps({"qwen-32b": "Qwen/Qwen2.5-32B-Instruct"}), |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.model_preset == "qwen-32b" |
| assert config.model_name == "Qwen/Qwen2.5-32B-Instruct" |
|
|
|
|
| def test_load_training_config_rejects_unknown_model_preset( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"model_preset": "unknown"}), |
| encoding="utf-8", |
| ) |
|
|
| try: |
| load_training_config(str(config_path)) |
| except ValueError as exc: |
| assert "model_preset" in str(exc) |
| assert "balanced" in str(exc) |
| else: |
| raise AssertionError("load_training_config() should reject unknown model presets") |
|
|
|
|
| def test_load_training_config_rejects_non_maris_hub_model_id( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"hub_model_id": "someone-else/not-maris"}), |
| encoding="utf-8", |
| ) |
|
|
| try: |
| load_training_config(str(config_path)) |
| except RuntimeError as exc: |
| assert "Maris AI modeli" in str(exc) |
| else: |
| raise AssertionError("load_training_config() should reject non-Maris output model ids") |
|
|
|
|
| def test_load_training_config_rejects_non_maris_dataset_repo( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"dataset_repo": "someone-else/not-maris-memory"}), |
| encoding="utf-8", |
| ) |
|
|
| try: |
| load_training_config(str(config_path)) |
| except RuntimeError as exc: |
| assert "dataset repozitorijs" in str(exc) |
| else: |
| raise AssertionError("load_training_config() should reject non-Maris dataset repo ids") |
|
|
|
|
| def test_load_training_config_reads_optional_eval_dataset_repo( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"eval_dataset_repo": "MarisUK/maris-ai-evals"}), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.eval_dataset_repo == "MarisUK/maris-ai-evals" |
|
|
|
|
| def test_load_training_config_reads_explicit_training_and_eval_dataset_repo_lists( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps( |
| { |
| "dataset_repo": "MarisUK/maris-ai-memory", |
| "dataset_repos": [ |
| "MarisUK/maris-ai-memory", |
| "MarisUK/maris-ai-lv-memory", |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ], |
| "eval_dataset_repo": "MarisUK/maris-ai-evals", |
| "eval_dataset_repos": [ |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ], |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.dataset_repos == [ |
| "MarisUK/maris-ai-memory", |
| "MarisUK/maris-ai-lv-memory", |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ] |
| assert config.eval_dataset_repos == [ |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ] |
|
|
|
|
| def test_load_training_config_reads_benchmark_and_preference_paths( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps( |
| { |
| "benchmark_dataset_path": "/tmp/benchmarks/release.json", |
| "benchmark_name": "release-gate", |
| "benchmark_levels": ["ci", "release"], |
| "benchmark_min_overall": 0.75, |
| "benchmark_gate_enabled": True, |
| "benchmark_feedback_auto_discover": False, |
| "benchmark_feedback_path": "/tmp/benchmarks/previous.json", |
| "benchmark_feedback_boost_scale": 2.5, |
| "benchmark_feedback_max_multiplier": 1.8, |
| "preference_dataset_path": "/tmp/preferences.json", |
| "branch_benchmark_targets": {"master": {"overall": 0.8, "reasoning": 0.78}}, |
| "branch_benchmark_names": { |
| "master": "memory-quality", |
| "coder": "coder-release-quality", |
| }, |
| "branch_benchmark_dataset_paths": { |
| "coder": "/tmp/benchmarks/coder-release.json", |
| "planner": "/tmp/benchmarks/planner-release.json", |
| }, |
| "branch_preference_dataset_paths": { |
| "coder": "/tmp/preferences/coder-preferences.json" |
| }, |
| "branch_dataset_filter_rules": { |
| "planner": {"include_record_types": ["autonomous"], "allow_unlabeled": False} |
| }, |
| "source_weight_map": {"production": 1.5, "synthetic": 1.0, "noisy": 0.6}, |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.benchmark_dataset_path == "/tmp/benchmarks/release.json" |
| assert config.benchmark_name == "release-gate" |
| assert config.benchmark_levels == ["ci", "release"] |
| assert config.benchmark_min_overall == 0.75 |
| assert config.benchmark_gate_enabled is True |
| assert config.benchmark_feedback_auto_discover is False |
| assert config.benchmark_feedback_path == "/tmp/benchmarks/previous.json" |
| assert config.benchmark_feedback_boost_scale == 2.5 |
| assert config.benchmark_feedback_max_multiplier == 1.8 |
| assert config.preference_dataset_path == "/tmp/preferences.json" |
| assert config.branch_benchmark_targets["master"]["reasoning"] == 0.78 |
| assert config.branch_benchmark_names["master"] == "memory-quality" |
| assert config.branch_benchmark_names["coder"] == "coder-release-quality" |
| assert config.branch_benchmark_dataset_paths["coder"] == "/tmp/benchmarks/coder-release.json" |
| assert ( |
| config.branch_benchmark_dataset_paths["planner"] == "/tmp/benchmarks/planner-release.json" |
| ) |
| assert ( |
| config.branch_preference_dataset_paths["coder"] == "/tmp/preferences/coder-preferences.json" |
| ) |
| assert config.branch_dataset_filter_rules["planner"]["include_record_types"] == ["autonomous"] |
| assert config.source_weight_map["production"] == 1.5 |
|
|
|
|
| def test_load_training_config_default_coder_targets_include_execution_gate() -> None: |
| config = load_training_config() |
|
|
| assert config.branch_benchmark_targets["coder"]["execution"] == 0.7 |
| assert config.branch_benchmark_targets["master"]["memory_retrieval_pass_rate"] == 0.8 |
| assert config.branch_benchmark_names["master"] == "memory-quality" |
| assert config.branch_benchmark_dataset_paths["master"].endswith( |
| "core-python/evals/master_memory_benchmark.json" |
| ) |
| assert config.branch_benchmark_dataset_paths["coder"].endswith( |
| "core-python/evals/coder_release_benchmark.json" |
| ) |
| assert config.branch_preference_dataset_paths["coder"].endswith( |
| "core-python/evals/coder_preference_dataset.json" |
| ) |
|
|
|
|
| def test_apply_branch_runtime_defaults_prefers_master_memory_suite() -> None: |
| import maris_core.training.train as train_module |
|
|
| config = load_training_config( |
| overrides={ |
| "branch_name": "master", |
| "benchmark_dataset_path": "", |
| "benchmark_name": "chat-quality", |
| "benchmark_gate_enabled": True, |
| } |
| ) |
|
|
| resolved = train_module._apply_branch_runtime_defaults(config) |
|
|
| assert resolved.benchmark_name == "memory-quality" |
| assert resolved.benchmark_dataset_path.endswith( |
| "core-python/evals/master_memory_benchmark.json" |
| ) |
|
|
|
|
| def test_build_benchmark_gate_artifact_uses_world_class_defaults_and_blocks_regressions() -> None: |
| config = load_training_config( |
| overrides={ |
| "branch_name": "coder", |
| "benchmark_gate_enabled": True, |
| } |
| ) |
|
|
| gate = _build_benchmark_gate_artifact( |
| config, |
| { |
| "benchmark_name": "release-gate", |
| "score_manifest": { |
| "overall": 0.8, |
| "coding": 0.81, |
| "reasoning": 0.76, |
| "execution": 0.74, |
| "grounding": 0.78, |
| "safety": 0.93, |
| "judge_overall": 0.78, |
| "judge_task_completion": 0.77, |
| "judge_instruction_following": 0.79, |
| "judge_safety": 0.95, |
| "judge_regression_risk": 0.8, |
| }, |
| "success_rate": 0.88, |
| "production_like_cases": 3, |
| "production_like_pass_rate": 0.8, |
| "execution_cases": 4, |
| "grounding_cases": 3, |
| }, |
| regression_report={"regression_count": 2}, |
| ) |
|
|
| assert gate["targets"]["success_rate"] == 0.85 |
| assert gate["targets"]["production_like_pass_rate"] == 0.75 |
| assert gate["targets"]["judge_overall"] == 0.72 |
| assert gate["passed"] is False |
| assert gate["failed_metrics"]["regression_count"]["required"] == 0.0 |
| assert gate["failed_metrics"]["regression_count"]["actual"] == 2.0 |
|
|
|
|
| def test_build_benchmark_gate_artifact_uses_stricter_execution_threshold() -> None: |
| config = load_training_config( |
| overrides={ |
| "branch_name": "coder", |
| "benchmark_gate_enabled": True, |
| } |
| ) |
|
|
| gate = _build_benchmark_gate_artifact( |
| config, |
| { |
| "benchmark_name": "release-gate", |
| "score_manifest": { |
| "overall": 0.8, |
| "coding": 0.82, |
| "reasoning": 0.76, |
| "execution": 0.6, |
| "grounding": 0.78, |
| "safety": 0.94, |
| }, |
| "execution_cases": 4, |
| }, |
| ) |
|
|
| assert gate["passed"] is False |
| assert gate["targets"]["execution"] == 0.7 |
| assert gate["failed_metrics"]["execution"]["actual"] == 0.6 |
|
|
|
|
| def test_load_training_config_reads_category_weight_map(tmp_path: Path) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"category_weight_map": {"coding": 1.3, "grounding": 1.2}}), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.category_weight_map["coding"] == 1.3 |
| assert config.category_weight_map["grounding"] == 1.2 |
|
|
|
|
| def test_load_training_config_reads_continue_training_settings(monkeypatch) -> None: |
| monkeypatch.setenv("HF_TRAIN_CONTINUE_FROM_LATEST", "true") |
| monkeypatch.setenv("HF_TRAIN_CONTINUE_MODEL_PATH", "/tmp/maris-last-good") |
|
|
| config = load_training_config() |
|
|
| assert config.continue_from_latest_artifact is True |
| assert config.continue_model_path == "/tmp/maris-last-good" |
|
|
|
|
| def test_list_training_base_models_returns_copy() -> None: |
| models = list_training_base_models() |
| models["balanced"]["model_name"] = "modified" |
|
|
| assert AVAILABLE_TRAINING_BASE_MODELS["balanced"]["model_name"] == DEFAULT_TRAINING_BASE_MODEL |
|
|
|
|
| def test_list_training_base_models_ignores_invalid_extra_models_json(monkeypatch) -> None: |
| monkeypatch.setenv("MARIS_TRAIN_EXTRA_MODELS", "{not valid json") |
|
|
| models = list_training_base_models() |
|
|
| assert {"balanced", "reasoning", "coding", "lightweight"}.issubset(models) |
|
|
|
|
| def test_list_training_base_models_accepts_owner_name_fallback_syntax(monkeypatch) -> None: |
| monkeypatch.setenv( |
| "MARIS_TRAIN_EXTRA_MODELS", |
| "Qwen/Qwen3-Coder-480B-A35B-Instruct, coder-7b=Qwen/Qwen2.5-7B-Instruct", |
| ) |
|
|
| models = list_training_base_models() |
|
|
| assert models["qwen-qwen3-coder-480b-a35b-instruct"]["model_name"] == ( |
| "Qwen/Qwen3-Coder-480B-A35B-Instruct" |
| ) |
| assert models["coder-7b"]["model_name"] == "Qwen/Qwen2.5-7B-Instruct" |
|
|
|
|
| def test_list_training_base_models_accepts_string_shorthand(monkeypatch) -> None: |
| monkeypatch.setenv("MARIS_TRAIN_EXTRA_MODELS", '{"qwen-880b":"Qwen/Qwen3-880B-Instruct"}') |
|
|
| models = list_training_base_models() |
|
|
| assert models["qwen-880b"]["model_name"] == "Qwen/Qwen3-880B-Instruct" |
| assert models["qwen-880b"]["label"] == "Qwen 880B" |
|
|
|
|
| def test_load_training_config_prefers_explicit_model_name_over_preset( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps({"model_preset": "coding"}), |
| encoding="utf-8", |
| ) |
| monkeypatch.setenv("HF_TRAIN_BASE_MODEL", "custom/model") |
| monkeypatch.setenv("HF_TRAIN_MODEL_PRESET", "reasoning") |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.model_name == "custom/model" |
| assert config.model_preset == "" |
|
|
|
|
| def test_huggingface_train_script_resolves_relative_config_from_repo_root( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| repo_root = next( |
| parent |
| for parent in Path(__file__).resolve().parents |
| if (parent / "huggingface" / "train.sh").is_file() |
| ) |
| fake_python = tmp_path / "python3" |
| invocation_log = tmp_path / "train-invocation.json" |
| fake_python.write_text( |
| "\n".join( |
| [ |
| f"#!{sys.executable}", |
| "import json", |
| "import os", |
| "import sys", |
| "from pathlib import Path", |
| "", |
| "Path(os.environ['TRAIN_SH_LOG']).write_text(", |
| " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False),", |
| " encoding='utf-8',", |
| ")", |
| ] |
| ), |
| encoding="utf-8", |
| ) |
| fake_python.chmod(0o755) |
| existing_path = os.environ.get("PATH", "") |
| monkeypatch.setenv( |
| "PATH", |
| f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), |
| ) |
| monkeypatch.setenv("HF_TRAINING_CONFIG_PATH", "huggingface/training-config.json") |
| monkeypatch.setenv("TRAIN_SH_LOG", str(invocation_log)) |
|
|
| subprocess.run( |
| ["bash", str(repo_root / "huggingface" / "train.sh")], |
| check=True, |
| cwd=repo_root, |
| ) |
|
|
| logged = json.loads(invocation_log.read_text(encoding="utf-8")) |
| assert logged["cwd"] == str(repo_root / "core-python") |
| assert logged["argv"][0] == str(repo_root / "core-python" / "scripts" / "train_model.py") |
| assert logged["argv"][1:3] == [ |
| "--config", |
| str(repo_root / "huggingface" / "training-config.json"), |
| ] |
|
|
|
|
| def test_huggingface_train_hf_script_uses_persistent_paths_and_uploads_model( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| repo_root = next( |
| parent |
| for parent in Path(__file__).resolve().parents |
| if (parent / "huggingface" / "train-hf.sh").is_file() |
| ) |
| persistent_dir = tmp_path / "persistent" |
| fake_python = tmp_path / "python3" |
| invocation_log = tmp_path / "train-hf-invocations.jsonl" |
| fake_python.write_text( |
| "\n".join( |
| [ |
| f"#!{sys.executable}", |
| "import json", |
| "import os", |
| "import sys", |
| "from pathlib import Path", |
| "", |
| "log_path = Path(os.environ['TRAIN_HF_LOG'])", |
| "with log_path.open('a', encoding='utf-8') as handle:", |
| " handle.write(", |
| " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False) + '\\n'", |
| " )", |
| ] |
| ), |
| encoding="utf-8", |
| ) |
| fake_python.chmod(0o755) |
| existing_path = os.environ.get("PATH", "") |
| monkeypatch.setenv( |
| "PATH", |
| f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), |
| ) |
| monkeypatch.setenv("HF_PERSISTENT_DIR", str(persistent_dir)) |
| monkeypatch.setenv("TRAIN_HF_LOG", str(invocation_log)) |
| monkeypatch.delenv("HF_TRAIN_OUTPUT_DIR", raising=False) |
| monkeypatch.delenv("HF_LOCAL_MODEL_DIR", raising=False) |
| monkeypatch.delenv("HF_TRAIN_PUSH_TO_HUB", raising=False) |
|
|
| subprocess.run( |
| ["bash", str(repo_root / "huggingface" / "train-hf.sh"), "--model-preset", "coding"], |
| check=True, |
| cwd=repo_root, |
| ) |
|
|
| logged = [ |
| json.loads(line) |
| for line in invocation_log.read_text(encoding="utf-8").splitlines() |
| if line.strip() |
| ] |
| assert len(logged) == 2 |
| assert logged[0]["cwd"] == str(repo_root / "core-python") |
| assert logged[0]["argv"][0] == str(repo_root / "core-python" / "scripts" / "train_model.py") |
| assert logged[0]["argv"][1:5] == [ |
| "--config", |
| str(repo_root / "huggingface" / "training-config.hf-jobs.json"), |
| "--model-preset", |
| "coding", |
| ] |
| assert logged[1]["argv"][0] == str(repo_root / "core-python" / "scripts" / "export_to_hf.py") |
| assert logged[1]["argv"][1:3] == [ |
| "--model-path", |
| str(persistent_dir / "maris-ai-master"), |
| ] |
|
|
|
|
| def test_huggingface_train_hf_script_enables_accelerate_on_gpu_space( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| repo_root = next( |
| parent |
| for parent in Path(__file__).resolve().parents |
| if (parent / "huggingface" / "train-hf.sh").is_file() |
| ) |
| persistent_dir = tmp_path / "persistent-gpu" |
| fake_python = tmp_path / "python3" |
| fake_nvidia_smi = tmp_path / "nvidia-smi" |
| invocation_log = tmp_path / "train-hf-gpu-invocations.jsonl" |
| fake_python.write_text( |
| "\n".join( |
| [ |
| f"#!{sys.executable}", |
| "import json", |
| "import os", |
| "import sys", |
| "from pathlib import Path", |
| "", |
| "log_path = Path(os.environ['TRAIN_HF_GPU_LOG'])", |
| "with log_path.open('a', encoding='utf-8') as handle:", |
| " handle.write(", |
| " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False) + '\\n'", |
| " )", |
| ] |
| ), |
| encoding="utf-8", |
| ) |
| fake_python.chmod(0o755) |
| fake_nvidia_smi.write_text("#!/usr/bin/env bash\necho 'GPU 0: Fake GPU'\n", encoding="utf-8") |
| fake_nvidia_smi.chmod(0o755) |
| existing_path = os.environ.get("PATH", "") |
| monkeypatch.setenv( |
| "PATH", |
| f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), |
| ) |
| monkeypatch.setenv("HF_PERSISTENT_DIR", str(persistent_dir)) |
| monkeypatch.setenv("TRAIN_HF_GPU_LOG", str(invocation_log)) |
| monkeypatch.delenv("HF_TRAIN_USE_ACCELERATE", raising=False) |
| monkeypatch.delenv("HF_TRAIN_NUM_PROCESSES", raising=False) |
|
|
| subprocess.run( |
| ["bash", str(repo_root / "huggingface" / "train-hf.sh"), "--model-preset", "coding"], |
| check=True, |
| cwd=repo_root, |
| ) |
|
|
| logged = [ |
| json.loads(line) |
| for line in invocation_log.read_text(encoding="utf-8").splitlines() |
| if line.strip() |
| ] |
|
|
| assert logged[0]["argv"][0:2] == ["-m", "accelerate.commands.launch"] |
| assert "--config_file" in logged[0]["argv"] |
| assert str(repo_root / "huggingface" / "accelerate-gpu-config.yaml") in logged[0]["argv"] |
| assert "--num_processes" in logged[0]["argv"] |
| assert logged[0]["argv"][logged[0]["argv"].index("--num_processes") + 1] == "1" |
| assert str(repo_root / "core-python" / "scripts" / "train_model.py") in logged[0]["argv"] |
| assert logged[1]["argv"][0] == str(repo_root / "core-python" / "scripts" / "export_to_hf.py") |
|
|
|
|
| def test_huggingface_train_job_script_uses_accelerate_for_distributed_launch( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| repo_root = next( |
| parent |
| for parent in Path(__file__).resolve().parents |
| if (parent / "huggingface" / "train-job.sh").is_file() |
| ) |
| fake_python = tmp_path / "python3" |
| fake_nvidia_smi = tmp_path / "nvidia-smi" |
| invocation_log = tmp_path / "train-job-invocations.jsonl" |
| fake_python.write_text( |
| "\n".join( |
| [ |
| f"#!{sys.executable}", |
| "import json", |
| "import os", |
| "import sys", |
| "from pathlib import Path", |
| "", |
| "log_path = Path(os.environ['TRAIN_JOB_LOG'])", |
| "with log_path.open('a', encoding='utf-8') as handle:", |
| " handle.write(", |
| " json.dumps({'cwd': os.getcwd(), 'argv': sys.argv[1:]}, ensure_ascii=False) + '\\n'", |
| " )", |
| ] |
| ), |
| encoding="utf-8", |
| ) |
| fake_python.chmod(0o755) |
| fake_nvidia_smi.write_text("#!/usr/bin/env bash\necho 'GPU 0: Fake GPU'\n", encoding="utf-8") |
| fake_nvidia_smi.chmod(0o755) |
| existing_path = os.environ.get("PATH", "") |
| monkeypatch.setenv( |
| "PATH", |
| f"{tmp_path}{os.pathsep}{existing_path}" if existing_path else str(tmp_path), |
| ) |
| monkeypatch.setenv("HF_JOB_WORK_DIR", str(tmp_path / "job-work")) |
| monkeypatch.setenv("TRAIN_JOB_LOG", str(invocation_log)) |
| monkeypatch.setenv("HF_TRAIN_DISTRIBUTED_STRATEGY", "deepspeed") |
| monkeypatch.delenv("HF_TRAIN_USE_ACCELERATE", raising=False) |
| monkeypatch.delenv("HF_TRAIN_NUM_PROCESSES", raising=False) |
|
|
| subprocess.run( |
| ["bash", str(repo_root / "huggingface" / "train-job.sh"), "--model-preset", "coding"], |
| check=True, |
| cwd=repo_root, |
| ) |
|
|
| logged = [ |
| json.loads(line) |
| for line in invocation_log.read_text(encoding="utf-8").splitlines() |
| if line.strip() |
| ] |
|
|
| assert logged[0]["argv"][0:2] == ["-m", "accelerate.commands.launch"] |
| assert str(repo_root / "huggingface" / "accelerate-gpu-config.yaml") in logged[0]["argv"] |
| assert str(repo_root / "huggingface" / "training-config.hf-jobs.json") in logged[0]["argv"] |
| assert str(repo_root / "core-python" / "scripts" / "train_model.py") in logged[0]["argv"] |
| assert logged[1]["argv"][0] == str(repo_root / "core-python" / "scripts" / "export_to_hf.py") |
|
|
|
|
| def test_configure_tokenizer_expands_large_model_context_window() -> None: |
| import maris_core.training.train as train_module |
|
|
| tokenizer = types.SimpleNamespace( |
| pad_token=None, |
| pad_token_id=None, |
| eos_token="<eos>", |
| eos_token_id=7, |
| model_max_length=4096, |
| ) |
| config = load_training_config( |
| overrides={ |
| "model_name": "Qwen/Qwen3-Coder-480B-A35B-Instruct", |
| "max_seq_length": 65536, |
| } |
| ) |
|
|
| train_module._configure_tokenizer(tokenizer, config) |
|
|
| assert tokenizer.pad_token == "<eos>" |
| assert tokenizer.pad_token_id == 7 |
| assert tokenizer.model_max_length == 65536 |
|
|
|
|
| def test_load_tokenizer_forces_remote_snapshot_restore(monkeypatch) -> None: |
| import maris_core.training.train as train_module |
|
|
| compat_flags: list[bool | None] = [] |
|
|
| class FakeTokenizer: |
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class CompatPath: |
| def __init__(self, model_name: str, *, allow_remote_snapshot: bool | None = None): |
| del model_name |
| compat_flags.append(allow_remote_snapshot) |
|
|
| def __enter__(self) -> str: |
| return "/tmp/fake-model" |
|
|
| def __exit__(self, exc_type, exc, tb) -> None: |
| del exc_type, exc, tb |
| return None |
|
|
| monkeypatch.setitem( |
| sys.modules, "transformers", types.SimpleNamespace(AutoTokenizer=FakeTokenizer) |
| ) |
| monkeypatch.setattr(train_module, "maris_hf_compatible_path", CompatPath) |
|
|
| config = load_training_config(overrides={"model_name": "MarisUK/maris-ai-master"}) |
|
|
| tokenizer = train_module._load_tokenizer("MarisUK/maris-ai-master", config) |
|
|
| assert isinstance(tokenizer, FakeTokenizer) |
| assert compat_flags == [True] |
|
|
|
|
| def test_load_tokenizer_falls_back_to_explicit_slow_class(monkeypatch, tmp_path) -> None: |
| import maris_core.training.train as train_module |
|
|
| compat_flags: list[bool | None] = [] |
| tokenizer_attempts: list[tuple[str, Any]] = [] |
| model_dir = tmp_path / "trained-model" |
| model_dir.mkdir(parents=True, exist_ok=True) |
| (model_dir / "tokenizer_config.json").write_text( |
| json.dumps({"tokenizer_class": "Qwen2TokenizerFast"}), |
| encoding="utf-8", |
| ) |
| (model_dir / "config.json").write_text( |
| json.dumps( |
| { |
| "tokenizer_class": "Qwen2TokenizerFast", |
| "auto_map": {"AutoTokenizer": ["Qwen2Tokenizer", "Qwen2TokenizerFast"]}, |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| class FakeAutoTokenizer: |
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name |
| tokenizer_attempts.append(("auto", kwargs.get("use_fast"))) |
| if kwargs.get("use_fast", True): |
| raise ValueError( |
| "Couldn't instantiate the backend tokenizer from one of the available paths." |
| ) |
| raise ValueError("tokenizer config still points to a fast tokenizer class") |
|
|
| class FakeSlowTokenizer: |
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| tokenizer_attempts.append(("slow", kwargs.get("use_fast"))) |
| assert model_name == str(model_dir) |
| assert "use_fast" not in kwargs |
| return cls() |
|
|
| class CompatPath: |
| def __init__(self, model_name: str, *, allow_remote_snapshot: bool | None = None): |
| del model_name |
| compat_flags.append(allow_remote_snapshot) |
|
|
| def __enter__(self) -> str: |
| return str(model_dir) |
|
|
| def __exit__(self, exc_type, exc, tb) -> None: |
| del exc_type, exc, tb |
| return None |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoTokenizer=FakeAutoTokenizer, |
| Qwen2Tokenizer=FakeSlowTokenizer, |
| ), |
| ) |
| monkeypatch.setattr(train_module, "maris_hf_compatible_path", CompatPath) |
|
|
| config = load_training_config(overrides={"model_name": "MarisUK/maris-ai-master"}) |
|
|
| tokenizer = train_module._load_tokenizer("MarisUK/maris-ai-master", config) |
|
|
| assert isinstance(tokenizer, FakeSlowTokenizer) |
| assert compat_flags == [True] |
| assert tokenizer_attempts == [("auto", True), ("auto", False), ("slow", None)] |
|
|
|
|
| def test_load_tokenizer_retries_after_installing_missing_backends(monkeypatch, tmp_path) -> None: |
| import maris_core.training.train as train_module |
|
|
| compat_flags: list[bool | None] = [] |
| tokenizer_attempts: list[tuple[str, Any]] = [] |
| model_dir = tmp_path / "trained-model" |
| model_dir.mkdir(parents=True, exist_ok=True) |
|
|
| class FakeAutoTokenizer: |
| retry_ready = False |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name |
| tokenizer_attempts.append(("auto", kwargs.get("use_fast"))) |
| if cls.retry_ready: |
| return cls() |
| raise ValueError( |
| "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one." |
| ) |
|
|
| class CompatPath: |
| def __init__(self, model_name: str, *, allow_remote_snapshot: bool | None = None): |
| del model_name |
| compat_flags.append(allow_remote_snapshot) |
|
|
| def __enter__(self) -> str: |
| return str(model_dir) |
|
|
| def __exit__(self, exc_type, exc, tb) -> None: |
| del exc_type, exc, tb |
|
|
| install_attempts: list[bool] = [] |
|
|
| def fake_install_missing_tokenizer_backends() -> bool: |
| install_attempts.append(True) |
| FakeAutoTokenizer.retry_ready = True |
| return True |
|
|
| monkeypatch.setitem( |
| sys.modules, "transformers", types.SimpleNamespace(AutoTokenizer=FakeAutoTokenizer) |
| ) |
| monkeypatch.setattr(train_module, "maris_hf_compatible_path", CompatPath) |
| monkeypatch.setattr( |
| train_module, |
| "_install_missing_tokenizer_backends", |
| fake_install_missing_tokenizer_backends, |
| ) |
|
|
| config = load_training_config(overrides={"model_name": "MarisUK/maris-ai-master"}) |
|
|
| tokenizer = train_module._load_tokenizer("MarisUK/maris-ai-master", config) |
|
|
| assert isinstance(tokenizer, FakeAutoTokenizer) |
| assert compat_flags == [True] |
| assert install_attempts == [True] |
| assert tokenizer_attempts == [("auto", True), ("auto", False), ("auto", True)] |
|
|
|
|
| def test_install_missing_tokenizer_backends_only_installs_missing_packages(monkeypatch) -> None: |
| import maris_core.training.train as train_module |
|
|
| installed_commands: list[list[str]] = [] |
| available_modules = {"tiktoken"} |
|
|
| def fake_find_spec(name: str): |
| return object() if name in available_modules else None |
|
|
| monkeypatch.setattr(train_module.importlib.util, "find_spec", fake_find_spec) |
| monkeypatch.setattr(train_module.importlib, "invalidate_caches", lambda: None) |
| monkeypatch.setattr( |
| train_module.subprocess, |
| "run", |
| lambda command, **kwargs: ( |
| installed_commands.append(command) or types.SimpleNamespace(stdout="") |
| ), |
| ) |
|
|
| installed = train_module._install_missing_tokenizer_backends() |
|
|
| assert installed is True |
| assert installed_commands == [ |
| [sys.executable, "-m", "pip", "install", "--no-cache-dir", "sentencepiece"] |
| ] |
|
|
|
|
| def test_install_missing_tokenizer_backends_is_noop_when_backends_exist(monkeypatch) -> None: |
| import maris_core.training.train as train_module |
|
|
| monkeypatch.setattr(train_module.importlib.util, "find_spec", lambda name: object()) |
| monkeypatch.setattr( |
| train_module.subprocess, |
| "run", |
| lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("pip should not run")), |
| ) |
|
|
| installed = train_module._install_missing_tokenizer_backends() |
|
|
| assert installed is False |
|
|
|
|
| def test_prepare_training_model_passes_use_reentrant_override(monkeypatch) -> None: |
| import maris_core.training.train as train_module |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
| self.gradient_checkpointing_kwargs = None |
|
|
| def gradient_checkpointing_enable(self, *, gradient_checkpointing_kwargs=None): |
| self.gradient_checkpointing_kwargs = gradient_checkpointing_kwargs |
|
|
| model = FakeModel() |
| tokenizer = types.SimpleNamespace(pad_token_id=7) |
| config = load_training_config( |
| overrides={ |
| "gradient_checkpointing": True, |
| "gradient_checkpointing_use_reentrant": False, |
| } |
| ) |
|
|
| monkeypatch.setattr(train_module, "_load_model", lambda model_name, config: model) |
| monkeypatch.setattr(train_module, "_apply_peft_adapter", lambda model, config: model) |
|
|
| prepared_model = train_module._prepare_training_model( |
| "MarisUK/maris-ai-master", tokenizer, config |
| ) |
|
|
| assert prepared_model is model |
| assert model.config.pad_token_id == 7 |
| assert model.config.use_cache is False |
| assert model.gradient_checkpointing_kwargs == {"use_reentrant": False} |
|
|
|
|
| def test_prepare_training_model_falls_back_when_runtime_rejects_use_reentrant( |
| monkeypatch, caplog |
| ) -> None: |
| import maris_core.training.train as train_module |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
| self.gradient_checkpointing_enabled = False |
|
|
| def gradient_checkpointing_enable(self): |
| self.gradient_checkpointing_enabled = True |
|
|
| model = FakeModel() |
| tokenizer = types.SimpleNamespace(pad_token_id=7) |
| config = load_training_config( |
| overrides={ |
| "gradient_checkpointing": True, |
| "gradient_checkpointing_use_reentrant": False, |
| } |
| ) |
|
|
| monkeypatch.setattr(train_module, "_load_model", lambda model_name, config: model) |
| monkeypatch.setattr(train_module, "_apply_peft_adapter", lambda model, config: model) |
|
|
| with caplog.at_level("WARNING"): |
| prepared_model = train_module._prepare_training_model( |
| "MarisUK/maris-ai-master", tokenizer, config |
| ) |
|
|
| assert prepared_model is model |
| assert model.gradient_checkpointing_enabled is True |
| assert "Ignoring explicit gradient_checkpointing_use_reentrant=False" in caplog.text |
|
|
|
|
| def test_train_auto_enables_deepspeed_for_giant_long_context_model( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch if batched else self.items[0]) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Čau!"}, |
| {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, |
| ] |
| ) |
| }, |
| ) |
| monkeypatch.setattr( |
| "maris_core.training.train._load_json_object", |
| lambda path_value, *, label: {"config_path": path_value, "label": label}, |
| ) |
| monkeypatch.setattr( |
| "maris_core.training.train._require_runtime_package", lambda *args, **kwargs: None |
| ) |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 0 |
| model_max_length = 4096 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def __call__(self, texts, truncation, max_length, padding): |
| del truncation, padding |
| return { |
| "input_ids": [[1] * min(max_length, 4) for _ in texts], |
| "attention_mask": [[1] * min(max_length, 4) for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def gradient_checkpointing_enable(self): |
| self.gradient_checkpointing_enabled = True |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): |
| del model, train_dataset, eval_dataset, data_collator |
| self.args = args |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.1} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| BitsAndBytesConfig=lambda **kwargs: kwargs, |
| ), |
| ) |
| monkeypatch.setitem( |
| sys.modules, |
| "peft", |
| types.SimpleNamespace( |
| LoraConfig=lambda **kwargs: kwargs, |
| TaskType=types.SimpleNamespace(CAUSAL_LM="CAUSAL_LM"), |
| get_peft_model=lambda model, peft_config: model, |
| prepare_model_for_kbit_training=lambda model, use_gradient_checkpointing: model, |
| ), |
| ) |
|
|
| train( |
| output_dir=str(tmp_path / "giant-long-context"), |
| model_name="Qwen/Qwen3-Coder-480B-A35B-Instruct", |
| max_seq_length=65536, |
| distributed_strategy="none", |
| use_accelerate=False, |
| ) |
|
|
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.args.kwargs["deepspeed"].endswith( |
| "huggingface/deepspeed-zero3.json" |
| ) |
| assert FakeTrainer.last_instance.args.kwargs["ddp_find_unused_parameters"] is False |
|
|
|
|
| def test_train_uses_eval_split_and_writes_metrics( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| split_index = max(1, len(self.items) - 1) |
| return { |
| "train": FakeDataset(self.items[:split_index]), |
| "test": FakeDataset(self.items[split_index:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| assert batched is True |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| fake_dataset = { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kā iet?", "assistant": "Labi"}, |
| ] |
| ) |
| } |
|
|
| monkeypatch.setattr("maris_core.training.train.load_hf_dataset", lambda _: fake_dataset) |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| assert truncation is True |
| assert max_length == 256 |
| assert padding is False |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer.json").write_text( |
| json.dumps( |
| { |
| "model": { |
| "type": "BPE", |
| "unk_token": "Qwen/Qwen2.5-7B-Instruct", |
| }, |
| "added_tokens": [ |
| {"content": "Claude"}, |
| {"content": "DeepSeek"}, |
| ], |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "tokenizer_config.json").write_text( |
| json.dumps( |
| { |
| "name_or_path": DEFAULT_TRAINING_BASE_MODEL, |
| "tokenizer_class": "Qwen2TokenizerFast", |
| "auto_map": {"AutoTokenizer": ["qwen2.Qwen2Tokenizer", None]}, |
| "chat_template": "You are Qwen, a helpful assistant for Qwen/Qwen2.5-7B-Instruct.", |
| "init_kwargs": { |
| "chat_template": "Respond like TinyLlama and DeepSeek.", |
| }, |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "chat_template.jinja").write_text( |
| "System: meta-llama/Llama-3.2-3B-Instruct and Claude must answer here.", |
| encoding="utf-8", |
| ) |
|
|
| class FakeModelConfig: |
| pad_token_id = None |
|
|
| class FakeModel: |
| config = FakeModelConfig() |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.25} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__( |
| self, |
| *, |
| model, |
| args, |
| train_dataset, |
| eval_dataset=None, |
| data_collator=None, |
| ): |
| del model, data_collator |
| self.args = args |
| self.train_dataset = train_dataset |
| self.eval_dataset = eval_dataset |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.5} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") |
| Path(output_dir, "config.json").write_text( |
| json.dumps( |
| { |
| "_name_or_path": DEFAULT_TRAINING_BASE_MODEL, |
| "model_type": "qwen2", |
| "architectures": ["Qwen2ForCausalLM"], |
| "tokenizer_class": "Qwen2TokenizerFast", |
| "auto_map": { |
| "AutoConfig": "qwen2.configuration_qwen2.Qwen2Config", |
| "AutoModelForCausalLM": "qwen2.modeling_qwen2.Qwen2ForCausalLM", |
| }, |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "adapter_config.json").write_text( |
| json.dumps( |
| { |
| "base_model_name_or_path": DEFAULT_TRAINING_BASE_MODEL, |
| "base_model_class": "Qwen2ForCausalLM", |
| "parent_library": "transformers.models.qwen2.modeling_qwen2", |
| "auto_mapping": { |
| "base_model_class": "Qwen2ForCausalLM", |
| "parent_library": "transformers.models.qwen2.modeling_qwen2", |
| }, |
| "description": "Adapter derived from Qwen and Llama.", |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| def push_to_hub(self, **kwargs): |
| self.push_kwargs = kwargs |
|
|
| fake_transformers = types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ) |
| monkeypatch.setitem(sys.modules, "transformers", fake_transformers) |
| preference_dataset_path = tmp_path / "preferences.json" |
| preference_dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "prompt": "Kurš variants ir labāks?", |
| "chosen": "Variants A", |
| "rejected": "Variants B", |
| "source": "human_review", |
| "tags": ["quality"], |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| async def fake_benchmark(config, *, model_path): |
| assert model_path.endswith("trained-model") |
| return { |
| "artifact_type": "chat-benchmark-manifest", |
| "benchmark_name": config.benchmark_name, |
| "branch": config.branch_name, |
| "model": config.hub_model_id, |
| "score_manifest": { |
| "overall": 0.81, |
| "reasoning": 0.8, |
| "factuality": 0.79, |
| "latvian_quality": 0.86, |
| "coding": 0.74, |
| "long_context": 0.75, |
| "helpfulness": 0.83, |
| }, |
| } |
|
|
| monkeypatch.setattr("maris_core.training.train._run_post_training_benchmark", fake_benchmark) |
|
|
| output_dir = tmp_path / "trained-model" |
| metrics = train( |
| output_dir=str(output_dir), |
| max_seq_length=256, |
| benchmark_dataset_path=str(tmp_path / "benchmarks.json"), |
| preference_dataset_path=str(preference_dataset_path), |
| ) |
|
|
| assert metrics["train_loss"] == 0.25 |
| assert metrics["eval_loss"] == 0.5 |
| assert metrics["perplexity"] > 1.0 |
| assert FakeTrainer.last_instance is not None |
| assert len(FakeTrainer.last_instance.train_dataset) == 1 |
| assert len(FakeTrainer.last_instance.eval_dataset) == 1 |
| assert FakeTrainer.last_instance.args.kwargs["evaluation_strategy"] == "steps" |
| assert (output_dir / "training-config.json").is_file() |
| assert (output_dir / "training-metrics.json").is_file() |
| assert (output_dir / "maris-metadata.json").is_file() |
| assert (output_dir / "training-provenance.json").is_file() |
| assert (output_dir / "README.md").is_file() |
| assert (output_dir / "benchmark-manifest.json").is_file() |
| assert (output_dir / "benchmark-release-gate.json").is_file() |
| assert (output_dir / "benchmark-history.json").is_file() |
| assert (output_dir / "benchmark-regression-report.json").is_file() |
| assert (output_dir / "benchmark-feedback.json").is_file() |
| assert (output_dir / "preference-summary.json").is_file() |
| assert (output_dir / "human-eval-summary.json").is_file() |
| assert (output_dir / "blind-side-by-side-eval.json").is_file() |
| training_config = json.loads((output_dir / "training-config.json").read_text(encoding="utf-8")) |
| training_metrics = json.loads( |
| (output_dir / "training-metrics.json").read_text(encoding="utf-8") |
| ) |
| benchmark_manifest = json.loads( |
| (output_dir / "benchmark-manifest.json").read_text(encoding="utf-8") |
| ) |
| benchmark_gate = json.loads( |
| (output_dir / "benchmark-release-gate.json").read_text(encoding="utf-8") |
| ) |
| benchmark_history = json.loads( |
| (output_dir / "benchmark-history.json").read_text(encoding="utf-8") |
| ) |
| benchmark_regression = json.loads( |
| (output_dir / "benchmark-regression-report.json").read_text(encoding="utf-8") |
| ) |
| benchmark_feedback = json.loads( |
| (output_dir / "benchmark-feedback.json").read_text(encoding="utf-8") |
| ) |
| preference_summary = json.loads( |
| (output_dir / "preference-summary.json").read_text(encoding="utf-8") |
| ) |
| human_eval_summary = json.loads( |
| (output_dir / "human-eval-summary.json").read_text(encoding="utf-8") |
| ) |
| blind_side_by_side = json.loads( |
| (output_dir / "blind-side-by-side-eval.json").read_text(encoding="utf-8") |
| ) |
| training_provenance = json.loads( |
| (output_dir / "training-provenance.json").read_text(encoding="utf-8") |
| ) |
| saved_model_config = json.loads((output_dir / "config.json").read_text(encoding="utf-8")) |
| saved_tokenizer_config = json.loads( |
| (output_dir / "tokenizer_config.json").read_text(encoding="utf-8") |
| ) |
| saved_tokenizer_json = json.loads((output_dir / "tokenizer.json").read_text(encoding="utf-8")) |
| saved_adapter_config = json.loads( |
| (output_dir / "adapter_config.json").read_text(encoding="utf-8") |
| ) |
| compatibility_manifest = json.loads( |
| (output_dir / MARIS_COMPATIBILITY_ARTIFACT_NAME).read_text(encoding="utf-8") |
| ) |
| saved_chat_template = (output_dir / "chat_template.jinja").read_text(encoding="utf-8") |
| assert training_config["maris_origin"] == "Maris AI" |
| assert training_config["maris_model_id"] == "MarisUK/maris-ai-master" |
| assert "model_name" not in training_config |
| assert training_metrics["maris_origin"] == "Maris AI" |
| assert training_metrics["artifact_type"] == "training-metrics" |
| assert training_metrics["dataset_repo"] == "MarisUK/maris-ai-memory" |
| assert training_metrics["benchmark_regressions"] == 0.0 |
| assert training_provenance["maris_origin"] == "Maris AI" |
| assert training_provenance["train_examples"] == 1 |
| assert training_provenance["eval_examples"] == 1 |
| assert training_provenance["base_model_name"] == "Maris AI" |
| assert training_provenance["base_model_lineage"] == "Maris AI" |
| model_card = (output_dir / "README.md").read_text(encoding="utf-8") |
| assert "Maris AI Model" in model_card |
| assert "Qwen/" not in model_card |
| assert "TinyLlama/" not in model_card |
| assert saved_model_config["_name_or_path"] == "MarisUK/maris-ai-master" |
| assert saved_model_config["model_type"] == "maris" |
| assert saved_model_config["architectures"] == ["MarisCompatibleCausalLM"] |
| assert saved_model_config["tokenizer_class"] == "MarisCompatibleTokenizer" |
| assert saved_tokenizer_config["name_or_path"] == "MarisUK/maris-ai-master" |
| assert saved_tokenizer_config["tokenizer_class"] == "MarisCompatibleTokenizer" |
| assert saved_tokenizer_json["model"]["unk_token"] == "MarisUK/maris-ai-master" |
| assert saved_tokenizer_json["added_tokens"][0]["content"] == "Maris AI" |
| assert saved_tokenizer_json["added_tokens"][1]["content"] == "Maris AI" |
| assert "Maris AI" in saved_tokenizer_config["chat_template"] |
| assert "Qwen" not in saved_tokenizer_config["chat_template"] |
| assert "Maris AI" in saved_tokenizer_config["init_kwargs"]["chat_template"] |
| assert saved_adapter_config["base_model_name_or_path"] == "MarisUK/maris-ai-master" |
| assert saved_adapter_config["base_model_class"] == "MarisCompatibleCausalLM" |
| assert saved_adapter_config["parent_library"] == "maris.compat" |
| assert "Qwen" not in saved_adapter_config["description"] |
| assert "Llama" not in saved_adapter_config["description"] |
| assert compatibility_manifest["artifact_type"] == "maris-hf-compatibility" |
| assert compatibility_manifest["maris_model_id"] == "MarisUK/maris-ai-master" |
| assert "config.json" in compatibility_manifest["artifacts"] |
| assert "tokenizer_config.json" in compatibility_manifest["artifacts"] |
| assert "adapter_config.json" in compatibility_manifest["artifacts"] |
| assert "meta-llama/" not in saved_chat_template |
| assert "Claude" not in saved_chat_template |
| assert "Maris AI" in saved_chat_template |
| assert metrics["perplexity"] < 1000 |
| assert metrics["benchmark_overall"] == 0.81 |
| assert metrics["benchmark_gate_passed"] == 1.0 |
| assert benchmark_manifest["artifact_type"] == "chat-benchmark-manifest" |
| assert benchmark_gate["artifact_type"] == "benchmark-release-gate" |
| assert benchmark_history["artifact_type"] == "chat-benchmark-history" |
| assert benchmark_history["run_count"] == 1 |
| assert benchmark_regression["artifact_type"] == "chat-benchmark-regression-report" |
| assert benchmark_regression["status"] == "no-baseline" |
| assert benchmark_feedback["artifact_type"] == "benchmark-feedback-reweighting" |
| assert training_metrics["scoring_dashboard"]["train"]["sources"]["unknown"]["records"] == 1 |
| assert training_metrics["scoring_dashboard"]["train"]["categories"]["general"]["records"] == 1 |
| assert training_metrics["scoring_dashboard_train_sources_unknown_records"] == 1.0 |
| assert training_metrics["scoring_dashboard_train_categories_general_records"] == 1.0 |
| _assert_output_dir_uses_only_maris_identity(output_dir) |
| assert benchmark_gate["passed"] is True |
| assert preference_summary["artifact_type"] == "preference-dataset-summary" |
| assert benchmark_manifest["score_manifest"]["pairwise_win_rate"] == 1.0 |
| assert human_eval_summary["artifact_type"] == "human-eval-summary" |
| assert blind_side_by_side["artifact_type"] == "blind-side-by-side-eval-set" |
|
|
|
|
| def test_train_pushes_to_hub_when_enabled( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| assert batched is True |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setenv("HF_TRAIN_PUSH_TO_HUB", "true") |
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kā iet?", "assistant": "Labi"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del texts, truncation, max_length, padding |
| return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModelConfig: |
| pad_token_id = None |
|
|
| class FakeModel: |
| config = FakeModelConfig() |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.1} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
| self.push_kwargs = None |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.2} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| def push_to_hub(self, **kwargs): |
| self.push_kwargs = kwargs |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| train(output_dir=str(tmp_path / "push-model"), max_seq_length=256) |
|
|
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.push_kwargs == { |
| "commit_message": "Maris AI training sync (master)" |
| } |
|
|
|
|
| def test_train_prefers_existing_local_artifact_when_continue_mode_enabled( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del desc, remove_columns |
| assert batched is True |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| output_dir = tmp_path / "continued-model" |
| output_dir.mkdir(parents=True, exist_ok=True) |
| (output_dir / "config.json").write_text("{}", encoding="utf-8") |
| import maris_core.training.train as train_module |
|
|
| (output_dir / "training-config.json").write_text( |
| json.dumps( |
| { |
| train_module.MODEL_SOURCE_FINGERPRINT_KEY: train_module._build_model_source_fingerprint( |
| DEFAULT_TRAINING_BASE_MODEL |
| ) |
| } |
| ), |
| encoding="utf-8", |
| ) |
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kā iet?", "assistant": "Labi"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| captured_paths: dict[str, str] = {} |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| captured_paths["tokenizer"] = model_name |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del texts, truncation, max_length, padding |
| return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModelConfig: |
| pad_token_id = None |
|
|
| class FakeModel: |
| config = FakeModelConfig() |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del kwargs |
| captured_paths["model"] = model_name |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainer: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| def train(self): |
| return types.SimpleNamespace(metrics={"train_loss": 0.1}) |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.2} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| train( |
| output_dir=str(output_dir), |
| continue_from_latest_artifact=True, |
| max_seq_length=256, |
| ) |
|
|
| assert captured_paths["tokenizer"] == str(output_dir) |
| assert captured_paths["model"] == str(output_dir) |
|
|
|
|
| def test_train_does_not_auto_resume_from_incompatible_output_artifact( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| output_dir = tmp_path / "incompatible-output" |
| output_dir.mkdir(parents=True, exist_ok=True) |
| (output_dir / "config.json").write_text("{}", encoding="utf-8") |
| import maris_core.training.train as train_module |
|
|
| (output_dir / "training-config.json").write_text( |
| json.dumps( |
| { |
| train_module.MODEL_SOURCE_FINGERPRINT_KEY: train_module._build_model_source_fingerprint( |
| "meta-llama/Llama-3.2-3B-Instruct" |
| ) |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config( |
| overrides={ |
| "output_dir": str(output_dir), |
| "model_name": "Qwen/Qwen2.5-1.5B-Instruct", |
| "continue_from_latest_artifact": True, |
| } |
| ) |
|
|
| assert train_module._resolve_training_model_source(config) == "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
|
|
| def test_train_restores_maris_artifacts_after_push_to_hub(tmp_path: Path, monkeypatch) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| assert batched is True |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setenv("HF_TRAIN_PUSH_TO_HUB", "true") |
| monkeypatch.setenv("HF_TOKEN", "token") |
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kā iet?", "assistant": "Labi"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del texts, truncation, max_length, padding |
| return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer_config.json").write_text( |
| json.dumps( |
| { |
| "name_or_path": "MarisUK/maris-ai-master", |
| "tokenizer_class": "Qwen2TokenizerFast", |
| "chat_template": "You are Qwen and Claude in one assistant.", |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| class FakeModelConfig: |
| pad_token_id = None |
|
|
| class FakeModel: |
| config = FakeModelConfig() |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.1} |
|
|
| class FakeTrainer: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.2} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text( |
| json.dumps( |
| { |
| "_name_or_path": "MarisUK/maris-ai-master", |
| "model_type": "qwen2", |
| "architectures": ["Qwen2ForCausalLM"], |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "adapter_config.json").write_text( |
| json.dumps( |
| { |
| "base_model_name_or_path": "Qwen/Qwen3-Coder-480B-A35B-Instruct", |
| "base_model_class": "Qwen2ForCausalLM", |
| "parent_library": "transformers.models.qwen2.modeling_qwen2", |
| "description": "Adapter built from DeepSeek and Mistral.", |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| def push_to_hub(self, **kwargs): |
| del kwargs |
| output_dir = Path(self.kwargs["args"].kwargs["output_dir"]) |
| Path(output_dir, "README.md").write_text( |
| "\n".join( |
| ( |
| "---", |
| "library_name: transformers", |
| "datasets:", |
| "- generator", |
| "---", |
| "# master", |
| ) |
| ) |
| + "\n", |
| encoding="utf-8", |
| ) |
| Path(output_dir, "config.json").write_text( |
| json.dumps( |
| { |
| "_name_or_path": "Qwen/Qwen3-Coder-480B-A35B-Instruct", |
| "model_type": "qwen2", |
| "architectures": ["Qwen2ForCausalLM"], |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "tokenizer_config.json").write_text( |
| json.dumps( |
| { |
| "name_or_path": "Qwen/Qwen3-Coder-480B-A35B-Instruct", |
| "tokenizer_class": "Qwen2TokenizerFast", |
| "chat_template": "Use meta-llama/Llama-3.2-3B-Instruct with Gemini.", |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "tokenizer.json").write_text( |
| json.dumps( |
| { |
| "model": {"type": "BPE", "unk_token": "DeepSeek-Coder"}, |
| "added_tokens": [{"content": "Anthropic"}], |
| } |
| ), |
| encoding="utf-8", |
| ) |
| Path(output_dir, "chat_template.jinja").write_text( |
| "System prompt from Anthropic Claude and OpenAI ChatGPT.", |
| encoding="utf-8", |
| ) |
|
|
| upload_calls: list[dict[str, str]] = [] |
|
|
| class FakeHfApi: |
| def __init__(self, token=None): |
| self.token = token |
|
|
| def create_repo(self, **kwargs): |
| upload_calls.append({"create_repo": kwargs}) |
|
|
| def upload_folder(self, **kwargs): |
| upload_calls.append(kwargs) |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
| monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) |
|
|
| output_dir = tmp_path / "push-model" |
| train(output_dir=str(output_dir), max_seq_length=256) |
|
|
| assert "Maris AI Model" in (output_dir / "README.md").read_text(encoding="utf-8") |
| assert "generated_from_trainer" not in (output_dir / "README.md").read_text(encoding="utf-8") |
| assert json.loads((output_dir / "config.json").read_text(encoding="utf-8"))[ |
| "_name_or_path" |
| ] == ("MarisUK/maris-ai-master") |
| assert ( |
| json.loads((output_dir / "config.json").read_text(encoding="utf-8"))["model_type"] |
| == "maris" |
| ) |
| assert json.loads((output_dir / "config.json").read_text(encoding="utf-8"))[ |
| "architectures" |
| ] == ["MarisCompatibleCausalLM"] |
| assert ( |
| json.loads((output_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ |
| "name_or_path" |
| ] |
| == "MarisUK/maris-ai-master" |
| ) |
| assert ( |
| json.loads((output_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ |
| "tokenizer_class" |
| ] |
| == "MarisCompatibleTokenizer" |
| ) |
| assert ( |
| "Maris AI" |
| in json.loads((output_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ |
| "chat_template" |
| ] |
| ) |
| assert ( |
| json.loads((output_dir / "tokenizer.json").read_text(encoding="utf-8"))["model"][ |
| "unk_token" |
| ] |
| == "Maris AI" |
| ) |
| assert ( |
| json.loads((output_dir / "tokenizer.json").read_text(encoding="utf-8"))["added_tokens"][0][ |
| "content" |
| ] |
| == "Maris AI" |
| ) |
| assert ( |
| json.loads((output_dir / "adapter_config.json").read_text(encoding="utf-8"))[ |
| "base_model_name_or_path" |
| ] |
| == "MarisUK/maris-ai-master" |
| ) |
| assert ( |
| json.loads((output_dir / "adapter_config.json").read_text(encoding="utf-8"))[ |
| "base_model_class" |
| ] |
| == "MarisCompatibleCausalLM" |
| ) |
| assert ( |
| "DeepSeek" |
| not in json.loads((output_dir / "adapter_config.json").read_text(encoding="utf-8"))[ |
| "description" |
| ] |
| ) |
| assert "Anthropic" not in (output_dir / "chat_template.jinja").read_text(encoding="utf-8") |
| assert "ChatGPT" not in (output_dir / "chat_template.jinja").read_text(encoding="utf-8") |
| assert "Maris AI" in (output_dir / "chat_template.jinja").read_text(encoding="utf-8") |
| assert (output_dir / MARIS_COMPATIBILITY_ARTIFACT_NAME).is_file() |
| _assert_output_dir_uses_only_maris_identity(output_dir) |
| assert upload_calls == [ |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "folder_path": str(output_dir), |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "commit_message": "Maris AI artifact sync (master)", |
| }, |
| ] |
|
|
|
|
| def test_export_model_creates_repo_before_upload(tmp_path: Path, monkeypatch) -> None: |
| script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_to_hf.py" |
| spec = importlib.util.spec_from_file_location("export_to_hf", script_path) |
| assert spec is not None and spec.loader is not None |
| export_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(export_module) |
|
|
| model_dir = tmp_path / "model" |
| model_dir.mkdir() |
| model_dir.joinpath("config.json").write_text("{}", encoding="utf-8") |
| monkeypatch.setenv("HF_TOKEN", "token") |
|
|
| calls: list[dict[str, object]] = [] |
|
|
| class FakeHfApi: |
| def __init__(self, token=None): |
| calls.append({"init": token}) |
|
|
| def create_repo(self, **kwargs): |
| calls.append({"create_repo": kwargs}) |
|
|
| def upload_folder(self, **kwargs): |
| calls.append({"upload_folder": kwargs}) |
|
|
| monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) |
|
|
| export_module.export_model(str(model_dir), "MarisUK/maris-ai-master") |
|
|
| assert calls == [ |
| {"init": "token"}, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(model_dir), |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export", |
| } |
| }, |
| ] |
|
|
|
|
| def test_export_model_publishes_branch_suite_to_runtime_repos(tmp_path: Path, monkeypatch) -> None: |
| script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_to_hf.py" |
| spec = importlib.util.spec_from_file_location("export_to_hf", script_path) |
| assert spec is not None and spec.loader is not None |
| export_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(export_module) |
|
|
| suite_dir = tmp_path / "suite" |
| suite_dir.mkdir() |
| for branch_name in ("master", "coder", "image", "tts"): |
| branch_dir = suite_dir / branch_name |
| branch_dir.mkdir() |
| branch_dir.joinpath("config.json").write_text("{}", encoding="utf-8") |
| suite_dir.joinpath("branch-suite.json").write_text( |
| json.dumps( |
| { |
| "branches": { |
| "master": {"output_dir": str(suite_dir / "master")}, |
| "coder": {"output_dir": str(suite_dir / "coder")}, |
| "image": {"output_dir": str(suite_dir / "image")}, |
| "tts": {"output_dir": str(suite_dir / "tts")}, |
| } |
| } |
| ), |
| encoding="utf-8", |
| ) |
| monkeypatch.setenv("HF_TOKEN", "token") |
|
|
| calls: list[dict[str, object]] = [] |
|
|
| class FakeHfApi: |
| def __init__(self, token=None): |
| calls.append({"init": token}) |
|
|
| def create_repo(self, **kwargs): |
| calls.append({"create_repo": kwargs}) |
|
|
| def upload_folder(self, **kwargs): |
| calls.append({"upload_folder": kwargs}) |
|
|
| monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) |
|
|
| export_module.export_model(str(suite_dir), "MarisUK/maris-ai-master") |
|
|
| assert calls == [ |
| {"init": "token"}, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir), |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export", |
| } |
| }, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-text", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir / "master"), |
| "repo_id": "MarisUK/maris-ai-text", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export (master)", |
| } |
| }, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-codex", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir / "coder"), |
| "repo_id": "MarisUK/maris-ai-codex", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export (coder)", |
| } |
| }, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-image", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir / "image"), |
| "repo_id": "MarisUK/maris-ai-image", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export (image)", |
| } |
| }, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-tts-runtime", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir / "tts"), |
| "repo_id": "MarisUK/maris-tts-runtime", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export (tts)", |
| } |
| }, |
| ] |
|
|
|
|
| def test_export_model_discovers_fallback_branch_dirs_without_manifest( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_to_hf.py" |
| spec = importlib.util.spec_from_file_location("export_to_hf", script_path) |
| assert spec is not None and spec.loader is not None |
| export_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(export_module) |
|
|
| suite_dir = tmp_path / "suite" |
| suite_dir.mkdir() |
| suite_dir.joinpath("config.json").write_text("{}", encoding="utf-8") |
| for branch_name in ("master", "coder"): |
| branch_dir = suite_dir / branch_name |
| branch_dir.mkdir() |
| branch_dir.joinpath("config.json").write_text("{}", encoding="utf-8") |
| monkeypatch.setenv("HF_TOKEN", "token") |
|
|
| calls: list[dict[str, object]] = [] |
|
|
| class FakeHfApi: |
| def __init__(self, token=None): |
| calls.append({"init": token}) |
|
|
| def create_repo(self, **kwargs): |
| calls.append({"create_repo": kwargs}) |
|
|
| def upload_folder(self, **kwargs): |
| calls.append({"upload_folder": kwargs}) |
|
|
| monkeypatch.setitem(sys.modules, "huggingface_hub", types.SimpleNamespace(HfApi=FakeHfApi)) |
|
|
| export_module.export_model(str(suite_dir), "MarisUK/maris-ai-master") |
|
|
| assert calls == [ |
| {"init": "token"}, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir), |
| "repo_id": "MarisUK/maris-ai-master", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export", |
| } |
| }, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-text", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir / "master"), |
| "repo_id": "MarisUK/maris-ai-text", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export (master)", |
| } |
| }, |
| { |
| "create_repo": { |
| "repo_id": "MarisUK/maris-ai-codex", |
| "repo_type": "model", |
| "exist_ok": True, |
| } |
| }, |
| { |
| "upload_folder": { |
| "folder_path": str(suite_dir / "coder"), |
| "repo_id": "MarisUK/maris-ai-codex", |
| "repo_type": "model", |
| "commit_message": "Maris AI model export (coder)", |
| } |
| }, |
| ] |
|
|
|
|
| def test_train_filters_unsupported_training_arguments( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| assert batched is True |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kā iet?", "assistant": "Labi"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| assert truncation is True |
| assert max_length == 256 |
| assert padding is False |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModelConfig: |
| pad_token_id = None |
|
|
| class FakeModel: |
| config = FakeModelConfig() |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| assert model_name == DEFAULT_TRAINING_BASE_MODEL |
| return cls() |
|
|
| class StrictTrainingArguments: |
| def __init__( |
| self, |
| *, |
| output_dir, |
| num_train_epochs, |
| learning_rate, |
| per_device_train_batch_size, |
| per_device_eval_batch_size, |
| gradient_accumulation_steps, |
| warmup_ratio, |
| weight_decay, |
| logging_steps, |
| save_steps, |
| eval_steps, |
| save_total_limit, |
| lr_scheduler_type, |
| seed, |
| fp16, |
| bf16, |
| report_to, |
| save_safetensors, |
| remove_unused_columns, |
| eval_strategy, |
| load_best_model_at_end, |
| metric_for_best_model, |
| greater_is_better, |
| ): |
| self.kwargs = { |
| "output_dir": output_dir, |
| "num_train_epochs": num_train_epochs, |
| "learning_rate": learning_rate, |
| "per_device_train_batch_size": per_device_train_batch_size, |
| "per_device_eval_batch_size": per_device_eval_batch_size, |
| "gradient_accumulation_steps": gradient_accumulation_steps, |
| "warmup_ratio": warmup_ratio, |
| "weight_decay": weight_decay, |
| "logging_steps": logging_steps, |
| "save_steps": save_steps, |
| "eval_steps": eval_steps, |
| "save_total_limit": save_total_limit, |
| "lr_scheduler_type": lr_scheduler_type, |
| "seed": seed, |
| "fp16": fp16, |
| "bf16": bf16, |
| "report_to": report_to, |
| "save_safetensors": save_safetensors, |
| "remove_unused_columns": remove_unused_columns, |
| "eval_strategy": eval_strategy, |
| "load_best_model_at_end": load_best_model_at_end, |
| "metric_for_best_model": metric_for_best_model, |
| "greater_is_better": greater_is_better, |
| } |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.25} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__( |
| self, |
| *, |
| model, |
| args, |
| train_dataset, |
| eval_dataset=None, |
| data_collator=None, |
| ): |
| del model, data_collator |
| self.args = args |
| self.train_dataset = train_dataset |
| self.eval_dataset = eval_dataset |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.5} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=StrictTrainingArguments, |
| ), |
| ) |
|
|
| metrics = train(output_dir=str(tmp_path / "trained-model"), max_seq_length=256) |
|
|
| assert metrics["eval_loss"] == 0.5 |
| assert FakeTrainer.last_instance is not None |
| assert "overwrite_output_dir" not in FakeTrainer.last_instance.args.kwargs |
| assert FakeTrainer.last_instance.args.kwargs["eval_strategy"] == "steps" |
|
|
|
|
| def test_build_branch_training_configs_creates_branch_output_dirs() -> None: |
| configs = build_branch_training_configs( |
| load_training_config( |
| overrides={ |
| "output_dir": "/tmp/maris-branch", |
| "eval_dataset_repo": "MarisUK/maris-ai-evals", |
| } |
| ) |
| ) |
|
|
| branch_names = {config.branch_name for config in configs} |
| assert {"master", "coder", "planner", "image", "music", "tts", "stt", "video"} == branch_names |
| coder_config = next(config for config in configs if config.branch_name == "coder") |
| assert coder_config.output_dir.endswith("/coder") |
| assert coder_config.eval_dataset_repo == "MarisUK/maris-ai-evals" |
| assert coder_config.benchmark_gate_enabled is True |
| assert coder_config.benchmark_min_overall >= 0.76 |
| assert coder_config.benchmark_dataset_path.endswith( |
| "core-python/evals/coder_release_benchmark.json" |
| ) |
| assert coder_config.preference_dataset_path.endswith( |
| "core-python/evals/coder_preference_dataset.json" |
| ) |
| assert coder_config.quality_min_text_chars >= 18 |
| assert coder_config.category_weight_map["coding"] >= 1.35 |
| assert coder_config.category_weight_map["grounding"] >= 1.25 |
| planner_config = next(config for config in configs if config.branch_name == "planner") |
| assert planner_config.benchmark_gate_enabled is True |
| assert planner_config.benchmark_min_overall >= 0.76 |
| assert planner_config.benchmark_dataset_path.endswith( |
| "core-python/evals/planner_release_benchmark.json" |
| ) |
| master_config = next(config for config in configs if config.branch_name == "master") |
| assert master_config.hub_model_id == "MarisUK/maris-ai-text" |
| assert master_config.benchmark_gate_enabled is True |
| assert master_config.quality_min_text_chars >= 12 |
| image_config = next(config for config in configs if config.branch_name == "image") |
| assert image_config.adapter_type == "specialist_model" |
| assert image_config.hub_model_id == "MarisUK/maris-ai-image" |
|
|
|
|
| def test_train_branch_suite_writes_external_manifests_for_specialists( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| base_config = load_training_config(overrides={"output_dir": str(tmp_path / "branches")}) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.train_with_config", |
| lambda branch_config: {"branch_len": float(len(branch_config.branch_name))}, |
| ) |
|
|
| results = train_branch_suite(base_config) |
|
|
| branch_suite = json.loads( |
| (tmp_path / "branches" / "branch-suite.json").read_text(encoding="utf-8") |
| ) |
|
|
| assert results["master"]["status"] == "trained" |
| assert results["master"]["maris_origin"] == "Maris AI" |
| assert results["coder"]["status"] == "trained" |
| assert results["image"]["status"] == "external_specialist" |
| assert results["tts"]["status"] == "external_specialist" |
| assert (tmp_path / "branches" / "image" / "branch-config.json").is_file() |
| assert (tmp_path / "branches" / "tts" / "branch-config.json").is_file() |
| assert (tmp_path / "branches" / "branch-suite.json").is_file() |
| assert branch_suite["artifact_type"] == "branch-suite" |
| assert branch_suite["maris_origin"] == "Maris AI" |
| assert branch_suite["dataset_repo"] == "MarisUK/maris-ai-memory" |
| assert branch_suite["branches"]["image"]["maris_origin"] == "Maris AI" |
| assert branch_suite["branches"]["stt"]["maris_model_id"] == "MarisUK/maris-stt-runtime" |
|
|
|
|
| def test_post_training_benchmark_results_use_maris_model_id(tmp_path: Path, monkeypatch) -> None: |
| benchmark_path = tmp_path / "benchmark.json" |
| benchmark_path.write_text( |
| json.dumps([{"name": "identity", "message": "Kas tu esi?", "expected_terms": ["Maris"]}]), |
| encoding="utf-8", |
| ) |
| config = load_training_config( |
| overrides={ |
| "benchmark_dataset_path": str(benchmark_path), |
| "benchmark_levels": ["ci"], |
| "hub_model_id": "MarisUK/maris-ai-master-trained", |
| } |
| ) |
|
|
| class FakePipeline: |
| pass |
|
|
| def fake_pipeline(*args, **kwargs): |
| del args, kwargs |
| return FakePipeline() |
|
|
| async def fake_run_chat_benchmark_with_responder(cases, *, responder, concurrency): |
| del concurrency |
| response = await responder(cases[0]) |
| return [types.SimpleNamespace(model=response["model"], response=response["response"])] |
|
|
| def fake_build_chat_benchmark_manifest(results, *, benchmark_name, branch, model): |
| return { |
| "benchmark_name": benchmark_name, |
| "branch": branch, |
| "model": model, |
| "results": [{"model": results[0].model, "response": results[0].response}], |
| } |
|
|
| monkeypatch.setitem(sys.modules, "transformers", types.SimpleNamespace(pipeline=fake_pipeline)) |
| monkeypatch.setattr( |
| "maris_core.training.train.run_chat_benchmark_with_responder", |
| fake_run_chat_benchmark_with_responder, |
| ) |
| monkeypatch.setattr( |
| "maris_core.training.train.call_generation_pipeline", |
| lambda *args, **kwargs: [{"generated_text": "Es esmu Maris AI."}], |
| ) |
| monkeypatch.setattr( |
| "maris_core.training.train.build_chat_benchmark_manifest", |
| fake_build_chat_benchmark_manifest, |
| ) |
|
|
| payload = asyncio.run( |
| _run_post_training_benchmark(config, model_path=str(tmp_path / "trained-model")) |
| ) |
| assert payload == { |
| "benchmark_name": config.benchmark_name, |
| "branch": config.branch_name, |
| "model": "MarisUK/maris-ai-master-trained", |
| "results": [ |
| { |
| "model": "MarisUK/maris-ai-master-trained", |
| "response": "Es esmu Maris AI.", |
| } |
| ], |
| } |
|
|
|
|
| def test_post_training_benchmark_filters_cases_by_branch(tmp_path: Path, monkeypatch) -> None: |
| benchmark_path = tmp_path / "benchmark.json" |
| benchmark_path.write_text( |
| json.dumps( |
| [ |
| {"name": "master-case", "message": "Sveiki", "branches": ["master"], "level": "ci"}, |
| { |
| "name": "coder-case", |
| "message": "Uzraksti Python helperi", |
| "profile": "coder", |
| "branches": ["coder"], |
| "level": "ci", |
| }, |
| ] |
| ), |
| encoding="utf-8", |
| ) |
| config = load_training_config( |
| overrides={ |
| "branch_name": "coder", |
| "benchmark_dataset_path": str(benchmark_path), |
| "benchmark_levels": ["ci"], |
| } |
| ) |
|
|
| class FakePipeline: |
| pass |
|
|
| captured_case_names: list[str] = [] |
|
|
| def fake_pipeline(*args, **kwargs): |
| del args, kwargs |
| return FakePipeline() |
|
|
| async def fake_run_chat_benchmark_with_responder(cases, *, responder, concurrency): |
| del responder, concurrency |
| captured_case_names.extend(case.name for case in cases) |
| return [] |
|
|
| monkeypatch.setitem(sys.modules, "transformers", types.SimpleNamespace(pipeline=fake_pipeline)) |
| monkeypatch.setattr( |
| "maris_core.training.train.run_chat_benchmark_with_responder", |
| fake_run_chat_benchmark_with_responder, |
| ) |
| monkeypatch.setattr( |
| "maris_core.training.train.build_chat_benchmark_manifest", |
| lambda results, *, benchmark_name, branch, model: { |
| "benchmark_name": benchmark_name, |
| "branch": branch, |
| "model": model, |
| "results": results, |
| }, |
| ) |
|
|
| asyncio.run(_run_post_training_benchmark(config, model_path=str(tmp_path / "trained-model"))) |
|
|
| assert captured_case_names == ["coder-case"] |
|
|
|
|
| def test_filter_records_for_branch_keeps_coder_specific_mix() -> None: |
| records = [ |
| { |
| "type": "conversation", |
| "user": "Sveiki", |
| "assistant": "Čau", |
| }, |
| { |
| "type": "code", |
| "prompt": "Salabo parseri", |
| "metadata": {"language": "python", "task": "bugfix", "project_area": "core-python"}, |
| }, |
| { |
| "type": "autonomous", |
| "prompt": "Investigate CI", |
| "metadata": {"workflow": "ci-triage", "project_area": "operations"}, |
| }, |
| ] |
|
|
| filtered, report = _filter_records_for_branch( |
| records, |
| branch_name="coder", |
| split_name="train", |
| ) |
|
|
| assert len(filtered) == 1 |
| assert filtered[0]["type"] == "code" |
| assert report.kept_records == 1 |
| assert report.dropped_records == 2 |
|
|
|
|
| def test_filter_records_for_branch_keeps_master_general_mix() -> None: |
| records = [ |
| {"type": "conversation", "user": "Sveiki", "assistant": "Čau"}, |
| {"type": "code", "prompt": "Uzraksti helperi", "profile": "coder"}, |
| {"type": "autonomous", "prompt": "Plan sprint", "branch": "planner"}, |
| ] |
|
|
| filtered, report = _filter_records_for_branch( |
| records, |
| branch_name="master", |
| split_name="train", |
| ) |
|
|
| assert [record["type"] for record in filtered] == ["conversation"] |
| assert report.kept_records == 1 |
| assert report.dropped_records == 2 |
|
|
|
|
| def test_filter_records_for_branch_uses_custom_rule_map() -> None: |
| records = [ |
| {"type": "conversation", "user": "Sveiki", "assistant": "Čau"}, |
| {"type": "code", "prompt": "Uzraksti helperi", "profile": "coder"}, |
| ] |
|
|
| filtered, report = _filter_records_for_branch( |
| records, |
| branch_name="coder", |
| split_name="train", |
| branch_filter_rules={ |
| "coder": { |
| "include_record_types": ["conversation"], |
| "exclude_explicit_branches": ["planner"], |
| } |
| }, |
| ) |
|
|
| assert [record["type"] for record in filtered] == ["conversation"] |
| assert report.kept_records == 1 |
| assert report.dropped_records == 1 |
|
|
|
|
| def test_filter_preference_examples_for_branch_keeps_coder_examples_only() -> None: |
| examples_path = Path( |
| "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json" |
| ) |
| examples = load_preference_dataset(examples_path) |
|
|
| filtered = _filter_preference_examples_for_branch( |
| examples, |
| branch_name="coder", |
| ) |
|
|
| assert filtered |
| assert all((example.branch or "").lower() == "coder" for example in filtered) |
|
|
|
|
| def test_filter_preference_examples_for_branch_uses_custom_rule_map() -> None: |
| examples_path = Path( |
| "/home/runner/work/Maris-MI/Maris-MI/core-python/evals/coder_preference_dataset.json" |
| ) |
| examples = load_preference_dataset(examples_path) |
|
|
| filtered = _filter_preference_examples_for_branch( |
| examples, |
| branch_name="planner", |
| branch_filter_rules={ |
| "planner": { |
| "include_task_types": ["repo-level"], |
| } |
| }, |
| ) |
|
|
| assert len(filtered) >= 1 |
| assert all(example.task_type == "repo-level" for example in filtered) |
|
|
|
|
| def test_train_uses_external_eval_dataset_when_configured(tmp_path: Path, monkeypatch) -> None: |
| dataset_calls: list[str] = [] |
|
|
| class FakeSplit(list): |
| column_names = ["text"] |
|
|
| def map(self, function, **kwargs): |
| del kwargs |
| batch = {"text": [item["text"] for item in self]} |
| mapped = function(batch) |
| size = len(next(iter(mapped.values()))) if mapped else 0 |
| return FakeSplit( |
| [{key: value[index] for key, value in mapped.items()} for index in range(size)] |
| ) |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| midpoint = max(1, len(self) - 1) |
| return {"train": FakeSplit(self[:midpoint]), "test": FakeSplit(self[midpoint:])} |
|
|
| def fake_load_hf_dataset(repo_id: str): |
| dataset_calls.append(repo_id) |
| if repo_id == "MarisUK/maris-ai-memory": |
| return {"train": FakeSplit([{"text": "train-1"}, {"text": "train-2"}])} |
| if repo_id == "MarisUK/maris-ai-evals": |
| return {"train": FakeSplit([{"text": "eval-1"}])} |
| raise AssertionError(f"Unexpected repo id: {repo_id}") |
|
|
| class FakeTokenizer: |
| pad_token = None |
| eos_token = "<eos>" |
| pad_token_id = None |
| eos_token_id = 7 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| del model_name |
| return cls() |
|
|
| def __call__(self, texts, **kwargs): |
| del kwargs |
| return { |
| "input_ids": [[index + 1] for index, _ in enumerate(texts)], |
| "attention_mask": [[1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| config = types.SimpleNamespace(pad_token_id=None) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| del model_name |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): |
| del model, data_collator |
| self.args = args |
| self.train_dataset = train_dataset |
| self.eval_dataset = eval_dataset |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return types.SimpleNamespace(metrics={"train_loss": 0.1}) |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.2} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") |
|
|
| monkeypatch.setattr("maris_core.training.train.load_hf_dataset", fake_load_hf_dataset) |
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| metrics = train( |
| output_dir=str(tmp_path / "trained-model"), |
| dataset_repos=["MarisUK/maris-ai-memory"], |
| eval_dataset_repo="MarisUK/maris-ai-evals", |
| eval_dataset_repos=["MarisUK/maris-ai-evals"], |
| ) |
|
|
| assert metrics["eval_loss"] == 0.2 |
| assert dataset_calls == ["MarisUK/maris-ai-memory", "MarisUK/maris-ai-evals"] |
| assert FakeTrainer.last_instance is not None |
| assert len(FakeTrainer.last_instance.train_dataset) == 1 |
| assert len(FakeTrainer.last_instance.eval_dataset) == 1 |
|
|
|
|
| def test_train_merges_multiple_dataset_repos_for_training_and_eval( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| dataset_calls: list[str] = [] |
|
|
| class FakeSplit(list): |
| column_names = ["text"] |
|
|
| def map(self, function, **kwargs): |
| del kwargs |
| batch = {"text": [item["text"] for item in self]} |
| mapped = function(batch) |
| size = len(next(iter(mapped.values()))) if mapped else 0 |
| return FakeSplit( |
| [{key: value[index] for key, value in mapped.items()} for index in range(size)] |
| ) |
|
|
| repo_rows = { |
| "MarisUK/maris-ai-memory": { |
| "train": [{"text": "memory-train"}], |
| "validation": [{"text": "memory-val"}], |
| }, |
| "MarisUK/maris-ai-lv-memory": { |
| "train": [{"text": "lv-train"}], |
| "validation": [{"text": "lv-val"}], |
| }, |
| "MarisUK/maris-ai-evals": { |
| "train": [{"text": "eval-train"}], |
| "validation": [{"text": "eval-val"}], |
| }, |
| "MarisUK/maris-ai-benchmark": { |
| "train": [{"text": "bench-train"}], |
| "validation": [{"text": "bench-val"}], |
| }, |
| } |
|
|
| def fake_load_hf_dataset(repo_id: str): |
| dataset_calls.append(repo_id) |
| if repo_id not in repo_rows: |
| raise AssertionError(f"Unexpected repo id: {repo_id}") |
| payload = repo_rows[repo_id] |
| return {split_name: FakeSplit(list(records)) for split_name, records in payload.items()} |
|
|
| class FakeTokenizer: |
| pad_token = None |
| eos_token = "<eos>" |
| pad_token_id = None |
| eos_token_id = 7 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| del model_name |
| return cls() |
|
|
| def __call__(self, texts, **kwargs): |
| del kwargs |
| return { |
| "input_ids": [[index + 1] for index, _ in enumerate(texts)], |
| "attention_mask": [[1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| config = types.SimpleNamespace(pad_token_id=None) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| del model_name |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): |
| del model, data_collator |
| self.args = args |
| self.train_dataset = train_dataset |
| self.eval_dataset = eval_dataset |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return types.SimpleNamespace(metrics={"train_loss": 0.1}) |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.2} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "model.bin").write_text("ok", encoding="utf-8") |
|
|
| monkeypatch.setattr("maris_core.training.train.load_hf_dataset", fake_load_hf_dataset) |
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| metrics = train( |
| output_dir=str(tmp_path / "trained-model"), |
| dataset_repo="MarisUK/maris-ai-memory", |
| dataset_repos=[ |
| "MarisUK/maris-ai-memory", |
| "MarisUK/maris-ai-lv-memory", |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ], |
| eval_dataset_repo="MarisUK/maris-ai-evals", |
| eval_dataset_repos=[ |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ], |
| ) |
|
|
| assert metrics["eval_loss"] == 0.2 |
| assert dataset_calls == [ |
| "MarisUK/maris-ai-memory", |
| "MarisUK/maris-ai-lv-memory", |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| "MarisUK/maris-ai-evals", |
| "MarisUK/maris-ai-benchmark", |
| ] |
| assert FakeTrainer.last_instance is not None |
| assert len(FakeTrainer.last_instance.train_dataset) == 4 |
| assert len(FakeTrainer.last_instance.eval_dataset) == 2 |
|
|
|
|
| def test_evaluate_with_config_prefers_external_eval_dataset(tmp_path: Path, monkeypatch) -> None: |
| dataset_calls: list[str] = [] |
| trained_model_dir = tmp_path / "trained-model" |
| trained_model_dir.mkdir(parents=True, exist_ok=True) |
| (trained_model_dir / "config.json").write_text( |
| json.dumps( |
| { |
| "_name_or_path": "MarisUK/maris-ai-master", |
| "model_type": "qwen2", |
| "architectures": ["Qwen2ForCausalLM"], |
| "tokenizer_class": "Qwen2TokenizerFast", |
| "auto_map": {"AutoModelForCausalLM": "qwen2.modeling_qwen2.Qwen2ForCausalLM"}, |
| } |
| ), |
| encoding="utf-8", |
| ) |
| (trained_model_dir / "tokenizer_config.json").write_text( |
| json.dumps( |
| { |
| "name_or_path": "MarisUK/maris-ai-master", |
| "tokenizer_class": "Qwen2TokenizerFast", |
| } |
| ), |
| encoding="utf-8", |
| ) |
| write_maris_compatibility_artifact( |
| trained_model_dir, |
| maris_model_id="MarisUK/maris-ai-master", |
| ) |
| apply_maris_compatibility_identity(trained_model_dir) |
|
|
| class FakeSplit(list): |
| column_names = ["text"] |
|
|
| def map(self, function, **kwargs): |
| del kwargs |
| batch = {"text": [item["text"] for item in self]} |
| mapped = function(batch) |
| size = len(next(iter(mapped.values()))) if mapped else 0 |
| return FakeSplit( |
| [{key: value[index] for key, value in mapped.items()} for index in range(size)] |
| ) |
|
|
| def fake_load_hf_dataset(repo_id: str): |
| dataset_calls.append(repo_id) |
| if repo_id == "MarisUK/maris-ai-evals": |
| return {"train": FakeSplit([{"text": "eval-1"}, {"text": "eval-2"}])} |
| raise AssertionError(f"Unexpected repo id: {repo_id}") |
|
|
| class FakeTokenizer: |
| pad_token = None |
| eos_token = "<eos>" |
| pad_token_id = None |
| eos_token_id = 7 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| loaded_dir = Path(model_name) |
| assert loaded_dir != trained_model_dir |
| tokenizer_config = json.loads( |
| loaded_dir.joinpath("tokenizer_config.json").read_text(encoding="utf-8") |
| ) |
| assert tokenizer_config["tokenizer_class"] == "Qwen2TokenizerFast" |
| return cls() |
|
|
| def __call__(self, texts, **kwargs): |
| del kwargs |
| return { |
| "input_ids": [[index + 1] for index, _ in enumerate(texts)], |
| "attention_mask": [[1] for _ in texts], |
| } |
|
|
| class FakeModel: |
| @classmethod |
| def from_pretrained(cls, model_name): |
| loaded_dir = Path(model_name) |
| assert loaded_dir != trained_model_dir |
| model_config = json.loads( |
| loaded_dir.joinpath("config.json").read_text(encoding="utf-8") |
| ) |
| assert model_config["model_type"] == "qwen2" |
| assert model_config["architectures"] == ["Qwen2ForCausalLM"] |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainer: |
| def __init__(self, *, model, args, eval_dataset=None, data_collator=None): |
| del model, args, data_collator |
| self.eval_dataset = eval_dataset |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.3} |
|
|
| monkeypatch.setattr("maris_core.training.train.load_hf_dataset", fake_load_hf_dataset) |
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| config = load_training_config( |
| overrides={ |
| "output_dir": str(trained_model_dir), |
| "eval_dataset_repo": "MarisUK/maris-ai-evals", |
| "eval_dataset_repos": ["MarisUK/maris-ai-evals"], |
| "benchmark_dataset_path": str(tmp_path / "benchmark.json"), |
| "benchmark_levels": ["ci"], |
| } |
| ) |
|
|
| async def fake_benchmark(config, *, model_path): |
| assert model_path.endswith("trained-model") |
| return { |
| "artifact_type": "chat-benchmark-manifest", |
| "benchmark_name": config.benchmark_name, |
| "branch": config.branch_name, |
| "model": config.hub_model_id, |
| "generated_at": "2026-04-16T00:00:00Z", |
| "score_manifest": { |
| "overall": 0.79, |
| "reasoning": 0.76, |
| "factuality": 0.75, |
| "helpfulness": 0.8, |
| "execution": 0.75, |
| }, |
| "category_scores": {"coding": 0.74}, |
| "execution_language_pass_rates": {"python": 1.0}, |
| "execution_language_scores": {"python": 0.74}, |
| "category_execution_pass_rates": {"coding": 1.0}, |
| } |
|
|
| monkeypatch.setattr("maris_core.training.train._run_post_training_benchmark", fake_benchmark) |
|
|
| metrics = evaluate_with_config(config, model_path=str(trained_model_dir)) |
|
|
| assert metrics["eval_loss"] == 0.3 |
| assert metrics["eval_examples"] == 2.0 |
| assert metrics["benchmark_overall"] == 0.79 |
| assert metrics["benchmark_gate_passed"] == 1.0 |
| assert metrics["benchmark_regressions"] == 0.0 |
| assert dataset_calls == ["MarisUK/maris-ai-evals"] |
| assert (trained_model_dir / "benchmark-manifest.json").is_file() |
| assert (trained_model_dir / "benchmark-history.json").is_file() |
| assert (trained_model_dir / "benchmark-regression-report.json").is_file() |
| assert ( |
| json.loads((trained_model_dir / "config.json").read_text(encoding="utf-8"))["model_type"] |
| == "maris" |
| ) |
| assert ( |
| json.loads((trained_model_dir / "tokenizer_config.json").read_text(encoding="utf-8"))[ |
| "tokenizer_class" |
| ] |
| == "MarisCompatibleTokenizer" |
| ) |
|
|
|
|
| def test_load_training_config_reads_peft_and_preference_optimization_settings( |
| tmp_path: Path, |
| ) -> None: |
| config_path = tmp_path / "training.json" |
| config_path.write_text( |
| json.dumps( |
| { |
| "adapter_type": "qlora", |
| "lora_r": 32, |
| "lora_alpha": 64, |
| "lora_dropout": 0.15, |
| "lora_bias": "all", |
| "peft_target_modules": ["q_proj", "v_proj"], |
| "qlora_quant_type": "fp4", |
| "qlora_use_double_quant": False, |
| "qlora_compute_dtype": "bfloat16", |
| "preference_dataset_path": "/tmp/preferences.json", |
| "preference_optimization": "dpo", |
| "preference_beta": 0.25, |
| "preference_max_prompt_length": 256, |
| "preference_max_length": 768, |
| "preference_reference_model": "MarisUK/maris-ai-master", |
| } |
| ), |
| encoding="utf-8", |
| ) |
|
|
| config = load_training_config(str(config_path)) |
|
|
| assert config.adapter_type == "qlora" |
| assert config.lora_r == 32 |
| assert config.lora_alpha == 64 |
| assert config.lora_dropout == 0.15 |
| assert config.lora_bias == "all" |
| assert config.peft_target_modules == ["q_proj", "v_proj"] |
| assert config.qlora_quant_type == "fp4" |
| assert config.qlora_use_double_quant is False |
| assert config.qlora_compute_dtype == "bfloat16" |
| assert config.preference_optimization == "dpo" |
| assert config.preference_beta == 0.25 |
| assert config.preference_max_prompt_length == 256 |
| assert config.preference_max_length == 768 |
| assert config.preference_reference_model == "MarisUK/maris-ai-master" |
|
|
|
|
| def test_train_runs_qlora_and_dpo_preference_stage(tmp_path: Path, monkeypatch) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| assert batched is True |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kas jauns?", "assistant": "Viss kārtībā"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| preference_dataset_path = tmp_path / "preferences.json" |
| preference_dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "prompt": "Atbildi korekti", |
| "chosen": "Šī ir labākā atbilde.", |
| "rejected": "Nē.", |
| "source": "human_review", |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| model_load_calls: list[dict[str, object]] = [] |
| bnb_calls: list[dict[str, object]] = [] |
| lora_config_calls: list[dict[str, object]] = [] |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del truncation, max_length, padding |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeBitsAndBytesConfig: |
| def __init__(self, **kwargs): |
| bnb_calls.append(kwargs) |
| self.kwargs = kwargs |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
| self.prepared_for_kbit = False |
| self.peft_config = None |
| self.trainable_parameters_printed = False |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| model_load_calls.append({"model_name": model_name, "kwargs": kwargs}) |
| return cls() |
|
|
| def print_trainable_parameters(self): |
| self.trainable_parameters_printed = True |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): |
| del data_collator |
| self.model = model |
| self.args = args |
| self.train_dataset = train_dataset |
| self.eval_dataset = eval_dataset |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.4} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "adapter_config.json").write_text("{}", encoding="utf-8") |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| fake_transformers = types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| BitsAndBytesConfig=FakeBitsAndBytesConfig, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ) |
| monkeypatch.setitem(sys.modules, "transformers", fake_transformers) |
|
|
| class FakeLoraConfig: |
| def __init__(self, **kwargs): |
| lora_config_calls.append(kwargs) |
| self.kwargs = kwargs |
|
|
| class FakeAutoPeftModelForCausalLM: |
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| model_load_calls.append({"model_name": model_name, "kwargs": kwargs, "auto_peft": True}) |
| return FakeModel() |
|
|
| def fake_prepare_model_for_kbit_training(model, use_gradient_checkpointing): |
| model.prepared_for_kbit = use_gradient_checkpointing |
| return model |
|
|
| def fake_get_peft_model(model, peft_config): |
| model.peft_config = peft_config |
| return model |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "peft", |
| types.SimpleNamespace( |
| AutoPeftModelForCausalLM=FakeAutoPeftModelForCausalLM, |
| LoraConfig=FakeLoraConfig, |
| TaskType=types.SimpleNamespace(CAUSAL_LM="CAUSAL_LM"), |
| get_peft_model=fake_get_peft_model, |
| prepare_model_for_kbit_training=fake_prepare_model_for_kbit_training, |
| ), |
| ) |
|
|
| class FakeDPOConfig: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeDPOTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
| FakeDPOTrainer.last_instance = self |
|
|
| def train(self): |
| return types.SimpleNamespace(metrics={"loss": 0.12}) |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "adapter_config.json").write_text("{}", encoding="utf-8") |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "trl", |
| types.SimpleNamespace(DPOConfig=FakeDPOConfig, DPOTrainer=FakeDPOTrainer), |
| ) |
|
|
| output_dir = tmp_path / "trained-model" |
| metrics = train( |
| output_dir=str(output_dir), |
| max_seq_length=256, |
| adapter_type="qlora", |
| qlora_compute_dtype="float16", |
| qlora_quant_type="nf4", |
| qlora_use_double_quant=True, |
| lora_r=8, |
| lora_alpha=16, |
| lora_dropout=0.1, |
| peft_target_modules=["q_proj", "v_proj"], |
| preference_dataset_path=str(preference_dataset_path), |
| preference_optimization="dpo", |
| preference_beta=0.2, |
| preference_max_prompt_length=128, |
| preference_max_length=512, |
| ) |
|
|
| assert metrics["train_loss"] == 0.2 |
| assert metrics["preference_loss"] == 0.12 |
| assert metrics["preference_examples"] == 1.0 |
| assert metrics["preference_stage"] == 1.0 |
| assert bnb_calls[0]["load_in_4bit"] is True |
| assert bnb_calls[0]["bnb_4bit_quant_type"] == "nf4" |
| assert lora_config_calls[0]["r"] == 8 |
| assert lora_config_calls[0]["lora_alpha"] == 16 |
| assert lora_config_calls[0]["target_modules"] == ["q_proj", "v_proj"] |
| assert FakeDPOTrainer.last_instance is not None |
| assert "ref_model" in FakeDPOTrainer.last_instance.kwargs |
| assert len(FakeDPOTrainer.last_instance.kwargs["train_dataset"]) == 1 |
| assert any(call.get("auto_peft") for call in model_load_calls) |
|
|
|
|
| def test_train_runs_orpo_preference_stage_without_reference_model( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kā iet?", "assistant": "Labi"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| preference_dataset_path = tmp_path / "preferences.json" |
| preference_dataset_path.write_text( |
| json.dumps( |
| [ |
| { |
| "prompt": "Atbildi pieklājīgi", |
| "chosen": "Protams, palīdzēšu.", |
| "rejected": "Nē.", |
| "source": "human_review", |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name): |
| del model_name |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del truncation, max_length, padding |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainer: |
| def __init__(self, *, model, args, train_dataset, eval_dataset=None, data_collator=None): |
| del model, args, train_dataset, eval_dataset, data_collator |
|
|
| def train(self): |
| return types.SimpleNamespace(metrics={"train_loss": 0.11}) |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.22} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| class FakeORPOConfig: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeORPOTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
| FakeORPOTrainer.last_instance = self |
|
|
| def train(self): |
| return types.SimpleNamespace(metrics={"loss": 0.07}) |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "trl", |
| types.SimpleNamespace(ORPOConfig=FakeORPOConfig, ORPOTrainer=FakeORPOTrainer), |
| ) |
|
|
| metrics = train( |
| output_dir=str(tmp_path / "trained-model"), |
| preference_dataset_path=str(preference_dataset_path), |
| preference_optimization="orpo", |
| ) |
|
|
| assert metrics["preference_loss"] == 0.07 |
| assert FakeORPOTrainer.last_instance is not None |
| assert "ref_model" not in FakeORPOTrainer.last_instance.kwargs |
|
|
|
|
| def test_train_retries_tokenizer_with_slow_backend(tmp_path: Path, monkeypatch) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kas jauns?", "assistant": "Viss kārtībā"}, |
| ] |
| ) |
| }, |
| ) |
|
|
| tokenizer_fast_attempts: list[bool] = [] |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name |
| tokenizer_fast_attempts.append(bool(kwargs.get("use_fast", True))) |
| if kwargs.get("use_fast", True): |
| raise ValueError("fast tokenizer unavailable") |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del texts, truncation, max_length, padding |
| return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModelConfig: |
| pad_token_id = None |
| use_cache = True |
|
|
| class FakeModel: |
| config = FakeModelConfig() |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.1} |
|
|
| class FakeTrainer: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.2} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| train( |
| output_dir=str(tmp_path / "slow-tokenizer"), model_name="custom/model", max_seq_length=256 |
| ) |
|
|
| assert tokenizer_fast_attempts == [True, False] |
|
|
|
|
| def test_train_auto_switches_giant_models_to_resource_saver_mode( |
| tmp_path: Path, |
| monkeypatch, |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Labdien"}, |
| {"user": "Kas jauns?", "assistant": "Viss kārtībā"}, |
| ] |
| ) |
| }, |
| ) |
| monkeypatch.setenv("HF_TRAIN_BATCH_SIZE", "4") |
| monkeypatch.setenv("HF_TRAIN_EVAL_BATCH_SIZE", "2") |
| monkeypatch.setenv("HF_TRAIN_GRADIENT_ACCUMULATION_STEPS", "4") |
|
|
| model_load_calls: list[dict[str, object]] = [] |
| bnb_calls: list[dict[str, object]] = [] |
|
|
| class FakeTokenizer: |
| pad_token = None |
| pad_token_id = None |
| eos_token = "<eos>" |
| eos_token_id = 99 |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def __call__(self, texts, *, truncation, max_length, padding): |
| del texts, truncation, max_length, padding |
| return {"input_ids": [[1, 2, 3]], "attention_mask": [[1, 1, 1]]} |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeBitsAndBytesConfig: |
| def __init__(self, **kwargs): |
| bnb_calls.append(kwargs) |
| self.kwargs = kwargs |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| model_load_calls.append({"model_name": model_name, "kwargs": kwargs}) |
| return cls() |
|
|
| def print_trainable_parameters(self): |
| return None |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
| self.args = kwargs["args"] |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.4} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "adapter_config.json").write_text("{}", encoding="utf-8") |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| BitsAndBytesConfig=FakeBitsAndBytesConfig, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| class FakeLoraConfig: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| def fake_prepare_model_for_kbit_training(model, use_gradient_checkpointing): |
| del use_gradient_checkpointing |
| return model |
|
|
| def fake_get_peft_model(model, peft_config): |
| del peft_config |
| return model |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "peft", |
| types.SimpleNamespace( |
| LoraConfig=FakeLoraConfig, |
| TaskType=types.SimpleNamespace(CAUSAL_LM="CAUSAL_LM"), |
| get_peft_model=fake_get_peft_model, |
| prepare_model_for_kbit_training=fake_prepare_model_for_kbit_training, |
| ), |
| ) |
|
|
| output_dir = tmp_path / "giant-model" |
| train( |
| output_dir=str(output_dir), |
| model_name="Qwen/Qwen3-Coder-480B-A35B-Instruct", |
| adapter_type="full", |
| max_seq_length=256, |
| ) |
|
|
| assert bnb_calls[0]["load_in_4bit"] is True |
| assert model_load_calls[0]["model_name"] == "Qwen/Qwen3-Coder-480B-A35B-Instruct" |
| assert model_load_calls[0]["kwargs"]["device_map"] == "auto" |
| assert model_load_calls[0]["kwargs"]["low_cpu_mem_usage"] is True |
| assert "quantization_config" in model_load_calls[0]["kwargs"] |
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.args.kwargs["per_device_train_batch_size"] == 1 |
| assert FakeTrainer.last_instance.args.kwargs["per_device_eval_batch_size"] == 1 |
| assert FakeTrainer.last_instance.args.kwargs["gradient_accumulation_steps"] == 16 |
| training_config = json.loads((output_dir / "training-config.json").read_text(encoding="utf-8")) |
| assert training_config["adapter_type"] == "qlora" |
|
|
|
|
| def test_train_disables_pin_memory_and_tqdm_in_non_interactive_environment( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch if batched else self.items[0]) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Čau!"}, |
| {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, |
| ] |
| ) |
| }, |
| ) |
| import maris_core.training.train as train_module |
|
|
| monkeypatch.setattr(train_module.sys, "stderr", types.SimpleNamespace(isatty=lambda: False)) |
| monkeypatch.setitem( |
| sys.modules, |
| "torch", |
| types.SimpleNamespace( |
| cuda=types.SimpleNamespace(is_available=lambda: False), |
| backends=types.SimpleNamespace(mps=types.SimpleNamespace(is_available=lambda: False)), |
| ), |
| ) |
|
|
| class FakeTokenizer: |
| pad_token_id = 0 |
| eos_token_id = 1 |
| pad_token = "<pad>" |
| eos_token = "</s>" |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def __call__(self, texts, truncation, padding, max_length): |
| del truncation, padding, max_length |
| if isinstance(texts, str): |
| texts = [texts] |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.args = kwargs["args"] |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.4} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| train(output_dir=str(tmp_path / "cpu-runtime"), model_name="custom/model", max_seq_length=256) |
|
|
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.args.kwargs["dataloader_pin_memory"] is False |
| assert FakeTrainer.last_instance.args.kwargs["disable_tqdm"] is True |
| assert FakeTrainer.last_instance.args.kwargs["logging_first_step"] is True |
|
|
|
|
| def test_train_enables_bf16_by_default_when_cuda_supports_it(tmp_path: Path, monkeypatch) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch if batched else self.items[0]) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Čau!"}, |
| {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, |
| ] |
| ) |
| }, |
| ) |
| import maris_core.training.train as train_module |
|
|
| monkeypatch.setattr(train_module.sys, "stderr", types.SimpleNamespace(isatty=lambda: True)) |
| monkeypatch.setitem( |
| sys.modules, |
| "torch", |
| types.SimpleNamespace( |
| cuda=types.SimpleNamespace( |
| is_available=lambda: True, |
| is_bf16_supported=lambda: True, |
| ), |
| backends=types.SimpleNamespace(mps=types.SimpleNamespace(is_available=lambda: False)), |
| ), |
| ) |
|
|
| class FakeTokenizer: |
| pad_token_id = 0 |
| eos_token_id = 1 |
| pad_token = "<pad>" |
| eos_token = "</s>" |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def __call__(self, texts, truncation, padding, max_length): |
| del truncation, padding, max_length |
| if isinstance(texts, str): |
| texts = [texts] |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.args = kwargs["args"] |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.4} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| train(output_dir=str(tmp_path / "cuda-runtime"), model_name="custom/model", max_seq_length=256) |
|
|
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.args.kwargs["bf16"] is True |
| assert FakeTrainer.last_instance.args.kwargs["fp16"] is False |
|
|
|
|
| def test_train_uses_fsdp_training_arguments_when_requested(tmp_path: Path, monkeypatch) -> None: |
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch if batched else self.items[0]) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Čau!"}, |
| {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, |
| ] |
| ) |
| }, |
| ) |
|
|
| class FakeTokenizer: |
| pad_token_id = 0 |
| eos_token_id = 1 |
| pad_token = "<pad>" |
| eos_token = "</s>" |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def __call__(self, texts, truncation, padding, max_length): |
| del truncation, padding, max_length |
| if isinstance(texts, str): |
| texts = [texts] |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.args = kwargs["args"] |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.4} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| fsdp_config_path = tmp_path / "fsdp-config.json" |
| fsdp_config_path.write_text( |
| json.dumps({"activation_checkpointing": False, "limit_all_gathers": False}), |
| encoding="utf-8", |
| ) |
|
|
| train( |
| output_dir=str(tmp_path / "fsdp-runtime"), |
| model_name="custom/model", |
| max_seq_length=256, |
| distributed_strategy="fsdp", |
| distributed_config_path=str(fsdp_config_path), |
| fsdp_transformer_layer_cls_to_wrap=["Qwen2DecoderLayer"], |
| ) |
|
|
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.args.kwargs["fsdp"] == "full_shard auto_wrap" |
| assert FakeTrainer.last_instance.args.kwargs["fsdp_config"]["activation_checkpointing"] is False |
| assert FakeTrainer.last_instance.args.kwargs["fsdp_config"]["limit_all_gathers"] is False |
| assert FakeTrainer.last_instance.args.kwargs["fsdp_config"]["min_num_params"] == 100_000_000 |
| assert FakeTrainer.last_instance.args.kwargs["fsdp_config"][ |
| "transformer_layer_cls_to_wrap" |
| ] == ["Qwen2DecoderLayer"] |
| assert FakeTrainer.last_instance.args.kwargs["ddp_find_unused_parameters"] is False |
|
|
|
|
| def test_train_uses_deepspeed_training_arguments_when_requested( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| monkeypatch.setattr( |
| "maris_core.training.train.get_installed_package_version", |
| lambda package_name: "0.18.9", |
| ) |
|
|
| class FakeDataset: |
| def __init__(self, items): |
| self.items = list(items) |
| self.column_names = list(self.items[0].keys()) if self.items else [] |
|
|
| def train_test_split(self, *, test_size, seed): |
| del test_size, seed |
| return { |
| "train": FakeDataset(self.items[:1]), |
| "test": FakeDataset(self.items[1:]), |
| } |
|
|
| def map(self, fn, *, batched, remove_columns, desc): |
| del remove_columns, desc |
| batch = {key: [item.get(key) for item in self.items] for key in self.column_names} |
| transformed = fn(batch if batched else self.items[0]) |
| size = len(next(iter(transformed.values()))) if transformed else 0 |
| return FakeDataset( |
| [{key: transformed[key][index] for key in transformed} for index in range(size)] |
| ) |
|
|
| def __len__(self): |
| return len(self.items) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.load_hf_dataset", |
| lambda _: { |
| "train": FakeDataset( |
| [ |
| {"user": "Sveiki", "assistant": "Čau!"}, |
| {"prompt": "Uzraksti plānu", "completion": "Gatavs."}, |
| ] |
| ) |
| }, |
| ) |
|
|
| class FakeTokenizer: |
| pad_token_id = 0 |
| eos_token_id = 1 |
| pad_token = "<pad>" |
| eos_token = "</s>" |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| def __call__(self, texts, truncation, padding, max_length): |
| del truncation, padding, max_length |
| if isinstance(texts, str): |
| texts = [texts] |
| return { |
| "input_ids": [[1, 2, 3] for _ in texts], |
| "attention_mask": [[1, 1, 1] for _ in texts], |
| } |
|
|
| def save_pretrained(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "tokenizer_config.json").write_text("{}", encoding="utf-8") |
|
|
| class FakeModel: |
| def __init__(self): |
| self.config = types.SimpleNamespace(pad_token_id=None, use_cache=True) |
|
|
| @classmethod |
| def from_pretrained(cls, model_name, **kwargs): |
| del model_name, kwargs |
| return cls() |
|
|
| class FakeTrainingArguments: |
| def __init__(self, **kwargs): |
| self.kwargs = kwargs |
|
|
| class FakeTrainResult: |
| metrics = {"train_loss": 0.2} |
|
|
| class FakeTrainer: |
| last_instance = None |
|
|
| def __init__(self, **kwargs): |
| self.args = kwargs["args"] |
| FakeTrainer.last_instance = self |
|
|
| def train(self): |
| return FakeTrainResult() |
|
|
| def evaluate(self): |
| return {"eval_loss": 0.4} |
|
|
| def save_model(self, output_dir): |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| Path(output_dir, "config.json").write_text("{}", encoding="utf-8") |
|
|
| monkeypatch.setitem( |
| sys.modules, |
| "transformers", |
| types.SimpleNamespace( |
| AutoModelForCausalLM=FakeModel, |
| AutoTokenizer=FakeTokenizer, |
| DataCollatorForLanguageModeling=lambda **kwargs: kwargs, |
| Trainer=FakeTrainer, |
| TrainingArguments=FakeTrainingArguments, |
| ), |
| ) |
|
|
| deepspeed_config_path = tmp_path / "deepspeed.json" |
| deepspeed_config_path.write_text( |
| json.dumps({"zero_optimization": {"stage": 3}}), encoding="utf-8" |
| ) |
|
|
| train( |
| output_dir=str(tmp_path / "deepspeed-runtime"), |
| model_name="custom/model", |
| max_seq_length=256, |
| distributed_strategy="deepspeed", |
| distributed_config_path=str(deepspeed_config_path), |
| ) |
|
|
| assert FakeTrainer.last_instance is not None |
| assert FakeTrainer.last_instance.args.kwargs["deepspeed"] == str(deepspeed_config_path) |
| assert FakeTrainer.last_instance.args.kwargs["ddp_find_unused_parameters"] is False |
|
|
|
|
| def test_deepspeed_training_arguments_raise_clear_error_when_dependency_missing( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| deepspeed_config_path = tmp_path / "deepspeed.json" |
| deepspeed_config_path.write_text( |
| json.dumps({"zero_optimization": {"stage": 3}}), encoding="utf-8" |
| ) |
| config = load_training_config( |
| overrides={ |
| "distributed_strategy": "deepspeed", |
| "distributed_config_path": str(deepspeed_config_path), |
| } |
| ) |
|
|
| def _raise_missing_package(package_name: str) -> None: |
| raise PackageNotFoundError(package_name) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.get_installed_package_version", |
| _raise_missing_package, |
| ) |
|
|
| with pytest.raises( |
| ImportError, |
| match="DeepSpeed režīms nepieciešams instalēt 'deepspeed'", |
| ): |
| _build_distributed_training_argument_overrides(config) |
|
|
|
|
| def test_deepspeed_training_arguments_raise_clear_error_when_metadata_lookup_stops( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| deepspeed_config_path = tmp_path / "deepspeed.json" |
| deepspeed_config_path.write_text( |
| json.dumps({"zero_optimization": {"stage": 3}}), encoding="utf-8" |
| ) |
| config = load_training_config( |
| overrides={ |
| "distributed_strategy": "deepspeed", |
| "distributed_config_path": str(deepspeed_config_path), |
| } |
| ) |
|
|
| def _raise_stop_iteration(package_name: str) -> None: |
| raise StopIteration(package_name) |
|
|
| monkeypatch.setattr( |
| "maris_core.training.train.get_installed_package_version", |
| _raise_stop_iteration, |
| ) |
|
|
| with pytest.raises( |
| ImportError, |
| match="DeepSpeed režīms nepieciešams instalēt 'deepspeed'", |
| ): |
| _build_distributed_training_argument_overrides(config) |
|
|
|
|
| def test_train_model_cli_exits_cleanly_when_runtime_dependency_missing(monkeypatch, capsys) -> None: |
| script_path = Path(__file__).resolve().parents[1] / "scripts" / "train_model.py" |
| spec = importlib.util.spec_from_file_location("train_model", script_path) |
| assert spec is not None and spec.loader is not None |
| train_model_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(train_model_module) |
|
|
| def _raise_missing_dependency(_config: object) -> dict[str, object]: |
| raise ImportError("DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni.") |
|
|
| monkeypatch.setattr( |
| train_model_module, "load_training_config", lambda *args, **kwargs: object() |
| ) |
| monkeypatch.setitem( |
| sys.modules, |
| "maris_core.training.train", |
| types.SimpleNamespace( |
| train_branch_suite=lambda _config: {}, |
| train_with_config=_raise_missing_dependency, |
| ), |
| ) |
| monkeypatch.setattr(sys, "argv", [str(script_path)]) |
|
|
| with pytest.raises(SystemExit) as exc_info: |
| train_model_module.main() |
|
|
| assert exc_info.value.code == 2 |
| captured = capsys.readouterr() |
| assert "DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni." in captured.err |
| assert "Traceback" not in captured.err |
|
|
|
|
| def test_train_model_cli_exits_cleanly_for_branch_suite_dependency_missing( |
| monkeypatch, capsys |
| ) -> None: |
| script_path = Path(__file__).resolve().parents[1] / "scripts" / "train_model.py" |
| spec = importlib.util.spec_from_file_location("train_model", script_path) |
| assert spec is not None and spec.loader is not None |
| train_model_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(train_model_module) |
|
|
| def _raise_missing_dependency(_config: object) -> dict[str, object]: |
| raise ImportError("DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni.") |
|
|
| monkeypatch.setattr( |
| train_model_module, "load_training_config", lambda *args, **kwargs: object() |
| ) |
| monkeypatch.setattr(train_model_module, "replace", lambda config, **kwargs: config) |
| monkeypatch.setitem( |
| sys.modules, |
| "maris_core.training.train", |
| types.SimpleNamespace( |
| train_branch_suite=_raise_missing_dependency, |
| train_with_config=lambda _config: {}, |
| ), |
| ) |
| monkeypatch.setattr(sys, "argv", [str(script_path), "--all-branches"]) |
|
|
| with pytest.raises(SystemExit) as exc_info: |
| train_model_module.main() |
|
|
| assert exc_info.value.code == 2 |
| captured = capsys.readouterr() |
| assert "DeepSpeed režīms nepieciešams instalēt 'deepspeed' Python pakotni." in captured.err |
| assert "Traceback" not in captured.err |
|
|
|
|
| def test_ensure_runtime_home_dir_sets_temp_home_when_missing(tmp_path: Path, monkeypatch) -> None: |
| monkeypatch.delenv("HOME", raising=False) |
| monkeypatch.delenv("USER", raising=False) |
| monkeypatch.delenv("LOGNAME", raising=False) |
| monkeypatch.delenv("USERNAME", raising=False) |
| monkeypatch.setattr("maris_core.training.train.tempfile.gettempdir", lambda: str(tmp_path)) |
| monkeypatch.setattr("maris_core.training.train.os.getuid", lambda: 1000) |
|
|
| resolved = _ensure_runtime_home_dir() |
|
|
| expected = tmp_path / "maris-home-1000" |
| assert resolved == str(expected) |
| assert os.environ["HOME"] == str(expected) |
| assert os.environ["USER"] == "maris-1000" |
| assert os.environ["LOGNAME"] == "maris-1000" |
| assert os.environ["USERNAME"] == "maris-1000" |
| assert expected.is_dir() |
|
|
|
|
| def test_ensure_runtime_home_dir_keeps_existing_home_and_user(monkeypatch) -> None: |
| monkeypatch.setenv("HOME", "/existing/home") |
| monkeypatch.setenv("USER", "existing-user") |
| monkeypatch.setenv("LOGNAME", "existing-user") |
| monkeypatch.setenv("USERNAME", "existing-user") |
|
|
| resolved = _ensure_runtime_home_dir() |
|
|
| assert resolved == "/existing/home" |
| assert os.environ["HOME"] == "/existing/home" |
| assert os.environ["USER"] == "existing-user" |
| assert os.environ["LOGNAME"] == "existing-user" |
| assert os.environ["USERNAME"] == "existing-user" |
|
|
|
|
| def test_ensure_runtime_home_dir_uses_unknown_suffix_when_getuid_fails( |
| tmp_path: Path, monkeypatch |
| ) -> None: |
| monkeypatch.setenv("HOME", " ") |
| monkeypatch.delenv("USER", raising=False) |
| monkeypatch.delenv("LOGNAME", raising=False) |
| monkeypatch.delenv("USERNAME", raising=False) |
| monkeypatch.setattr("maris_core.training.train.tempfile.gettempdir", lambda: str(tmp_path)) |
|
|
| def _raise_os_error() -> int: |
| raise OSError("uid not available") |
|
|
| monkeypatch.setattr("maris_core.training.train.os.getuid", _raise_os_error) |
|
|
| resolved = _ensure_runtime_home_dir() |
|
|
| expected = tmp_path / "maris-home-unknown" |
| assert resolved == str(expected) |
| assert os.environ["HOME"] == str(expected) |
| assert os.environ["USER"] == "maris-unknown" |
| assert os.environ["LOGNAME"] == "maris-unknown" |
| assert os.environ["USERNAME"] == "maris-unknown" |
| assert expected.is_dir() |
|
|