Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /world_pure_source_registry.py
| """High-provenance source registry for TinyMind data acquisition. | |
| This module does not download large corpora by default. It records where each | |
| source may be used, which gates must pass first, and which sources are reserved | |
| for evaluation only so training cannot silently contaminate benchmarks. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from dataclasses import asdict, dataclass | |
| from pathlib import Path | |
| from typing import Iterable | |
| TRAIN_TIERS = { | |
| "streaming_train", | |
| "strict_train_after_license_review", | |
| "quarantine_then_final_answer_distill", | |
| } | |
| class WorldPureSource: | |
| id: str | |
| name: str | |
| repo_or_url: str | |
| domain: str | |
| modality: str | |
| scale_hint: str | |
| license_gate: str | |
| allowed_tier: str | |
| purity_score: float | |
| contamination_risk: float | |
| ingest_policy: str | |
| notes: str | |
| def train_allowed(self) -> bool: | |
| return self.allowed_tier in TRAIN_TIERS | |
| def _sources() -> list[WorldPureSource]: | |
| return [ | |
| WorldPureSource( | |
| id="fineweb_edu", | |
| name="FineWeb-Edu", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu", | |
| domain="education_web", | |
| modality="text", | |
| scale_hint="trillion-token educational web subset", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="streaming_train", | |
| purity_score=0.94, | |
| contamination_risk=0.34, | |
| ingest_policy="stream shards; score-filter; remove benchmark n-grams; cap per domain", | |
| notes="Use as broad knowledge background, never as eval replacement.", | |
| ), | |
| WorldPureSource( | |
| id="cosmopedia_v2", | |
| name="Cosmopedia v2", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-v2", | |
| domain="synthetic_textbook", | |
| modality="text", | |
| scale_hint="large synthetic textbook corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="streaming_train", | |
| purity_score=0.93, | |
| contamination_risk=0.22, | |
| ingest_policy="prefer textbook-style samples; strip hidden traces; dedupe against local SFT", | |
| notes="Good for fluency and structured explanations.", | |
| ), | |
| WorldPureSource( | |
| id="allenai_dolma", | |
| name="Dolma", | |
| repo_or_url="https://huggingface.co/datasets/allenai/dolma", | |
| domain="mixed_open_corpus", | |
| modality="text_code", | |
| scale_hint="multi-trillion-token transparent corpus", | |
| license_gate="source_mixture_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.88, | |
| contamination_risk=0.42, | |
| ingest_policy="use documented subsets only; require URL/provenance manifest; heavy dedupe", | |
| notes="Transparent enough to audit, but mixed-source terms need strict review.", | |
| ), | |
| WorldPureSource( | |
| id="fineweb_2_multilingual", | |
| name="FineWeb 2", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceFW/fineweb-2", | |
| domain="multilingual_web", | |
| modality="text", | |
| scale_hint="large multilingual cleaned web corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="streaming_train", | |
| purity_score=0.90, | |
| contamination_risk=0.40, | |
| ingest_policy="language-id route; keep Thai/English high-score slices; block benchmark overlap", | |
| notes="Use to widen multilingual general knowledge without dominating Thai SFT.", | |
| ), | |
| WorldPureSource( | |
| id="wikimedia_wikipedia", | |
| name="Wikimedia Wikipedia", | |
| repo_or_url="https://huggingface.co/datasets/wikimedia/wikipedia", | |
| domain="encyclopedic_reference", | |
| modality="text", | |
| scale_hint="multilingual encyclopedia snapshots", | |
| license_gate="cc_by_sa_attribution_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.89, | |
| contamination_risk=0.26, | |
| ingest_policy="prefer current snapshots; keep attribution/provenance; remove low-quality markup", | |
| notes="Strong factual backbone if attribution and versioning are preserved.", | |
| ), | |
| WorldPureSource( | |
| id="allenai_pes2o", | |
| name="peS2o", | |
| repo_or_url="https://huggingface.co/datasets/allenai/peS2o", | |
| domain="scientific_papers", | |
| modality="text", | |
| scale_hint="scientific-paper text corpus", | |
| license_gate="source_mixture_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.91, | |
| contamination_risk=0.35, | |
| ingest_policy="license-filter papers; extract abstracts/methods; dedupe against arxiv-like corpora", | |
| notes="Useful for scientific style and terminology.", | |
| ), | |
| WorldPureSource( | |
| id="openwebmath", | |
| name="OpenWebMath", | |
| repo_or_url="https://huggingface.co/datasets/open-web-math/open-web-math", | |
| domain="web_mathematics", | |
| modality="text_math", | |
| scale_hint="math-heavy web corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.88, | |
| contamination_risk=0.38, | |
| ingest_policy="formula-preserving parser; remove copied benchmark statements; cap web forum noise", | |
| notes="Complements curated math with broad notation exposure.", | |
| ), | |
| WorldPureSource( | |
| id="bigcode_stack_v2", | |
| name="The Stack v2", | |
| repo_or_url="https://huggingface.co/datasets/bigcode/the-stack-v2", | |
| domain="source_code", | |
| modality="code", | |
| scale_hint="tens of TB of source code metadata/content", | |
| license_gate="permissive_source_license_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.90, | |
| contamination_risk=0.39, | |
| ingest_policy="only permissive-license files; remove secrets; remove benchmark solutions", | |
| notes="Main large-scale code source, but never bypass license or secret scans.", | |
| ), | |
| WorldPureSource( | |
| id="starcoderdata", | |
| name="StarCoderData", | |
| repo_or_url="https://huggingface.co/datasets/bigcode/starcoderdata", | |
| domain="source_code", | |
| modality="code", | |
| scale_hint="curated code training corpus lineage", | |
| license_gate="permissive_source_license_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.88, | |
| contamination_risk=0.37, | |
| ingest_policy="prefer permissive files; run secret scan; remove generated/vendor duplicates", | |
| notes="Secondary code source for diversity after The Stack v2.", | |
| ), | |
| WorldPureSource( | |
| id="nvidia_opencodeinstruct", | |
| name="OpenCodeInstruct", | |
| repo_or_url="https://huggingface.co/datasets/nvidia/OpenCodeInstruct", | |
| domain="code_instruction", | |
| modality="text_code", | |
| scale_hint="code instruction corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.91, | |
| contamination_risk=0.31, | |
| ingest_policy="keep executable examples; unit-test snippets when possible; cap template repeats", | |
| notes="Useful for instruction-following on software tasks.", | |
| ), | |
| WorldPureSource( | |
| id="humaneval", | |
| name="HumanEval", | |
| repo_or_url="https://huggingface.co/datasets/openai/openai_humaneval", | |
| domain="coding_eval", | |
| modality="text_code", | |
| scale_hint="Python coding benchmark", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.94, | |
| contamination_risk=0.95, | |
| ingest_policy="eval only; block prompt, canonical solution, and tests from train data", | |
| notes="Classic code benchmark; very high contamination risk.", | |
| ), | |
| WorldPureSource( | |
| id="mbpp", | |
| name="Mostly Basic Python Problems", | |
| repo_or_url="https://huggingface.co/datasets/google-research-datasets/mbpp", | |
| domain="coding_eval", | |
| modality="text_code", | |
| scale_hint="Python programming benchmark", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.93, | |
| contamination_risk=0.93, | |
| ingest_policy="eval only; use separate synthetic coding SFT instead", | |
| notes="Do not train if measuring coding generalization.", | |
| ), | |
| WorldPureSource( | |
| id="numina_math_1_5", | |
| name="NuminaMath 1.5", | |
| repo_or_url="https://huggingface.co/datasets/AI-MO/NuminaMath-1.5", | |
| domain="mathematics", | |
| modality="text_math", | |
| scale_hint="large verified math reasoning set", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="quarantine_then_final_answer_distill", | |
| purity_score=0.92, | |
| contamination_risk=0.28, | |
| ingest_policy="verify answers; strip raw hidden reasoning when required; keep final derivations", | |
| notes="High-value math source, handled carefully to avoid trace imitation.", | |
| ), | |
| WorldPureSource( | |
| id="openr1_math_220k", | |
| name="OpenR1 Math 220k", | |
| repo_or_url="https://huggingface.co/datasets/open-r1/OpenR1-Math-220k", | |
| domain="mathematics", | |
| modality="text_math", | |
| scale_hint="220k reasoning/math examples", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="quarantine_then_final_answer_distill", | |
| purity_score=0.89, | |
| contamination_risk=0.33, | |
| ingest_policy="quarantine raw traces; distill final answer style; exclude benchmark overlap", | |
| notes="Use to improve problem solving without training on raw private-style thoughts.", | |
| ), | |
| WorldPureSource( | |
| id="gpqa_diamond", | |
| name="GPQA Diamond", | |
| repo_or_url="https://huggingface.co/datasets/Idavidrein/gpqa", | |
| domain="expert_science_eval", | |
| modality="text", | |
| scale_hint="graduate-level science QA benchmark", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.95, | |
| contamination_risk=0.94, | |
| ingest_policy="eval only; benchmark-overlap block required", | |
| notes="Hard science eval, not training material for claims.", | |
| ), | |
| WorldPureSource( | |
| id="hle", | |
| name="Humanity's Last Exam", | |
| repo_or_url="https://huggingface.co/datasets/cais/hle", | |
| domain="frontier_eval", | |
| modality="text_multimodal", | |
| scale_hint="hard cross-domain benchmark", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.95, | |
| contamination_risk=0.96, | |
| ingest_policy="eval only; keep sealed if used for claims", | |
| notes="Reserved for high-stakes external-style evaluation.", | |
| ), | |
| WorldPureSource( | |
| id="ultrachat_200k", | |
| name="UltraChat 200k", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k", | |
| domain="instruction_chat", | |
| modality="text", | |
| scale_hint="chat SFT corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.84, | |
| contamination_risk=0.32, | |
| ingest_policy="quality-filter assistant turns; remove boilerplate; balance with Thai/tool data", | |
| notes="Can repair instruction following if kept below cap.", | |
| ), | |
| WorldPureSource( | |
| id="tulu_3_sft_mixture", | |
| name="Tulu 3 SFT Mixture", | |
| repo_or_url="https://huggingface.co/datasets/allenai/tulu-3-sft-mixture", | |
| domain="instruction_chat", | |
| modality="text", | |
| scale_hint="post-training SFT mixture", | |
| license_gate="source_mixture_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.86, | |
| contamination_risk=0.36, | |
| ingest_policy="source-level license audit; strip traces; cap any benchmark-derived rows", | |
| notes="Good alignment source only after mixture audit.", | |
| ), | |
| WorldPureSource( | |
| id="aya_collection", | |
| name="Aya Collection", | |
| repo_or_url="https://huggingface.co/datasets/CohereForAI/aya_collection", | |
| domain="multilingual_instruction", | |
| modality="text", | |
| scale_hint="multilingual instruction data", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.87, | |
| contamination_risk=0.30, | |
| ingest_policy="select Thai/English and neighboring-language tasks; dedupe translations", | |
| notes="Useful for natural multilingual instruction following.", | |
| ), | |
| WorldPureSource( | |
| id="xp3x", | |
| name="xP3x", | |
| repo_or_url="https://huggingface.co/datasets/bigscience/xP3x", | |
| domain="multilingual_instruction", | |
| modality="text", | |
| scale_hint="multilingual prompted tasks", | |
| license_gate="source_mixture_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.83, | |
| contamination_risk=0.44, | |
| ingest_policy="task-source audit; exclude eval datasets; retain only license-clear tasks", | |
| notes="Broad multilingual task format source with high audit needs.", | |
| ), | |
| WorldPureSource( | |
| id="scb_mt_en_th_2020", | |
| name="SCB MT EN-TH 2020", | |
| repo_or_url="https://huggingface.co/datasets/airesearch/scb_mt_enth_2020", | |
| domain="thai_translation", | |
| modality="text", | |
| scale_hint="Thai-English parallel corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.90, | |
| contamination_risk=0.19, | |
| ingest_policy="balance directions; filter noisy pairs; preserve Thai punctuation and spacing", | |
| notes="Core Thai-English grounding candidate.", | |
| ), | |
| WorldPureSource( | |
| id="vistec_mt_opus", | |
| name="VISTEC MT OPUS", | |
| repo_or_url="https://huggingface.co/datasets/vistec-AI/mt-opus", | |
| domain="thai_translation", | |
| modality="text", | |
| scale_hint="Thai multilingual translation data", | |
| license_gate="source_mixture_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.86, | |
| contamination_risk=0.27, | |
| ingest_policy="language-id filter; dedupe against SCB; reject bad alignment", | |
| notes="Supplemental Thai coverage after alignment filtering.", | |
| ), | |
| WorldPureSource( | |
| id="pythainlp_thai_synonym", | |
| name="PyThaiNLP Thai Synonym", | |
| repo_or_url="https://github.com/PyThaiNLP/thai-synonym", | |
| domain="thai_lexical", | |
| modality="text", | |
| scale_hint="Thai lexical resource", | |
| license_gate="repository_license_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.87, | |
| contamination_risk=0.10, | |
| ingest_policy="convert to lexical contrastive tasks; keep provenance rows", | |
| notes="Small but useful for Thai nuance and paraphrase.", | |
| ), | |
| WorldPureSource( | |
| id="thai_ner_data", | |
| name="PyThaiNLP Thai NER Data", | |
| repo_or_url="https://github.com/PyThaiNLP/thai-named-entity-recognition-data", | |
| domain="thai_ner", | |
| modality="text", | |
| scale_hint="Thai named entity data", | |
| license_gate="repository_license_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.86, | |
| contamination_risk=0.14, | |
| ingest_policy="convert to entity extraction and grounded answer tasks; balance with general Thai", | |
| notes="Improves Thai factual span handling when not over-weighted.", | |
| ), | |
| WorldPureSource( | |
| id="common_voice_th", | |
| name="Common Voice Thai", | |
| repo_or_url="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0", | |
| domain="thai_speech", | |
| modality="audio_text", | |
| scale_hint="crowdsourced speech corpus", | |
| license_gate="cc0_or_dataset_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.84, | |
| contamination_risk=0.18, | |
| ingest_policy="use transcripts for ASR grounding; filter clips by validation votes; keep consent metadata", | |
| notes="Speech bridge data for future multimodal path.", | |
| ), | |
| WorldPureSource( | |
| id="fleurs", | |
| name="FLEURS", | |
| repo_or_url="https://huggingface.co/datasets/google/fleurs", | |
| domain="multilingual_speech", | |
| modality="audio_text", | |
| scale_hint="multilingual speech benchmark/corpus", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.86, | |
| contamination_risk=0.24, | |
| ingest_policy="split train/eval by language; protect official test splits", | |
| notes="Multilingual audio-text alignment source if split hygiene is strict.", | |
| ), | |
| WorldPureSource( | |
| id="the_cauldron", | |
| name="The Cauldron", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceM4/the_cauldron", | |
| domain="vision_language_instruction", | |
| modality="image_text", | |
| scale_hint="large multimodal instruction mixture", | |
| license_gate="source_mixture_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.84, | |
| contamination_risk=0.46, | |
| ingest_policy="source-level audit; strip eval subsets; image safety/license filter", | |
| notes="Candidate for multimodal adapter training, not text-only base mixing.", | |
| ), | |
| WorldPureSource( | |
| id="docvqa", | |
| name="DocVQA", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceM4/DocVQA", | |
| domain="document_vision_eval", | |
| modality="image_text", | |
| scale_hint="document visual question answering", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.93, | |
| contamination_risk=0.87, | |
| ingest_policy="eval only unless using official train split with separate claim protocol", | |
| notes="Protects document-understanding measurement.", | |
| ), | |
| WorldPureSource( | |
| id="kaggle_scicode", | |
| name="SciCode", | |
| repo_or_url="https://www.kaggle.com/datasets/open-benchmarks/scicode", | |
| domain="scientific_code_eval", | |
| modality="text_code", | |
| scale_hint="scientific coding benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.95, | |
| contamination_risk=0.86, | |
| ingest_policy="hold out only; never train on tasks or reference solutions", | |
| notes="Use for external-style coding measurement.", | |
| ), | |
| WorldPureSource( | |
| id="livecodebench", | |
| name="LiveCodeBench", | |
| repo_or_url="https://www.kaggle.com/datasets/open-benchmarks/livecodebench", | |
| domain="coding_eval", | |
| modality="text_code", | |
| scale_hint="live coding benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.95, | |
| contamination_risk=0.90, | |
| ingest_policy="eval only; keep prompts and tests out of training pool", | |
| notes="Protects real coding generalization signal.", | |
| ), | |
| WorldPureSource( | |
| id="deepmind_simpleqa_verified", | |
| name="SimpleQA Verified", | |
| repo_or_url="https://www.kaggle.com/datasets/deepmind/simpleqa-verified", | |
| domain="factual_eval", | |
| modality="text", | |
| scale_hint="verified factual QA benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.96, | |
| contamination_risk=0.88, | |
| ingest_policy="eval only; use answers for scoring, not SFT", | |
| notes="Good factuality probe; must stay outside training.", | |
| ), | |
| WorldPureSource( | |
| id="tiger_mmlu_pro", | |
| name="MMLU-Pro", | |
| repo_or_url="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro", | |
| domain="knowledge_eval", | |
| modality="text", | |
| scale_hint="hard multi-choice knowledge benchmark", | |
| license_gate="dataset_card_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.94, | |
| contamination_risk=0.92, | |
| ingest_policy="eval only; n-gram block all overlaps in train corpora", | |
| notes="Never train on this if using it for claims.", | |
| ), | |
| WorldPureSource( | |
| id="global_mmlu_lite", | |
| name="Global MMLU Lite", | |
| repo_or_url="https://www.kaggle.com/benchmarks/cohere-labs/global-mmlu-lite", | |
| domain="multilingual_eval", | |
| modality="text", | |
| scale_hint="global multilingual benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.94, | |
| contamination_risk=0.91, | |
| ingest_policy="eval only; add Thai/English reporting splits", | |
| notes="External-style multilingual knowledge measurement.", | |
| ), | |
| WorldPureSource( | |
| id="mgsm_multilingual", | |
| name="MGSM Multilingual Grade School Math", | |
| repo_or_url="https://www.kaggle.com/datasets/open-benchmarks/mgsm-multilingual-grade-school-math-benchmark", | |
| domain="math_eval", | |
| modality="text_math", | |
| scale_hint="multilingual grade-school math benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.94, | |
| contamination_risk=0.88, | |
| ingest_policy="eval only; block translations and answer strings from SFT", | |
| notes="Useful Thai math grounding measurement.", | |
| ), | |
| WorldPureSource( | |
| id="parsebench", | |
| name="ParseBench", | |
| repo_or_url="https://www.kaggle.com/datasets/llamaindex-org/parsebench", | |
| domain="document_parsing_eval", | |
| modality="document_text", | |
| scale_hint="document parsing benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.93, | |
| contamination_risk=0.83, | |
| ingest_policy="eval only; use synthetic separate parser SFT for training", | |
| notes="Measures document/file understanding without contaminating it.", | |
| ), | |
| WorldPureSource( | |
| id="multiloko", | |
| name="MultiLoKo", | |
| repo_or_url="https://www.kaggle.com/datasets/metaresearch/multiloko", | |
| domain="multilingual_local_knowledge", | |
| modality="text", | |
| scale_hint="multilingual local-knowledge benchmark", | |
| license_gate="kaggle_terms_required", | |
| allowed_tier="eval_only", | |
| purity_score=0.93, | |
| contamination_risk=0.87, | |
| ingest_policy="eval only; track Thai split separately", | |
| notes="Good external-style local knowledge probe.", | |
| ), | |
| ] | |
| def _summarize(sources: Iterable[WorldPureSource]) -> dict: | |
| rows = list(sources) | |
| domains = {row.domain for row in rows} | |
| train_allowed = [row for row in rows if row.train_allowed] | |
| eval_only = [row for row in rows if row.allowed_tier == "eval_only"] | |
| quarantine = [row for row in rows if "quarantine" in row.allowed_tier] | |
| return { | |
| "source_count": len(rows), | |
| "domain_count": len(domains), | |
| "train_allowed_count": len(train_allowed), | |
| "eval_only_count": len(eval_only), | |
| "quarantine_count": len(quarantine), | |
| "avg_purity_score": round(sum(row.purity_score for row in rows) / len(rows), 4), | |
| "max_contamination_risk": max(row.contamination_risk for row in rows), | |
| } | |
| def _write_markdown(report: dict, path: Path) -> None: | |
| lines = [ | |
| "# TinyMind World Pure Source Registry", | |
| "", | |
| "This registry separates training sources, quarantine sources, and evaluation-only sources.", | |
| "No world-best or rank claim is enabled by source discovery alone.", | |
| "", | |
| "## Summary", | |
| "", | |
| ] | |
| for key, value in report["summary"].items(): | |
| lines.append(f"- `{key}`: {value}") | |
| lines.extend(["", "## Sources", ""]) | |
| for row in report["sources"]: | |
| lines.append( | |
| f"- `{row['id']}` ({row['domain']}): `{row['allowed_tier']}`; " | |
| f"license gate `{row['license_gate']}`; purity {row['purity_score']}; " | |
| f"risk {row['contamination_risk']}; {row['repo_or_url']}" | |
| ) | |
| lines.extend( | |
| [ | |
| "", | |
| "## Gates", | |
| "", | |
| "- Training corpora must pass license review, secret scan, dedupe, and benchmark-overlap purge.", | |
| "- Eval-only sources are blocked from training by default.", | |
| "- Raw reasoning traces remain quarantine-only unless converted to final-answer-only data.", | |
| ] | |
| ) | |
| path.write_text("\n".join(lines) + "\n", encoding="utf-8") | |
| def _phase_for_source(row: WorldPureSource) -> str: | |
| if row.modality in {"audio_text", "image_text", "text_multimodal"}: | |
| return "multimodal_bridge" | |
| if row.domain in {"mathematics", "web_mathematics", "scientific_papers"}: | |
| return "reasoning" | |
| if "instruction" in row.domain or row.domain in {"code_instruction", "thai_translation", "thai_lexical", "thai_ner"}: | |
| return "alignment" | |
| return "backbone" | |
| def _initial_weight(row: WorldPureSource) -> float: | |
| phase = _phase_for_source(row) | |
| base = { | |
| "backbone": 1.0, | |
| "reasoning": 0.95, | |
| "alignment": 1.05, | |
| "multimodal_bridge": 0.45, | |
| }[phase] | |
| if row.allowed_tier == "quarantine_then_final_answer_distill": | |
| base *= 0.55 | |
| if "license_review" in row.allowed_tier: | |
| base *= 0.72 | |
| return max(0.01, base * row.purity_score * (1.0 - min(row.contamination_risk, 0.72) * 0.45)) | |
| def _cap_domain_shares(weights: dict[str, float], sources_by_id: dict[str, WorldPureSource], max_share: float = 0.22) -> dict[str, float]: | |
| capped = dict(weights) | |
| for _ in range(8): | |
| total = sum(capped.values()) | |
| if total <= 0: | |
| return capped | |
| domain_totals: dict[str, float] = {} | |
| for source_id, weight in capped.items(): | |
| domain = sources_by_id[source_id].domain | |
| domain_totals[domain] = domain_totals.get(domain, 0.0) + weight | |
| worst_domain, worst_weight = max(domain_totals.items(), key=lambda item: item[1]) | |
| if worst_weight / total <= max_share: | |
| break | |
| scale = (max_share * total) / worst_weight | |
| for source_id in list(capped): | |
| if sources_by_id[source_id].domain == worst_domain: | |
| capped[source_id] *= scale | |
| return capped | |
| def _write_curriculum_markdown(plan: dict, path: Path) -> None: | |
| lines = [ | |
| "# TinyMind World Pure Streaming Curriculum", | |
| "", | |
| "This plan converts the source registry into bounded streaming weights.", | |
| "Evaluation-only sources are recorded as reserves and are not used for training.", | |
| "", | |
| "## Summary", | |
| "", | |
| ] | |
| for key, value in plan["summary"].items(): | |
| lines.append(f"- `{key}`: {value}") | |
| lines.extend(["", "## Train Streams", ""]) | |
| for row in plan["train_streams"]: | |
| lines.append( | |
| f"- `{row['source_id']}` phase `{row['phase']}` domain `{row['domain']}` " | |
| f"weight {row['mix_weight']:.6f}, budget {row['token_budget']}" | |
| ) | |
| lines.extend(["", "## Eval Reserves", ""]) | |
| for row in plan["eval_reserves"]: | |
| lines.append(f"- `{row['source_id']}` domain `{row['domain']}` risk {row['contamination_risk']}") | |
| path.write_text("\n".join(lines) + "\n", encoding="utf-8") | |
| def build_world_pure_source_registry(out_dir: str | Path) -> dict: | |
| out_path = Path(out_dir) | |
| out_path.mkdir(parents=True, exist_ok=True) | |
| sources = _sources() | |
| summary = _summarize(sources) | |
| report = { | |
| "schema": "tinymind.world_pure_source_registry.v1", | |
| "summary": summary, | |
| "claim_gate": { | |
| "external_rank_claim_allowed": False, | |
| "world_best_claim_allowed": False, | |
| "reason": "Source discovery is not model performance evidence. Official external evals are still required.", | |
| }, | |
| "global_ingest_gates": { | |
| "strict_provenance_required": True, | |
| "license_review_required": True, | |
| "secret_scan_required": True, | |
| "dedupe_required": True, | |
| "benchmark_overlap_purge_required": True, | |
| "raw_reasoning_trace_main_train_allowed": False, | |
| "streaming_first": True, | |
| }, | |
| "sources": [asdict(row) | {"train_allowed": row.train_allowed} for row in sources], | |
| } | |
| (out_path / "world_pure_source_registry.json").write_text( | |
| json.dumps(report, ensure_ascii=False, indent=2) + "\n", | |
| encoding="utf-8", | |
| ) | |
| _write_markdown(report, out_path / "world_pure_source_registry.md") | |
| return report | |
| def build_world_pure_streaming_curriculum(out_dir: str | Path, token_budget: int = 10_000_000) -> dict: | |
| """Build a bounded streaming curriculum from the registry. | |
| The returned plan is intentionally conservative: no eval-only source is | |
| assigned training weight, raw trace-like sources remain low weight, and no | |
| single domain is allowed to dominate the mixture. | |
| """ | |
| if token_budget <= 0: | |
| raise ValueError("token_budget must be positive") | |
| out_path = Path(out_dir) | |
| out_path.mkdir(parents=True, exist_ok=True) | |
| sources = _sources() | |
| train_sources = [row for row in sources if row.train_allowed] | |
| eval_sources = [row for row in sources if row.allowed_tier == "eval_only"] | |
| sources_by_id = {row.id: row for row in train_sources} | |
| raw_weights = {row.id: _initial_weight(row) for row in train_sources} | |
| capped_weights = _cap_domain_shares(raw_weights, sources_by_id) | |
| total = sum(capped_weights.values()) | |
| normalized = {source_id: weight / total for source_id, weight in capped_weights.items()} | |
| train_streams = [] | |
| domain_shares: dict[str, float] = {} | |
| for source_id, mix_weight in sorted(normalized.items(), key=lambda item: (-item[1], item[0])): | |
| row = sources_by_id[source_id] | |
| domain_shares[row.domain] = domain_shares.get(row.domain, 0.0) + mix_weight | |
| train_streams.append( | |
| { | |
| "source_id": row.id, | |
| "name": row.name, | |
| "repo_or_url": row.repo_or_url, | |
| "domain": row.domain, | |
| "phase": _phase_for_source(row), | |
| "allowed_tier": row.allowed_tier, | |
| "license_gate": row.license_gate, | |
| "mix_weight": round(mix_weight, 10), | |
| "token_budget": int(round(token_budget * mix_weight)), | |
| "loss_multiplier": round(0.55 + row.purity_score * 0.45, 4), | |
| "contamination_guard": "dedupe+benchmark_ngram_purge+secret_scan", | |
| "ingest_policy": row.ingest_policy, | |
| } | |
| ) | |
| # Keep exact normalized sum stable after rounding. | |
| drift = 1.0 - sum(row["mix_weight"] for row in train_streams) | |
| if train_streams: | |
| train_streams[0]["mix_weight"] = round(train_streams[0]["mix_weight"] + drift, 10) | |
| eval_reserves = [ | |
| { | |
| "source_id": row.id, | |
| "name": row.name, | |
| "repo_or_url": row.repo_or_url, | |
| "domain": row.domain, | |
| "contamination_risk": row.contamination_risk, | |
| "policy": row.ingest_policy, | |
| } | |
| for row in eval_sources | |
| ] | |
| plan = { | |
| "schema": "tinymind.world_pure_streaming_curriculum.v1", | |
| "summary": { | |
| "token_budget": token_budget, | |
| "train_source_count": len(train_streams), | |
| "eval_source_count": len(eval_reserves), | |
| "domain_count": len({row["domain"] for row in train_streams}), | |
| "largest_domain_share": round(max(domain_shares.values()) if domain_shares else 0.0, 6), | |
| }, | |
| "claim_gate": { | |
| "main_training_allowed": bool(train_streams), | |
| "world_best_claim_allowed": False, | |
| "official_rank_claim_allowed": False, | |
| "reason": "Curriculum planning is not performance evidence; external held-out eval is required.", | |
| }, | |
| "train_streams": train_streams, | |
| "eval_reserves": eval_reserves, | |
| } | |
| (out_path / "world_pure_streaming_curriculum.json").write_text( | |
| json.dumps(plan, ensure_ascii=False, indent=2) + "\n", | |
| encoding="utf-8", | |
| ) | |
| _write_curriculum_markdown(plan, out_path / "world_pure_streaming_curriculum.md") | |
| return plan | |
| if __name__ == "__main__": | |
| out_dir = Path("reports") / "world_pure_source_registry" | |
| build_world_pure_source_registry(out_dir) | |
| build_world_pure_streaming_curriculum(out_dir) | |
Xet Storage Details
- Size:
- 34.8 kB
- Xet hash:
- 7c4b2fe65448ad6a048c6f83487462e0be9a7946dbad5455b56a49e7bc81f024
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.