bbkdevops's picture
download
raw
34.8 kB
"""High-provenance source registry for TinyMind data acquisition.
This module does not download large corpora by default. It records where each
source may be used, which gates must pass first, and which sources are reserved
for evaluation only so training cannot silently contaminate benchmarks.
"""
from __future__ import annotations
import json
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Iterable
TRAIN_TIERS = {
"streaming_train",
"strict_train_after_license_review",
"quarantine_then_final_answer_distill",
}
@dataclass(frozen=True)
class WorldPureSource:
id: str
name: str
repo_or_url: str
domain: str
modality: str
scale_hint: str
license_gate: str
allowed_tier: str
purity_score: float
contamination_risk: float
ingest_policy: str
notes: str
@property
def train_allowed(self) -> bool:
return self.allowed_tier in TRAIN_TIERS
def _sources() -> list[WorldPureSource]:
return [
WorldPureSource(
id="fineweb_edu",
name="FineWeb-Edu",
repo_or_url="https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu",
domain="education_web",
modality="text",
scale_hint="trillion-token educational web subset",
license_gate="dataset_card_terms_required",
allowed_tier="streaming_train",
purity_score=0.94,
contamination_risk=0.34,
ingest_policy="stream shards; score-filter; remove benchmark n-grams; cap per domain",
notes="Use as broad knowledge background, never as eval replacement.",
),
WorldPureSource(
id="cosmopedia_v2",
name="Cosmopedia v2",
repo_or_url="https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-v2",
domain="synthetic_textbook",
modality="text",
scale_hint="large synthetic textbook corpus",
license_gate="dataset_card_terms_required",
allowed_tier="streaming_train",
purity_score=0.93,
contamination_risk=0.22,
ingest_policy="prefer textbook-style samples; strip hidden traces; dedupe against local SFT",
notes="Good for fluency and structured explanations.",
),
WorldPureSource(
id="allenai_dolma",
name="Dolma",
repo_or_url="https://huggingface.co/datasets/allenai/dolma",
domain="mixed_open_corpus",
modality="text_code",
scale_hint="multi-trillion-token transparent corpus",
license_gate="source_mixture_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.88,
contamination_risk=0.42,
ingest_policy="use documented subsets only; require URL/provenance manifest; heavy dedupe",
notes="Transparent enough to audit, but mixed-source terms need strict review.",
),
WorldPureSource(
id="fineweb_2_multilingual",
name="FineWeb 2",
repo_or_url="https://huggingface.co/datasets/HuggingFaceFW/fineweb-2",
domain="multilingual_web",
modality="text",
scale_hint="large multilingual cleaned web corpus",
license_gate="dataset_card_terms_required",
allowed_tier="streaming_train",
purity_score=0.90,
contamination_risk=0.40,
ingest_policy="language-id route; keep Thai/English high-score slices; block benchmark overlap",
notes="Use to widen multilingual general knowledge without dominating Thai SFT.",
),
WorldPureSource(
id="wikimedia_wikipedia",
name="Wikimedia Wikipedia",
repo_or_url="https://huggingface.co/datasets/wikimedia/wikipedia",
domain="encyclopedic_reference",
modality="text",
scale_hint="multilingual encyclopedia snapshots",
license_gate="cc_by_sa_attribution_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.89,
contamination_risk=0.26,
ingest_policy="prefer current snapshots; keep attribution/provenance; remove low-quality markup",
notes="Strong factual backbone if attribution and versioning are preserved.",
),
WorldPureSource(
id="allenai_pes2o",
name="peS2o",
repo_or_url="https://huggingface.co/datasets/allenai/peS2o",
domain="scientific_papers",
modality="text",
scale_hint="scientific-paper text corpus",
license_gate="source_mixture_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.91,
contamination_risk=0.35,
ingest_policy="license-filter papers; extract abstracts/methods; dedupe against arxiv-like corpora",
notes="Useful for scientific style and terminology.",
),
WorldPureSource(
id="openwebmath",
name="OpenWebMath",
repo_or_url="https://huggingface.co/datasets/open-web-math/open-web-math",
domain="web_mathematics",
modality="text_math",
scale_hint="math-heavy web corpus",
license_gate="dataset_card_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.88,
contamination_risk=0.38,
ingest_policy="formula-preserving parser; remove copied benchmark statements; cap web forum noise",
notes="Complements curated math with broad notation exposure.",
),
WorldPureSource(
id="bigcode_stack_v2",
name="The Stack v2",
repo_or_url="https://huggingface.co/datasets/bigcode/the-stack-v2",
domain="source_code",
modality="code",
scale_hint="tens of TB of source code metadata/content",
license_gate="permissive_source_license_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.90,
contamination_risk=0.39,
ingest_policy="only permissive-license files; remove secrets; remove benchmark solutions",
notes="Main large-scale code source, but never bypass license or secret scans.",
),
WorldPureSource(
id="starcoderdata",
name="StarCoderData",
repo_or_url="https://huggingface.co/datasets/bigcode/starcoderdata",
domain="source_code",
modality="code",
scale_hint="curated code training corpus lineage",
license_gate="permissive_source_license_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.88,
contamination_risk=0.37,
ingest_policy="prefer permissive files; run secret scan; remove generated/vendor duplicates",
notes="Secondary code source for diversity after The Stack v2.",
),
WorldPureSource(
id="nvidia_opencodeinstruct",
name="OpenCodeInstruct",
repo_or_url="https://huggingface.co/datasets/nvidia/OpenCodeInstruct",
domain="code_instruction",
modality="text_code",
scale_hint="code instruction corpus",
license_gate="dataset_card_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.91,
contamination_risk=0.31,
ingest_policy="keep executable examples; unit-test snippets when possible; cap template repeats",
notes="Useful for instruction-following on software tasks.",
),
WorldPureSource(
id="humaneval",
name="HumanEval",
repo_or_url="https://huggingface.co/datasets/openai/openai_humaneval",
domain="coding_eval",
modality="text_code",
scale_hint="Python coding benchmark",
license_gate="dataset_card_terms_required",
allowed_tier="eval_only",
purity_score=0.94,
contamination_risk=0.95,
ingest_policy="eval only; block prompt, canonical solution, and tests from train data",
notes="Classic code benchmark; very high contamination risk.",
),
WorldPureSource(
id="mbpp",
name="Mostly Basic Python Problems",
repo_or_url="https://huggingface.co/datasets/google-research-datasets/mbpp",
domain="coding_eval",
modality="text_code",
scale_hint="Python programming benchmark",
license_gate="dataset_card_terms_required",
allowed_tier="eval_only",
purity_score=0.93,
contamination_risk=0.93,
ingest_policy="eval only; use separate synthetic coding SFT instead",
notes="Do not train if measuring coding generalization.",
),
WorldPureSource(
id="numina_math_1_5",
name="NuminaMath 1.5",
repo_or_url="https://huggingface.co/datasets/AI-MO/NuminaMath-1.5",
domain="mathematics",
modality="text_math",
scale_hint="large verified math reasoning set",
license_gate="dataset_card_terms_required",
allowed_tier="quarantine_then_final_answer_distill",
purity_score=0.92,
contamination_risk=0.28,
ingest_policy="verify answers; strip raw hidden reasoning when required; keep final derivations",
notes="High-value math source, handled carefully to avoid trace imitation.",
),
WorldPureSource(
id="openr1_math_220k",
name="OpenR1 Math 220k",
repo_or_url="https://huggingface.co/datasets/open-r1/OpenR1-Math-220k",
domain="mathematics",
modality="text_math",
scale_hint="220k reasoning/math examples",
license_gate="dataset_card_terms_required",
allowed_tier="quarantine_then_final_answer_distill",
purity_score=0.89,
contamination_risk=0.33,
ingest_policy="quarantine raw traces; distill final answer style; exclude benchmark overlap",
notes="Use to improve problem solving without training on raw private-style thoughts.",
),
WorldPureSource(
id="gpqa_diamond",
name="GPQA Diamond",
repo_or_url="https://huggingface.co/datasets/Idavidrein/gpqa",
domain="expert_science_eval",
modality="text",
scale_hint="graduate-level science QA benchmark",
license_gate="dataset_card_terms_required",
allowed_tier="eval_only",
purity_score=0.95,
contamination_risk=0.94,
ingest_policy="eval only; benchmark-overlap block required",
notes="Hard science eval, not training material for claims.",
),
WorldPureSource(
id="hle",
name="Humanity's Last Exam",
repo_or_url="https://huggingface.co/datasets/cais/hle",
domain="frontier_eval",
modality="text_multimodal",
scale_hint="hard cross-domain benchmark",
license_gate="dataset_card_terms_required",
allowed_tier="eval_only",
purity_score=0.95,
contamination_risk=0.96,
ingest_policy="eval only; keep sealed if used for claims",
notes="Reserved for high-stakes external-style evaluation.",
),
WorldPureSource(
id="ultrachat_200k",
name="UltraChat 200k",
repo_or_url="https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k",
domain="instruction_chat",
modality="text",
scale_hint="chat SFT corpus",
license_gate="dataset_card_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.84,
contamination_risk=0.32,
ingest_policy="quality-filter assistant turns; remove boilerplate; balance with Thai/tool data",
notes="Can repair instruction following if kept below cap.",
),
WorldPureSource(
id="tulu_3_sft_mixture",
name="Tulu 3 SFT Mixture",
repo_or_url="https://huggingface.co/datasets/allenai/tulu-3-sft-mixture",
domain="instruction_chat",
modality="text",
scale_hint="post-training SFT mixture",
license_gate="source_mixture_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.86,
contamination_risk=0.36,
ingest_policy="source-level license audit; strip traces; cap any benchmark-derived rows",
notes="Good alignment source only after mixture audit.",
),
WorldPureSource(
id="aya_collection",
name="Aya Collection",
repo_or_url="https://huggingface.co/datasets/CohereForAI/aya_collection",
domain="multilingual_instruction",
modality="text",
scale_hint="multilingual instruction data",
license_gate="dataset_card_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.87,
contamination_risk=0.30,
ingest_policy="select Thai/English and neighboring-language tasks; dedupe translations",
notes="Useful for natural multilingual instruction following.",
),
WorldPureSource(
id="xp3x",
name="xP3x",
repo_or_url="https://huggingface.co/datasets/bigscience/xP3x",
domain="multilingual_instruction",
modality="text",
scale_hint="multilingual prompted tasks",
license_gate="source_mixture_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.83,
contamination_risk=0.44,
ingest_policy="task-source audit; exclude eval datasets; retain only license-clear tasks",
notes="Broad multilingual task format source with high audit needs.",
),
WorldPureSource(
id="scb_mt_en_th_2020",
name="SCB MT EN-TH 2020",
repo_or_url="https://huggingface.co/datasets/airesearch/scb_mt_enth_2020",
domain="thai_translation",
modality="text",
scale_hint="Thai-English parallel corpus",
license_gate="dataset_card_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.90,
contamination_risk=0.19,
ingest_policy="balance directions; filter noisy pairs; preserve Thai punctuation and spacing",
notes="Core Thai-English grounding candidate.",
),
WorldPureSource(
id="vistec_mt_opus",
name="VISTEC MT OPUS",
repo_or_url="https://huggingface.co/datasets/vistec-AI/mt-opus",
domain="thai_translation",
modality="text",
scale_hint="Thai multilingual translation data",
license_gate="source_mixture_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.86,
contamination_risk=0.27,
ingest_policy="language-id filter; dedupe against SCB; reject bad alignment",
notes="Supplemental Thai coverage after alignment filtering.",
),
WorldPureSource(
id="pythainlp_thai_synonym",
name="PyThaiNLP Thai Synonym",
repo_or_url="https://github.com/PyThaiNLP/thai-synonym",
domain="thai_lexical",
modality="text",
scale_hint="Thai lexical resource",
license_gate="repository_license_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.87,
contamination_risk=0.10,
ingest_policy="convert to lexical contrastive tasks; keep provenance rows",
notes="Small but useful for Thai nuance and paraphrase.",
),
WorldPureSource(
id="thai_ner_data",
name="PyThaiNLP Thai NER Data",
repo_or_url="https://github.com/PyThaiNLP/thai-named-entity-recognition-data",
domain="thai_ner",
modality="text",
scale_hint="Thai named entity data",
license_gate="repository_license_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.86,
contamination_risk=0.14,
ingest_policy="convert to entity extraction and grounded answer tasks; balance with general Thai",
notes="Improves Thai factual span handling when not over-weighted.",
),
WorldPureSource(
id="common_voice_th",
name="Common Voice Thai",
repo_or_url="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0",
domain="thai_speech",
modality="audio_text",
scale_hint="crowdsourced speech corpus",
license_gate="cc0_or_dataset_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.84,
contamination_risk=0.18,
ingest_policy="use transcripts for ASR grounding; filter clips by validation votes; keep consent metadata",
notes="Speech bridge data for future multimodal path.",
),
WorldPureSource(
id="fleurs",
name="FLEURS",
repo_or_url="https://huggingface.co/datasets/google/fleurs",
domain="multilingual_speech",
modality="audio_text",
scale_hint="multilingual speech benchmark/corpus",
license_gate="dataset_card_terms_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.86,
contamination_risk=0.24,
ingest_policy="split train/eval by language; protect official test splits",
notes="Multilingual audio-text alignment source if split hygiene is strict.",
),
WorldPureSource(
id="the_cauldron",
name="The Cauldron",
repo_or_url="https://huggingface.co/datasets/HuggingFaceM4/the_cauldron",
domain="vision_language_instruction",
modality="image_text",
scale_hint="large multimodal instruction mixture",
license_gate="source_mixture_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.84,
contamination_risk=0.46,
ingest_policy="source-level audit; strip eval subsets; image safety/license filter",
notes="Candidate for multimodal adapter training, not text-only base mixing.",
),
WorldPureSource(
id="docvqa",
name="DocVQA",
repo_or_url="https://huggingface.co/datasets/HuggingFaceM4/DocVQA",
domain="document_vision_eval",
modality="image_text",
scale_hint="document visual question answering",
license_gate="dataset_card_terms_required",
allowed_tier="eval_only",
purity_score=0.93,
contamination_risk=0.87,
ingest_policy="eval only unless using official train split with separate claim protocol",
notes="Protects document-understanding measurement.",
),
WorldPureSource(
id="kaggle_scicode",
name="SciCode",
repo_or_url="https://www.kaggle.com/datasets/open-benchmarks/scicode",
domain="scientific_code_eval",
modality="text_code",
scale_hint="scientific coding benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.95,
contamination_risk=0.86,
ingest_policy="hold out only; never train on tasks or reference solutions",
notes="Use for external-style coding measurement.",
),
WorldPureSource(
id="livecodebench",
name="LiveCodeBench",
repo_or_url="https://www.kaggle.com/datasets/open-benchmarks/livecodebench",
domain="coding_eval",
modality="text_code",
scale_hint="live coding benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.95,
contamination_risk=0.90,
ingest_policy="eval only; keep prompts and tests out of training pool",
notes="Protects real coding generalization signal.",
),
WorldPureSource(
id="deepmind_simpleqa_verified",
name="SimpleQA Verified",
repo_or_url="https://www.kaggle.com/datasets/deepmind/simpleqa-verified",
domain="factual_eval",
modality="text",
scale_hint="verified factual QA benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.96,
contamination_risk=0.88,
ingest_policy="eval only; use answers for scoring, not SFT",
notes="Good factuality probe; must stay outside training.",
),
WorldPureSource(
id="tiger_mmlu_pro",
name="MMLU-Pro",
repo_or_url="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
domain="knowledge_eval",
modality="text",
scale_hint="hard multi-choice knowledge benchmark",
license_gate="dataset_card_terms_required",
allowed_tier="eval_only",
purity_score=0.94,
contamination_risk=0.92,
ingest_policy="eval only; n-gram block all overlaps in train corpora",
notes="Never train on this if using it for claims.",
),
WorldPureSource(
id="global_mmlu_lite",
name="Global MMLU Lite",
repo_or_url="https://www.kaggle.com/benchmarks/cohere-labs/global-mmlu-lite",
domain="multilingual_eval",
modality="text",
scale_hint="global multilingual benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.94,
contamination_risk=0.91,
ingest_policy="eval only; add Thai/English reporting splits",
notes="External-style multilingual knowledge measurement.",
),
WorldPureSource(
id="mgsm_multilingual",
name="MGSM Multilingual Grade School Math",
repo_or_url="https://www.kaggle.com/datasets/open-benchmarks/mgsm-multilingual-grade-school-math-benchmark",
domain="math_eval",
modality="text_math",
scale_hint="multilingual grade-school math benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.94,
contamination_risk=0.88,
ingest_policy="eval only; block translations and answer strings from SFT",
notes="Useful Thai math grounding measurement.",
),
WorldPureSource(
id="parsebench",
name="ParseBench",
repo_or_url="https://www.kaggle.com/datasets/llamaindex-org/parsebench",
domain="document_parsing_eval",
modality="document_text",
scale_hint="document parsing benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.93,
contamination_risk=0.83,
ingest_policy="eval only; use synthetic separate parser SFT for training",
notes="Measures document/file understanding without contaminating it.",
),
WorldPureSource(
id="multiloko",
name="MultiLoKo",
repo_or_url="https://www.kaggle.com/datasets/metaresearch/multiloko",
domain="multilingual_local_knowledge",
modality="text",
scale_hint="multilingual local-knowledge benchmark",
license_gate="kaggle_terms_required",
allowed_tier="eval_only",
purity_score=0.93,
contamination_risk=0.87,
ingest_policy="eval only; track Thai split separately",
notes="Good external-style local knowledge probe.",
),
]
def _summarize(sources: Iterable[WorldPureSource]) -> dict:
rows = list(sources)
domains = {row.domain for row in rows}
train_allowed = [row for row in rows if row.train_allowed]
eval_only = [row for row in rows if row.allowed_tier == "eval_only"]
quarantine = [row for row in rows if "quarantine" in row.allowed_tier]
return {
"source_count": len(rows),
"domain_count": len(domains),
"train_allowed_count": len(train_allowed),
"eval_only_count": len(eval_only),
"quarantine_count": len(quarantine),
"avg_purity_score": round(sum(row.purity_score for row in rows) / len(rows), 4),
"max_contamination_risk": max(row.contamination_risk for row in rows),
}
def _write_markdown(report: dict, path: Path) -> None:
lines = [
"# TinyMind World Pure Source Registry",
"",
"This registry separates training sources, quarantine sources, and evaluation-only sources.",
"No world-best or rank claim is enabled by source discovery alone.",
"",
"## Summary",
"",
]
for key, value in report["summary"].items():
lines.append(f"- `{key}`: {value}")
lines.extend(["", "## Sources", ""])
for row in report["sources"]:
lines.append(
f"- `{row['id']}` ({row['domain']}): `{row['allowed_tier']}`; "
f"license gate `{row['license_gate']}`; purity {row['purity_score']}; "
f"risk {row['contamination_risk']}; {row['repo_or_url']}"
)
lines.extend(
[
"",
"## Gates",
"",
"- Training corpora must pass license review, secret scan, dedupe, and benchmark-overlap purge.",
"- Eval-only sources are blocked from training by default.",
"- Raw reasoning traces remain quarantine-only unless converted to final-answer-only data.",
]
)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def _phase_for_source(row: WorldPureSource) -> str:
if row.modality in {"audio_text", "image_text", "text_multimodal"}:
return "multimodal_bridge"
if row.domain in {"mathematics", "web_mathematics", "scientific_papers"}:
return "reasoning"
if "instruction" in row.domain or row.domain in {"code_instruction", "thai_translation", "thai_lexical", "thai_ner"}:
return "alignment"
return "backbone"
def _initial_weight(row: WorldPureSource) -> float:
phase = _phase_for_source(row)
base = {
"backbone": 1.0,
"reasoning": 0.95,
"alignment": 1.05,
"multimodal_bridge": 0.45,
}[phase]
if row.allowed_tier == "quarantine_then_final_answer_distill":
base *= 0.55
if "license_review" in row.allowed_tier:
base *= 0.72
return max(0.01, base * row.purity_score * (1.0 - min(row.contamination_risk, 0.72) * 0.45))
def _cap_domain_shares(weights: dict[str, float], sources_by_id: dict[str, WorldPureSource], max_share: float = 0.22) -> dict[str, float]:
capped = dict(weights)
for _ in range(8):
total = sum(capped.values())
if total <= 0:
return capped
domain_totals: dict[str, float] = {}
for source_id, weight in capped.items():
domain = sources_by_id[source_id].domain
domain_totals[domain] = domain_totals.get(domain, 0.0) + weight
worst_domain, worst_weight = max(domain_totals.items(), key=lambda item: item[1])
if worst_weight / total <= max_share:
break
scale = (max_share * total) / worst_weight
for source_id in list(capped):
if sources_by_id[source_id].domain == worst_domain:
capped[source_id] *= scale
return capped
def _write_curriculum_markdown(plan: dict, path: Path) -> None:
lines = [
"# TinyMind World Pure Streaming Curriculum",
"",
"This plan converts the source registry into bounded streaming weights.",
"Evaluation-only sources are recorded as reserves and are not used for training.",
"",
"## Summary",
"",
]
for key, value in plan["summary"].items():
lines.append(f"- `{key}`: {value}")
lines.extend(["", "## Train Streams", ""])
for row in plan["train_streams"]:
lines.append(
f"- `{row['source_id']}` phase `{row['phase']}` domain `{row['domain']}` "
f"weight {row['mix_weight']:.6f}, budget {row['token_budget']}"
)
lines.extend(["", "## Eval Reserves", ""])
for row in plan["eval_reserves"]:
lines.append(f"- `{row['source_id']}` domain `{row['domain']}` risk {row['contamination_risk']}")
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def build_world_pure_source_registry(out_dir: str | Path) -> dict:
out_path = Path(out_dir)
out_path.mkdir(parents=True, exist_ok=True)
sources = _sources()
summary = _summarize(sources)
report = {
"schema": "tinymind.world_pure_source_registry.v1",
"summary": summary,
"claim_gate": {
"external_rank_claim_allowed": False,
"world_best_claim_allowed": False,
"reason": "Source discovery is not model performance evidence. Official external evals are still required.",
},
"global_ingest_gates": {
"strict_provenance_required": True,
"license_review_required": True,
"secret_scan_required": True,
"dedupe_required": True,
"benchmark_overlap_purge_required": True,
"raw_reasoning_trace_main_train_allowed": False,
"streaming_first": True,
},
"sources": [asdict(row) | {"train_allowed": row.train_allowed} for row in sources],
}
(out_path / "world_pure_source_registry.json").write_text(
json.dumps(report, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
_write_markdown(report, out_path / "world_pure_source_registry.md")
return report
def build_world_pure_streaming_curriculum(out_dir: str | Path, token_budget: int = 10_000_000) -> dict:
"""Build a bounded streaming curriculum from the registry.
The returned plan is intentionally conservative: no eval-only source is
assigned training weight, raw trace-like sources remain low weight, and no
single domain is allowed to dominate the mixture.
"""
if token_budget <= 0:
raise ValueError("token_budget must be positive")
out_path = Path(out_dir)
out_path.mkdir(parents=True, exist_ok=True)
sources = _sources()
train_sources = [row for row in sources if row.train_allowed]
eval_sources = [row for row in sources if row.allowed_tier == "eval_only"]
sources_by_id = {row.id: row for row in train_sources}
raw_weights = {row.id: _initial_weight(row) for row in train_sources}
capped_weights = _cap_domain_shares(raw_weights, sources_by_id)
total = sum(capped_weights.values())
normalized = {source_id: weight / total for source_id, weight in capped_weights.items()}
train_streams = []
domain_shares: dict[str, float] = {}
for source_id, mix_weight in sorted(normalized.items(), key=lambda item: (-item[1], item[0])):
row = sources_by_id[source_id]
domain_shares[row.domain] = domain_shares.get(row.domain, 0.0) + mix_weight
train_streams.append(
{
"source_id": row.id,
"name": row.name,
"repo_or_url": row.repo_or_url,
"domain": row.domain,
"phase": _phase_for_source(row),
"allowed_tier": row.allowed_tier,
"license_gate": row.license_gate,
"mix_weight": round(mix_weight, 10),
"token_budget": int(round(token_budget * mix_weight)),
"loss_multiplier": round(0.55 + row.purity_score * 0.45, 4),
"contamination_guard": "dedupe+benchmark_ngram_purge+secret_scan",
"ingest_policy": row.ingest_policy,
}
)
# Keep exact normalized sum stable after rounding.
drift = 1.0 - sum(row["mix_weight"] for row in train_streams)
if train_streams:
train_streams[0]["mix_weight"] = round(train_streams[0]["mix_weight"] + drift, 10)
eval_reserves = [
{
"source_id": row.id,
"name": row.name,
"repo_or_url": row.repo_or_url,
"domain": row.domain,
"contamination_risk": row.contamination_risk,
"policy": row.ingest_policy,
}
for row in eval_sources
]
plan = {
"schema": "tinymind.world_pure_streaming_curriculum.v1",
"summary": {
"token_budget": token_budget,
"train_source_count": len(train_streams),
"eval_source_count": len(eval_reserves),
"domain_count": len({row["domain"] for row in train_streams}),
"largest_domain_share": round(max(domain_shares.values()) if domain_shares else 0.0, 6),
},
"claim_gate": {
"main_training_allowed": bool(train_streams),
"world_best_claim_allowed": False,
"official_rank_claim_allowed": False,
"reason": "Curriculum planning is not performance evidence; external held-out eval is required.",
},
"train_streams": train_streams,
"eval_reserves": eval_reserves,
}
(out_path / "world_pure_streaming_curriculum.json").write_text(
json.dumps(plan, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
_write_curriculum_markdown(plan, out_path / "world_pure_streaming_curriculum.md")
return plan
if __name__ == "__main__":
out_dir = Path("reports") / "world_pure_source_registry"
build_world_pure_source_registry(out_dir)
build_world_pure_streaming_curriculum(out_dir)

Xet Storage Details

Size:
34.8 kB
·
Xet hash:
7c4b2fe65448ad6a048c6f83487462e0be9a7946dbad5455b56a49e7bc81f024

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.