bbkdevops's picture
download
raw
16.2 kB
from __future__ import annotations
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Any
TRAIN_TIERS = {
"streaming_train_after_license_review",
"strict_train_after_license_review",
"instruction_train_after_decontamination",
}
@dataclass(frozen=True)
class CodeSource:
id: str
name: str
repo_or_url: str
category: str
scale_hint: str
license_gate: str
allowed_tier: str
purity_score: float
contamination_risk: float
rarity_score: float
ingest_policy: str
notes: str
evidence_url: str
@property
def train_allowed_after_gates(self) -> bool:
return self.allowed_tier in TRAIN_TIERS
def code_sources() -> list[CodeSource]:
return [
CodeSource(
id="bigcode_stack_v2",
name="The Stack v2",
repo_or_url="https://huggingface.co/datasets/bigcode/the-stack-v2",
category="massive_source_code_pretraining",
scale_hint="67.5TB / 600+ languages, Software Heritage backed ids/content workflow",
license_gate="permissive_source_license_required_per_file",
allowed_tier="streaming_train_after_license_review",
purity_score=0.91,
contamination_risk=0.39,
rarity_score=0.96,
ingest_policy="stream only licensed/permissive files; run secret scan, generated/vendor filter, benchmark-solution purge, and repo/path dedupe",
notes="Primary large-scale source. Do not ingest without license and secret gates.",
evidence_url="https://huggingface.co/datasets/bigcode/the-stack-v2",
),
CodeSource(
id="stack_edu",
name="Stack-Edu / StarCoder2Data",
repo_or_url="https://huggingface.co/datasets/HuggingFaceTB/stack-edu",
category="educational_code_pretraining",
scale_hint="125B token educational code filter from The Stack v2",
license_gate="inherits_the_stack_v2_license_review",
allowed_tier="streaming_train_after_license_review",
purity_score=0.95,
contamination_risk=0.28,
rarity_score=0.93,
ingest_policy="prefer high score/int_score samples; retrieve content through Software Heritage path; cap per language; retain license metadata",
notes="Highest-priority code source for compact model quality because it filters for educational programming value.",
evidence_url="https://huggingface.co/datasets/HuggingFaceTB/stack-edu",
),
CodeSource(
id="starcoderdata",
name="StarCoderData",
repo_or_url="https://huggingface.co/datasets/bigcode/starcoderdata",
category="curated_code_pretraining",
scale_hint="StarCoder lineage code corpus, 80+ languages / large token scale",
license_gate="permissive_source_license_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.89,
contamination_risk=0.37,
rarity_score=0.88,
ingest_policy="use as secondary diversity source; remove overlap with Stack-Edu/Stack v2; preserve attribution metadata",
notes="Good for language diversity after primary educational code slices.",
evidence_url="https://huggingface.co/datasets/bigcode/starcoderdata",
),
CodeSource(
id="nvidia_opencodeinstruct",
name="OpenCodeInstruct",
repo_or_url="https://huggingface.co/datasets/nvidia/OpenCodeInstruct",
category="code_instruction_sft",
scale_hint="5M diverse code instruction samples",
license_gate="cc_by_4_0_attribution_required",
allowed_tier="instruction_train_after_decontamination",
purity_score=0.92,
contamination_risk=0.31,
rarity_score=0.90,
ingest_policy="strip templates, dedupe prompts, unit-test runnable snippets where possible, cap to supplemental share",
notes="Large instruction source for tool/code behavior; attribution must be retained.",
evidence_url="https://huggingface.co/datasets/nvidia/OpenCodeInstruct",
),
CodeSource(
id="magicoder_oss_instruct_75k",
name="Magicoder-OSS-Instruct-75K",
repo_or_url="https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K",
category="oss_snippet_instruction_sft",
scale_hint="75K OSS-Instruct code instruction rows, decontaminated file available",
license_gate="mit_license_review",
allowed_tier="instruction_train_after_decontamination",
purity_score=0.93,
contamination_risk=0.24,
rarity_score=0.86,
ingest_policy="prefer decontaminated JSONL; benchmark overlap scan; mix as high-quality code SFT seed",
notes="Small but dense. Good for targeted code-following repair.",
evidence_url="https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K",
),
CodeSource(
id="bigcode_commitpackft",
name="CommitPackFT",
repo_or_url="https://huggingface.co/datasets/bigcode/commitpackft",
category="commit_instruction_sft",
scale_hint="instruction-filtered code commit messages / about GB-scale",
license_gate="mit_license_review",
allowed_tier="instruction_train_after_decontamination",
purity_score=0.88,
contamination_risk=0.33,
rarity_score=0.89,
ingest_policy="keep intent-diff pairs; strip credentials; cap repetitive commit-message patterns",
notes="Rare signal for realistic maintenance, patch intent, and repository evolution.",
evidence_url="https://huggingface.co/datasets/bigcode/commitpackft",
),
CodeSource(
id="codeparrot_github_code",
name="CodeParrot GitHub Code",
repo_or_url="https://huggingface.co/datasets/codeparrot/github-code",
category="github_code_pretraining_legacy",
scale_hint="~1TB GitHub code / 32 languages with metadata including license field",
license_gate="per_file_license_field_must_be_whitelisted",
allowed_tier="quarantine_then_strict_sample",
purity_score=0.74,
contamination_risk=0.55,
rarity_score=0.72,
ingest_policy="quarantine first; whitelist permissive licenses; heavy dedupe vs Stack; reject missing/unclear license",
notes="Useful only as fallback diversity. Not pure enough for default main mix.",
evidence_url="https://huggingface.co/spaces/codeparrot/code-generation-models/blob/main/datasets/github_code.md",
),
CodeSource(
id="codeparrot_clean_valid",
name="CodeParrot Clean Valid",
repo_or_url="https://huggingface.co/datasets/codeparrot/codeparrot-clean-valid",
category="cleaned_python_code",
scale_hint="cleaned Python code with license/copy/autogenerated metadata",
license_gate="per_file_license_field_must_be_whitelisted",
allowed_tier="strict_train_after_license_review",
purity_score=0.82,
contamination_risk=0.44,
rarity_score=0.70,
ingest_policy="filter autogenerated=false; cap copies; license whitelist; syntax parse before train",
notes="Good targeted Python source after filters, not broad enough alone.",
evidence_url="https://huggingface.co/datasets/codeparrot/codeparrot-clean-valid",
),
CodeSource(
id="codesearchnet",
name="CodeSearchNet",
repo_or_url="https://huggingface.co/datasets/code-search-net/code_search_net",
category="code_search_comprehension",
scale_hint="millions of comment-code pairs across major languages",
license_gate="upstream_license_resolution_required",
allowed_tier="quarantine_then_strict_sample",
purity_score=0.78,
contamination_risk=0.46,
rarity_score=0.75,
ingest_policy="use for retrieval/docstring comprehension after license resolution; remove benchmark query/test overlap",
notes="Strong code-understanding signal but license metadata is not consistently example-wise.",
evidence_url="https://huggingface.co/datasets?search=CodeSearchNet&sort=downloads",
),
CodeSource(
id="repofusion_stack_repo",
name="RepoFusion / Stack-Repo",
repo_or_url="https://huggingface.co/RepoFusion",
category="repository_context_code",
scale_hint="200 permissively licensed Java repositories with repo-level context",
license_gate="permissive_repository_license_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.86,
contamination_risk=0.36,
rarity_score=0.92,
ingest_policy="preserve repo graph context; train folder-scale retrieval and import/class grounding; dedupe files",
notes="High rarity because it teaches repository context rather than isolated snippets.",
evidence_url="https://arxiv.org/abs/2306.10998",
),
CodeSource(
id="codeqa",
name="CodeQA",
repo_or_url="https://arxiv.org/abs/2109.08365",
category="source_code_question_answering",
scale_hint="Java 119k QA pairs and Python 70k QA pairs",
license_gate="dataset_host_license_review_required",
allowed_tier="strict_train_after_license_review",
purity_score=0.84,
contamination_risk=0.40,
rarity_score=0.84,
ingest_policy="convert to explain-code QA; keep train/eval split separate; decontam against code QA probes",
notes="Good for code comprehension, not raw generation only.",
evidence_url="https://arxiv.org/abs/2109.08365",
),
CodeSource(
id="bigcodebench",
name="BigCodeBench",
repo_or_url="https://huggingface.co/datasets/bigcode/bigcodebench",
category="coding_eval",
scale_hint="~5.7k rows, practical function-call programming benchmark",
license_gate="apache_2_0_eval_only_due_contamination",
allowed_tier="eval_only",
purity_score=0.94,
contamination_risk=0.97,
rarity_score=0.91,
ingest_policy="eval only; block prompts, solutions, tests, and derived variants from train data",
notes="High-quality benchmark, but training on it invalidates coding claims.",
evidence_url="https://huggingface.co/datasets/bigcode/bigcodebench",
),
CodeSource(
id="humaneval",
name="HumanEval",
repo_or_url="https://huggingface.co/datasets/openai/openai_humaneval",
category="coding_eval",
scale_hint="classic Python code generation benchmark",
license_gate="eval_only_due_high_contamination_risk",
allowed_tier="eval_only",
purity_score=0.93,
contamination_risk=0.98,
rarity_score=0.70,
ingest_policy="eval only; maintain n-gram and AST contamination blocklist",
notes="Never include in training mix if claiming coding generalization.",
evidence_url="https://huggingface.co/datasets/openai/openai_humaneval",
),
CodeSource(
id="mbpp",
name="MBPP",
repo_or_url="https://huggingface.co/datasets/google-research-datasets/mbpp",
category="coding_eval",
scale_hint="Python programming benchmark",
license_gate="eval_only_due_high_contamination_risk",
allowed_tier="eval_only",
purity_score=0.92,
contamination_risk=0.96,
rarity_score=0.68,
ingest_policy="eval only; use as decontamination blocklist for train corpora",
notes="Useful for measurement, not main training.",
evidence_url="https://huggingface.co/datasets/google-research-datasets/mbpp",
),
]
def build_code_source_registry(out_dir: str | Path) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
sources = code_sources()
rows = [{**asdict(source), "train_allowed_after_gates": source.train_allowed_after_gates} for source in sources]
trainable = [row for row in rows if row["train_allowed_after_gates"]]
quarantine = [row for row in rows if row["allowed_tier"].startswith("quarantine")]
eval_only = [row for row in rows if row["allowed_tier"] == "eval_only"]
report = {
"schema": "tinymind.code_source_registry.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"summary": {
"sources_total": len(rows),
"trainable_after_gates": len(trainable),
"quarantine_sources": len(quarantine),
"eval_only_sources": len(eval_only),
"avg_purity_trainable": sum(row["purity_score"] for row in trainable) / max(1, len(trainable)),
"avg_rarity_trainable": sum(row["rarity_score"] for row in trainable) / max(1, len(trainable)),
},
"policy": {
"default_action": "do_not_download_full_corpora_without_streaming_filters",
"required_gates": [
"license whitelist",
"secret scan",
"generated/vendor duplicate filter",
"benchmark contamination blocklist",
"syntax/parser validation",
"per-language/domain caps",
"source manifest with hashes",
],
"blocked_by_default": ["malware payload corpora", "exploit-only offensive code", "unclear-license full dumps"],
},
"sources": rows,
"claim_gate": {
"code_source_registry_ready": True,
"code_training_allowed_without_gates": False,
"world_rare_code_complete_claim_allowed": False,
"reason": "Registry identifies high-value sources and gates; it does not prove all rare code in the world has been acquired.",
},
}
json_path = out / "code_source_registry.json"
md_path = out / "code_source_registry.md"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _markdown(report: dict[str, Any]) -> str:
lines = [
"# TinyMind Code Source Registry",
"",
f"- Created: `{report['created_at']}`",
f"- Sources total: `{report['summary']['sources_total']}`",
f"- Trainable after gates: `{report['summary']['trainable_after_gates']}`",
f"- Quarantine sources: `{report['summary']['quarantine_sources']}`",
f"- Eval-only sources: `{report['summary']['eval_only_sources']}`",
"",
"## Required Gates",
"",
]
lines.extend(f"- {gate}" for gate in report["policy"]["required_gates"])
lines.extend(
[
"",
"## Sources",
"",
"| Source | Tier | Purity | Rarity | Risk | Policy |",
"| --- | --- | ---: | ---: | ---: | --- |",
]
)
for row in report["sources"]:
lines.append(
f"| [{row['name']}]({row['repo_or_url']}) | `{row['allowed_tier']}` | "
f"{row['purity_score']:.2f} | {row['rarity_score']:.2f} | {row['contamination_risk']:.2f} | {row['ingest_policy']} |"
)
lines.extend(["", "## Claim Boundary", "", report["claim_gate"]["reason"], ""])
return "\n".join(lines)

Xet Storage Details

Size:
16.2 kB
·
Xet hash:
f3f68d4372953aed127b06c1a198cfd154f99002244d4c62872dbdfd1e76b467

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.