Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /code_source_registry.py
| from __future__ import annotations | |
| from dataclasses import asdict, dataclass | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| TRAIN_TIERS = { | |
| "streaming_train_after_license_review", | |
| "strict_train_after_license_review", | |
| "instruction_train_after_decontamination", | |
| } | |
| class CodeSource: | |
| id: str | |
| name: str | |
| repo_or_url: str | |
| category: str | |
| scale_hint: str | |
| license_gate: str | |
| allowed_tier: str | |
| purity_score: float | |
| contamination_risk: float | |
| rarity_score: float | |
| ingest_policy: str | |
| notes: str | |
| evidence_url: str | |
| def train_allowed_after_gates(self) -> bool: | |
| return self.allowed_tier in TRAIN_TIERS | |
| def code_sources() -> list[CodeSource]: | |
| return [ | |
| CodeSource( | |
| id="bigcode_stack_v2", | |
| name="The Stack v2", | |
| repo_or_url="https://huggingface.co/datasets/bigcode/the-stack-v2", | |
| category="massive_source_code_pretraining", | |
| scale_hint="67.5TB / 600+ languages, Software Heritage backed ids/content workflow", | |
| license_gate="permissive_source_license_required_per_file", | |
| allowed_tier="streaming_train_after_license_review", | |
| purity_score=0.91, | |
| contamination_risk=0.39, | |
| rarity_score=0.96, | |
| ingest_policy="stream only licensed/permissive files; run secret scan, generated/vendor filter, benchmark-solution purge, and repo/path dedupe", | |
| notes="Primary large-scale source. Do not ingest without license and secret gates.", | |
| evidence_url="https://huggingface.co/datasets/bigcode/the-stack-v2", | |
| ), | |
| CodeSource( | |
| id="stack_edu", | |
| name="Stack-Edu / StarCoder2Data", | |
| repo_or_url="https://huggingface.co/datasets/HuggingFaceTB/stack-edu", | |
| category="educational_code_pretraining", | |
| scale_hint="125B token educational code filter from The Stack v2", | |
| license_gate="inherits_the_stack_v2_license_review", | |
| allowed_tier="streaming_train_after_license_review", | |
| purity_score=0.95, | |
| contamination_risk=0.28, | |
| rarity_score=0.93, | |
| ingest_policy="prefer high score/int_score samples; retrieve content through Software Heritage path; cap per language; retain license metadata", | |
| notes="Highest-priority code source for compact model quality because it filters for educational programming value.", | |
| evidence_url="https://huggingface.co/datasets/HuggingFaceTB/stack-edu", | |
| ), | |
| CodeSource( | |
| id="starcoderdata", | |
| name="StarCoderData", | |
| repo_or_url="https://huggingface.co/datasets/bigcode/starcoderdata", | |
| category="curated_code_pretraining", | |
| scale_hint="StarCoder lineage code corpus, 80+ languages / large token scale", | |
| license_gate="permissive_source_license_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.89, | |
| contamination_risk=0.37, | |
| rarity_score=0.88, | |
| ingest_policy="use as secondary diversity source; remove overlap with Stack-Edu/Stack v2; preserve attribution metadata", | |
| notes="Good for language diversity after primary educational code slices.", | |
| evidence_url="https://huggingface.co/datasets/bigcode/starcoderdata", | |
| ), | |
| CodeSource( | |
| id="nvidia_opencodeinstruct", | |
| name="OpenCodeInstruct", | |
| repo_or_url="https://huggingface.co/datasets/nvidia/OpenCodeInstruct", | |
| category="code_instruction_sft", | |
| scale_hint="5M diverse code instruction samples", | |
| license_gate="cc_by_4_0_attribution_required", | |
| allowed_tier="instruction_train_after_decontamination", | |
| purity_score=0.92, | |
| contamination_risk=0.31, | |
| rarity_score=0.90, | |
| ingest_policy="strip templates, dedupe prompts, unit-test runnable snippets where possible, cap to supplemental share", | |
| notes="Large instruction source for tool/code behavior; attribution must be retained.", | |
| evidence_url="https://huggingface.co/datasets/nvidia/OpenCodeInstruct", | |
| ), | |
| CodeSource( | |
| id="magicoder_oss_instruct_75k", | |
| name="Magicoder-OSS-Instruct-75K", | |
| repo_or_url="https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K", | |
| category="oss_snippet_instruction_sft", | |
| scale_hint="75K OSS-Instruct code instruction rows, decontaminated file available", | |
| license_gate="mit_license_review", | |
| allowed_tier="instruction_train_after_decontamination", | |
| purity_score=0.93, | |
| contamination_risk=0.24, | |
| rarity_score=0.86, | |
| ingest_policy="prefer decontaminated JSONL; benchmark overlap scan; mix as high-quality code SFT seed", | |
| notes="Small but dense. Good for targeted code-following repair.", | |
| evidence_url="https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K", | |
| ), | |
| CodeSource( | |
| id="bigcode_commitpackft", | |
| name="CommitPackFT", | |
| repo_or_url="https://huggingface.co/datasets/bigcode/commitpackft", | |
| category="commit_instruction_sft", | |
| scale_hint="instruction-filtered code commit messages / about GB-scale", | |
| license_gate="mit_license_review", | |
| allowed_tier="instruction_train_after_decontamination", | |
| purity_score=0.88, | |
| contamination_risk=0.33, | |
| rarity_score=0.89, | |
| ingest_policy="keep intent-diff pairs; strip credentials; cap repetitive commit-message patterns", | |
| notes="Rare signal for realistic maintenance, patch intent, and repository evolution.", | |
| evidence_url="https://huggingface.co/datasets/bigcode/commitpackft", | |
| ), | |
| CodeSource( | |
| id="codeparrot_github_code", | |
| name="CodeParrot GitHub Code", | |
| repo_or_url="https://huggingface.co/datasets/codeparrot/github-code", | |
| category="github_code_pretraining_legacy", | |
| scale_hint="~1TB GitHub code / 32 languages with metadata including license field", | |
| license_gate="per_file_license_field_must_be_whitelisted", | |
| allowed_tier="quarantine_then_strict_sample", | |
| purity_score=0.74, | |
| contamination_risk=0.55, | |
| rarity_score=0.72, | |
| ingest_policy="quarantine first; whitelist permissive licenses; heavy dedupe vs Stack; reject missing/unclear license", | |
| notes="Useful only as fallback diversity. Not pure enough for default main mix.", | |
| evidence_url="https://huggingface.co/spaces/codeparrot/code-generation-models/blob/main/datasets/github_code.md", | |
| ), | |
| CodeSource( | |
| id="codeparrot_clean_valid", | |
| name="CodeParrot Clean Valid", | |
| repo_or_url="https://huggingface.co/datasets/codeparrot/codeparrot-clean-valid", | |
| category="cleaned_python_code", | |
| scale_hint="cleaned Python code with license/copy/autogenerated metadata", | |
| license_gate="per_file_license_field_must_be_whitelisted", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.82, | |
| contamination_risk=0.44, | |
| rarity_score=0.70, | |
| ingest_policy="filter autogenerated=false; cap copies; license whitelist; syntax parse before train", | |
| notes="Good targeted Python source after filters, not broad enough alone.", | |
| evidence_url="https://huggingface.co/datasets/codeparrot/codeparrot-clean-valid", | |
| ), | |
| CodeSource( | |
| id="codesearchnet", | |
| name="CodeSearchNet", | |
| repo_or_url="https://huggingface.co/datasets/code-search-net/code_search_net", | |
| category="code_search_comprehension", | |
| scale_hint="millions of comment-code pairs across major languages", | |
| license_gate="upstream_license_resolution_required", | |
| allowed_tier="quarantine_then_strict_sample", | |
| purity_score=0.78, | |
| contamination_risk=0.46, | |
| rarity_score=0.75, | |
| ingest_policy="use for retrieval/docstring comprehension after license resolution; remove benchmark query/test overlap", | |
| notes="Strong code-understanding signal but license metadata is not consistently example-wise.", | |
| evidence_url="https://huggingface.co/datasets?search=CodeSearchNet&sort=downloads", | |
| ), | |
| CodeSource( | |
| id="repofusion_stack_repo", | |
| name="RepoFusion / Stack-Repo", | |
| repo_or_url="https://huggingface.co/RepoFusion", | |
| category="repository_context_code", | |
| scale_hint="200 permissively licensed Java repositories with repo-level context", | |
| license_gate="permissive_repository_license_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.86, | |
| contamination_risk=0.36, | |
| rarity_score=0.92, | |
| ingest_policy="preserve repo graph context; train folder-scale retrieval and import/class grounding; dedupe files", | |
| notes="High rarity because it teaches repository context rather than isolated snippets.", | |
| evidence_url="https://arxiv.org/abs/2306.10998", | |
| ), | |
| CodeSource( | |
| id="codeqa", | |
| name="CodeQA", | |
| repo_or_url="https://arxiv.org/abs/2109.08365", | |
| category="source_code_question_answering", | |
| scale_hint="Java 119k QA pairs and Python 70k QA pairs", | |
| license_gate="dataset_host_license_review_required", | |
| allowed_tier="strict_train_after_license_review", | |
| purity_score=0.84, | |
| contamination_risk=0.40, | |
| rarity_score=0.84, | |
| ingest_policy="convert to explain-code QA; keep train/eval split separate; decontam against code QA probes", | |
| notes="Good for code comprehension, not raw generation only.", | |
| evidence_url="https://arxiv.org/abs/2109.08365", | |
| ), | |
| CodeSource( | |
| id="bigcodebench", | |
| name="BigCodeBench", | |
| repo_or_url="https://huggingface.co/datasets/bigcode/bigcodebench", | |
| category="coding_eval", | |
| scale_hint="~5.7k rows, practical function-call programming benchmark", | |
| license_gate="apache_2_0_eval_only_due_contamination", | |
| allowed_tier="eval_only", | |
| purity_score=0.94, | |
| contamination_risk=0.97, | |
| rarity_score=0.91, | |
| ingest_policy="eval only; block prompts, solutions, tests, and derived variants from train data", | |
| notes="High-quality benchmark, but training on it invalidates coding claims.", | |
| evidence_url="https://huggingface.co/datasets/bigcode/bigcodebench", | |
| ), | |
| CodeSource( | |
| id="humaneval", | |
| name="HumanEval", | |
| repo_or_url="https://huggingface.co/datasets/openai/openai_humaneval", | |
| category="coding_eval", | |
| scale_hint="classic Python code generation benchmark", | |
| license_gate="eval_only_due_high_contamination_risk", | |
| allowed_tier="eval_only", | |
| purity_score=0.93, | |
| contamination_risk=0.98, | |
| rarity_score=0.70, | |
| ingest_policy="eval only; maintain n-gram and AST contamination blocklist", | |
| notes="Never include in training mix if claiming coding generalization.", | |
| evidence_url="https://huggingface.co/datasets/openai/openai_humaneval", | |
| ), | |
| CodeSource( | |
| id="mbpp", | |
| name="MBPP", | |
| repo_or_url="https://huggingface.co/datasets/google-research-datasets/mbpp", | |
| category="coding_eval", | |
| scale_hint="Python programming benchmark", | |
| license_gate="eval_only_due_high_contamination_risk", | |
| allowed_tier="eval_only", | |
| purity_score=0.92, | |
| contamination_risk=0.96, | |
| rarity_score=0.68, | |
| ingest_policy="eval only; use as decontamination blocklist for train corpora", | |
| notes="Useful for measurement, not main training.", | |
| evidence_url="https://huggingface.co/datasets/google-research-datasets/mbpp", | |
| ), | |
| ] | |
| def build_code_source_registry(out_dir: str | Path) -> dict[str, Any]: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| sources = code_sources() | |
| rows = [{**asdict(source), "train_allowed_after_gates": source.train_allowed_after_gates} for source in sources] | |
| trainable = [row for row in rows if row["train_allowed_after_gates"]] | |
| quarantine = [row for row in rows if row["allowed_tier"].startswith("quarantine")] | |
| eval_only = [row for row in rows if row["allowed_tier"] == "eval_only"] | |
| report = { | |
| "schema": "tinymind.code_source_registry.v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "summary": { | |
| "sources_total": len(rows), | |
| "trainable_after_gates": len(trainable), | |
| "quarantine_sources": len(quarantine), | |
| "eval_only_sources": len(eval_only), | |
| "avg_purity_trainable": sum(row["purity_score"] for row in trainable) / max(1, len(trainable)), | |
| "avg_rarity_trainable": sum(row["rarity_score"] for row in trainable) / max(1, len(trainable)), | |
| }, | |
| "policy": { | |
| "default_action": "do_not_download_full_corpora_without_streaming_filters", | |
| "required_gates": [ | |
| "license whitelist", | |
| "secret scan", | |
| "generated/vendor duplicate filter", | |
| "benchmark contamination blocklist", | |
| "syntax/parser validation", | |
| "per-language/domain caps", | |
| "source manifest with hashes", | |
| ], | |
| "blocked_by_default": ["malware payload corpora", "exploit-only offensive code", "unclear-license full dumps"], | |
| }, | |
| "sources": rows, | |
| "claim_gate": { | |
| "code_source_registry_ready": True, | |
| "code_training_allowed_without_gates": False, | |
| "world_rare_code_complete_claim_allowed": False, | |
| "reason": "Registry identifies high-value sources and gates; it does not prove all rare code in the world has been acquired.", | |
| }, | |
| } | |
| json_path = out / "code_source_registry.json" | |
| md_path = out / "code_source_registry.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict[str, Any]) -> str: | |
| lines = [ | |
| "# TinyMind Code Source Registry", | |
| "", | |
| f"- Created: `{report['created_at']}`", | |
| f"- Sources total: `{report['summary']['sources_total']}`", | |
| f"- Trainable after gates: `{report['summary']['trainable_after_gates']}`", | |
| f"- Quarantine sources: `{report['summary']['quarantine_sources']}`", | |
| f"- Eval-only sources: `{report['summary']['eval_only_sources']}`", | |
| "", | |
| "## Required Gates", | |
| "", | |
| ] | |
| lines.extend(f"- {gate}" for gate in report["policy"]["required_gates"]) | |
| lines.extend( | |
| [ | |
| "", | |
| "## Sources", | |
| "", | |
| "| Source | Tier | Purity | Rarity | Risk | Policy |", | |
| "| --- | --- | ---: | ---: | ---: | --- |", | |
| ] | |
| ) | |
| for row in report["sources"]: | |
| lines.append( | |
| f"| [{row['name']}]({row['repo_or_url']}) | `{row['allowed_tier']}` | " | |
| f"{row['purity_score']:.2f} | {row['rarity_score']:.2f} | {row['contamination_risk']:.2f} | {row['ingest_policy']} |" | |
| ) | |
| lines.extend(["", "## Claim Boundary", "", report["claim_gate"]["reason"], ""]) | |
| return "\n".join(lines) | |
Xet Storage Details
- Size:
- 16.2 kB
- Xet hash:
- f3f68d4372953aed127b06c1a198cfd154f99002244d4c62872dbdfd1e76b467
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.