ropedia-xperience-10m-task-baselines / scripts /validate_scope_claims.py
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
cf07180 verified
#!/usr/bin/env python3
"""Validate Qwen3-Omni scale-up status against the actual Xperience-10M artifacts.
This check exists because several setup/provenance files retain historical
`32ep` run identifiers in their paths. Those identifiers are useful provenance,
but public project surfaces should present them as setup artifacts until the
held-out 32-episode pilot is actually completed.
"""
from __future__ import annotations
import json
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
OUTPUT = ROOT / "docs/data/scope_claims_audit.json"
PUBLIC_PRESENTATION_FILES = [
"README.md",
"ARTIFACT_GUIDE.md",
"EVIDENCE_CONTRACT.md",
"REPRODUCIBILITY.md",
"docs/index.html",
"docs/data/artifact_index.json",
"docs/data/evidence_contract.json",
"docs/data/project_manifest.json",
"docs/data/mirror_parity.json",
"docs/data/reproducibility_matrix.json",
"docs/data/project_packet.json",
"docs/data/summary_metrics.json",
]
RESULT_TEXT_SUFFIXES = {".csv", ".json", ".jsonl", ".md", ".txt", ".yaml", ".yml"}
HISTORICAL_PATTERNS = [
"qwen3_omni_32ep",
"xperience10m_qwen3_omni_32ep",
"ropedia-episode-task-suite",
]
MISLEADING_PHRASES = [
re.compile(r"\breal\s+32-episode\s+(?:result|metric|fine-?tune)\b", re.IGNORECASE),
re.compile(r"\b32-episode\s+(?:result|metric|fine-?tune)\s+is\s+claimed\b", re.IGNORECASE),
re.compile(r"\bfull\s+32-episode\s+(?:result|metric|fine-?tune)\b", re.IGNORECASE),
]
NEGATION_HINTS = {
"not",
"no",
"never",
"blocked",
"pending",
"gated",
"until",
"after",
"requires",
"must not",
"not yet",
"no real",
}
def read_json(relative_path: str):
return json.loads((ROOT / relative_path).read_text(encoding="utf-8"))
def check(name: str, passed: bool, detail: str, evidence: list[str]) -> dict:
return {
"name": name,
"status": "pass" if passed else "fail",
"detail": detail,
"evidence": evidence,
}
def sentence_windows(text: str) -> list[str]:
return [part.strip() for part in re.split(r"(?<=[.!?\n])\s+", text) if part.strip()]
def has_negation(sentence: str) -> bool:
lowered = sentence.lower()
return any(hint in lowered for hint in NEGATION_HINTS)
def scan_public_docs() -> tuple[list[dict], list[dict]]:
failures: list[dict] = []
observations: list[dict] = []
for relative_path in PUBLIC_PRESENTATION_FILES:
path = ROOT / relative_path
if not path.exists():
failures.append({"kind": "missing_public_file", "path": relative_path})
continue
text = path.read_text(encoding="utf-8", errors="ignore")
for pattern in HISTORICAL_PATTERNS:
if pattern in text:
failures.append(
{
"kind": "historical_identifier_in_public_presentation",
"path": relative_path,
"pattern": pattern,
}
)
for sentence in sentence_windows(text):
for phrase in MISLEADING_PHRASES:
if phrase.search(sentence) and not has_negation(sentence):
failures.append(
{
"kind": "misleading_32_episode_phrase",
"path": relative_path,
"phrase": phrase.pattern,
"sentence": sentence[:260],
}
)
if "32-episode" in text:
observations.append({"path": relative_path, "contains_32_episode_status_text": True})
return failures, observations
def scan_historical_result_identifiers() -> list[dict]:
results_root = ROOT / "results/omni_finetune"
records: list[dict] = []
if not results_root.exists():
return records
try:
tracked = subprocess.run(
["git", "-C", str(ROOT), "ls-files", "results/omni_finetune"],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
).stdout.splitlines()
paths = [ROOT / item for item in tracked if item]
except (OSError, subprocess.CalledProcessError):
paths = [item for item in results_root.rglob("*") if item.is_file()]
for path in sorted(item for item in paths if item.is_file()):
if path.suffix.lower() not in RESULT_TEXT_SUFFIXES:
continue
relative_path = path.relative_to(ROOT).as_posix()
with path.open("r", encoding="utf-8", errors="ignore") as handle:
for line_number, line in enumerate(handle, start=1):
matched = [pattern for pattern in HISTORICAL_PATTERNS if pattern in line]
if not matched:
continue
records.append(
{
"classification": "historical_identifier_in_readiness_artifact",
"path": relative_path,
"line": line_number,
"patterns": matched,
"example": line.strip()[:260],
}
)
return records
def build_report() -> dict:
checks: list[dict] = []
failures: list[dict] = []
project_manifest = read_json("docs/data/project_manifest.json")
project_packet = read_json("docs/data/project_packet.json")
summary_metrics = read_json("docs/data/summary_metrics.json")
dataset_manifest = read_json("results/omni_finetune/dataset_manifest.json")
training_metadata = read_json("results/omni_finetune/training_metadata.json")
source_discovery = read_json("results/omni_finetune/source_discovery.json")
project_qwen_claim = project_manifest["scope_boundary"].get("qwen3_omni_32_episode_claim")
checks.append(
check(
"project_manifest_records_pending_32_episode_qwen_result",
project_qwen_claim is False,
f"project_manifest scope_boundary.qwen3_omni_32_episode_claim={project_qwen_claim!r}",
["docs/data/project_manifest.json"],
)
)
project_qwen_claim = project_packet["scope_status"].get("qwen3_omni_32_episode_claim")
checks.append(
check(
"project_packet_records_pending_32_episode_qwen_result",
project_qwen_claim is False,
f"project_packet scope_status.qwen3_omni_32_episode_claim={project_qwen_claim!r}",
["docs/data/project_packet.json"],
)
)
reading_notes = " ".join(project_packet.get("current_reading_notes", []))
checks.append(
check(
"project_packet_describes_32_episode_setup_status",
"32-episode" in reading_notes and ("setup" in reading_notes or "gated data" in reading_notes),
"project packet describes the setup-stage Qwen3-Omni run separately from the planned 32-episode fine-tune",
["docs/data/project_packet.json"],
)
)
current_scope = summary_metrics.get("omni_relay", {}).get("current_scope", "")
checks.append(
check(
"summary_metrics_preserves_omni_scale_up_status",
"32-episode Qwen3-Omni fine-tune requires gated data staging" in current_scope,
current_scope,
["docs/data/summary_metrics.json"],
)
)
split_counts = dataset_manifest.get("split_counts", {})
checks.append(
check(
"omni_dataset_manifest_is_setup_stage",
dataset_manifest.get("num_episodes") == 1
and dataset_manifest.get("num_samples") == 128
and split_counts == {"train": 128},
(
f"episodes={dataset_manifest.get('num_episodes')}, "
f"samples={dataset_manifest.get('num_samples')}, split_counts={split_counts}"
),
["results/omni_finetune/dataset_manifest.json"],
)
)
checks.append(
check(
"omni_training_metadata_is_setup_stage",
training_metadata.get("num_train_samples") == 128
and training_metadata.get("num_val_samples") == 0,
(
f"train={training_metadata.get('num_train_samples')}, "
f"val={training_metadata.get('num_val_samples')}, "
f"processes={training_metadata.get('num_processes')}"
),
["results/omni_finetune/training_metadata.json"],
)
)
checks.append(
check(
"source_discovery_gate_is_closed",
source_discovery.get("ready_for_32_episode_pilot") is False
and source_discovery.get("local", {}).get("num_degraded_valid_episodes") == 1,
(
f"ready_for_32_episode_pilot={source_discovery.get('ready_for_32_episode_pilot')}, "
f"local_valid={source_discovery.get('local', {}).get('num_degraded_valid_episodes')}"
),
["results/omni_finetune/source_discovery.json"],
)
)
doc_failures, public_observations = scan_public_docs()
failures.extend(doc_failures)
checks.append(
check(
"public_presentation_has_no_historical_32ep_identifiers",
not doc_failures,
f"public presentation scan failures={len(doc_failures)}",
PUBLIC_PRESENTATION_FILES,
)
)
historical_identifiers = scan_historical_result_identifiers()
checks.append(
check(
"historical_32ep_identifiers_are_confined_to_readiness_artifacts",
bool(historical_identifiers),
f"historical identifiers found in result provenance files={len(historical_identifiers)}",
["results/omni_finetune/"],
)
)
failures.extend(
{
"kind": "failed_check",
"name": item["name"],
"detail": item["detail"],
"evidence": item["evidence"],
}
for item in checks
if item["status"] != "pass"
)
status = "pass" if not failures else "fail"
return {
"status": status,
"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"summary": {
"qwen3_omni_32_episode_claim": False,
"dataset_manifest_num_episodes": dataset_manifest.get("num_episodes"),
"dataset_manifest_num_samples": dataset_manifest.get("num_samples"),
"training_metadata_num_train_samples": training_metadata.get("num_train_samples"),
"source_discovery_ready_for_32_episode_pilot": source_discovery.get("ready_for_32_episode_pilot"),
"historical_identifier_count": len(historical_identifiers),
"public_32_episode_status_file_count": len(public_observations),
"failure_count": len(failures),
},
"checks": checks,
"public_status_observations": public_observations,
"historical_identifiers": historical_identifiers[:30],
"historical_identifier_total_count": len(historical_identifiers),
"failures": failures,
}
def main() -> int:
report = build_report()
OUTPUT.parent.mkdir(parents=True, exist_ok=True)
OUTPUT.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
print(f"{report['status'].upper()}: wrote {OUTPUT}")
if report["status"] != "pass":
for failure in report["failures"][:30]:
print(f"- {failure}")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())