gemma-bucket-sync / app /validation.py
cmpatino's picture
cmpatino HF Staff
Replicate bucket-sync API for the Gemma collab
6cc71c7
from __future__ import annotations
from app.config import Settings
from app.errors import InvalidPath
from app.naming import (
AGENT_ID_RE,
SLUG_RE,
SourceURI,
agent_id_from_bucket,
parse_source_uri,
)
BLOCKED_TARGETS = {
"README.md",
"LEADERBOARD.md",
"shared_resources/README.md",
}
BLOCKED_PREFIXES = ("audit/",)
def validate_agent_id(agent_id: str) -> None:
if not AGENT_ID_RE.match(agent_id):
raise InvalidPath(f"invalid agent_id: {agent_id!r}")
def validate_slug(slug: str) -> None:
if not SLUG_RE.match(slug):
raise InvalidPath(f"invalid slug: {slug!r}")
def validate_path_components(path: str) -> None:
if not path:
raise InvalidPath("empty path")
if path.startswith("/"):
raise InvalidPath("path must not be absolute")
for part in path.rstrip("/").split("/"):
if part in ("", ".", ".."):
raise InvalidPath(f"invalid path component: {part!r}")
if part.startswith("."):
raise InvalidPath(f"path component must not start with '.': {part!r}")
if any(ord(c) < 32 for c in part):
raise InvalidPath("path contains control characters")
def check_dest_not_blocked(target: str) -> None:
norm = target.lstrip("/")
if norm in BLOCKED_TARGETS:
raise InvalidPath(f"target path blocked: {norm}", hint="this path is reserved")
for prefix in BLOCKED_PREFIXES:
if norm.startswith(prefix):
raise InvalidPath(f"target path blocked: {norm}", hint=f"prefix '{prefix}' is reserved")
def resolve_source(settings: Settings, source: str) -> tuple[SourceURI, str]:
"""Parse a source URI and confirm it points inside a valid agent bucket.
Returns (parsed_uri, agent_id). Raises InvalidPath otherwise.
"""
parsed = parse_source_uri(source)
if parsed is None:
raise InvalidPath(f"source must be an hf://buckets/... URI, got: {source!r}")
if parsed.org != settings.org:
raise InvalidPath(
f"source must be under org '{settings.org}', got '{parsed.org}'",
hint="agents post from buckets in this org only",
)
agent_id = agent_id_from_bucket(parsed.bucket, settings.collab_slug)
if agent_id is None:
raise InvalidPath(
f"source bucket '{parsed.bucket}' does not match '{settings.collab_slug}-<agent_id>'",
hint="source must be under your own scratch bucket",
)
if parsed.path:
validate_path_components(parsed.path)
return parsed, agent_id
def validate_shared_dest_path(dest_path: str, agent_id: str) -> None:
validate_path_components(dest_path)
leaf = dest_path.rsplit("/", 1)[-1]
marker = f"_{agent_id}"
leaf_no_ext = leaf.rsplit(".", 1)[0]
if marker not in leaf_no_ext and marker not in dest_path:
raise InvalidPath(
f"shared_resources dest path must include '_{agent_id}' in the leaf component",
hint=f"e.g. 'tokenizers/{agent_id}_bpe.json' or 'plots/curve_{agent_id}.png'",
)
full_target = f"shared_resources/{dest_path}"
check_dest_not_blocked(full_target)