"""Redact internal reviews into publishable ICSAC-branded review artifacts. Operates on the authoritative internal review markdown in ``reviews/`` and emits a sanitized version that is safe to publish on icsacinstitute.org. The redaction removes all vendor/model identifiers, renames reviewers generically ("Reviewer 1", "Reviewer 2", ...), drops internal workflow detail (raw API error payloads, slot indices, fallback chains), and replaces the disagreement flag with a human-readable consensus label. A grep-gate (``assert_clean``) fails hard if any forbidden token survives redacting. Callers must catch the exception and abort publication. """ from __future__ import annotations import os import re from dataclasses import dataclass, field # Hard-fail vendor/model identifiers. Any case-insensitive substring hit # indicates a leak of panel composition and must abort publication. # # Tokens chosen to catch identity leaks (OpenRouter route paths, specific # model family IDs, vendor names rarely legitimate in academic prose) # WITHOUT catching subject-matter discussion of published transformers. # A paper reviewing "GPT-2 and Gemma-2 activations" must pass the gate — # those are scientific subjects, not panel self-references. FORBIDDEN_VENDOR_TOKENS: tuple[str, ...] = ( # Infrastructure names — never appear in legitimate academic prose. "openrouter", "anthropic", # OpenRouter route prefixes — the "/" guarantees a path, not a word. "openai/", "nvidia/", "google/gemma", "meta-llama/", "z-ai/", "minimax/", "nousresearch/", "qwen/", "mistralai/", "deepseek/", "cognitivecomputations/", "liquid/", # Specific panel model families — narrow enough that a hit is a leak. "nemotron", "gpt-oss", # Panelist name — small false-positive risk ("Claude Shannon") accepted # to catch "As Claude, I..." self-reference leaks from slot 0. "claude", ) # Hard-fail credential/infra phrases. Substring match, case-insensitive. # These are structurally always leaks — there's no legitimate prose # reason for a paper review to contain these compounds. FORBIDDEN_SECRET_PHRASES: tuple[str, ...] = ( "api key", "api keys", "access token", "auth token", "auth key", "bearer token", "secret key", "private key", "api token", "api tokens", "bearer ", ) # Soft-warn tokens — bare "key", "api", "token", "google" that appear # regularly in academic prose ("key findings", "Google Scholar", "tokenizer"). # Surfaced in the redaction report but do not abort. Operators can grep the # published review manually if they want extra assurance. SOFT_WARN_TOKENS: tuple[str, ...] = ( "google", "api", "token", "key", ) # Regex patterns indicating attempted exfiltration via review output. # Added 2026-04-18 after prompt-injection attack-surface audit. Triggered # by file paths pointing at our hosts, env-var assignments, and known # credential prefixes. Match anywhere in the redacted review text. FORBIDDEN_EXFIL_PATTERNS: tuple[str, ...] = ( # Absolute filesystem paths likely pointing at our hosts r"/home/orangepi\b", r"/home/dietpi\b", r"/opt/orchestrator\b", r"/etc/passwd\b", r"/etc/shadow\b", r"/root/", r"\.config/[a-z][a-z0-9_-]*\.env\b", r"C:\\\\Users\\\\", # Env-var assignments of the form UPPER_SNAKE=longvalue r"\b[A-Z][A-Z0-9_]{3,}=\S{8,}", # Known credential prefixes r"\bsk-ant-api03-[A-Za-z0-9_-]{8,}", r"\bsk-[A-Za-z0-9]{20,}", r"\bghp_[A-Za-z0-9]{10,}", r"\bgho_[A-Za-z0-9]{10,}", r"\bAKIA[0-9A-Z]{16}\b", # Bearer tokens of non-trivial length following the keyword r"\bBearer\s+[A-Za-z0-9._-]{32,}", # Internal rubric filenames — reviewers and the RQC auditor occasionally # echo filenames from the prompt ("drift from tone.md"). Public output # must reference rubrics by prose name, never by repo filename. Rewriting # pass runs first (_rewrite_rubric_filenames); this gate catches anything # that slipped through. r"\b(?:rubrics/)?(?:scope|methodology|ai-provenance|tone|calibration|review_quality_control)\.md\b", ) # -------------------------------------------------------------------------- # Rubric filename → prose rewrite # -------------------------------------------------------------------------- # # The RQC rubric references sibling rubrics by filename ("the standard # mirrors tone.md"). Audit justifications inherit that phrasing and leak # internal filenames into public-facing text ("a soft but consistent drift # from tone.md"). Rewrite before rendering; the hard-gate above catches any # new filename that isn't in this map so a future rubric addition can't # silently leak. RUBRIC_FILENAME_PROSE: tuple[tuple[str, str], ...] = ( ("rubrics/review_quality_control.md", "the audit rubric"), ("rubrics/ai-provenance.md", "the AI provenance rubric"), ("rubrics/calibration.md", "the calibration rubric"), ("rubrics/methodology.md", "the methodology rubric"), ("rubrics/scope.md", "the scope rubric"), ("rubrics/tone.md", "the tone rubric"), ("review_quality_control.md", "the audit rubric"), ("ai-provenance.md", "the AI provenance rubric"), ("calibration.md", "the calibration rubric"), ("methodology.md", "the methodology rubric"), ("scope.md", "the scope rubric"), ("tone.md", "the tone rubric"), ) def _rewrite_rubric_filenames(text: str) -> str: """Rewrite rubric filename references to prose descriptions.""" if not text: return text out = text for needle, prose in RUBRIC_FILENAME_PROSE: pattern = re.compile(re.escape(needle), re.IGNORECASE) out = pattern.sub(prose, out) return out @dataclass class ParsedReview: """Structured view of a reviews/_*.md file.""" record_id: str title: str doi: str review_date: str recommendation: str disagreement: bool dimension_rows: list[tuple[str, str, list[str]]] = field(default_factory=list) reviewers: list[dict] = field(default_factory=list) def _parse_frontmatter(body: str) -> tuple[dict, str]: """Strip YAML frontmatter; return (fields, remainder).""" if not body.startswith("---\n"): return {}, body end = body.find("\n---\n", 4) if end < 0: return {}, body raw = body[4:end] rest = body[end + 5 :] fields: dict = {} for line in raw.splitlines(): if ":" not in line: continue k, v = line.split(":", 1) fields[k.strip()] = v.strip().strip('"').strip("'") return fields, rest def _parse_aggregate_table(body: str) -> list[tuple[str, str, list[str]]]: """Pull rows out of the 'Aggregate Scores' markdown table.""" rows: list[tuple[str, str, list[str]]] = [] in_table = False for line in body.splitlines(): stripped = line.strip() if stripped.startswith("## Aggregate Scores"): in_table = True continue if in_table and stripped.startswith("## "): break if not in_table or not stripped.startswith("|"): continue if set(stripped.replace("|", "").strip()) <= set("- "): continue cells = [c.strip() for c in stripped.strip("|").split("|")] if len(cells) < 3 or cells[0].lower() == "dimension": continue scores = [s.strip() for s in cells[2].split(",") if s.strip()] rows.append((cells[0], cells[1], scores)) return rows def _split_reviewer_sections(body: str) -> list[tuple[str, str]]: """Extract [(heading, content), ...] for each '### ' block.""" marker = "\n## Individual Model Reviews\n" idx = body.find(marker) if idx < 0: return [] remainder = body[idx + len(marker) :] end = remainder.find("\n---\n") if end >= 0: remainder = remainder[:end] sections: list[tuple[str, str]] = [] current_head: str | None = None current_lines: list[str] = [] for line in remainder.splitlines(): if line.startswith("### "): if current_head is not None: sections.append((current_head, "\n".join(current_lines).strip())) current_head = line[4:].strip() current_lines = [] else: current_lines.append(line) if current_head is not None: sections.append((current_head, "\n".join(current_lines).strip())) return sections def _parse_reviewer_body(content: str) -> dict: """Pull recommendation, summary, dimension scores out of one section.""" if content.startswith("**Error:**"): return {"error": True} rec_match = re.search(r"\*\*Recommendation:\*\*\s*([A-Z_]+)", content) sum_match = re.search(r"\*\*Summary:\*\*\s*(.+?)(?:\n\n|\Z)", content, re.S) dims: list[tuple[str, str, str]] = [] for m in re.finditer( r"^-\s+\*\*(?P