post-audit / rules.py
pasternake's picture
Post Audit MVP — Gradio Space (hybrid rules + Gemma 4 E4B on Modal)
931cd2b verified
Raw
History Blame Contribute Delete
8.78 kB
"""Deterministic rule-based warnings for social post audit."""
from __future__ import annotations
import re
from typing import Any
from platform_config import ACTIVATING_GOAL_KEYWORDS, TIME_PATTERNS, hashtag_limit, normalize_platform
URL_RE = re.compile(r"https?://\S+", re.I)
HASHTAG_RE = re.compile(r"#\w+", re.UNICODE)
CHAT_TIMESTAMP_RE = re.compile(
r"\[\d{1,2}/\d{1,2}/\d{2,4}\s+\d{1,2}:\d{2}\s*(?:AM|PM)?\]",
re.I,
)
CHAT_NAME_RE = re.compile(r"^\[[^\]]+\]\s+\w[\w\s.-]{0,40}:", re.M)
ENGAGEMENT_BAIT_RE = re.compile(
r"\b(tag a friend|comment below|double tap|like and share|drop a .? below|"
r"share this post|repost if you agree)\b",
re.I,
)
STRUCTURE_MARKERS_RE = re.compile(
r"(^[\s]*[-*•]\s|\n[\s]*[-*•]\s|^\d+\.\s|\n\d+\.\s|^#{1,3}\s|\*\*[^*]+\*\*)",
re.M,
)
def detect_post_language(post: str) -> str:
if not post.strip():
return "en"
cyrillic = sum(1 for c in post if "\u0400" <= c <= "\u04FF")
return "ru" if cyrillic > len(post) * 0.15 else "en"
def _msg(lang: str, en: str, ru: str) -> str:
return ru if lang == "ru" else en
def _warning(
code: str,
severity: str,
message: str,
evidence: str | None = None,
) -> dict[str, Any]:
w: dict[str, Any] = {
"code": code,
"severity": severity,
"source": "rule",
"message": message,
}
if evidence:
w["evidence"] = evidence[:120]
return w
def _first_line(post: str) -> str:
lines = [ln.strip() for ln in post.strip().splitlines() if ln.strip()]
return lines[0] if lines else ""
def _evidence_snippet(text: str, max_words: int = 12) -> str:
words = text.split()
return " ".join(words[:max_words])
def _goal_is_activating(goal: str) -> bool:
g = goal.lower()
return any(kw in g for kw in ACTIVATING_GOAL_KEYWORDS)
def _post_has_deadline(post: str) -> bool:
for pat in TIME_PATTERNS:
if re.search(pat, post, re.I):
return True
return False
def _check_hashtag_stuffing(post: str, platform: str, lang: str) -> dict[str, Any] | None:
tags = HASHTAG_RE.findall(post)
limit = hashtag_limit(platform)
if len(tags) <= limit:
return None
return _warning(
"HASHTAG_STUFFING",
"warning",
_msg(
lang,
f"{len(tags)} hashtags exceed the {normalize_platform(platform)} limit ({limit}).",
f"{len(tags)} хэштегов — больше порога для {normalize_platform(platform)} ({limit}).",
),
_evidence_snippet(" ".join(tags[:6])),
)
def _check_chat_dump(post: str, lang: str) -> dict[str, Any] | None:
m = CHAT_TIMESTAMP_RE.search(post) or CHAT_NAME_RE.search(post)
if not m:
return None
return _warning(
"CHAT_DUMP_FORMAT",
"warning",
_msg(
lang,
"Timestamps or chat-style name prefixes look like a pasted conversation, not a composed post.",
"Таймстемпы и подписи «Имя:» — похоже на копипаст чата, а не собранный пост.",
),
_evidence_snippet(m.group(0)),
)
def _check_weak_opening(post: str, lang: str) -> dict[str, Any] | None:
first = _first_line(post)
if not first:
return None
if URL_RE.fullmatch(first.strip()) or (
URL_RE.search(first) and len(first.split()) <= 4
):
return _warning(
"WEAK_OPENING",
"warning",
_msg(
lang,
"First line is a bare link or too thin to hook the reader.",
"Первая строка — голая ссылка или слабая зацепка.",
),
_evidence_snippet(first),
)
return None
def _check_wall_of_text(post: str, lang: str) -> dict[str, Any] | None:
for para in post.split("\n\n"):
p = para.strip()
if len(p) > 400 and "\n" not in p:
return _warning(
"WALL_OF_TEXT",
"warning",
_msg(
lang,
"Long paragraph without line breaks is hard to scan.",
"Длинный абзац без переносов — тяжело читать.",
),
_evidence_snippet(p),
)
return None
def _check_no_structure(post: str, lang: str) -> dict[str, Any] | None:
if len(post) < 280:
return None
if STRUCTURE_MARKERS_RE.search(post):
return None
if post.count("\n") >= 4:
return None
return _warning(
"NO_STRUCTURE",
"warning",
_msg(
lang,
"Long post lacks bullets, numbers, or headings — reads as a flat dump.",
"Длинный пост без списков и иерархии — плоский дамп.",
),
_evidence_snippet(_first_line(post) or post[:80]),
)
def _check_engagement_bait(post: str, lang: str) -> dict[str, Any] | None:
m = ENGAGEMENT_BAIT_RE.search(post)
if not m:
return None
return _warning(
"ENGAGEMENT_BAIT",
"warning",
_msg(
lang,
"Engagement-bait phrasing detected.",
"Обнаружена «приманка» для вовлечения.",
),
_evidence_snippet(m.group(0)),
)
def _check_bare_link(post: str, lang: str) -> dict[str, Any] | None:
urls = URL_RE.findall(post)
if not urls:
return None
for url in urls:
idx = post.find(url)
before = post[max(0, idx - 80) : idx].strip()
after = post[idx + len(url) : idx + len(url) + 80].strip()
context = (before + " " + after).strip()
if len(context.split()) < 6:
return _warning(
"BARE_LINK",
"info",
_msg(
lang,
"Link appears without framing — say what's inside and why to open it.",
"Ссылка без рамки — неясно, что внутри и зачем открывать.",
),
_evidence_snippet(url),
)
return None
def _check_dense_parenthetical(post: str, lang: str) -> dict[str, Any] | None:
for m in re.finditer(r"\([^)]{80,}\)", post):
return _warning(
"DENSE_PARENTHETICAL",
"info",
_msg(
lang,
"Long parenthetical breaks reading flow.",
"Длинная вставка в скобках тормозит чтение.",
),
_evidence_snippet(m.group(0)),
)
return None
def _check_no_deadline(post: str, goal: str, lang: str) -> dict[str, Any] | None:
if not _goal_is_activating(goal):
return None
if _post_has_deadline(post):
return None
return _warning(
"NO_DEADLINE",
"warning",
_msg(
lang,
"Goal implies a time-bound action, but the post has no date or deadline.",
"Цель требует действия ко времени, но в посте нет срока.",
),
_evidence_snippet(_first_line(post) or post[:60]),
)
def _check_unresolved_reference(post: str, lang: str) -> dict[str, Any] | None:
refs = re.search(
r"\b(link in bio|link in profile|see profile|check my profile|"
r"registration link|sign up link)\b",
post,
re.I,
)
if not refs:
return None
if URL_RE.search(post):
return None
return _warning(
"UNRESOLVED_REFERENCE",
"info",
_msg(
lang,
"Registration or resource is referenced but no URL is in the post.",
"Регистрация упомянута, но ссылки в тексте нет.",
),
_evidence_snippet(refs.group(0)),
)
def run_rules(
platform: str,
goal: str,
audience: str,
post: str,
) -> list[dict[str, Any]]:
"""Run all rule linters and return warning objects."""
del audience # reserved for future rule checks
lang = detect_post_language(post)
checks = [
_check_hashtag_stuffing(post, platform, lang),
_check_chat_dump(post, lang),
_check_weak_opening(post, lang),
_check_wall_of_text(post, lang),
_check_no_structure(post, lang),
_check_engagement_bait(post, lang),
_check_bare_link(post, lang),
_check_dense_parenthetical(post, lang),
_check_no_deadline(post, goal, lang),
_check_unresolved_reference(post, lang),
]
return [w for w in checks if w is not None]