"""Deterministic rule-based warnings for social post audit.""" from __future__ import annotations import re from typing import Any from platform_config import ACTIVATING_GOAL_KEYWORDS, TIME_PATTERNS, hashtag_limit, normalize_platform URL_RE = re.compile(r"https?://\S+", re.I) HASHTAG_RE = re.compile(r"#\w+", re.UNICODE) CHAT_TIMESTAMP_RE = re.compile( r"\[\d{1,2}/\d{1,2}/\d{2,4}\s+\d{1,2}:\d{2}\s*(?:AM|PM)?\]", re.I, ) CHAT_NAME_RE = re.compile(r"^\[[^\]]+\]\s+\w[\w\s.-]{0,40}:", re.M) ENGAGEMENT_BAIT_RE = re.compile( r"\b(tag a friend|comment below|double tap|like and share|drop a .? below|" r"share this post|repost if you agree)\b", re.I, ) STRUCTURE_MARKERS_RE = re.compile( r"(^[\s]*[-*•]\s|\n[\s]*[-*•]\s|^\d+\.\s|\n\d+\.\s|^#{1,3}\s|\*\*[^*]+\*\*)", re.M, ) def detect_post_language(post: str) -> str: if not post.strip(): return "en" cyrillic = sum(1 for c in post if "\u0400" <= c <= "\u04FF") return "ru" if cyrillic > len(post) * 0.15 else "en" def _msg(lang: str, en: str, ru: str) -> str: return ru if lang == "ru" else en def _warning( code: str, severity: str, message: str, evidence: str | None = None, ) -> dict[str, Any]: w: dict[str, Any] = { "code": code, "severity": severity, "source": "rule", "message": message, } if evidence: w["evidence"] = evidence[:120] return w def _first_line(post: str) -> str: lines = [ln.strip() for ln in post.strip().splitlines() if ln.strip()] return lines[0] if lines else "" def _evidence_snippet(text: str, max_words: int = 12) -> str: words = text.split() return " ".join(words[:max_words]) def _goal_is_activating(goal: str) -> bool: g = goal.lower() return any(kw in g for kw in ACTIVATING_GOAL_KEYWORDS) def _post_has_deadline(post: str) -> bool: for pat in TIME_PATTERNS: if re.search(pat, post, re.I): return True return False def _check_hashtag_stuffing(post: str, platform: str, lang: str) -> dict[str, Any] | None: tags = HASHTAG_RE.findall(post) limit = hashtag_limit(platform) if len(tags) <= limit: return None return _warning( "HASHTAG_STUFFING", "warning", _msg( lang, f"{len(tags)} hashtags exceed the {normalize_platform(platform)} limit ({limit}).", f"{len(tags)} хэштегов — больше порога для {normalize_platform(platform)} ({limit}).", ), _evidence_snippet(" ".join(tags[:6])), ) def _check_chat_dump(post: str, lang: str) -> dict[str, Any] | None: m = CHAT_TIMESTAMP_RE.search(post) or CHAT_NAME_RE.search(post) if not m: return None return _warning( "CHAT_DUMP_FORMAT", "warning", _msg( lang, "Timestamps or chat-style name prefixes look like a pasted conversation, not a composed post.", "Таймстемпы и подписи «Имя:» — похоже на копипаст чата, а не собранный пост.", ), _evidence_snippet(m.group(0)), ) def _check_weak_opening(post: str, lang: str) -> dict[str, Any] | None: first = _first_line(post) if not first: return None if URL_RE.fullmatch(first.strip()) or ( URL_RE.search(first) and len(first.split()) <= 4 ): return _warning( "WEAK_OPENING", "warning", _msg( lang, "First line is a bare link or too thin to hook the reader.", "Первая строка — голая ссылка или слабая зацепка.", ), _evidence_snippet(first), ) return None def _check_wall_of_text(post: str, lang: str) -> dict[str, Any] | None: for para in post.split("\n\n"): p = para.strip() if len(p) > 400 and "\n" not in p: return _warning( "WALL_OF_TEXT", "warning", _msg( lang, "Long paragraph without line breaks is hard to scan.", "Длинный абзац без переносов — тяжело читать.", ), _evidence_snippet(p), ) return None def _check_no_structure(post: str, lang: str) -> dict[str, Any] | None: if len(post) < 280: return None if STRUCTURE_MARKERS_RE.search(post): return None if post.count("\n") >= 4: return None return _warning( "NO_STRUCTURE", "warning", _msg( lang, "Long post lacks bullets, numbers, or headings — reads as a flat dump.", "Длинный пост без списков и иерархии — плоский дамп.", ), _evidence_snippet(_first_line(post) or post[:80]), ) def _check_engagement_bait(post: str, lang: str) -> dict[str, Any] | None: m = ENGAGEMENT_BAIT_RE.search(post) if not m: return None return _warning( "ENGAGEMENT_BAIT", "warning", _msg( lang, "Engagement-bait phrasing detected.", "Обнаружена «приманка» для вовлечения.", ), _evidence_snippet(m.group(0)), ) def _check_bare_link(post: str, lang: str) -> dict[str, Any] | None: urls = URL_RE.findall(post) if not urls: return None for url in urls: idx = post.find(url) before = post[max(0, idx - 80) : idx].strip() after = post[idx + len(url) : idx + len(url) + 80].strip() context = (before + " " + after).strip() if len(context.split()) < 6: return _warning( "BARE_LINK", "info", _msg( lang, "Link appears without framing — say what's inside and why to open it.", "Ссылка без рамки — неясно, что внутри и зачем открывать.", ), _evidence_snippet(url), ) return None def _check_dense_parenthetical(post: str, lang: str) -> dict[str, Any] | None: for m in re.finditer(r"\([^)]{80,}\)", post): return _warning( "DENSE_PARENTHETICAL", "info", _msg( lang, "Long parenthetical breaks reading flow.", "Длинная вставка в скобках тормозит чтение.", ), _evidence_snippet(m.group(0)), ) return None def _check_no_deadline(post: str, goal: str, lang: str) -> dict[str, Any] | None: if not _goal_is_activating(goal): return None if _post_has_deadline(post): return None return _warning( "NO_DEADLINE", "warning", _msg( lang, "Goal implies a time-bound action, but the post has no date or deadline.", "Цель требует действия ко времени, но в посте нет срока.", ), _evidence_snippet(_first_line(post) or post[:60]), ) def _check_unresolved_reference(post: str, lang: str) -> dict[str, Any] | None: refs = re.search( r"\b(link in bio|link in profile|see profile|check my profile|" r"registration link|sign up link)\b", post, re.I, ) if not refs: return None if URL_RE.search(post): return None return _warning( "UNRESOLVED_REFERENCE", "info", _msg( lang, "Registration or resource is referenced but no URL is in the post.", "Регистрация упомянута, но ссылки в тексте нет.", ), _evidence_snippet(refs.group(0)), ) def run_rules( platform: str, goal: str, audience: str, post: str, ) -> list[dict[str, Any]]: """Run all rule linters and return warning objects.""" del audience # reserved for future rule checks lang = detect_post_language(post) checks = [ _check_hashtag_stuffing(post, platform, lang), _check_chat_dump(post, lang), _check_weak_opening(post, lang), _check_wall_of_text(post, lang), _check_no_structure(post, lang), _check_engagement_bait(post, lang), _check_bare_link(post, lang), _check_dense_parenthetical(post, lang), _check_no_deadline(post, goal, lang), _check_unresolved_reference(post, lang), ] return [w for w in checks if w is not None]