""" Text cleaning utilities shared across all loaders. Applied before tokenization — keeps text usable for both classical NLP pipelines and transformer tokenizers. """ import re from typing import List from data.schema import Argument, Debate # Reddit-specific noise _REDDIT_QUOTE = re.compile(r"^>.*$", re.MULTILINE) _URL = re.compile(r"https?://\S+|www\.\S+") _SUBREDDIT_MENTION = re.compile(r"r/\w+") _USER_MENTION = re.compile(r"u/\w+") _EDIT_NOTE = re.compile(r"\*?edit\*?:.*", re.IGNORECASE | re.DOTALL) _WHITESPACE = re.compile(r"\s+") # Deleted/removed placeholder strings _DELETED = {"[deleted]", "[removed]", ""} def clean_text(text: str) -> str: text = _REDDIT_QUOTE.sub("", text) text = _URL.sub("", text) text = _SUBREDDIT_MENTION.sub("", text) text = _USER_MENTION.sub("", text) text = _EDIT_NOTE.sub("", text) text = _WHITESPACE.sub(" ", text) return text.strip() def is_valid(text: str, min_tokens: int = 5) -> bool: """Return False for deleted posts or suspiciously short text.""" if text in _DELETED: return False return len(text.split()) >= min_tokens def clean_debate(debate: Debate, min_tokens: int = 5) -> Debate: """Return a new Debate with cleaned argument texts, dropping invalid ones.""" cleaned_args = [] for arg in debate.arguments: text = clean_text(arg.text) if is_valid(text, min_tokens): cleaned_args.append( Argument( id=arg.id, text=text, arg_type=arg.arg_type, parent_id=arg.parent_id, author=arg.author, score=arg.score, metadata=arg.metadata, ) ) return Debate( id=debate.id, title=debate.title, source=debate.source, arguments=cleaned_args, metadata=debate.metadata, ) def clean_debates(debates: List[Debate], min_tokens: int = 5) -> List[Debate]: cleaned = [clean_debate(d, min_tokens) for d in debates] # Drop debates that lost their root claim during cleaning return [d for d in cleaned if d.root() is not None]