Spaces:
Sleeping
Sleeping
| """ | |
| Text cleaning utilities shared across all loaders. | |
| Applied before tokenization — keeps text usable for both | |
| classical NLP pipelines and transformer tokenizers. | |
| """ | |
| import re | |
| from typing import List | |
| from data.schema import Argument, Debate | |
| # Reddit-specific noise | |
| _REDDIT_QUOTE = re.compile(r"^>.*$", re.MULTILINE) | |
| _URL = re.compile(r"https?://\S+|www\.\S+") | |
| _SUBREDDIT_MENTION = re.compile(r"r/\w+") | |
| _USER_MENTION = re.compile(r"u/\w+") | |
| _EDIT_NOTE = re.compile(r"\*?edit\*?:.*", re.IGNORECASE | re.DOTALL) | |
| _WHITESPACE = re.compile(r"\s+") | |
| # Deleted/removed placeholder strings | |
| _DELETED = {"[deleted]", "[removed]", ""} | |
| def clean_text(text: str) -> str: | |
| text = _REDDIT_QUOTE.sub("", text) | |
| text = _URL.sub("", text) | |
| text = _SUBREDDIT_MENTION.sub("", text) | |
| text = _USER_MENTION.sub("", text) | |
| text = _EDIT_NOTE.sub("", text) | |
| text = _WHITESPACE.sub(" ", text) | |
| return text.strip() | |
| def is_valid(text: str, min_tokens: int = 5) -> bool: | |
| """Return False for deleted posts or suspiciously short text.""" | |
| if text in _DELETED: | |
| return False | |
| return len(text.split()) >= min_tokens | |
| def clean_debate(debate: Debate, min_tokens: int = 5) -> Debate: | |
| """Return a new Debate with cleaned argument texts, dropping invalid ones.""" | |
| cleaned_args = [] | |
| for arg in debate.arguments: | |
| text = clean_text(arg.text) | |
| if is_valid(text, min_tokens): | |
| cleaned_args.append( | |
| Argument( | |
| id=arg.id, | |
| text=text, | |
| arg_type=arg.arg_type, | |
| parent_id=arg.parent_id, | |
| author=arg.author, | |
| score=arg.score, | |
| metadata=arg.metadata, | |
| ) | |
| ) | |
| return Debate( | |
| id=debate.id, | |
| title=debate.title, | |
| source=debate.source, | |
| arguments=cleaned_args, | |
| metadata=debate.metadata, | |
| ) | |
| def clean_debates(debates: List[Debate], min_tokens: int = 5) -> List[Debate]: | |
| cleaned = [clean_debate(d, min_tokens) for d in debates] | |
| # Drop debates that lost their root claim during cleaning | |
| return [d for d in cleaned if d.root() is not None] | |