Spaces:
Running
Running
| """ | |
| text_utils.py β Shared text cleaning helpers for the Fake News Detection pipeline. | |
| Provides functions for normalizing, cleaning, and featurizing raw text | |
| before it enters any model stage. | |
| """ | |
| import re | |
| import logging | |
| from typing import Optional | |
| import pandas as pd | |
| logger = logging.getLogger(__name__) | |
| def clean_text(text: str) -> str: | |
| """Clean a single text string for downstream processing. | |
| Steps applied (in order): | |
| 1. Lowercase | |
| 2. Remove HTML tags | |
| 3. Remove URLs | |
| 4. Remove special characters (keep alphanumeric + basic punctuation) | |
| 5. Normalize whitespace | |
| Note: Stopwords are **not** removed because LSTM / BERT models need them. | |
| Args: | |
| text: Raw input text. | |
| Returns: | |
| Cleaned text string. | |
| """ | |
| if not isinstance(text, str) or len(text.strip()) == 0: | |
| return "" | |
| # Lowercase | |
| text = text.lower() | |
| # Remove HTML tags | |
| text = re.sub(r"<[^>]+>", " ", text) | |
| # Remove URLs | |
| text = re.sub(r"https?://\S+|www\.\S+", " ", text) | |
| # Remove special characters (keep letters, digits, spaces, basic punctuation) | |
| text = re.sub(r"[^a-z0-9\s.,!?;:'\"-]", " ", text) | |
| # Collapse multiple whitespace into one | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def build_full_text(title: Optional[str], text: Optional[str]) -> str: | |
| """Concatenate title and body text with a period separator. | |
| Args: | |
| title: Article title (may be None or empty). | |
| text: Article body (may be None or empty). | |
| Returns: | |
| Combined string in the form ``"title. text"`` with graceful handling | |
| of missing parts. | |
| """ | |
| if pd.isna(title): title = "" | |
| if pd.isna(text): text = "" | |
| title = str(title).strip() | |
| text = str(text).strip() | |
| if title and text: | |
| return f"{title}. {text}" | |
| elif title: | |
| return title | |
| elif text: | |
| return text | |
| return "" | |
| def word_count(text: str) -> int: | |
| """Return the number of whitespace-delimited tokens in *text*. | |
| Args: | |
| text: Input string (cleaned or raw). | |
| Returns: | |
| Integer word count. | |
| """ | |
| if not text: | |
| return 0 | |
| return len(text.split()) | |
| def text_length_bucket(wc: int) -> str: | |
| """Classify a word count into a length bucket. | |
| Args: | |
| wc: Word count (non-negative integer). | |
| Returns: | |
| One of ``"short"`` (< 50), ``"medium"`` (50β300), ``"long"`` (> 300). | |
| """ | |
| if wc < 50: | |
| return "short" | |
| elif wc <= 300: | |
| return "medium" | |
| else: | |
| return "long" | |
| def clean_empty_texts( | |
| df: pd.DataFrame, | |
| min_word_count: int = 3, | |
| ) -> pd.DataFrame: | |
| """Remove rows with missing or near-empty text content. | |
| Rules: | |
| - Fill NaN in ``title`` and ``text`` columns with empty string. | |
| - Create ``full_text`` = title.strip() + ". " + text.strip(). | |
| - Drop rows where full_text word count < *min_word_count*. | |
| - Reset index after dropping. | |
| Args: | |
| df: Input DataFrame (must contain ``title`` and ``text``). | |
| min_word_count: Minimum number of words required to keep a row. | |
| Returns: | |
| Cleaned DataFrame with empty/near-empty rows removed. | |
| Logs how many rows were dropped. | |
| """ | |
| before = len(df) | |
| df = df.copy() | |
| df["title"] = df["title"].fillna("").astype(str) | |
| df["text"] = df["text"].fillna("").astype(str) | |
| # Build combined text for word-count check | |
| full = df.apply( | |
| lambda r: build_full_text(r["title"], r["text"]), axis=1 | |
| ) | |
| wc = full.apply(word_count) | |
| keep_mask = wc >= min_word_count | |
| dropped = (~keep_mask).sum() | |
| df_out = df.loc[keep_mask].reset_index(drop=True) | |
| # Overwrite 'text' with the combined 'full_text' so model tokenization | |
| # and the validation script do not encounter empty strings in 'text' | |
| df_out["text"] = full.loc[keep_mask].reset_index(drop=True) | |
| logger.info( | |
| "clean_empty_texts: dropped %d / %d rows with word_count < %d", | |
| dropped, before, min_word_count, | |
| ) | |
| return df_out | |
| # βββ standalone test ββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| sample = ( | |
| '<p>WASHINGTON (Reuters) β The U.S. military said on Friday ' | |
| 'https://example.com/article that it would begin accepting ' | |
| 'transgender recruits & moreβ¦</p>' | |
| ) | |
| cleaned = clean_text(sample) | |
| print(f"Cleaned : {cleaned}") | |
| full = build_full_text("Breaking News", cleaned) | |
| print(f"Full : {full}") | |
| wc = word_count(full) | |
| print(f"Words : {wc}") | |
| print(f"Bucket : {text_length_bucket(wc)}") | |