""" preprocessing.py — Text cleaning and combined_text creation for topic modelling pipeline. Two text columns are produced: - combined_text_raw : Title + Abstract with ORIGINAL casing → used for SPECTER2 embeddings - combined_text : cleaned / lowercased version → used for TF-IDF / display """ import re import pandas as pd from typing import Optional, Tuple def clean_text(text: str) -> str: """ Clean a single text string: - Lowercase - Remove extra whitespace - Preserve domain-specific terms (hyphens, slashes, acronyms) """ if not isinstance(text, str): return "" text = text.lower() text = re.sub(r"[^\w\s\-/]", " ", text) text = re.sub(r"\s+", " ", text).strip() return text def load_and_preprocess(filepath: str) -> Tuple[pd.DataFrame, dict]: """ Load a Scopus-format CSV and return a cleaned DataFrame plus a stats dict. The DataFrame contains: - 'combined_text_raw' : original-casing Title + Abstract (for SPECTER2) - 'combined_text' : lowercased cleaned version (for TF-IDF / display) - 'Title', 'Abstract', 'DOI' columns preserved Stats dict keys: total_raw, duplicates_removed, missing_title, missing_abstract, too_short_removed, final_count, avg_text_length Raises ValueError if required columns are missing or dataset is too small. """ df = pd.read_csv(filepath) # Normalize column names df.columns = [c.strip() for c in df.columns] # ── Required columns ────────────────────────────────────────────────────── required = {"Title", "Abstract"} missing_cols = required - set(df.columns) if missing_cols: raise ValueError(f"CSV is missing required columns: {missing_cols}") total_raw = len(df) # ── DOI fallback ────────────────────────────────────────────────────────── if "DOI" not in df.columns: df["DOI"] = df.index.astype(str) print("[Preprocessing] DOI column not found — using row index as identifier.") # ── Drop rows where Title is missing ────────────────────────────────────── missing_title = df["Title"].isna().sum() df = df.dropna(subset=["Title"]).copy() # ── Deduplication by DOI ────────────────────────────────────────────────── before_dedup = len(df) df = df.drop_duplicates(subset=["DOI"]).reset_index(drop=True) duplicates_removed = before_dedup - len(df) if duplicates_removed: print(f"[Preprocessing] Removed {duplicates_removed} duplicate DOIs.") # ── Fill missing abstracts ──────────────────────────────────────────────── missing_abstract = int(df["Abstract"].isna().sum()) df["Abstract"] = df["Abstract"].fillna("") # ── Build combined_text_raw (original casing — for SPECTER2) ───────────── df["combined_text_raw"] = ( df["Title"].str.strip() + " " + df["Abstract"].str.strip() ) df["combined_text_raw"] = df["combined_text_raw"].str.strip() # ── Build combined_text (cleaned / lowercased — for TF-IDF / display) ──── df["combined_text"] = ( df["Title"].apply(clean_text) + " " + df["Abstract"].apply(clean_text) ) df["combined_text"] = df["combined_text"].str.strip() # ── Remove rows with insufficient text (≥100 chars in raw text) ─────────── before_short = len(df) df = df[df["combined_text_raw"].str.len() >= 100].reset_index(drop=True) too_short_removed = before_short - len(df) if too_short_removed: print(f"[Preprocessing] Removed {too_short_removed} papers with <100 char combined text.") if len(df) < 50: raise ValueError( f"Dataset too small after preprocessing: {len(df)} papers. Need at least 50." ) avg_len = int(df["combined_text_raw"].str.len().mean()) print(f"[Preprocessing] Final dataset: {len(df)} papers | avg text length: {avg_len} chars") stats = { "total_raw": total_raw, "missing_title": int(missing_title), "duplicates_removed": duplicates_removed, "missing_abstract": missing_abstract, "too_short_removed": too_short_removed, "final_count": len(df), "avg_text_length": avg_len, "columns_detected": list(df.columns), } return df[["DOI", "Title", "Abstract", "combined_text_raw", "combined_text"]], stats