Spaces:
Sleeping
Sleeping
| """ | |
| preprocessing.py β Text cleaning and combined_text creation for topic modelling pipeline. | |
| Two text columns are produced: | |
| - combined_text_raw : Title + Abstract with ORIGINAL casing β used for SPECTER2 embeddings | |
| - combined_text : cleaned / lowercased version β used for TF-IDF / display | |
| """ | |
| import re | |
| import pandas as pd | |
| from typing import Optional, Tuple | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean a single text string: | |
| - Lowercase | |
| - Remove extra whitespace | |
| - Preserve domain-specific terms (hyphens, slashes, acronyms) | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.lower() | |
| text = re.sub(r"[^\w\s\-/]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def load_and_preprocess(filepath: str) -> Tuple[pd.DataFrame, dict]: | |
| """ | |
| Load a Scopus-format CSV and return a cleaned DataFrame plus a stats dict. | |
| The DataFrame contains: | |
| - 'combined_text_raw' : original-casing Title + Abstract (for SPECTER2) | |
| - 'combined_text' : lowercased cleaned version (for TF-IDF / display) | |
| - 'Title', 'Abstract', 'DOI' columns preserved | |
| Stats dict keys: | |
| total_raw, duplicates_removed, missing_title, missing_abstract, | |
| too_short_removed, final_count, avg_text_length | |
| Raises ValueError if required columns are missing or dataset is too small. | |
| """ | |
| df = pd.read_csv(filepath) | |
| # Normalize column names | |
| df.columns = [c.strip() for c in df.columns] | |
| # ββ Required columns ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| required = {"Title", "Abstract"} | |
| missing_cols = required - set(df.columns) | |
| if missing_cols: | |
| raise ValueError(f"CSV is missing required columns: {missing_cols}") | |
| total_raw = len(df) | |
| # ββ DOI fallback ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if "DOI" not in df.columns: | |
| df["DOI"] = df.index.astype(str) | |
| print("[Preprocessing] DOI column not found β using row index as identifier.") | |
| # ββ Drop rows where Title is missing ββββββββββββββββββββββββββββββββββββββ | |
| missing_title = df["Title"].isna().sum() | |
| df = df.dropna(subset=["Title"]).copy() | |
| # ββ Deduplication by DOI ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| before_dedup = len(df) | |
| df = df.drop_duplicates(subset=["DOI"]).reset_index(drop=True) | |
| duplicates_removed = before_dedup - len(df) | |
| if duplicates_removed: | |
| print(f"[Preprocessing] Removed {duplicates_removed} duplicate DOIs.") | |
| # ββ Fill missing abstracts ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| missing_abstract = int(df["Abstract"].isna().sum()) | |
| df["Abstract"] = df["Abstract"].fillna("") | |
| # ββ Build combined_text_raw (original casing β for SPECTER2) βββββββββββββ | |
| df["combined_text_raw"] = ( | |
| df["Title"].str.strip() + " " + df["Abstract"].str.strip() | |
| ) | |
| df["combined_text_raw"] = df["combined_text_raw"].str.strip() | |
| # ββ Build combined_text (cleaned / lowercased β for TF-IDF / display) ββββ | |
| df["combined_text"] = ( | |
| df["Title"].apply(clean_text) + " " + df["Abstract"].apply(clean_text) | |
| ) | |
| df["combined_text"] = df["combined_text"].str.strip() | |
| # ββ Remove rows with insufficient text (β₯100 chars in raw text) βββββββββββ | |
| before_short = len(df) | |
| df = df[df["combined_text_raw"].str.len() >= 100].reset_index(drop=True) | |
| too_short_removed = before_short - len(df) | |
| if too_short_removed: | |
| print(f"[Preprocessing] Removed {too_short_removed} papers with <100 char combined text.") | |
| if len(df) < 50: | |
| raise ValueError( | |
| f"Dataset too small after preprocessing: {len(df)} papers. Need at least 50." | |
| ) | |
| avg_len = int(df["combined_text_raw"].str.len().mean()) | |
| print(f"[Preprocessing] Final dataset: {len(df)} papers | avg text length: {avg_len} chars") | |
| stats = { | |
| "total_raw": total_raw, | |
| "missing_title": int(missing_title), | |
| "duplicates_removed": duplicates_removed, | |
| "missing_abstract": missing_abstract, | |
| "too_short_removed": too_short_removed, | |
| "final_count": len(df), | |
| "avg_text_length": avg_len, | |
| "columns_detected": list(df.columns), | |
| } | |
| return df[["DOI", "Title", "Abstract", "combined_text_raw", "combined_text"]], stats | |