agent-2-specter / preprocessing.py
reyansh2005's picture
clean ok1 without history
11d364a
"""
preprocessing.py β€” Text cleaning and combined_text creation for topic modelling pipeline.
Two text columns are produced:
- combined_text_raw : Title + Abstract with ORIGINAL casing β†’ used for SPECTER2 embeddings
- combined_text : cleaned / lowercased version β†’ used for TF-IDF / display
"""
import re
import pandas as pd
from typing import Optional, Tuple
def clean_text(text: str) -> str:
"""
Clean a single text string:
- Lowercase
- Remove extra whitespace
- Preserve domain-specific terms (hyphens, slashes, acronyms)
"""
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r"[^\w\s\-/]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def load_and_preprocess(filepath: str) -> Tuple[pd.DataFrame, dict]:
"""
Load a Scopus-format CSV and return a cleaned DataFrame plus a stats dict.
The DataFrame contains:
- 'combined_text_raw' : original-casing Title + Abstract (for SPECTER2)
- 'combined_text' : lowercased cleaned version (for TF-IDF / display)
- 'Title', 'Abstract', 'DOI' columns preserved
Stats dict keys:
total_raw, duplicates_removed, missing_title, missing_abstract,
too_short_removed, final_count, avg_text_length
Raises ValueError if required columns are missing or dataset is too small.
"""
df = pd.read_csv(filepath)
# Normalize column names
df.columns = [c.strip() for c in df.columns]
# ── Required columns ──────────────────────────────────────────────────────
required = {"Title", "Abstract"}
missing_cols = required - set(df.columns)
if missing_cols:
raise ValueError(f"CSV is missing required columns: {missing_cols}")
total_raw = len(df)
# ── DOI fallback ──────────────────────────────────────────────────────────
if "DOI" not in df.columns:
df["DOI"] = df.index.astype(str)
print("[Preprocessing] DOI column not found β€” using row index as identifier.")
# ── Drop rows where Title is missing ──────────────────────────────────────
missing_title = df["Title"].isna().sum()
df = df.dropna(subset=["Title"]).copy()
# ── Deduplication by DOI ──────────────────────────────────────────────────
before_dedup = len(df)
df = df.drop_duplicates(subset=["DOI"]).reset_index(drop=True)
duplicates_removed = before_dedup - len(df)
if duplicates_removed:
print(f"[Preprocessing] Removed {duplicates_removed} duplicate DOIs.")
# ── Fill missing abstracts ────────────────────────────────────────────────
missing_abstract = int(df["Abstract"].isna().sum())
df["Abstract"] = df["Abstract"].fillna("")
# ── Build combined_text_raw (original casing β€” for SPECTER2) ─────────────
df["combined_text_raw"] = (
df["Title"].str.strip() + " " + df["Abstract"].str.strip()
)
df["combined_text_raw"] = df["combined_text_raw"].str.strip()
# ── Build combined_text (cleaned / lowercased β€” for TF-IDF / display) ────
df["combined_text"] = (
df["Title"].apply(clean_text) + " " + df["Abstract"].apply(clean_text)
)
df["combined_text"] = df["combined_text"].str.strip()
# ── Remove rows with insufficient text (β‰₯100 chars in raw text) ───────────
before_short = len(df)
df = df[df["combined_text_raw"].str.len() >= 100].reset_index(drop=True)
too_short_removed = before_short - len(df)
if too_short_removed:
print(f"[Preprocessing] Removed {too_short_removed} papers with <100 char combined text.")
if len(df) < 50:
raise ValueError(
f"Dataset too small after preprocessing: {len(df)} papers. Need at least 50."
)
avg_len = int(df["combined_text_raw"].str.len().mean())
print(f"[Preprocessing] Final dataset: {len(df)} papers | avg text length: {avg_len} chars")
stats = {
"total_raw": total_raw,
"missing_title": int(missing_title),
"duplicates_removed": duplicates_removed,
"missing_abstract": missing_abstract,
"too_short_removed": too_short_removed,
"final_count": len(df),
"avg_text_length": avg_len,
"columns_detected": list(df.columns),
}
return df[["DOI", "Title", "Abstract", "combined_text_raw", "combined_text"]], stats