Spaces:
Sleeping
Sleeping
File size: 4,928 Bytes
11d364a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | """
preprocessing.py β Text cleaning and combined_text creation for topic modelling pipeline.
Two text columns are produced:
- combined_text_raw : Title + Abstract with ORIGINAL casing β used for SPECTER2 embeddings
- combined_text : cleaned / lowercased version β used for TF-IDF / display
"""
import re
import pandas as pd
from typing import Optional, Tuple
def clean_text(text: str) -> str:
"""
Clean a single text string:
- Lowercase
- Remove extra whitespace
- Preserve domain-specific terms (hyphens, slashes, acronyms)
"""
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r"[^\w\s\-/]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def load_and_preprocess(filepath: str) -> Tuple[pd.DataFrame, dict]:
"""
Load a Scopus-format CSV and return a cleaned DataFrame plus a stats dict.
The DataFrame contains:
- 'combined_text_raw' : original-casing Title + Abstract (for SPECTER2)
- 'combined_text' : lowercased cleaned version (for TF-IDF / display)
- 'Title', 'Abstract', 'DOI' columns preserved
Stats dict keys:
total_raw, duplicates_removed, missing_title, missing_abstract,
too_short_removed, final_count, avg_text_length
Raises ValueError if required columns are missing or dataset is too small.
"""
df = pd.read_csv(filepath)
# Normalize column names
df.columns = [c.strip() for c in df.columns]
# ββ Required columns ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
required = {"Title", "Abstract"}
missing_cols = required - set(df.columns)
if missing_cols:
raise ValueError(f"CSV is missing required columns: {missing_cols}")
total_raw = len(df)
# ββ DOI fallback ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if "DOI" not in df.columns:
df["DOI"] = df.index.astype(str)
print("[Preprocessing] DOI column not found β using row index as identifier.")
# ββ Drop rows where Title is missing ββββββββββββββββββββββββββββββββββββββ
missing_title = df["Title"].isna().sum()
df = df.dropna(subset=["Title"]).copy()
# ββ Deduplication by DOI ββββββββββββββββββββββββββββββββββββββββββββββββββ
before_dedup = len(df)
df = df.drop_duplicates(subset=["DOI"]).reset_index(drop=True)
duplicates_removed = before_dedup - len(df)
if duplicates_removed:
print(f"[Preprocessing] Removed {duplicates_removed} duplicate DOIs.")
# ββ Fill missing abstracts ββββββββββββββββββββββββββββββββββββββββββββββββ
missing_abstract = int(df["Abstract"].isna().sum())
df["Abstract"] = df["Abstract"].fillna("")
# ββ Build combined_text_raw (original casing β for SPECTER2) βββββββββββββ
df["combined_text_raw"] = (
df["Title"].str.strip() + " " + df["Abstract"].str.strip()
)
df["combined_text_raw"] = df["combined_text_raw"].str.strip()
# ββ Build combined_text (cleaned / lowercased β for TF-IDF / display) ββββ
df["combined_text"] = (
df["Title"].apply(clean_text) + " " + df["Abstract"].apply(clean_text)
)
df["combined_text"] = df["combined_text"].str.strip()
# ββ Remove rows with insufficient text (β₯100 chars in raw text) βββββββββββ
before_short = len(df)
df = df[df["combined_text_raw"].str.len() >= 100].reset_index(drop=True)
too_short_removed = before_short - len(df)
if too_short_removed:
print(f"[Preprocessing] Removed {too_short_removed} papers with <100 char combined text.")
if len(df) < 50:
raise ValueError(
f"Dataset too small after preprocessing: {len(df)} papers. Need at least 50."
)
avg_len = int(df["combined_text_raw"].str.len().mean())
print(f"[Preprocessing] Final dataset: {len(df)} papers | avg text length: {avg_len} chars")
stats = {
"total_raw": total_raw,
"missing_title": int(missing_title),
"duplicates_removed": duplicates_removed,
"missing_abstract": missing_abstract,
"too_short_removed": too_short_removed,
"final_count": len(df),
"avg_text_length": avg_len,
"columns_detected": list(df.columns),
}
return df[["DOI", "Title", "Abstract", "combined_text_raw", "combined_text"]], stats
|