File size: 4,928 Bytes
11d364a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
preprocessing.py β€” Text cleaning and combined_text creation for topic modelling pipeline.

Two text columns are produced:
  - combined_text_raw : Title + Abstract with ORIGINAL casing  β†’ used for SPECTER2 embeddings
  - combined_text     : cleaned / lowercased version           β†’ used for TF-IDF / display
"""

import re
import pandas as pd
from typing import Optional, Tuple


def clean_text(text: str) -> str:
    """
    Clean a single text string:
    - Lowercase
    - Remove extra whitespace
    - Preserve domain-specific terms (hyphens, slashes, acronyms)
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^\w\s\-/]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def load_and_preprocess(filepath: str) -> Tuple[pd.DataFrame, dict]:
    """
    Load a Scopus-format CSV and return a cleaned DataFrame plus a stats dict.

    The DataFrame contains:
      - 'combined_text_raw' : original-casing Title + Abstract (for SPECTER2)
      - 'combined_text'     : lowercased cleaned version (for TF-IDF / display)
      - 'Title', 'Abstract', 'DOI' columns preserved

    Stats dict keys:
      total_raw, duplicates_removed, missing_title, missing_abstract,
      too_short_removed, final_count, avg_text_length

    Raises ValueError if required columns are missing or dataset is too small.
    """
    df = pd.read_csv(filepath)

    # Normalize column names
    df.columns = [c.strip() for c in df.columns]

    # ── Required columns ──────────────────────────────────────────────────────
    required = {"Title", "Abstract"}
    missing_cols = required - set(df.columns)
    if missing_cols:
        raise ValueError(f"CSV is missing required columns: {missing_cols}")

    total_raw = len(df)

    # ── DOI fallback ──────────────────────────────────────────────────────────
    if "DOI" not in df.columns:
        df["DOI"] = df.index.astype(str)
        print("[Preprocessing] DOI column not found β€” using row index as identifier.")

    # ── Drop rows where Title is missing ──────────────────────────────────────
    missing_title = df["Title"].isna().sum()
    df = df.dropna(subset=["Title"]).copy()

    # ── Deduplication by DOI ──────────────────────────────────────────────────
    before_dedup = len(df)
    df = df.drop_duplicates(subset=["DOI"]).reset_index(drop=True)
    duplicates_removed = before_dedup - len(df)
    if duplicates_removed:
        print(f"[Preprocessing] Removed {duplicates_removed} duplicate DOIs.")

    # ── Fill missing abstracts ────────────────────────────────────────────────
    missing_abstract = int(df["Abstract"].isna().sum())
    df["Abstract"] = df["Abstract"].fillna("")

    # ── Build combined_text_raw (original casing β€” for SPECTER2) ─────────────
    df["combined_text_raw"] = (
        df["Title"].str.strip() + " " + df["Abstract"].str.strip()
    )
    df["combined_text_raw"] = df["combined_text_raw"].str.strip()

    # ── Build combined_text (cleaned / lowercased β€” for TF-IDF / display) ────
    df["combined_text"] = (
        df["Title"].apply(clean_text) + " " + df["Abstract"].apply(clean_text)
    )
    df["combined_text"] = df["combined_text"].str.strip()

    # ── Remove rows with insufficient text (β‰₯100 chars in raw text) ───────────
    before_short = len(df)
    df = df[df["combined_text_raw"].str.len() >= 100].reset_index(drop=True)
    too_short_removed = before_short - len(df)
    if too_short_removed:
        print(f"[Preprocessing] Removed {too_short_removed} papers with <100 char combined text.")

    if len(df) < 50:
        raise ValueError(
            f"Dataset too small after preprocessing: {len(df)} papers. Need at least 50."
        )

    avg_len = int(df["combined_text_raw"].str.len().mean())
    print(f"[Preprocessing] Final dataset: {len(df)} papers | avg text length: {avg_len} chars")

    stats = {
        "total_raw": total_raw,
        "missing_title": int(missing_title),
        "duplicates_removed": duplicates_removed,
        "missing_abstract": missing_abstract,
        "too_short_removed": too_short_removed,
        "final_count": len(df),
        "avg_text_length": avg_len,
        "columns_detected": list(df.columns),
    }

    return df[["DOI", "Title", "Abstract", "combined_text_raw", "combined_text"]], stats