Spaces:

reyansh2005
/

agent-2-specter

Sleeping

App Files Files Community

agent-2-specter / preprocessing.py

reyansh2005

clean ok1 without history

11d364a about 1 month ago

raw

history blame contribute delete

4.93 kB

	"""
	preprocessing.py — Text cleaning and combined_text creation for topic modelling pipeline.

	Two text columns are produced:
	- combined_text_raw : Title + Abstract with ORIGINAL casing → used for SPECTER2 embeddings
	- combined_text : cleaned / lowercased version → used for TF-IDF / display
	"""

	import re
	import pandas as pd
	from typing import Optional, Tuple


	def clean_text(text: str) -> str:
	"""
	Clean a single text string:
	- Lowercase
	- Remove extra whitespace
	- Preserve domain-specific terms (hyphens, slashes, acronyms)
	"""
	if not isinstance(text, str):
	return ""
	text = text.lower()
	text = re.sub(r"[^\w\s\-/]", " ", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def load_and_preprocess(filepath: str) -> Tuple[pd.DataFrame, dict]:
	"""
	Load a Scopus-format CSV and return a cleaned DataFrame plus a stats dict.

	The DataFrame contains:
	- 'combined_text_raw' : original-casing Title + Abstract (for SPECTER2)
	- 'combined_text' : lowercased cleaned version (for TF-IDF / display)
	- 'Title', 'Abstract', 'DOI' columns preserved

	Stats dict keys:
	total_raw, duplicates_removed, missing_title, missing_abstract,
	too_short_removed, final_count, avg_text_length

	Raises ValueError if required columns are missing or dataset is too small.
	"""
	df = pd.read_csv(filepath)

	# Normalize column names
	df.columns = [c.strip() for c in df.columns]

	# ── Required columns ──────────────────────────────────────────────────────
	required = {"Title", "Abstract"}
	missing_cols = required - set(df.columns)
	if missing_cols:
	raise ValueError(f"CSV is missing required columns: {missing_cols}")

	total_raw = len(df)

	# ── DOI fallback ──────────────────────────────────────────────────────────
	if "DOI" not in df.columns:
	df["DOI"] = df.index.astype(str)
	print("[Preprocessing] DOI column not found — using row index as identifier.")

	# ── Drop rows where Title is missing ──────────────────────────────────────
	missing_title = df["Title"].isna().sum()
	df = df.dropna(subset=["Title"]).copy()

	# ── Deduplication by DOI ──────────────────────────────────────────────────
	before_dedup = len(df)
	df = df.drop_duplicates(subset=["DOI"]).reset_index(drop=True)
	duplicates_removed = before_dedup - len(df)
	if duplicates_removed:
	print(f"[Preprocessing] Removed {duplicates_removed} duplicate DOIs.")

	# ── Fill missing abstracts ────────────────────────────────────────────────
	missing_abstract = int(df["Abstract"].isna().sum())
	df["Abstract"] = df["Abstract"].fillna("")

	# ── Build combined_text_raw (original casing — for SPECTER2) ─────────────
	df["combined_text_raw"] = (
	df["Title"].str.strip() + " " + df["Abstract"].str.strip()
	)
	df["combined_text_raw"] = df["combined_text_raw"].str.strip()

	# ── Build combined_text (cleaned / lowercased — for TF-IDF / display) ────
	df["combined_text"] = (
	df["Title"].apply(clean_text) + " " + df["Abstract"].apply(clean_text)
	)
	df["combined_text"] = df["combined_text"].str.strip()

	# ── Remove rows with insufficient text (≥100 chars in raw text) ───────────
	before_short = len(df)
	df = df[df["combined_text_raw"].str.len() >= 100].reset_index(drop=True)
	too_short_removed = before_short - len(df)
	if too_short_removed:
	print(f"[Preprocessing] Removed {too_short_removed} papers with <100 char combined text.")

	if len(df) < 50:
	raise ValueError(
	f"Dataset too small after preprocessing: {len(df)} papers. Need at least 50."
	)

	avg_len = int(df["combined_text_raw"].str.len().mean())
	print(f"[Preprocessing] Final dataset: {len(df)} papers \| avg text length: {avg_len} chars")

	stats = {
	"total_raw": total_raw,
	"missing_title": int(missing_title),
	"duplicates_removed": duplicates_removed,
	"missing_abstract": missing_abstract,
	"too_short_removed": too_short_removed,
	"final_count": len(df),
	"avg_text_length": avg_len,
	"columns_detected": list(df.columns),
	}

	return df[["DOI", "Title", "Abstract", "combined_text_raw", "combined_text"]], stats