Sina1138 commited on
Commit ·
e425a8a
1
Parent(s): 9068195
Add scoring utilities and unified scoring pipeline for ICLR review data
Browse files- dependencies/scoring_utils.py +262 -0
- interface/Demo.py +13 -11
- run_polarity_scoring.py +215 -0
- run_scoring.py +222 -0
- run_topic_scoring.py +218 -0
- scored_reviews_builder.py +24 -32
dependencies/scoring_utils.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Shared utilities for polarity and topic scoring pipelines.
|
| 3 |
+
Provides common functions for model loading, prediction, and result saving.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import torch
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def find_available_years(data_dir: Path) -> list:
|
| 15 |
+
"""
|
| 16 |
+
Auto-detect years by scanning data directory for all_reviews_*.csv files.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
data_dir: Path to directory containing processed review data
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Sorted list of years found
|
| 23 |
+
"""
|
| 24 |
+
years = []
|
| 25 |
+
if data_dir.exists():
|
| 26 |
+
for file in data_dir.glob("all_reviews_*.csv"):
|
| 27 |
+
match = re.search(r'all_reviews_(\d{4})\.csv', file.name)
|
| 28 |
+
if match:
|
| 29 |
+
years.append(int(match.group(1)))
|
| 30 |
+
|
| 31 |
+
return sorted(years)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_model_and_tokenizer(model_dir: Path, device: str = "cuda"):
|
| 35 |
+
"""
|
| 36 |
+
Load a model and tokenizer from a local directory.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
model_dir: Path to directory containing model (config.json, pytorch_model.bin, etc.)
|
| 40 |
+
device: Device to load model onto ("cuda" or "cpu")
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Tuple of (tokenizer, model)
|
| 44 |
+
|
| 45 |
+
Raises:
|
| 46 |
+
FileNotFoundError: If model directory doesn't exist or is missing model files
|
| 47 |
+
"""
|
| 48 |
+
if not model_dir.exists():
|
| 49 |
+
raise FileNotFoundError(f"Model directory not found: {model_dir}")
|
| 50 |
+
|
| 51 |
+
# Check for required files
|
| 52 |
+
required_files = ["config.json", "pytorch_model.bin"]
|
| 53 |
+
for required_file in required_files:
|
| 54 |
+
if not (model_dir / required_file).exists():
|
| 55 |
+
raise FileNotFoundError(f"Missing {required_file} in {model_dir}")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
|
| 59 |
+
model = AutoModelForSequenceClassification.from_pretrained(str(model_dir))
|
| 60 |
+
model.eval()
|
| 61 |
+
|
| 62 |
+
# Move to device
|
| 63 |
+
device_obj = torch.device(device if torch.cuda.is_available() else "cpu")
|
| 64 |
+
model.to(device_obj)
|
| 65 |
+
|
| 66 |
+
return tokenizer, model, device_obj
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
raise RuntimeError(f"Failed to load model from {model_dir}: {e}")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def predict_batch(sentences: list, tokenizer, model, device, max_length: int = 512) -> list:
|
| 73 |
+
"""
|
| 74 |
+
Run batch predictions on a list of sentences.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
sentences: List of sentence strings to predict
|
| 78 |
+
tokenizer: Tokenizer instance
|
| 79 |
+
model: Model instance
|
| 80 |
+
device: Device object for computation
|
| 81 |
+
max_length: Maximum token length (default: 512 for BERT-like models)
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
List of predicted class IDs (integers)
|
| 85 |
+
"""
|
| 86 |
+
if not sentences:
|
| 87 |
+
return []
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
inputs = tokenizer(
|
| 91 |
+
sentences,
|
| 92 |
+
return_tensors="pt",
|
| 93 |
+
padding=True,
|
| 94 |
+
truncation=True,
|
| 95 |
+
max_length=max_length
|
| 96 |
+
).to(device)
|
| 97 |
+
|
| 98 |
+
with torch.no_grad():
|
| 99 |
+
outputs = model(**inputs)
|
| 100 |
+
predictions = torch.argmax(outputs.logits, dim=1).cpu().tolist()
|
| 101 |
+
|
| 102 |
+
return predictions
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
raise RuntimeError(f"Prediction failed: {e}")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def save_polarity_results(output_path: Path, results: list) -> None:
|
| 109 |
+
"""
|
| 110 |
+
Save polarity scoring results to CSV.
|
| 111 |
+
|
| 112 |
+
Expected result format:
|
| 113 |
+
[
|
| 114 |
+
{"id": review_id, "sentence": sentence_text, "score": float, "label": int},
|
| 115 |
+
...
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
output_path: Path to output CSV file
|
| 120 |
+
results: List of result dictionaries
|
| 121 |
+
"""
|
| 122 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 123 |
+
df = pd.DataFrame(results)
|
| 124 |
+
df.to_csv(output_path, index=False)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def save_topic_results(output_path: Path, results: list) -> None:
|
| 128 |
+
"""
|
| 129 |
+
Save topic scoring results to CSV.
|
| 130 |
+
|
| 131 |
+
Expected result format:
|
| 132 |
+
[
|
| 133 |
+
{"id": review_id, "sentence": sentence_text, "topic_id": int, "topic_label": str},
|
| 134 |
+
...
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
output_path: Path to output CSV file
|
| 139 |
+
results: List of result dictionaries
|
| 140 |
+
"""
|
| 141 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 142 |
+
df = pd.DataFrame(results)
|
| 143 |
+
df.to_csv(output_path, index=False)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def validate_input_file(input_path: Path, required_columns: list) -> pd.DataFrame:
|
| 147 |
+
"""
|
| 148 |
+
Validate that input CSV file exists and has required columns.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
input_path: Path to CSV file
|
| 152 |
+
required_columns: List of column names that must exist
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Loaded DataFrame
|
| 156 |
+
|
| 157 |
+
Raises:
|
| 158 |
+
FileNotFoundError: If file doesn't exist
|
| 159 |
+
ValueError: If required columns are missing
|
| 160 |
+
"""
|
| 161 |
+
if not input_path.exists():
|
| 162 |
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
df = pd.read_csv(input_path)
|
| 166 |
+
except Exception as e:
|
| 167 |
+
raise ValueError(f"Failed to read CSV {input_path}: {e}")
|
| 168 |
+
|
| 169 |
+
missing_cols = set(required_columns) - set(df.columns)
|
| 170 |
+
if missing_cols:
|
| 171 |
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
| 172 |
+
|
| 173 |
+
return df
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def load_polarity_model(model_variant: str, base_dir: Path, device: str = "cuda"):
|
| 177 |
+
"""
|
| 178 |
+
Factory function to load polarity model by variant name.
|
| 179 |
+
|
| 180 |
+
Supported variants:
|
| 181 |
+
- "scibert": scibert/scibert_polarity/final_model
|
| 182 |
+
- "deberta": alternative_polarity/deberta/final_model
|
| 183 |
+
- "scideberta": alternative_polarity/scideberta/final_model
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
model_variant: Name of model variant
|
| 187 |
+
base_dir: Base directory of project
|
| 188 |
+
device: Device to load onto
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
Tuple of (tokenizer, model, device_obj)
|
| 192 |
+
|
| 193 |
+
Raises:
|
| 194 |
+
ValueError: If model_variant not supported
|
| 195 |
+
FileNotFoundError: If model directory doesn't exist
|
| 196 |
+
"""
|
| 197 |
+
variant_map = {
|
| 198 |
+
"scibert": base_dir / "scibert" / "scibert_polarity" / "final_model",
|
| 199 |
+
"deberta": base_dir / "alternative_polarity" / "deberta" / "deberta_v3_base_polarity_final_model",
|
| 200 |
+
"scideberta": base_dir / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model",
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
if model_variant not in variant_map:
|
| 204 |
+
raise ValueError(
|
| 205 |
+
f"Unknown polarity model variant: {model_variant}. "
|
| 206 |
+
f"Supported: {list(variant_map.keys())}"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
model_dir = variant_map[model_variant]
|
| 210 |
+
return load_model_and_tokenizer(model_dir, device)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def load_topic_model(model_variant: str, base_dir: Path, device: str = "cuda"):
|
| 214 |
+
"""
|
| 215 |
+
Factory function to load topic model by variant name.
|
| 216 |
+
|
| 217 |
+
Supported variants:
|
| 218 |
+
- "scibert": scibert/scibert_topic/final_model
|
| 219 |
+
- "deberta": alternative_topic/deberta/final_model
|
| 220 |
+
- "scideberta": alternative_topic/scideberta/final_model
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
model_variant: Name of model variant
|
| 224 |
+
base_dir: Base directory of project
|
| 225 |
+
device: Device to load onto
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
Tuple of (tokenizer, model, device_obj)
|
| 229 |
+
|
| 230 |
+
Raises:
|
| 231 |
+
ValueError: If model_variant not supported
|
| 232 |
+
FileNotFoundError: If model directory doesn't exist
|
| 233 |
+
"""
|
| 234 |
+
variant_map = {
|
| 235 |
+
"scibert": base_dir / "scibert" / "scibert_topic" / "final_model",
|
| 236 |
+
"deberta": base_dir / "alternative_topic" / "deberta" / "final_model",
|
| 237 |
+
"scideberta": base_dir / "alternative_topic" / "scideberta" / "final_model",
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
if model_variant not in variant_map:
|
| 241 |
+
raise ValueError(
|
| 242 |
+
f"Unknown topic model variant: {model_variant}. "
|
| 243 |
+
f"Supported: {list(variant_map.keys())}"
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
model_dir = variant_map[model_variant]
|
| 247 |
+
return load_model_and_tokenizer(model_dir, device)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# Topic label mapping
|
| 251 |
+
TOPIC_ID_TO_LABEL = {
|
| 252 |
+
0: "Substance",
|
| 253 |
+
1: "Clarity",
|
| 254 |
+
2: "Soundness/Correctness",
|
| 255 |
+
3: "Originality",
|
| 256 |
+
4: "Motivation/Impact",
|
| 257 |
+
5: "Meaningful Comparison",
|
| 258 |
+
6: "Replicability",
|
| 259 |
+
7: "NONE",
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
TOPIC_LABEL_TO_ID = {v: k for k, v in TOPIC_ID_TO_LABEL.items()}
|
interface/Demo.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import math
|
| 2 |
-
|
| 3 |
import sys, os.path
|
|
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
|
|
@@ -11,20 +11,15 @@ from dependencies.rsa_reranker import RSAReranking
|
|
| 11 |
import gradio as gr
|
| 12 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
|
| 13 |
import pandas as pd
|
| 14 |
-
from pathlib import Path
|
| 15 |
import ast
|
| 16 |
from tqdm import tqdm
|
| 17 |
|
| 18 |
-
from scored_reviews_builder import load_scored_reviews
|
| 19 |
from dependencies.Glimpse_tokenizer import glimpse_tokenizer
|
| 20 |
# from scibert.scibert_polarity.scibert_polarity import predict_polarity
|
| 21 |
|
| 22 |
-
# Load
|
| 23 |
-
years_legacy, df_legacy = load_scored_reviews()
|
| 24 |
-
|
| 25 |
-
# Load new reviews with rebuttals (2022-2025) - if available
|
| 26 |
def load_scored_reviews_with_rebuttals(
|
| 27 |
-
csv_path: Path = BASE_DIR / "data" / "
|
| 28 |
):
|
| 29 |
"""Load 2022-2025 dataset with rebuttal metadata."""
|
| 30 |
if not csv_path.exists():
|
|
@@ -47,8 +42,13 @@ def load_scored_reviews_with_rebuttals(
|
|
| 47 |
|
| 48 |
years_new, df_new = load_scored_reviews_with_rebuttals()
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# -----------------------------------
|
| 54 |
# Pre-processed Tab
|
|
@@ -311,7 +311,9 @@ with gr.Blocks(title="ReView") as demo:
|
|
| 311 |
# -----------------------------------
|
| 312 |
with gr.Tab("Pre-processed Reviews"):
|
| 313 |
# Initialize state for this session.
|
| 314 |
-
|
|
|
|
|
|
|
| 315 |
initial_scored_reviews = get_preprocessed_scores(initial_year)
|
| 316 |
initial_review_ids = list(initial_scored_reviews.keys())
|
| 317 |
initial_review = initial_scored_reviews[initial_review_ids[0]]
|
|
|
|
| 1 |
import math
|
|
|
|
| 2 |
import sys, os.path
|
| 3 |
+
from pathlib import Path
|
| 4 |
|
| 5 |
import torch
|
| 6 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
|
|
|
|
| 11 |
import gradio as gr
|
| 12 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
|
| 13 |
import pandas as pd
|
|
|
|
| 14 |
import ast
|
| 15 |
from tqdm import tqdm
|
| 16 |
|
|
|
|
| 17 |
from dependencies.Glimpse_tokenizer import glimpse_tokenizer
|
| 18 |
# from scibert.scibert_polarity.scibert_polarity import predict_polarity
|
| 19 |
|
| 20 |
+
# Load new reviews with rebuttals (2020-2025) - if available
|
|
|
|
|
|
|
|
|
|
| 21 |
def load_scored_reviews_with_rebuttals(
|
| 22 |
+
csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews_2020-2025.csv"
|
| 23 |
):
|
| 24 |
"""Load 2022-2025 dataset with rebuttal metadata."""
|
| 25 |
if not csv_path.exists():
|
|
|
|
| 42 |
|
| 43 |
years_new, df_new = load_scored_reviews_with_rebuttals()
|
| 44 |
|
| 45 |
+
if df_new.empty:
|
| 46 |
+
raise FileNotFoundError(
|
| 47 |
+
"New dataset not found or empty. Expected data/preprocessed_scored_reviews_2020-2025.csv"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Use new data only
|
| 51 |
+
years, all_scored_reviews_df = years_new, df_new
|
| 52 |
|
| 53 |
# -----------------------------------
|
| 54 |
# Pre-processed Tab
|
|
|
|
| 311 |
# -----------------------------------
|
| 312 |
with gr.Tab("Pre-processed Reviews"):
|
| 313 |
# Initialize state for this session.
|
| 314 |
+
if not years:
|
| 315 |
+
raise ValueError("No years available in new dataset")
|
| 316 |
+
initial_year = years[0]
|
| 317 |
initial_scored_reviews = get_preprocessed_scores(initial_year)
|
| 318 |
initial_review_ids = list(initial_scored_reviews.keys())
|
| 319 |
initial_review = initial_scored_reviews[initial_review_ids[0]]
|
run_polarity_scoring.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Clean polarity scoring pipeline for ICLR review data.
|
| 4 |
+
Supports multiple model variants (SciBERT, DeBERTa, SciBERTa) and auto-detects available years.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import sys
|
| 9 |
+
import torch
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
|
| 13 |
+
# Add parent directory to path for imports
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 15 |
+
|
| 16 |
+
from config import Config
|
| 17 |
+
from dependencies.Glimpse_tokenizer import glimpse_tokenizer
|
| 18 |
+
from dependencies.scoring_utils import (
|
| 19 |
+
find_available_years,
|
| 20 |
+
load_polarity_model,
|
| 21 |
+
predict_batch,
|
| 22 |
+
save_polarity_results,
|
| 23 |
+
validate_input_file,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def score_reviews_polarity(
|
| 28 |
+
year: int,
|
| 29 |
+
model_variant: str = "scibert",
|
| 30 |
+
device: str = "cuda",
|
| 31 |
+
input_dir: Path = None,
|
| 32 |
+
output_dir: Path = None,
|
| 33 |
+
skip_if_exists: bool = True,
|
| 34 |
+
limit: int = None,
|
| 35 |
+
) -> Path:
|
| 36 |
+
"""
|
| 37 |
+
Score reviews for polarity using specified model variant.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
year: Year of reviews to score
|
| 41 |
+
model_variant: Model to use ("scibert", "deberta", "scideberta")
|
| 42 |
+
device: Device for computation ("cuda" or "cpu")
|
| 43 |
+
input_dir: Directory containing preprocessed reviews
|
| 44 |
+
output_dir: Directory to save scored results
|
| 45 |
+
skip_if_exists: Skip if output already exists
|
| 46 |
+
limit: Limit to first N reviews (None = process all)
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Path to output CSV file
|
| 50 |
+
"""
|
| 51 |
+
if input_dir is None:
|
| 52 |
+
input_dir = Config.BASE_DIR / "data" / "processed"
|
| 53 |
+
if output_dir is None:
|
| 54 |
+
output_dir = Config.POLARITY_DIR
|
| 55 |
+
|
| 56 |
+
output_path = output_dir / f"polarity_scored_reviews_{year}.csv"
|
| 57 |
+
|
| 58 |
+
# Skip if already exists and not forced
|
| 59 |
+
if skip_if_exists and output_path.exists():
|
| 60 |
+
print(f"⏩ Polarity scores already exist for {year}: {output_path}")
|
| 61 |
+
return output_path
|
| 62 |
+
|
| 63 |
+
print(f"\n{'='*60}")
|
| 64 |
+
print(f"Polarity Scoring: {year}")
|
| 65 |
+
print(f" Model: {model_variant}")
|
| 66 |
+
print(f" Device: {device}")
|
| 67 |
+
if limit:
|
| 68 |
+
print(f" Limit: {limit} reviews")
|
| 69 |
+
print(f"{'='*60}")
|
| 70 |
+
|
| 71 |
+
# Validate input file
|
| 72 |
+
input_path = input_dir / f"all_reviews_{year}.csv"
|
| 73 |
+
try:
|
| 74 |
+
df = validate_input_file(input_path, required_columns=["id", "text"])
|
| 75 |
+
except (FileNotFoundError, ValueError) as e:
|
| 76 |
+
print(f"✗ Input validation failed: {e}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
# Apply limit if specified
|
| 80 |
+
if limit:
|
| 81 |
+
df = df.head(limit)
|
| 82 |
+
print(f"Limited to {len(df)} reviews")
|
| 83 |
+
|
| 84 |
+
# Load model
|
| 85 |
+
try:
|
| 86 |
+
print(f"Loading {model_variant} model...")
|
| 87 |
+
tokenizer, model, device_obj = load_polarity_model(
|
| 88 |
+
model_variant, Config.BASE_DIR, device
|
| 89 |
+
)
|
| 90 |
+
except (ValueError, FileNotFoundError) as e:
|
| 91 |
+
print(f"✗ Model loading failed: {e}")
|
| 92 |
+
raise
|
| 93 |
+
|
| 94 |
+
# Process reviews
|
| 95 |
+
all_results = []
|
| 96 |
+
|
| 97 |
+
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing reviews"):
|
| 98 |
+
review_id = row["id"]
|
| 99 |
+
text = row["text"]
|
| 100 |
+
|
| 101 |
+
# Tokenize into sentences
|
| 102 |
+
sentences = glimpse_tokenizer(text)
|
| 103 |
+
if not sentences:
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
# Predict polarity for all sentences in batch
|
| 107 |
+
try:
|
| 108 |
+
predictions = predict_batch(sentences, tokenizer, model, device_obj)
|
| 109 |
+
except RuntimeError as e:
|
| 110 |
+
print(f"✗ Prediction failed for review {review_id}: {e}")
|
| 111 |
+
raise
|
| 112 |
+
|
| 113 |
+
# Store results
|
| 114 |
+
for sentence, polarity_label in zip(sentences, predictions):
|
| 115 |
+
all_results.append({
|
| 116 |
+
"id": review_id,
|
| 117 |
+
"sentence": sentence,
|
| 118 |
+
"polarity": polarity_label,
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
# Save results
|
| 122 |
+
try:
|
| 123 |
+
save_polarity_results(output_path, all_results)
|
| 124 |
+
print(f"✓ Polarity scores saved: {output_path}")
|
| 125 |
+
print(f" Scored sentences: {len(all_results)}")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"✗ Failed to save results: {e}")
|
| 128 |
+
raise
|
| 129 |
+
|
| 130 |
+
return output_path
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def main():
|
| 134 |
+
parser = argparse.ArgumentParser(
|
| 135 |
+
description="Polarity scoring pipeline for ICLR review data"
|
| 136 |
+
)
|
| 137 |
+
parser.add_argument(
|
| 138 |
+
"--year",
|
| 139 |
+
type=int,
|
| 140 |
+
help="Single year to process (if not specified, auto-detects all available years)",
|
| 141 |
+
)
|
| 142 |
+
parser.add_argument(
|
| 143 |
+
"--model",
|
| 144 |
+
type=str,
|
| 145 |
+
default="scibert",
|
| 146 |
+
choices=["scibert", "deberta", "scideberta"],
|
| 147 |
+
help="Model variant to use (default: scibert)",
|
| 148 |
+
)
|
| 149 |
+
parser.add_argument(
|
| 150 |
+
"--device",
|
| 151 |
+
type=str,
|
| 152 |
+
default="cuda",
|
| 153 |
+
choices=["cuda", "cpu"],
|
| 154 |
+
help="Device for computation (default: cuda)",
|
| 155 |
+
)
|
| 156 |
+
parser.add_argument(
|
| 157 |
+
"--force",
|
| 158 |
+
action="store_true",
|
| 159 |
+
help="Force reprocessing even if results exist",
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
args = parser.parse_args()
|
| 163 |
+
|
| 164 |
+
# Determine years to process
|
| 165 |
+
if args.year:
|
| 166 |
+
years = [args.year]
|
| 167 |
+
else:
|
| 168 |
+
processed_dir = Config.BASE_DIR / "data" / "processed"
|
| 169 |
+
years = find_available_years(processed_dir)
|
| 170 |
+
if not years:
|
| 171 |
+
print("⚠️ No preprocessed data found in data/processed/")
|
| 172 |
+
print(" Run preprocess_data.py first")
|
| 173 |
+
return
|
| 174 |
+
|
| 175 |
+
# Print summary
|
| 176 |
+
print(f"\n{'='*60}")
|
| 177 |
+
print(f"Polarity Scoring Pipeline")
|
| 178 |
+
print(f"Years: {years}")
|
| 179 |
+
print(f"Model: {args.model}")
|
| 180 |
+
print(f"Device: {args.device}")
|
| 181 |
+
print(f"{'='*60}")
|
| 182 |
+
|
| 183 |
+
# Process each year
|
| 184 |
+
success_count = 0
|
| 185 |
+
failed_years = []
|
| 186 |
+
|
| 187 |
+
for year in years:
|
| 188 |
+
try:
|
| 189 |
+
score_reviews_polarity(
|
| 190 |
+
year,
|
| 191 |
+
model_variant=args.model,
|
| 192 |
+
device=args.device,
|
| 193 |
+
skip_if_exists=not args.force,
|
| 194 |
+
)
|
| 195 |
+
success_count += 1
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print(f"\n⚠️ Failed to process {year}: {e}")
|
| 198 |
+
failed_years.append(year)
|
| 199 |
+
|
| 200 |
+
# Final summary
|
| 201 |
+
print(f"\n{'='*60}")
|
| 202 |
+
print(f"Pipeline Summary")
|
| 203 |
+
print(f"{'='*60}")
|
| 204 |
+
print(f"✓ Successful: {success_count}/{len(years)} years")
|
| 205 |
+
if failed_years:
|
| 206 |
+
print(f"✗ Failed: {failed_years}")
|
| 207 |
+
print(f"{'='*60}\n")
|
| 208 |
+
|
| 209 |
+
# Exit with error if any failed
|
| 210 |
+
if failed_years:
|
| 211 |
+
sys.exit(1)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
if __name__ == "__main__":
|
| 215 |
+
main()
|
run_scoring.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unified scoring pipeline - End-to-end data pipeline for ICLR review analysis.
|
| 4 |
+
Runs all scoring steps (GLIMPSE, polarity, topic) and builds final integrated dataset.
|
| 5 |
+
Automatically skips existing results unless --force is used.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python run_scoring.py --year 2020 # Score single year
|
| 9 |
+
python run_scoring.py # Auto-detect all available years
|
| 10 |
+
python run_scoring.py --force # Reprocess everything
|
| 11 |
+
python run_scoring.py --skip-glimpse # Skip GLIMPSE, just polarity/topic
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
# Add parent directory to path for imports
|
| 19 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 20 |
+
|
| 21 |
+
from config import Config
|
| 22 |
+
from dependencies.scoring_utils import find_available_years
|
| 23 |
+
|
| 24 |
+
# Import scoring functions
|
| 25 |
+
from run_glimpse_scoring import run_glimpse_pipeline
|
| 26 |
+
from run_polarity_scoring import score_reviews_polarity
|
| 27 |
+
from run_topic_scoring import score_reviews_topic
|
| 28 |
+
from scored_reviews_builder import build_2020_2025_dataset
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def run_full_pipeline(
|
| 32 |
+
year: int,
|
| 33 |
+
model_variant_polarity: str = "scibert",
|
| 34 |
+
model_variant_topic: str = "scibert",
|
| 35 |
+
device: str = "cuda",
|
| 36 |
+
skip_if_exists: bool = True,
|
| 37 |
+
skip_glimpse: bool = False,
|
| 38 |
+
limit: int = None,
|
| 39 |
+
) -> bool:
|
| 40 |
+
"""
|
| 41 |
+
Run complete scoring pipeline for a single year.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
year: Year to process
|
| 45 |
+
model_variant_polarity: Polarity model ("scibert", "deberta", "scideberta")
|
| 46 |
+
model_variant_topic: Topic model ("scibert", "deberta", "scideberta")
|
| 47 |
+
device: Device for computation ("cuda" or "cpu")
|
| 48 |
+
skip_if_exists: Skip if results already exist
|
| 49 |
+
skip_glimpse: Skip GLIMPSE scoring step
|
| 50 |
+
limit: Limit to first N reviews (None = process all)
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
True if successful, False if failed
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
limit_str = f" (limit: {limit})" if limit else ""
|
| 57 |
+
print(f"\n{'#'*60}")
|
| 58 |
+
print(f"# Full Scoring Pipeline: {year}{limit_str}")
|
| 59 |
+
print(f"{'#'*60}")
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
# Step 1: GLIMPSE Scoring
|
| 63 |
+
if not skip_glimpse:
|
| 64 |
+
print(f"\n[1/4] GLIMPSE Scoring...")
|
| 65 |
+
run_glimpse_pipeline(
|
| 66 |
+
year,
|
| 67 |
+
model_name="facebook/bart-large-cnn",
|
| 68 |
+
device=device,
|
| 69 |
+
skip_if_exists=skip_if_exists,
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
print(f"\n[1/4] Skipping GLIMPSE scoring (--skip-glimpse)")
|
| 73 |
+
|
| 74 |
+
# Step 2: Polarity Scoring
|
| 75 |
+
print(f"\n[2/4] Polarity Scoring ({model_variant_polarity})...")
|
| 76 |
+
score_reviews_polarity(
|
| 77 |
+
year,
|
| 78 |
+
model_variant=model_variant_polarity,
|
| 79 |
+
device=device,
|
| 80 |
+
skip_if_exists=skip_if_exists,
|
| 81 |
+
limit=limit,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Step 3: Topic Scoring
|
| 85 |
+
print(f"\n[3/4] Topic Scoring ({model_variant_topic})...")
|
| 86 |
+
score_reviews_topic(
|
| 87 |
+
year,
|
| 88 |
+
model_variant=model_variant_topic,
|
| 89 |
+
device=device,
|
| 90 |
+
skip_if_exists=skip_if_exists,
|
| 91 |
+
limit=limit,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Step 4: Build Final Dataset (always rebuild to ensure latest data)
|
| 95 |
+
print(f"\n[4/4] Building Final Integrated Dataset...")
|
| 96 |
+
build_2020_2025_dataset()
|
| 97 |
+
|
| 98 |
+
print(f"\n{'='*60}")
|
| 99 |
+
print(f"✓ Pipeline complete for {year}")
|
| 100 |
+
print(f"{'='*60}")
|
| 101 |
+
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"\n{'='*60}")
|
| 106 |
+
print(f"✗ Pipeline failed for {year}: {e}")
|
| 107 |
+
print(f"{'='*60}")
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def main():
|
| 112 |
+
parser = argparse.ArgumentParser(
|
| 113 |
+
description="Unified scoring pipeline - End-to-end processing for all review data"
|
| 114 |
+
)
|
| 115 |
+
parser.add_argument(
|
| 116 |
+
"--year",
|
| 117 |
+
type=int,
|
| 118 |
+
help="Single year to process (if not specified, auto-detects all available years)",
|
| 119 |
+
)
|
| 120 |
+
parser.add_argument(
|
| 121 |
+
"--model-polarity",
|
| 122 |
+
type=str,
|
| 123 |
+
default="scibert",
|
| 124 |
+
choices=["scibert", "deberta", "scideberta"],
|
| 125 |
+
help="Model variant for polarity scoring (default: scibert)",
|
| 126 |
+
)
|
| 127 |
+
parser.add_argument(
|
| 128 |
+
"--model-topic",
|
| 129 |
+
type=str,
|
| 130 |
+
default="scibert",
|
| 131 |
+
choices=["scibert", "deberta", "scideberta"],
|
| 132 |
+
help="Model variant for topic scoring (default: scibert)",
|
| 133 |
+
)
|
| 134 |
+
parser.add_argument(
|
| 135 |
+
"--device",
|
| 136 |
+
type=str,
|
| 137 |
+
default="cuda",
|
| 138 |
+
choices=["cuda", "cpu"],
|
| 139 |
+
help="Device for computation (default: cuda)",
|
| 140 |
+
)
|
| 141 |
+
parser.add_argument(
|
| 142 |
+
"--force",
|
| 143 |
+
action="store_true",
|
| 144 |
+
help="Force reprocessing even if results exist",
|
| 145 |
+
)
|
| 146 |
+
parser.add_argument(
|
| 147 |
+
"--skip-glimpse",
|
| 148 |
+
action="store_true",
|
| 149 |
+
help="Skip GLIMPSE scoring (assume results already exist)",
|
| 150 |
+
)
|
| 151 |
+
parser.add_argument(
|
| 152 |
+
"--limit",
|
| 153 |
+
type=int,
|
| 154 |
+
default=None,
|
| 155 |
+
help="Limit to first N reviews (None = process all)",
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
args = parser.parse_args()
|
| 159 |
+
|
| 160 |
+
# Determine years to process
|
| 161 |
+
if args.year:
|
| 162 |
+
years = [args.year]
|
| 163 |
+
else:
|
| 164 |
+
processed_dir = Config.BASE_DIR / "data" / "processed"
|
| 165 |
+
years = find_available_years(processed_dir)
|
| 166 |
+
if not years:
|
| 167 |
+
print("⚠️ No preprocessed data found in data/processed/")
|
| 168 |
+
print(" Run preprocess_data.py first")
|
| 169 |
+
return
|
| 170 |
+
|
| 171 |
+
# Print summary
|
| 172 |
+
print(f"\n{'='*60}")
|
| 173 |
+
print(f"Unified Scoring Pipeline")
|
| 174 |
+
print(f"{'='*60}")
|
| 175 |
+
print(f"Years: {years}")
|
| 176 |
+
print(f"Polarity model: {args.model_polarity}")
|
| 177 |
+
print(f"Topic model: {args.model_topic}")
|
| 178 |
+
print(f"Device: {args.device}")
|
| 179 |
+
print(f"Skip if exists: {not args.force}")
|
| 180 |
+
print(f"Include GLIMPSE: {not args.skip_glimpse}")
|
| 181 |
+
if args.limit:
|
| 182 |
+
print(f"Limit: {args.limit} reviews per year")
|
| 183 |
+
print(f"{'='*60}")
|
| 184 |
+
|
| 185 |
+
# Process each year
|
| 186 |
+
success_count = 0
|
| 187 |
+
failed_years = []
|
| 188 |
+
|
| 189 |
+
for year in years:
|
| 190 |
+
success = run_full_pipeline(
|
| 191 |
+
year,
|
| 192 |
+
model_variant_polarity=args.model_polarity,
|
| 193 |
+
model_variant_topic=args.model_topic,
|
| 194 |
+
device=args.device,
|
| 195 |
+
skip_if_exists=not args.force,
|
| 196 |
+
skip_glimpse=args.skip_glimpse,
|
| 197 |
+
limit=args.limit,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
if success:
|
| 201 |
+
success_count += 1
|
| 202 |
+
else:
|
| 203 |
+
failed_years.append(year)
|
| 204 |
+
|
| 205 |
+
# Final summary
|
| 206 |
+
print(f"\n{'='*60}")
|
| 207 |
+
print(f"Pipeline Summary")
|
| 208 |
+
print(f"{'='*60}")
|
| 209 |
+
print(f"✓ Successful: {success_count}/{len(years)} years")
|
| 210 |
+
if failed_years:
|
| 211 |
+
print(f"✗ Failed: {failed_years}")
|
| 212 |
+
print(f"\n📊 Final dataset: data/preprocessed_scored_reviews_2020-2025.csv")
|
| 213 |
+
print(f" Ready for interface: python interface/Demo.py")
|
| 214 |
+
print(f"{'='*60}\n")
|
| 215 |
+
|
| 216 |
+
# Exit with error if any failed
|
| 217 |
+
if failed_years:
|
| 218 |
+
sys.exit(1)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
main()
|
run_topic_scoring.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Clean topic scoring pipeline for ICLR review data.
|
| 4 |
+
Supports multiple model variants (SciBERT, DeBERTa, SciBERTa) and auto-detects available years.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import sys
|
| 9 |
+
import torch
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
|
| 13 |
+
# Add parent directory to path for imports
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 15 |
+
|
| 16 |
+
from config import Config
|
| 17 |
+
from dependencies.Glimpse_tokenizer import glimpse_tokenizer
|
| 18 |
+
from dependencies.scoring_utils import (
|
| 19 |
+
find_available_years,
|
| 20 |
+
load_topic_model,
|
| 21 |
+
predict_batch,
|
| 22 |
+
save_topic_results,
|
| 23 |
+
validate_input_file,
|
| 24 |
+
TOPIC_ID_TO_LABEL,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def score_reviews_topic(
|
| 29 |
+
year: int,
|
| 30 |
+
model_variant: str = "scibert",
|
| 31 |
+
device: str = "cuda",
|
| 32 |
+
input_dir: Path = None,
|
| 33 |
+
output_dir: Path = None,
|
| 34 |
+
skip_if_exists: bool = True,
|
| 35 |
+
limit: int = None,
|
| 36 |
+
) -> Path:
|
| 37 |
+
"""
|
| 38 |
+
Score reviews for topic using specified model variant.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
year: Year of reviews to score
|
| 42 |
+
model_variant: Model to use ("scibert", "deberta", "scideberta")
|
| 43 |
+
device: Device for computation ("cuda" or "cpu")
|
| 44 |
+
input_dir: Directory containing preprocessed reviews
|
| 45 |
+
output_dir: Directory to save scored results
|
| 46 |
+
skip_if_exists: Skip if output already exists
|
| 47 |
+
limit: Limit to first N reviews (None = process all)
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Path to output CSV file
|
| 51 |
+
"""
|
| 52 |
+
if input_dir is None:
|
| 53 |
+
input_dir = Config.BASE_DIR / "data" / "processed"
|
| 54 |
+
if output_dir is None:
|
| 55 |
+
output_dir = Config.TOPIC_DIR
|
| 56 |
+
|
| 57 |
+
output_path = output_dir / f"topic_scored_reviews_{year}.csv"
|
| 58 |
+
|
| 59 |
+
# Skip if already exists and not forced
|
| 60 |
+
if skip_if_exists and output_path.exists():
|
| 61 |
+
print(f"⏩ Topic scores already exist for {year}: {output_path}")
|
| 62 |
+
return output_path
|
| 63 |
+
|
| 64 |
+
print(f"\n{'='*60}")
|
| 65 |
+
print(f"Topic Scoring: {year}")
|
| 66 |
+
print(f" Model: {model_variant}")
|
| 67 |
+
print(f" Device: {device}")
|
| 68 |
+
if limit:
|
| 69 |
+
print(f" Limit: {limit} reviews")
|
| 70 |
+
print(f"{'='*60}")
|
| 71 |
+
|
| 72 |
+
# Validate input file
|
| 73 |
+
input_path = input_dir / f"all_reviews_{year}.csv"
|
| 74 |
+
try:
|
| 75 |
+
df = validate_input_file(input_path, required_columns=["id", "text"])
|
| 76 |
+
except (FileNotFoundError, ValueError) as e:
|
| 77 |
+
print(f"✗ Input validation failed: {e}")
|
| 78 |
+
raise
|
| 79 |
+
|
| 80 |
+
# Apply limit if specified
|
| 81 |
+
if limit:
|
| 82 |
+
df = df.head(limit)
|
| 83 |
+
print(f"Limited to {len(df)} reviews")
|
| 84 |
+
|
| 85 |
+
# Load model
|
| 86 |
+
try:
|
| 87 |
+
print(f"Loading {model_variant} model...")
|
| 88 |
+
tokenizer, model, device_obj = load_topic_model(
|
| 89 |
+
model_variant, Config.BASE_DIR, device
|
| 90 |
+
)
|
| 91 |
+
except (ValueError, FileNotFoundError) as e:
|
| 92 |
+
print(f"✗ Model loading failed: {e}")
|
| 93 |
+
raise
|
| 94 |
+
|
| 95 |
+
# Process reviews
|
| 96 |
+
all_results = []
|
| 97 |
+
|
| 98 |
+
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing reviews"):
|
| 99 |
+
review_id = row["id"]
|
| 100 |
+
text = row["text"]
|
| 101 |
+
|
| 102 |
+
# Tokenize into sentences
|
| 103 |
+
sentences = glimpse_tokenizer(text)
|
| 104 |
+
if not sentences:
|
| 105 |
+
continue
|
| 106 |
+
|
| 107 |
+
# Predict topic for all sentences in batch
|
| 108 |
+
try:
|
| 109 |
+
predictions = predict_batch(sentences, tokenizer, model, device_obj)
|
| 110 |
+
except RuntimeError as e:
|
| 111 |
+
print(f"✗ Prediction failed for review {review_id}: {e}")
|
| 112 |
+
raise
|
| 113 |
+
|
| 114 |
+
# Store results with both numeric ID and label
|
| 115 |
+
for sentence, topic_id in zip(sentences, predictions):
|
| 116 |
+
topic_label = TOPIC_ID_TO_LABEL.get(topic_id, "UNKNOWN")
|
| 117 |
+
all_results.append({
|
| 118 |
+
"id": review_id,
|
| 119 |
+
"sentence": sentence,
|
| 120 |
+
"topic_id": topic_id,
|
| 121 |
+
"topic": topic_label,
|
| 122 |
+
})
|
| 123 |
+
|
| 124 |
+
# Save results
|
| 125 |
+
try:
|
| 126 |
+
save_topic_results(output_path, all_results)
|
| 127 |
+
print(f"✓ Topic scores saved: {output_path}")
|
| 128 |
+
print(f" Scored sentences: {len(all_results)}")
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"✗ Failed to save results: {e}")
|
| 131 |
+
raise
|
| 132 |
+
|
| 133 |
+
return output_path
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def main():
|
| 137 |
+
parser = argparse.ArgumentParser(
|
| 138 |
+
description="Topic scoring pipeline for ICLR review data"
|
| 139 |
+
)
|
| 140 |
+
parser.add_argument(
|
| 141 |
+
"--year",
|
| 142 |
+
type=int,
|
| 143 |
+
help="Single year to process (if not specified, auto-detects all available years)",
|
| 144 |
+
)
|
| 145 |
+
parser.add_argument(
|
| 146 |
+
"--model",
|
| 147 |
+
type=str,
|
| 148 |
+
default="scibert",
|
| 149 |
+
choices=["scibert", "deberta", "scideberta"],
|
| 150 |
+
help="Model variant to use (default: scibert)",
|
| 151 |
+
)
|
| 152 |
+
parser.add_argument(
|
| 153 |
+
"--device",
|
| 154 |
+
type=str,
|
| 155 |
+
default="cuda",
|
| 156 |
+
choices=["cuda", "cpu"],
|
| 157 |
+
help="Device for computation (default: cuda)",
|
| 158 |
+
)
|
| 159 |
+
parser.add_argument(
|
| 160 |
+
"--force",
|
| 161 |
+
action="store_true",
|
| 162 |
+
help="Force reprocessing even if results exist",
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
args = parser.parse_args()
|
| 166 |
+
|
| 167 |
+
# Determine years to process
|
| 168 |
+
if args.year:
|
| 169 |
+
years = [args.year]
|
| 170 |
+
else:
|
| 171 |
+
processed_dir = Config.BASE_DIR / "data" / "processed"
|
| 172 |
+
years = find_available_years(processed_dir)
|
| 173 |
+
if not years:
|
| 174 |
+
print("⚠️ No preprocessed data found in data/processed/")
|
| 175 |
+
print(" Run preprocess_data.py first")
|
| 176 |
+
return
|
| 177 |
+
|
| 178 |
+
# Print summary
|
| 179 |
+
print(f"\n{'='*60}")
|
| 180 |
+
print(f"Topic Scoring Pipeline")
|
| 181 |
+
print(f"Years: {years}")
|
| 182 |
+
print(f"Model: {args.model}")
|
| 183 |
+
print(f"Device: {args.device}")
|
| 184 |
+
print(f"{'='*60}")
|
| 185 |
+
|
| 186 |
+
# Process each year
|
| 187 |
+
success_count = 0
|
| 188 |
+
failed_years = []
|
| 189 |
+
|
| 190 |
+
for year in years:
|
| 191 |
+
try:
|
| 192 |
+
score_reviews_topic(
|
| 193 |
+
year,
|
| 194 |
+
model_variant=args.model,
|
| 195 |
+
device=args.device,
|
| 196 |
+
skip_if_exists=not args.force,
|
| 197 |
+
)
|
| 198 |
+
success_count += 1
|
| 199 |
+
except Exception as e:
|
| 200 |
+
print(f"\n⚠️ Failed to process {year}: {e}")
|
| 201 |
+
failed_years.append(year)
|
| 202 |
+
|
| 203 |
+
# Final summary
|
| 204 |
+
print(f"\n{'='*60}")
|
| 205 |
+
print(f"Pipeline Summary")
|
| 206 |
+
print(f"{'='*60}")
|
| 207 |
+
print(f"✓ Successful: {success_count}/{len(years)} years")
|
| 208 |
+
if failed_years:
|
| 209 |
+
print(f"✗ Failed: {failed_years}")
|
| 210 |
+
print(f"{'='*60}\n")
|
| 211 |
+
|
| 212 |
+
# Exit with error if any failed
|
| 213 |
+
if failed_years:
|
| 214 |
+
sys.exit(1)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
if __name__ == "__main__":
|
| 218 |
+
main()
|
scored_reviews_builder.py
CHANGED
|
@@ -165,10 +165,16 @@ def build_2020_2025_dataset(
|
|
| 165 |
review_metadata = {}
|
| 166 |
for _, row in original_df.iterrows():
|
| 167 |
review_id = row["id"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
review_metadata[review_id] = {
|
| 169 |
-
'rebuttal':
|
| 170 |
'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
|
| 171 |
-
'has_rebuttal': bool(
|
| 172 |
}
|
| 173 |
|
| 174 |
all_scored_reviews.append({
|
|
@@ -202,34 +208,20 @@ if __name__ == "__main__":
|
|
| 202 |
years, all_scored_reviews_df = load_scored_reviews()
|
| 203 |
print (years)
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
|
| 222 |
-
|
| 223 |
-
# --- Testing code ---
|
| 224 |
-
# scored_reviews_2017 = all_scored_reviews_df[all_scored_reviews_df["year"] == 2017]
|
| 225 |
-
# print(scored_reviews_2017)
|
| 226 |
-
# scored_reviews_2017 = scored_reviews_2017["scored_dict"].iloc[0]
|
| 227 |
-
# # scored_reviews_2017 = ast.literal_eval(scored_reviews_2017)
|
| 228 |
-
# print(type(scored_reviews_2017))
|
| 229 |
-
# print(scored_reviews_2017.keys())
|
| 230 |
-
# sample = scored_reviews_2017["https://openreview.net/forum?id=r1rhWnZkg"]
|
| 231 |
-
# print(sample[0])
|
| 232 |
-
|
| 233 |
-
# print(years)
|
| 234 |
-
# for id in scored_reviews_2017.keys():
|
| 235 |
-
# print(len(scored_reviews_2017[id]))
|
|
|
|
| 165 |
review_metadata = {}
|
| 166 |
for _, row in original_df.iterrows():
|
| 167 |
review_id = row["id"]
|
| 168 |
+
rebuttal = row.get('rebuttal', '') if 'rebuttal' in original_df.columns else ''
|
| 169 |
+
# Handle NaN values from pandas
|
| 170 |
+
if pd.isna(rebuttal):
|
| 171 |
+
rebuttal = ''
|
| 172 |
+
rebuttal_str = str(rebuttal) if rebuttal else ''
|
| 173 |
+
|
| 174 |
review_metadata[review_id] = {
|
| 175 |
+
'rebuttal': rebuttal_str,
|
| 176 |
'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
|
| 177 |
+
'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
|
| 178 |
}
|
| 179 |
|
| 180 |
all_scored_reviews.append({
|
|
|
|
| 208 |
years, all_scored_reviews_df = load_scored_reviews()
|
| 209 |
print (years)
|
| 210 |
|
| 211 |
+
# Debugging sample output
|
| 212 |
+
sample_year = 2021
|
| 213 |
+
|
| 214 |
+
sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year]
|
| 215 |
+
review_dict = sample_df["scored_dict"].iloc[0]
|
| 216 |
+
|
| 217 |
+
print(f"\n=== Sample Review from {sample_year} ===")
|
| 218 |
+
for review_id, sentence_data_list in review_dict.items():
|
| 219 |
+
print(f"\nReview ID: {review_id}")
|
| 220 |
+
for sentence_dict in sentence_data_list:
|
| 221 |
+
for sentence, data in sentence_dict.items():
|
| 222 |
+
print(f" Sentence: {sentence}")
|
| 223 |
+
for key, value in data.items():
|
| 224 |
+
print(f" → {key}: {value}")
|
| 225 |
+
break # print only the first review's sentences
|
| 226 |
+
break # only one review
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|