Sina1138 commited on
Commit
e425a8a
·
1 Parent(s): 9068195

Add scoring utilities and unified scoring pipeline for ICLR review data

Browse files
dependencies/scoring_utils.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared utilities for polarity and topic scoring pipelines.
3
+ Provides common functions for model loading, prediction, and result saving.
4
+ """
5
+
6
+ import re
7
+ import torch
8
+ import pandas as pd
9
+ from pathlib import Path
10
+ from tqdm import tqdm
11
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
+
13
+
14
+ def find_available_years(data_dir: Path) -> list:
15
+ """
16
+ Auto-detect years by scanning data directory for all_reviews_*.csv files.
17
+
18
+ Args:
19
+ data_dir: Path to directory containing processed review data
20
+
21
+ Returns:
22
+ Sorted list of years found
23
+ """
24
+ years = []
25
+ if data_dir.exists():
26
+ for file in data_dir.glob("all_reviews_*.csv"):
27
+ match = re.search(r'all_reviews_(\d{4})\.csv', file.name)
28
+ if match:
29
+ years.append(int(match.group(1)))
30
+
31
+ return sorted(years)
32
+
33
+
34
+ def load_model_and_tokenizer(model_dir: Path, device: str = "cuda"):
35
+ """
36
+ Load a model and tokenizer from a local directory.
37
+
38
+ Args:
39
+ model_dir: Path to directory containing model (config.json, pytorch_model.bin, etc.)
40
+ device: Device to load model onto ("cuda" or "cpu")
41
+
42
+ Returns:
43
+ Tuple of (tokenizer, model)
44
+
45
+ Raises:
46
+ FileNotFoundError: If model directory doesn't exist or is missing model files
47
+ """
48
+ if not model_dir.exists():
49
+ raise FileNotFoundError(f"Model directory not found: {model_dir}")
50
+
51
+ # Check for required files
52
+ required_files = ["config.json", "pytorch_model.bin"]
53
+ for required_file in required_files:
54
+ if not (model_dir / required_file).exists():
55
+ raise FileNotFoundError(f"Missing {required_file} in {model_dir}")
56
+
57
+ try:
58
+ tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
59
+ model = AutoModelForSequenceClassification.from_pretrained(str(model_dir))
60
+ model.eval()
61
+
62
+ # Move to device
63
+ device_obj = torch.device(device if torch.cuda.is_available() else "cpu")
64
+ model.to(device_obj)
65
+
66
+ return tokenizer, model, device_obj
67
+
68
+ except Exception as e:
69
+ raise RuntimeError(f"Failed to load model from {model_dir}: {e}")
70
+
71
+
72
+ def predict_batch(sentences: list, tokenizer, model, device, max_length: int = 512) -> list:
73
+ """
74
+ Run batch predictions on a list of sentences.
75
+
76
+ Args:
77
+ sentences: List of sentence strings to predict
78
+ tokenizer: Tokenizer instance
79
+ model: Model instance
80
+ device: Device object for computation
81
+ max_length: Maximum token length (default: 512 for BERT-like models)
82
+
83
+ Returns:
84
+ List of predicted class IDs (integers)
85
+ """
86
+ if not sentences:
87
+ return []
88
+
89
+ try:
90
+ inputs = tokenizer(
91
+ sentences,
92
+ return_tensors="pt",
93
+ padding=True,
94
+ truncation=True,
95
+ max_length=max_length
96
+ ).to(device)
97
+
98
+ with torch.no_grad():
99
+ outputs = model(**inputs)
100
+ predictions = torch.argmax(outputs.logits, dim=1).cpu().tolist()
101
+
102
+ return predictions
103
+
104
+ except Exception as e:
105
+ raise RuntimeError(f"Prediction failed: {e}")
106
+
107
+
108
+ def save_polarity_results(output_path: Path, results: list) -> None:
109
+ """
110
+ Save polarity scoring results to CSV.
111
+
112
+ Expected result format:
113
+ [
114
+ {"id": review_id, "sentence": sentence_text, "score": float, "label": int},
115
+ ...
116
+ ]
117
+
118
+ Args:
119
+ output_path: Path to output CSV file
120
+ results: List of result dictionaries
121
+ """
122
+ output_path.parent.mkdir(parents=True, exist_ok=True)
123
+ df = pd.DataFrame(results)
124
+ df.to_csv(output_path, index=False)
125
+
126
+
127
+ def save_topic_results(output_path: Path, results: list) -> None:
128
+ """
129
+ Save topic scoring results to CSV.
130
+
131
+ Expected result format:
132
+ [
133
+ {"id": review_id, "sentence": sentence_text, "topic_id": int, "topic_label": str},
134
+ ...
135
+ ]
136
+
137
+ Args:
138
+ output_path: Path to output CSV file
139
+ results: List of result dictionaries
140
+ """
141
+ output_path.parent.mkdir(parents=True, exist_ok=True)
142
+ df = pd.DataFrame(results)
143
+ df.to_csv(output_path, index=False)
144
+
145
+
146
+ def validate_input_file(input_path: Path, required_columns: list) -> pd.DataFrame:
147
+ """
148
+ Validate that input CSV file exists and has required columns.
149
+
150
+ Args:
151
+ input_path: Path to CSV file
152
+ required_columns: List of column names that must exist
153
+
154
+ Returns:
155
+ Loaded DataFrame
156
+
157
+ Raises:
158
+ FileNotFoundError: If file doesn't exist
159
+ ValueError: If required columns are missing
160
+ """
161
+ if not input_path.exists():
162
+ raise FileNotFoundError(f"Input file not found: {input_path}")
163
+
164
+ try:
165
+ df = pd.read_csv(input_path)
166
+ except Exception as e:
167
+ raise ValueError(f"Failed to read CSV {input_path}: {e}")
168
+
169
+ missing_cols = set(required_columns) - set(df.columns)
170
+ if missing_cols:
171
+ raise ValueError(f"Missing required columns: {missing_cols}")
172
+
173
+ return df
174
+
175
+
176
+ def load_polarity_model(model_variant: str, base_dir: Path, device: str = "cuda"):
177
+ """
178
+ Factory function to load polarity model by variant name.
179
+
180
+ Supported variants:
181
+ - "scibert": scibert/scibert_polarity/final_model
182
+ - "deberta": alternative_polarity/deberta/final_model
183
+ - "scideberta": alternative_polarity/scideberta/final_model
184
+
185
+ Args:
186
+ model_variant: Name of model variant
187
+ base_dir: Base directory of project
188
+ device: Device to load onto
189
+
190
+ Returns:
191
+ Tuple of (tokenizer, model, device_obj)
192
+
193
+ Raises:
194
+ ValueError: If model_variant not supported
195
+ FileNotFoundError: If model directory doesn't exist
196
+ """
197
+ variant_map = {
198
+ "scibert": base_dir / "scibert" / "scibert_polarity" / "final_model",
199
+ "deberta": base_dir / "alternative_polarity" / "deberta" / "deberta_v3_base_polarity_final_model",
200
+ "scideberta": base_dir / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model",
201
+ }
202
+
203
+ if model_variant not in variant_map:
204
+ raise ValueError(
205
+ f"Unknown polarity model variant: {model_variant}. "
206
+ f"Supported: {list(variant_map.keys())}"
207
+ )
208
+
209
+ model_dir = variant_map[model_variant]
210
+ return load_model_and_tokenizer(model_dir, device)
211
+
212
+
213
+ def load_topic_model(model_variant: str, base_dir: Path, device: str = "cuda"):
214
+ """
215
+ Factory function to load topic model by variant name.
216
+
217
+ Supported variants:
218
+ - "scibert": scibert/scibert_topic/final_model
219
+ - "deberta": alternative_topic/deberta/final_model
220
+ - "scideberta": alternative_topic/scideberta/final_model
221
+
222
+ Args:
223
+ model_variant: Name of model variant
224
+ base_dir: Base directory of project
225
+ device: Device to load onto
226
+
227
+ Returns:
228
+ Tuple of (tokenizer, model, device_obj)
229
+
230
+ Raises:
231
+ ValueError: If model_variant not supported
232
+ FileNotFoundError: If model directory doesn't exist
233
+ """
234
+ variant_map = {
235
+ "scibert": base_dir / "scibert" / "scibert_topic" / "final_model",
236
+ "deberta": base_dir / "alternative_topic" / "deberta" / "final_model",
237
+ "scideberta": base_dir / "alternative_topic" / "scideberta" / "final_model",
238
+ }
239
+
240
+ if model_variant not in variant_map:
241
+ raise ValueError(
242
+ f"Unknown topic model variant: {model_variant}. "
243
+ f"Supported: {list(variant_map.keys())}"
244
+ )
245
+
246
+ model_dir = variant_map[model_variant]
247
+ return load_model_and_tokenizer(model_dir, device)
248
+
249
+
250
+ # Topic label mapping
251
+ TOPIC_ID_TO_LABEL = {
252
+ 0: "Substance",
253
+ 1: "Clarity",
254
+ 2: "Soundness/Correctness",
255
+ 3: "Originality",
256
+ 4: "Motivation/Impact",
257
+ 5: "Meaningful Comparison",
258
+ 6: "Replicability",
259
+ 7: "NONE",
260
+ }
261
+
262
+ TOPIC_LABEL_TO_ID = {v: k for k, v in TOPIC_ID_TO_LABEL.items()}
interface/Demo.py CHANGED
@@ -1,6 +1,6 @@
1
  import math
2
-
3
  import sys, os.path
 
4
 
5
  import torch
6
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
@@ -11,20 +11,15 @@ from dependencies.rsa_reranker import RSAReranking
11
  import gradio as gr
12
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
13
  import pandas as pd
14
- from pathlib import Path
15
  import ast
16
  from tqdm import tqdm
17
 
18
- from scored_reviews_builder import load_scored_reviews
19
  from dependencies.Glimpse_tokenizer import glimpse_tokenizer
20
  # from scibert.scibert_polarity.scibert_polarity import predict_polarity
21
 
22
- # Load scored reviews - LEGACY (2017-2021)
23
- years_legacy, df_legacy = load_scored_reviews()
24
-
25
- # Load new reviews with rebuttals (2022-2025) - if available
26
  def load_scored_reviews_with_rebuttals(
27
- csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews_2022-2025.csv"
28
  ):
29
  """Load 2022-2025 dataset with rebuttal metadata."""
30
  if not csv_path.exists():
@@ -47,8 +42,13 @@ def load_scored_reviews_with_rebuttals(
47
 
48
  years_new, df_new = load_scored_reviews_with_rebuttals()
49
 
50
- # For backward compatibility, use legacy as default
51
- years, all_scored_reviews_df = years_legacy, df_legacy
 
 
 
 
 
52
 
53
  # -----------------------------------
54
  # Pre-processed Tab
@@ -311,7 +311,9 @@ with gr.Blocks(title="ReView") as demo:
311
  # -----------------------------------
312
  with gr.Tab("Pre-processed Reviews"):
313
  # Initialize state for this session.
314
- initial_year = 2017
 
 
315
  initial_scored_reviews = get_preprocessed_scores(initial_year)
316
  initial_review_ids = list(initial_scored_reviews.keys())
317
  initial_review = initial_scored_reviews[initial_review_ids[0]]
 
1
  import math
 
2
  import sys, os.path
3
+ from pathlib import Path
4
 
5
  import torch
6
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
 
11
  import gradio as gr
12
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
13
  import pandas as pd
 
14
  import ast
15
  from tqdm import tqdm
16
 
 
17
  from dependencies.Glimpse_tokenizer import glimpse_tokenizer
18
  # from scibert.scibert_polarity.scibert_polarity import predict_polarity
19
 
20
+ # Load new reviews with rebuttals (2020-2025) - if available
 
 
 
21
  def load_scored_reviews_with_rebuttals(
22
+ csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews_2020-2025.csv"
23
  ):
24
  """Load 2022-2025 dataset with rebuttal metadata."""
25
  if not csv_path.exists():
 
42
 
43
  years_new, df_new = load_scored_reviews_with_rebuttals()
44
 
45
+ if df_new.empty:
46
+ raise FileNotFoundError(
47
+ "New dataset not found or empty. Expected data/preprocessed_scored_reviews_2020-2025.csv"
48
+ )
49
+
50
+ # Use new data only
51
+ years, all_scored_reviews_df = years_new, df_new
52
 
53
  # -----------------------------------
54
  # Pre-processed Tab
 
311
  # -----------------------------------
312
  with gr.Tab("Pre-processed Reviews"):
313
  # Initialize state for this session.
314
+ if not years:
315
+ raise ValueError("No years available in new dataset")
316
+ initial_year = years[0]
317
  initial_scored_reviews = get_preprocessed_scores(initial_year)
318
  initial_review_ids = list(initial_scored_reviews.keys())
319
  initial_review = initial_scored_reviews[initial_review_ids[0]]
run_polarity_scoring.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Clean polarity scoring pipeline for ICLR review data.
4
+ Supports multiple model variants (SciBERT, DeBERTa, SciBERTa) and auto-detects available years.
5
+ """
6
+
7
+ import argparse
8
+ import sys
9
+ import torch
10
+ from pathlib import Path
11
+ from tqdm import tqdm
12
+
13
+ # Add parent directory to path for imports
14
+ sys.path.insert(0, str(Path(__file__).parent))
15
+
16
+ from config import Config
17
+ from dependencies.Glimpse_tokenizer import glimpse_tokenizer
18
+ from dependencies.scoring_utils import (
19
+ find_available_years,
20
+ load_polarity_model,
21
+ predict_batch,
22
+ save_polarity_results,
23
+ validate_input_file,
24
+ )
25
+
26
+
27
+ def score_reviews_polarity(
28
+ year: int,
29
+ model_variant: str = "scibert",
30
+ device: str = "cuda",
31
+ input_dir: Path = None,
32
+ output_dir: Path = None,
33
+ skip_if_exists: bool = True,
34
+ limit: int = None,
35
+ ) -> Path:
36
+ """
37
+ Score reviews for polarity using specified model variant.
38
+
39
+ Args:
40
+ year: Year of reviews to score
41
+ model_variant: Model to use ("scibert", "deberta", "scideberta")
42
+ device: Device for computation ("cuda" or "cpu")
43
+ input_dir: Directory containing preprocessed reviews
44
+ output_dir: Directory to save scored results
45
+ skip_if_exists: Skip if output already exists
46
+ limit: Limit to first N reviews (None = process all)
47
+
48
+ Returns:
49
+ Path to output CSV file
50
+ """
51
+ if input_dir is None:
52
+ input_dir = Config.BASE_DIR / "data" / "processed"
53
+ if output_dir is None:
54
+ output_dir = Config.POLARITY_DIR
55
+
56
+ output_path = output_dir / f"polarity_scored_reviews_{year}.csv"
57
+
58
+ # Skip if already exists and not forced
59
+ if skip_if_exists and output_path.exists():
60
+ print(f"⏩ Polarity scores already exist for {year}: {output_path}")
61
+ return output_path
62
+
63
+ print(f"\n{'='*60}")
64
+ print(f"Polarity Scoring: {year}")
65
+ print(f" Model: {model_variant}")
66
+ print(f" Device: {device}")
67
+ if limit:
68
+ print(f" Limit: {limit} reviews")
69
+ print(f"{'='*60}")
70
+
71
+ # Validate input file
72
+ input_path = input_dir / f"all_reviews_{year}.csv"
73
+ try:
74
+ df = validate_input_file(input_path, required_columns=["id", "text"])
75
+ except (FileNotFoundError, ValueError) as e:
76
+ print(f"✗ Input validation failed: {e}")
77
+ raise
78
+
79
+ # Apply limit if specified
80
+ if limit:
81
+ df = df.head(limit)
82
+ print(f"Limited to {len(df)} reviews")
83
+
84
+ # Load model
85
+ try:
86
+ print(f"Loading {model_variant} model...")
87
+ tokenizer, model, device_obj = load_polarity_model(
88
+ model_variant, Config.BASE_DIR, device
89
+ )
90
+ except (ValueError, FileNotFoundError) as e:
91
+ print(f"✗ Model loading failed: {e}")
92
+ raise
93
+
94
+ # Process reviews
95
+ all_results = []
96
+
97
+ for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing reviews"):
98
+ review_id = row["id"]
99
+ text = row["text"]
100
+
101
+ # Tokenize into sentences
102
+ sentences = glimpse_tokenizer(text)
103
+ if not sentences:
104
+ continue
105
+
106
+ # Predict polarity for all sentences in batch
107
+ try:
108
+ predictions = predict_batch(sentences, tokenizer, model, device_obj)
109
+ except RuntimeError as e:
110
+ print(f"✗ Prediction failed for review {review_id}: {e}")
111
+ raise
112
+
113
+ # Store results
114
+ for sentence, polarity_label in zip(sentences, predictions):
115
+ all_results.append({
116
+ "id": review_id,
117
+ "sentence": sentence,
118
+ "polarity": polarity_label,
119
+ })
120
+
121
+ # Save results
122
+ try:
123
+ save_polarity_results(output_path, all_results)
124
+ print(f"✓ Polarity scores saved: {output_path}")
125
+ print(f" Scored sentences: {len(all_results)}")
126
+ except Exception as e:
127
+ print(f"✗ Failed to save results: {e}")
128
+ raise
129
+
130
+ return output_path
131
+
132
+
133
+ def main():
134
+ parser = argparse.ArgumentParser(
135
+ description="Polarity scoring pipeline for ICLR review data"
136
+ )
137
+ parser.add_argument(
138
+ "--year",
139
+ type=int,
140
+ help="Single year to process (if not specified, auto-detects all available years)",
141
+ )
142
+ parser.add_argument(
143
+ "--model",
144
+ type=str,
145
+ default="scibert",
146
+ choices=["scibert", "deberta", "scideberta"],
147
+ help="Model variant to use (default: scibert)",
148
+ )
149
+ parser.add_argument(
150
+ "--device",
151
+ type=str,
152
+ default="cuda",
153
+ choices=["cuda", "cpu"],
154
+ help="Device for computation (default: cuda)",
155
+ )
156
+ parser.add_argument(
157
+ "--force",
158
+ action="store_true",
159
+ help="Force reprocessing even if results exist",
160
+ )
161
+
162
+ args = parser.parse_args()
163
+
164
+ # Determine years to process
165
+ if args.year:
166
+ years = [args.year]
167
+ else:
168
+ processed_dir = Config.BASE_DIR / "data" / "processed"
169
+ years = find_available_years(processed_dir)
170
+ if not years:
171
+ print("⚠️ No preprocessed data found in data/processed/")
172
+ print(" Run preprocess_data.py first")
173
+ return
174
+
175
+ # Print summary
176
+ print(f"\n{'='*60}")
177
+ print(f"Polarity Scoring Pipeline")
178
+ print(f"Years: {years}")
179
+ print(f"Model: {args.model}")
180
+ print(f"Device: {args.device}")
181
+ print(f"{'='*60}")
182
+
183
+ # Process each year
184
+ success_count = 0
185
+ failed_years = []
186
+
187
+ for year in years:
188
+ try:
189
+ score_reviews_polarity(
190
+ year,
191
+ model_variant=args.model,
192
+ device=args.device,
193
+ skip_if_exists=not args.force,
194
+ )
195
+ success_count += 1
196
+ except Exception as e:
197
+ print(f"\n⚠️ Failed to process {year}: {e}")
198
+ failed_years.append(year)
199
+
200
+ # Final summary
201
+ print(f"\n{'='*60}")
202
+ print(f"Pipeline Summary")
203
+ print(f"{'='*60}")
204
+ print(f"✓ Successful: {success_count}/{len(years)} years")
205
+ if failed_years:
206
+ print(f"✗ Failed: {failed_years}")
207
+ print(f"{'='*60}\n")
208
+
209
+ # Exit with error if any failed
210
+ if failed_years:
211
+ sys.exit(1)
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()
run_scoring.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified scoring pipeline - End-to-end data pipeline for ICLR review analysis.
4
+ Runs all scoring steps (GLIMPSE, polarity, topic) and builds final integrated dataset.
5
+ Automatically skips existing results unless --force is used.
6
+
7
+ Usage:
8
+ python run_scoring.py --year 2020 # Score single year
9
+ python run_scoring.py # Auto-detect all available years
10
+ python run_scoring.py --force # Reprocess everything
11
+ python run_scoring.py --skip-glimpse # Skip GLIMPSE, just polarity/topic
12
+ """
13
+
14
+ import argparse
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ # Add parent directory to path for imports
19
+ sys.path.insert(0, str(Path(__file__).parent))
20
+
21
+ from config import Config
22
+ from dependencies.scoring_utils import find_available_years
23
+
24
+ # Import scoring functions
25
+ from run_glimpse_scoring import run_glimpse_pipeline
26
+ from run_polarity_scoring import score_reviews_polarity
27
+ from run_topic_scoring import score_reviews_topic
28
+ from scored_reviews_builder import build_2020_2025_dataset
29
+
30
+
31
+ def run_full_pipeline(
32
+ year: int,
33
+ model_variant_polarity: str = "scibert",
34
+ model_variant_topic: str = "scibert",
35
+ device: str = "cuda",
36
+ skip_if_exists: bool = True,
37
+ skip_glimpse: bool = False,
38
+ limit: int = None,
39
+ ) -> bool:
40
+ """
41
+ Run complete scoring pipeline for a single year.
42
+
43
+ Args:
44
+ year: Year to process
45
+ model_variant_polarity: Polarity model ("scibert", "deberta", "scideberta")
46
+ model_variant_topic: Topic model ("scibert", "deberta", "scideberta")
47
+ device: Device for computation ("cuda" or "cpu")
48
+ skip_if_exists: Skip if results already exist
49
+ skip_glimpse: Skip GLIMPSE scoring step
50
+ limit: Limit to first N reviews (None = process all)
51
+
52
+ Returns:
53
+ True if successful, False if failed
54
+ """
55
+
56
+ limit_str = f" (limit: {limit})" if limit else ""
57
+ print(f"\n{'#'*60}")
58
+ print(f"# Full Scoring Pipeline: {year}{limit_str}")
59
+ print(f"{'#'*60}")
60
+
61
+ try:
62
+ # Step 1: GLIMPSE Scoring
63
+ if not skip_glimpse:
64
+ print(f"\n[1/4] GLIMPSE Scoring...")
65
+ run_glimpse_pipeline(
66
+ year,
67
+ model_name="facebook/bart-large-cnn",
68
+ device=device,
69
+ skip_if_exists=skip_if_exists,
70
+ )
71
+ else:
72
+ print(f"\n[1/4] Skipping GLIMPSE scoring (--skip-glimpse)")
73
+
74
+ # Step 2: Polarity Scoring
75
+ print(f"\n[2/4] Polarity Scoring ({model_variant_polarity})...")
76
+ score_reviews_polarity(
77
+ year,
78
+ model_variant=model_variant_polarity,
79
+ device=device,
80
+ skip_if_exists=skip_if_exists,
81
+ limit=limit,
82
+ )
83
+
84
+ # Step 3: Topic Scoring
85
+ print(f"\n[3/4] Topic Scoring ({model_variant_topic})...")
86
+ score_reviews_topic(
87
+ year,
88
+ model_variant=model_variant_topic,
89
+ device=device,
90
+ skip_if_exists=skip_if_exists,
91
+ limit=limit,
92
+ )
93
+
94
+ # Step 4: Build Final Dataset (always rebuild to ensure latest data)
95
+ print(f"\n[4/4] Building Final Integrated Dataset...")
96
+ build_2020_2025_dataset()
97
+
98
+ print(f"\n{'='*60}")
99
+ print(f"✓ Pipeline complete for {year}")
100
+ print(f"{'='*60}")
101
+
102
+ return True
103
+
104
+ except Exception as e:
105
+ print(f"\n{'='*60}")
106
+ print(f"✗ Pipeline failed for {year}: {e}")
107
+ print(f"{'='*60}")
108
+ return False
109
+
110
+
111
+ def main():
112
+ parser = argparse.ArgumentParser(
113
+ description="Unified scoring pipeline - End-to-end processing for all review data"
114
+ )
115
+ parser.add_argument(
116
+ "--year",
117
+ type=int,
118
+ help="Single year to process (if not specified, auto-detects all available years)",
119
+ )
120
+ parser.add_argument(
121
+ "--model-polarity",
122
+ type=str,
123
+ default="scibert",
124
+ choices=["scibert", "deberta", "scideberta"],
125
+ help="Model variant for polarity scoring (default: scibert)",
126
+ )
127
+ parser.add_argument(
128
+ "--model-topic",
129
+ type=str,
130
+ default="scibert",
131
+ choices=["scibert", "deberta", "scideberta"],
132
+ help="Model variant for topic scoring (default: scibert)",
133
+ )
134
+ parser.add_argument(
135
+ "--device",
136
+ type=str,
137
+ default="cuda",
138
+ choices=["cuda", "cpu"],
139
+ help="Device for computation (default: cuda)",
140
+ )
141
+ parser.add_argument(
142
+ "--force",
143
+ action="store_true",
144
+ help="Force reprocessing even if results exist",
145
+ )
146
+ parser.add_argument(
147
+ "--skip-glimpse",
148
+ action="store_true",
149
+ help="Skip GLIMPSE scoring (assume results already exist)",
150
+ )
151
+ parser.add_argument(
152
+ "--limit",
153
+ type=int,
154
+ default=None,
155
+ help="Limit to first N reviews (None = process all)",
156
+ )
157
+
158
+ args = parser.parse_args()
159
+
160
+ # Determine years to process
161
+ if args.year:
162
+ years = [args.year]
163
+ else:
164
+ processed_dir = Config.BASE_DIR / "data" / "processed"
165
+ years = find_available_years(processed_dir)
166
+ if not years:
167
+ print("⚠️ No preprocessed data found in data/processed/")
168
+ print(" Run preprocess_data.py first")
169
+ return
170
+
171
+ # Print summary
172
+ print(f"\n{'='*60}")
173
+ print(f"Unified Scoring Pipeline")
174
+ print(f"{'='*60}")
175
+ print(f"Years: {years}")
176
+ print(f"Polarity model: {args.model_polarity}")
177
+ print(f"Topic model: {args.model_topic}")
178
+ print(f"Device: {args.device}")
179
+ print(f"Skip if exists: {not args.force}")
180
+ print(f"Include GLIMPSE: {not args.skip_glimpse}")
181
+ if args.limit:
182
+ print(f"Limit: {args.limit} reviews per year")
183
+ print(f"{'='*60}")
184
+
185
+ # Process each year
186
+ success_count = 0
187
+ failed_years = []
188
+
189
+ for year in years:
190
+ success = run_full_pipeline(
191
+ year,
192
+ model_variant_polarity=args.model_polarity,
193
+ model_variant_topic=args.model_topic,
194
+ device=args.device,
195
+ skip_if_exists=not args.force,
196
+ skip_glimpse=args.skip_glimpse,
197
+ limit=args.limit,
198
+ )
199
+
200
+ if success:
201
+ success_count += 1
202
+ else:
203
+ failed_years.append(year)
204
+
205
+ # Final summary
206
+ print(f"\n{'='*60}")
207
+ print(f"Pipeline Summary")
208
+ print(f"{'='*60}")
209
+ print(f"✓ Successful: {success_count}/{len(years)} years")
210
+ if failed_years:
211
+ print(f"✗ Failed: {failed_years}")
212
+ print(f"\n📊 Final dataset: data/preprocessed_scored_reviews_2020-2025.csv")
213
+ print(f" Ready for interface: python interface/Demo.py")
214
+ print(f"{'='*60}\n")
215
+
216
+ # Exit with error if any failed
217
+ if failed_years:
218
+ sys.exit(1)
219
+
220
+
221
+ if __name__ == "__main__":
222
+ main()
run_topic_scoring.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Clean topic scoring pipeline for ICLR review data.
4
+ Supports multiple model variants (SciBERT, DeBERTa, SciBERTa) and auto-detects available years.
5
+ """
6
+
7
+ import argparse
8
+ import sys
9
+ import torch
10
+ from pathlib import Path
11
+ from tqdm import tqdm
12
+
13
+ # Add parent directory to path for imports
14
+ sys.path.insert(0, str(Path(__file__).parent))
15
+
16
+ from config import Config
17
+ from dependencies.Glimpse_tokenizer import glimpse_tokenizer
18
+ from dependencies.scoring_utils import (
19
+ find_available_years,
20
+ load_topic_model,
21
+ predict_batch,
22
+ save_topic_results,
23
+ validate_input_file,
24
+ TOPIC_ID_TO_LABEL,
25
+ )
26
+
27
+
28
+ def score_reviews_topic(
29
+ year: int,
30
+ model_variant: str = "scibert",
31
+ device: str = "cuda",
32
+ input_dir: Path = None,
33
+ output_dir: Path = None,
34
+ skip_if_exists: bool = True,
35
+ limit: int = None,
36
+ ) -> Path:
37
+ """
38
+ Score reviews for topic using specified model variant.
39
+
40
+ Args:
41
+ year: Year of reviews to score
42
+ model_variant: Model to use ("scibert", "deberta", "scideberta")
43
+ device: Device for computation ("cuda" or "cpu")
44
+ input_dir: Directory containing preprocessed reviews
45
+ output_dir: Directory to save scored results
46
+ skip_if_exists: Skip if output already exists
47
+ limit: Limit to first N reviews (None = process all)
48
+
49
+ Returns:
50
+ Path to output CSV file
51
+ """
52
+ if input_dir is None:
53
+ input_dir = Config.BASE_DIR / "data" / "processed"
54
+ if output_dir is None:
55
+ output_dir = Config.TOPIC_DIR
56
+
57
+ output_path = output_dir / f"topic_scored_reviews_{year}.csv"
58
+
59
+ # Skip if already exists and not forced
60
+ if skip_if_exists and output_path.exists():
61
+ print(f"⏩ Topic scores already exist for {year}: {output_path}")
62
+ return output_path
63
+
64
+ print(f"\n{'='*60}")
65
+ print(f"Topic Scoring: {year}")
66
+ print(f" Model: {model_variant}")
67
+ print(f" Device: {device}")
68
+ if limit:
69
+ print(f" Limit: {limit} reviews")
70
+ print(f"{'='*60}")
71
+
72
+ # Validate input file
73
+ input_path = input_dir / f"all_reviews_{year}.csv"
74
+ try:
75
+ df = validate_input_file(input_path, required_columns=["id", "text"])
76
+ except (FileNotFoundError, ValueError) as e:
77
+ print(f"✗ Input validation failed: {e}")
78
+ raise
79
+
80
+ # Apply limit if specified
81
+ if limit:
82
+ df = df.head(limit)
83
+ print(f"Limited to {len(df)} reviews")
84
+
85
+ # Load model
86
+ try:
87
+ print(f"Loading {model_variant} model...")
88
+ tokenizer, model, device_obj = load_topic_model(
89
+ model_variant, Config.BASE_DIR, device
90
+ )
91
+ except (ValueError, FileNotFoundError) as e:
92
+ print(f"✗ Model loading failed: {e}")
93
+ raise
94
+
95
+ # Process reviews
96
+ all_results = []
97
+
98
+ for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing reviews"):
99
+ review_id = row["id"]
100
+ text = row["text"]
101
+
102
+ # Tokenize into sentences
103
+ sentences = glimpse_tokenizer(text)
104
+ if not sentences:
105
+ continue
106
+
107
+ # Predict topic for all sentences in batch
108
+ try:
109
+ predictions = predict_batch(sentences, tokenizer, model, device_obj)
110
+ except RuntimeError as e:
111
+ print(f"✗ Prediction failed for review {review_id}: {e}")
112
+ raise
113
+
114
+ # Store results with both numeric ID and label
115
+ for sentence, topic_id in zip(sentences, predictions):
116
+ topic_label = TOPIC_ID_TO_LABEL.get(topic_id, "UNKNOWN")
117
+ all_results.append({
118
+ "id": review_id,
119
+ "sentence": sentence,
120
+ "topic_id": topic_id,
121
+ "topic": topic_label,
122
+ })
123
+
124
+ # Save results
125
+ try:
126
+ save_topic_results(output_path, all_results)
127
+ print(f"✓ Topic scores saved: {output_path}")
128
+ print(f" Scored sentences: {len(all_results)}")
129
+ except Exception as e:
130
+ print(f"✗ Failed to save results: {e}")
131
+ raise
132
+
133
+ return output_path
134
+
135
+
136
+ def main():
137
+ parser = argparse.ArgumentParser(
138
+ description="Topic scoring pipeline for ICLR review data"
139
+ )
140
+ parser.add_argument(
141
+ "--year",
142
+ type=int,
143
+ help="Single year to process (if not specified, auto-detects all available years)",
144
+ )
145
+ parser.add_argument(
146
+ "--model",
147
+ type=str,
148
+ default="scibert",
149
+ choices=["scibert", "deberta", "scideberta"],
150
+ help="Model variant to use (default: scibert)",
151
+ )
152
+ parser.add_argument(
153
+ "--device",
154
+ type=str,
155
+ default="cuda",
156
+ choices=["cuda", "cpu"],
157
+ help="Device for computation (default: cuda)",
158
+ )
159
+ parser.add_argument(
160
+ "--force",
161
+ action="store_true",
162
+ help="Force reprocessing even if results exist",
163
+ )
164
+
165
+ args = parser.parse_args()
166
+
167
+ # Determine years to process
168
+ if args.year:
169
+ years = [args.year]
170
+ else:
171
+ processed_dir = Config.BASE_DIR / "data" / "processed"
172
+ years = find_available_years(processed_dir)
173
+ if not years:
174
+ print("⚠️ No preprocessed data found in data/processed/")
175
+ print(" Run preprocess_data.py first")
176
+ return
177
+
178
+ # Print summary
179
+ print(f"\n{'='*60}")
180
+ print(f"Topic Scoring Pipeline")
181
+ print(f"Years: {years}")
182
+ print(f"Model: {args.model}")
183
+ print(f"Device: {args.device}")
184
+ print(f"{'='*60}")
185
+
186
+ # Process each year
187
+ success_count = 0
188
+ failed_years = []
189
+
190
+ for year in years:
191
+ try:
192
+ score_reviews_topic(
193
+ year,
194
+ model_variant=args.model,
195
+ device=args.device,
196
+ skip_if_exists=not args.force,
197
+ )
198
+ success_count += 1
199
+ except Exception as e:
200
+ print(f"\n⚠️ Failed to process {year}: {e}")
201
+ failed_years.append(year)
202
+
203
+ # Final summary
204
+ print(f"\n{'='*60}")
205
+ print(f"Pipeline Summary")
206
+ print(f"{'='*60}")
207
+ print(f"✓ Successful: {success_count}/{len(years)} years")
208
+ if failed_years:
209
+ print(f"✗ Failed: {failed_years}")
210
+ print(f"{'='*60}\n")
211
+
212
+ # Exit with error if any failed
213
+ if failed_years:
214
+ sys.exit(1)
215
+
216
+
217
+ if __name__ == "__main__":
218
+ main()
scored_reviews_builder.py CHANGED
@@ -165,10 +165,16 @@ def build_2020_2025_dataset(
165
  review_metadata = {}
166
  for _, row in original_df.iterrows():
167
  review_id = row["id"]
 
 
 
 
 
 
168
  review_metadata[review_id] = {
169
- 'rebuttal': row.get('rebuttal', ''),
170
  'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
171
- 'has_rebuttal': bool(row.get('rebuttal', '').strip()) if 'rebuttal' in original_df.columns else False,
172
  }
173
 
174
  all_scored_reviews.append({
@@ -202,34 +208,20 @@ if __name__ == "__main__":
202
  years, all_scored_reviews_df = load_scored_reviews()
203
  print (years)
204
 
205
- # Debugging sample output
206
- sample_year = 2021
207
-
208
- sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year]
209
- review_dict = sample_df["scored_dict"].iloc[0]
210
-
211
- print(f"\n=== Sample Review from {sample_year} ===")
212
- for review_id, sentence_data_list in review_dict.items():
213
- print(f"\nReview ID: {review_id}")
214
- for sentence_dict in sentence_data_list:
215
- for sentence, data in sentence_dict.items():
216
- print(f" Sentence: {sentence}")
217
- for key, value in data.items():
218
- print(f" → {key}: {value}")
219
- break # print only the first review's sentences
220
- break # only one review
221
 
222
-
223
- # --- Testing code ---
224
- # scored_reviews_2017 = all_scored_reviews_df[all_scored_reviews_df["year"] == 2017]
225
- # print(scored_reviews_2017)
226
- # scored_reviews_2017 = scored_reviews_2017["scored_dict"].iloc[0]
227
- # # scored_reviews_2017 = ast.literal_eval(scored_reviews_2017)
228
- # print(type(scored_reviews_2017))
229
- # print(scored_reviews_2017.keys())
230
- # sample = scored_reviews_2017["https://openreview.net/forum?id=r1rhWnZkg"]
231
- # print(sample[0])
232
-
233
- # print(years)
234
- # for id in scored_reviews_2017.keys():
235
- # print(len(scored_reviews_2017[id]))
 
165
  review_metadata = {}
166
  for _, row in original_df.iterrows():
167
  review_id = row["id"]
168
+ rebuttal = row.get('rebuttal', '') if 'rebuttal' in original_df.columns else ''
169
+ # Handle NaN values from pandas
170
+ if pd.isna(rebuttal):
171
+ rebuttal = ''
172
+ rebuttal_str = str(rebuttal) if rebuttal else ''
173
+
174
  review_metadata[review_id] = {
175
+ 'rebuttal': rebuttal_str,
176
  'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
177
+ 'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
178
  }
179
 
180
  all_scored_reviews.append({
 
208
  years, all_scored_reviews_df = load_scored_reviews()
209
  print (years)
210
 
211
+ # Debugging sample output
212
+ sample_year = 2021
213
+
214
+ sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year]
215
+ review_dict = sample_df["scored_dict"].iloc[0]
216
+
217
+ print(f"\n=== Sample Review from {sample_year} ===")
218
+ for review_id, sentence_data_list in review_dict.items():
219
+ print(f"\nReview ID: {review_id}")
220
+ for sentence_dict in sentence_data_list:
221
+ for sentence, data in sentence_dict.items():
222
+ print(f" Sentence: {sentence}")
223
+ for key, value in data.items():
224
+ print(f" → {key}: {value}")
225
+ break # print only the first review's sentences
226
+ break # only one review
227