Spaces:

Sina1138
/

ReView

Paused

App Files Files Community

Sina1138 commited on Feb 6

Commit

e425a8a

1 Parent(s): 9068195

Add scoring utilities and unified scoring pipeline for ICLR review data

Browse files

Files changed (6) hide show

dependencies/scoring_utils.py +262 -0
interface/Demo.py +13 -11
run_polarity_scoring.py +215 -0
run_scoring.py +222 -0
run_topic_scoring.py +218 -0
scored_reviews_builder.py +24 -32

dependencies/scoring_utils.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Shared utilities for polarity and topic scoring pipelines.
+Provides common functions for model loading, prediction, and result saving.
+"""
+import re
+import torch
+import pandas as pd
+from pathlib import Path
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+def find_available_years(data_dir: Path) -> list:
+    """
+    Auto-detect years by scanning data directory for all_reviews_*.csv files.
+    Args:
+        data_dir: Path to directory containing processed review data
+    Returns:
+        Sorted list of years found
+    """
+    years = []
+    if data_dir.exists():
+        for file in data_dir.glob("all_reviews_*.csv"):
+            match = re.search(r'all_reviews_(\d{4})\.csv', file.name)
+            if match:
+                years.append(int(match.group(1)))
+    return sorted(years)
+def load_model_and_tokenizer(model_dir: Path, device: str = "cuda"):
+    """
+    Load a model and tokenizer from a local directory.
+    Args:
+        model_dir: Path to directory containing model (config.json, pytorch_model.bin, etc.)
+        device: Device to load model onto ("cuda" or "cpu")
+    Returns:
+        Tuple of (tokenizer, model)
+    Raises:
+        FileNotFoundError: If model directory doesn't exist or is missing model files
+    """
+    if not model_dir.exists():
+        raise FileNotFoundError(f"Model directory not found: {model_dir}")
+    # Check for required files
+    required_files = ["config.json", "pytorch_model.bin"]
+    for required_file in required_files:
+        if not (model_dir / required_file).exists():
+            raise FileNotFoundError(f"Missing {required_file} in {model_dir}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
+        model = AutoModelForSequenceClassification.from_pretrained(str(model_dir))
+        model.eval()
+        # Move to device
+        device_obj = torch.device(device if torch.cuda.is_available() else "cpu")
+        model.to(device_obj)
+        return tokenizer, model, device_obj
+    except Exception as e:
+        raise RuntimeError(f"Failed to load model from {model_dir}: {e}")
+def predict_batch(sentences: list, tokenizer, model, device, max_length: int = 512) -> list:
+    """
+    Run batch predictions on a list of sentences.
+    Args:
+        sentences: List of sentence strings to predict
+        tokenizer: Tokenizer instance
+        model: Model instance
+        device: Device object for computation
+        max_length: Maximum token length (default: 512 for BERT-like models)
+    Returns:
+        List of predicted class IDs (integers)
+    """
+    if not sentences:
+        return []
+    try:
+        inputs = tokenizer(
+            sentences,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length
+        ).to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predictions = torch.argmax(outputs.logits, dim=1).cpu().tolist()
+        return predictions
+    except Exception as e:
+        raise RuntimeError(f"Prediction failed: {e}")
+def save_polarity_results(output_path: Path, results: list) -> None:
+    """
+    Save polarity scoring results to CSV.
+    Expected result format:
+    [
+        {"id": review_id, "sentence": sentence_text, "score": float, "label": int},
+        ...
+    ]
+    Args:
+        output_path: Path to output CSV file
+        results: List of result dictionaries
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df = pd.DataFrame(results)
+    df.to_csv(output_path, index=False)
+def save_topic_results(output_path: Path, results: list) -> None:
+    """
+    Save topic scoring results to CSV.
+    Expected result format:
+    [
+        {"id": review_id, "sentence": sentence_text, "topic_id": int, "topic_label": str},
+        ...
+    ]
+    Args:
+        output_path: Path to output CSV file
+        results: List of result dictionaries
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df = pd.DataFrame(results)
+    df.to_csv(output_path, index=False)
+def validate_input_file(input_path: Path, required_columns: list) -> pd.DataFrame:
+    """
+    Validate that input CSV file exists and has required columns.
+    Args:
+        input_path: Path to CSV file
+        required_columns: List of column names that must exist
+    Returns:
+        Loaded DataFrame
+    Raises:
+        FileNotFoundError: If file doesn't exist
+        ValueError: If required columns are missing
+    """
+    if not input_path.exists():
+        raise FileNotFoundError(f"Input file not found: {input_path}")
+    try:
+        df = pd.read_csv(input_path)
+    except Exception as e:
+        raise ValueError(f"Failed to read CSV {input_path}: {e}")
+    missing_cols = set(required_columns) - set(df.columns)
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    return df
+def load_polarity_model(model_variant: str, base_dir: Path, device: str = "cuda"):
+    """
+    Factory function to load polarity model by variant name.
+    Supported variants:
+      - "scibert": scibert/scibert_polarity/final_model
+      - "deberta": alternative_polarity/deberta/final_model
+      - "scideberta": alternative_polarity/scideberta/final_model
+    Args:
+        model_variant: Name of model variant
+        base_dir: Base directory of project
+        device: Device to load onto
+    Returns:
+        Tuple of (tokenizer, model, device_obj)
+    Raises:
+        ValueError: If model_variant not supported
+        FileNotFoundError: If model directory doesn't exist
+    """
+    variant_map = {
+        "scibert": base_dir / "scibert" / "scibert_polarity" / "final_model",
+        "deberta": base_dir / "alternative_polarity" / "deberta" / "deberta_v3_base_polarity_final_model",
+        "scideberta": base_dir / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model",
+    }
+    if model_variant not in variant_map:
+        raise ValueError(
+            f"Unknown polarity model variant: {model_variant}. "
+            f"Supported: {list(variant_map.keys())}"
+        )
+    model_dir = variant_map[model_variant]
+    return load_model_and_tokenizer(model_dir, device)
+def load_topic_model(model_variant: str, base_dir: Path, device: str = "cuda"):
+    """
+    Factory function to load topic model by variant name.
+    Supported variants:
+      - "scibert": scibert/scibert_topic/final_model
+      - "deberta": alternative_topic/deberta/final_model
+      - "scideberta": alternative_topic/scideberta/final_model
+    Args:
+        model_variant: Name of model variant
+        base_dir: Base directory of project
+        device: Device to load onto
+    Returns:
+        Tuple of (tokenizer, model, device_obj)
+    Raises:
+        ValueError: If model_variant not supported
+        FileNotFoundError: If model directory doesn't exist
+    """
+    variant_map = {
+        "scibert": base_dir / "scibert" / "scibert_topic" / "final_model",
+        "deberta": base_dir / "alternative_topic" / "deberta" / "final_model",
+        "scideberta": base_dir / "alternative_topic" / "scideberta" / "final_model",
+    }
+    if model_variant not in variant_map:
+        raise ValueError(
+            f"Unknown topic model variant: {model_variant}. "
+            f"Supported: {list(variant_map.keys())}"
+        )
+    model_dir = variant_map[model_variant]
+    return load_model_and_tokenizer(model_dir, device)
+# Topic label mapping
+TOPIC_ID_TO_LABEL = {
+    0: "Substance",
+    1: "Clarity",
+    2: "Soundness/Correctness",
+    3: "Originality",
+    4: "Motivation/Impact",
+    5: "Meaningful Comparison",
+    6: "Replicability",
+    7: "NONE",
+}
+TOPIC_LABEL_TO_ID = {v: k for k, v in TOPIC_ID_TO_LABEL.items()}

interface/Demo.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import math
 import sys, os.path
 import torch
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
@@ -11,20 +11,15 @@ from dependencies.rsa_reranker import RSAReranking
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
 import pandas as pd
-from pathlib import Path
 import ast
 from tqdm import tqdm
-from scored_reviews_builder import load_scored_reviews
 from dependencies.Glimpse_tokenizer import glimpse_tokenizer
 # from scibert.scibert_polarity.scibert_polarity import predict_polarity
-# Load scored reviews - LEGACY (2017-2021)
-years_legacy, df_legacy = load_scored_reviews()
-# Load new reviews with rebuttals (2022-2025) - if available
 def load_scored_reviews_with_rebuttals(
-    csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews_2022-2025.csv"
 ):
     """Load 2022-2025 dataset with rebuttal metadata."""
     if not csv_path.exists():
@@ -47,8 +42,13 @@ def load_scored_reviews_with_rebuttals(
 years_new, df_new = load_scored_reviews_with_rebuttals()
-# For backward compatibility, use legacy as default
-years, all_scored_reviews_df = years_legacy, df_legacy
 # -----------------------------------
 # Pre-processed Tab
@@ -311,7 +311,9 @@ with gr.Blocks(title="ReView") as demo:
     # -----------------------------------
     with gr.Tab("Pre-processed Reviews"):
         # Initialize state for this session.
-        initial_year = 2017
         initial_scored_reviews = get_preprocessed_scores(initial_year)
         initial_review_ids = list(initial_scored_reviews.keys())
         initial_review = initial_scored_reviews[initial_review_ids[0]]

 import math
 import sys, os.path
+from pathlib import Path
 import torch
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
 import pandas as pd
 import ast
 from tqdm import tqdm
 from dependencies.Glimpse_tokenizer import glimpse_tokenizer
 # from scibert.scibert_polarity.scibert_polarity import predict_polarity
+# Load new reviews with rebuttals (2020-2025) - if available
 def load_scored_reviews_with_rebuttals(
+    csv_path: Path = BASE_DIR / "data" / "preprocessed_scored_reviews_2020-2025.csv"
 ):
     """Load 2022-2025 dataset with rebuttal metadata."""
     if not csv_path.exists():
 years_new, df_new = load_scored_reviews_with_rebuttals()
+if df_new.empty:
+    raise FileNotFoundError(
+        "New dataset not found or empty. Expected data/preprocessed_scored_reviews_2020-2025.csv"
+    )
+# Use new data only
+years, all_scored_reviews_df = years_new, df_new
 # -----------------------------------
 # Pre-processed Tab
     # -----------------------------------
     with gr.Tab("Pre-processed Reviews"):
         # Initialize state for this session.
+        if not years:
+            raise ValueError("No years available in new dataset")
+        initial_year = years[0]
         initial_scored_reviews = get_preprocessed_scores(initial_year)
         initial_review_ids = list(initial_scored_reviews.keys())
         initial_review = initial_scored_reviews[initial_review_ids[0]]

run_polarity_scoring.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env python3
+"""
+Clean polarity scoring pipeline for ICLR review data.
+Supports multiple model variants (SciBERT, DeBERTa, SciBERTa) and auto-detects available years.
+"""
+import argparse
+import sys
+import torch
+from pathlib import Path
+from tqdm import tqdm
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from config import Config
+from dependencies.Glimpse_tokenizer import glimpse_tokenizer
+from dependencies.scoring_utils import (
+    find_available_years,
+    load_polarity_model,
+    predict_batch,
+    save_polarity_results,
+    validate_input_file,
+)
+def score_reviews_polarity(
+    year: int,
+    model_variant: str = "scibert",
+    device: str = "cuda",
+    input_dir: Path = None,
+    output_dir: Path = None,
+    skip_if_exists: bool = True,
+    limit: int = None,
+) -> Path:
+    """
+    Score reviews for polarity using specified model variant.
+    Args:
+        year: Year of reviews to score
+        model_variant: Model to use ("scibert", "deberta", "scideberta")
+        device: Device for computation ("cuda" or "cpu")
+        input_dir: Directory containing preprocessed reviews
+        output_dir: Directory to save scored results
+        skip_if_exists: Skip if output already exists
+        limit: Limit to first N reviews (None = process all)
+    Returns:
+        Path to output CSV file
+    """
+    if input_dir is None:
+        input_dir = Config.BASE_DIR / "data" / "processed"
+    if output_dir is None:
+        output_dir = Config.POLARITY_DIR
+    output_path = output_dir / f"polarity_scored_reviews_{year}.csv"
+    # Skip if already exists and not forced
+    if skip_if_exists and output_path.exists():
+        print(f"⏩ Polarity scores already exist for {year}: {output_path}")
+        return output_path
+    print(f"\n{'='*60}")
+    print(f"Polarity Scoring: {year}")
+    print(f"  Model: {model_variant}")
+    print(f"  Device: {device}")
+    if limit:
+        print(f"  Limit: {limit} reviews")
+    print(f"{'='*60}")
+    # Validate input file
+    input_path = input_dir / f"all_reviews_{year}.csv"
+    try:
+        df = validate_input_file(input_path, required_columns=["id", "text"])
+    except (FileNotFoundError, ValueError) as e:
+        print(f"✗ Input validation failed: {e}")
+        raise
+    # Apply limit if specified
+    if limit:
+        df = df.head(limit)
+        print(f"Limited to {len(df)} reviews")
+    # Load model
+    try:
+        print(f"Loading {model_variant} model...")
+        tokenizer, model, device_obj = load_polarity_model(
+            model_variant, Config.BASE_DIR, device
+        )
+    except (ValueError, FileNotFoundError) as e:
+        print(f"✗ Model loading failed: {e}")
+        raise
+    # Process reviews
+    all_results = []
+    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing reviews"):
+        review_id = row["id"]
+        text = row["text"]
+        # Tokenize into sentences
+        sentences = glimpse_tokenizer(text)
+        if not sentences:
+            continue
+        # Predict polarity for all sentences in batch
+        try:
+            predictions = predict_batch(sentences, tokenizer, model, device_obj)
+        except RuntimeError as e:
+            print(f"✗ Prediction failed for review {review_id}: {e}")
+            raise
+        # Store results
+        for sentence, polarity_label in zip(sentences, predictions):
+            all_results.append({
+                "id": review_id,
+                "sentence": sentence,
+                "polarity": polarity_label,
+            })
+    # Save results
+    try:
+        save_polarity_results(output_path, all_results)
+        print(f"✓ Polarity scores saved: {output_path}")
+        print(f"  Scored sentences: {len(all_results)}")
+    except Exception as e:
+        print(f"✗ Failed to save results: {e}")
+        raise
+    return output_path
+def main():
+    parser = argparse.ArgumentParser(
+        description="Polarity scoring pipeline for ICLR review data"
+    )
+    parser.add_argument(
+        "--year",
+        type=int,
+        help="Single year to process (if not specified, auto-detects all available years)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="scibert",
+        choices=["scibert", "deberta", "scideberta"],
+        help="Model variant to use (default: scibert)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device for computation (default: cuda)",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force reprocessing even if results exist",
+    )
+    args = parser.parse_args()
+    # Determine years to process
+    if args.year:
+        years = [args.year]
+    else:
+        processed_dir = Config.BASE_DIR / "data" / "processed"
+        years = find_available_years(processed_dir)
+        if not years:
+            print("⚠️  No preprocessed data found in data/processed/")
+            print("   Run preprocess_data.py first")
+            return
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"Polarity Scoring Pipeline")
+    print(f"Years: {years}")
+    print(f"Model: {args.model}")
+    print(f"Device: {args.device}")
+    print(f"{'='*60}")
+    # Process each year
+    success_count = 0
+    failed_years = []
+    for year in years:
+        try:
+            score_reviews_polarity(
+                year,
+                model_variant=args.model,
+                device=args.device,
+                skip_if_exists=not args.force,
+            )
+            success_count += 1
+        except Exception as e:
+            print(f"\n⚠️  Failed to process {year}: {e}")
+            failed_years.append(year)
+    # Final summary
+    print(f"\n{'='*60}")
+    print(f"Pipeline Summary")
+    print(f"{'='*60}")
+    print(f"✓ Successful: {success_count}/{len(years)} years")
+    if failed_years:
+        print(f"✗ Failed: {failed_years}")
+    print(f"{'='*60}\n")
+    # Exit with error if any failed
+    if failed_years:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

run_scoring.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python3
+"""
+Unified scoring pipeline - End-to-end data pipeline for ICLR review analysis.
+Runs all scoring steps (GLIMPSE, polarity, topic) and builds final integrated dataset.
+Automatically skips existing results unless --force is used.
+Usage:
+    python run_scoring.py --year 2020              # Score single year
+    python run_scoring.py                          # Auto-detect all available years
+    python run_scoring.py --force                  # Reprocess everything
+    python run_scoring.py --skip-glimpse           # Skip GLIMPSE, just polarity/topic
+"""
+import argparse
+import sys
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from config import Config
+from dependencies.scoring_utils import find_available_years
+# Import scoring functions
+from run_glimpse_scoring import run_glimpse_pipeline
+from run_polarity_scoring import score_reviews_polarity
+from run_topic_scoring import score_reviews_topic
+from scored_reviews_builder import build_2020_2025_dataset
+def run_full_pipeline(
+    year: int,
+    model_variant_polarity: str = "scibert",
+    model_variant_topic: str = "scibert",
+    device: str = "cuda",
+    skip_if_exists: bool = True,
+    skip_glimpse: bool = False,
+    limit: int = None,
+) -> bool:
+    """
+    Run complete scoring pipeline for a single year.
+    Args:
+        year: Year to process
+        model_variant_polarity: Polarity model ("scibert", "deberta", "scideberta")
+        model_variant_topic: Topic model ("scibert", "deberta", "scideberta")
+        device: Device for computation ("cuda" or "cpu")
+        skip_if_exists: Skip if results already exist
+        skip_glimpse: Skip GLIMPSE scoring step
+        limit: Limit to first N reviews (None = process all)
+    Returns:
+        True if successful, False if failed
+    """
+    limit_str = f" (limit: {limit})" if limit else ""
+    print(f"\n{'#'*60}")
+    print(f"# Full Scoring Pipeline: {year}{limit_str}")
+    print(f"{'#'*60}")
+    try:
+        # Step 1: GLIMPSE Scoring
+        if not skip_glimpse:
+            print(f"\n[1/4] GLIMPSE Scoring...")
+            run_glimpse_pipeline(
+                year,
+                model_name="facebook/bart-large-cnn",
+                device=device,
+                skip_if_exists=skip_if_exists,
+            )
+        else:
+            print(f"\n[1/4] Skipping GLIMPSE scoring (--skip-glimpse)")
+        # Step 2: Polarity Scoring
+        print(f"\n[2/4] Polarity Scoring ({model_variant_polarity})...")
+        score_reviews_polarity(
+            year,
+            model_variant=model_variant_polarity,
+            device=device,
+            skip_if_exists=skip_if_exists,
+            limit=limit,
+        )
+        # Step 3: Topic Scoring
+        print(f"\n[3/4] Topic Scoring ({model_variant_topic})...")
+        score_reviews_topic(
+            year,
+            model_variant=model_variant_topic,
+            device=device,
+            skip_if_exists=skip_if_exists,
+            limit=limit,
+        )
+        # Step 4: Build Final Dataset (always rebuild to ensure latest data)
+        print(f"\n[4/4] Building Final Integrated Dataset...")
+        build_2020_2025_dataset()
+        print(f"\n{'='*60}")
+        print(f"✓ Pipeline complete for {year}")
+        print(f"{'='*60}")
+        return True
+    except Exception as e:
+        print(f"\n{'='*60}")
+        print(f"✗ Pipeline failed for {year}: {e}")
+        print(f"{'='*60}")
+        return False
+def main():
+    parser = argparse.ArgumentParser(
+        description="Unified scoring pipeline - End-to-end processing for all review data"
+    )
+    parser.add_argument(
+        "--year",
+        type=int,
+        help="Single year to process (if not specified, auto-detects all available years)",
+    )
+    parser.add_argument(
+        "--model-polarity",
+        type=str,
+        default="scibert",
+        choices=["scibert", "deberta", "scideberta"],
+        help="Model variant for polarity scoring (default: scibert)",
+    )
+    parser.add_argument(
+        "--model-topic",
+        type=str,
+        default="scibert",
+        choices=["scibert", "deberta", "scideberta"],
+        help="Model variant for topic scoring (default: scibert)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device for computation (default: cuda)",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force reprocessing even if results exist",
+    )
+    parser.add_argument(
+        "--skip-glimpse",
+        action="store_true",
+        help="Skip GLIMPSE scoring (assume results already exist)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Limit to first N reviews (None = process all)",
+    )
+    args = parser.parse_args()
+    # Determine years to process
+    if args.year:
+        years = [args.year]
+    else:
+        processed_dir = Config.BASE_DIR / "data" / "processed"
+        years = find_available_years(processed_dir)
+        if not years:
+            print("⚠️  No preprocessed data found in data/processed/")
+            print("   Run preprocess_data.py first")
+            return
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"Unified Scoring Pipeline")
+    print(f"{'='*60}")
+    print(f"Years: {years}")
+    print(f"Polarity model: {args.model_polarity}")
+    print(f"Topic model: {args.model_topic}")
+    print(f"Device: {args.device}")
+    print(f"Skip if exists: {not args.force}")
+    print(f"Include GLIMPSE: {not args.skip_glimpse}")
+    if args.limit:
+        print(f"Limit: {args.limit} reviews per year")
+    print(f"{'='*60}")
+    # Process each year
+    success_count = 0
+    failed_years = []
+    for year in years:
+        success = run_full_pipeline(
+            year,
+            model_variant_polarity=args.model_polarity,
+            model_variant_topic=args.model_topic,
+            device=args.device,
+            skip_if_exists=not args.force,
+            skip_glimpse=args.skip_glimpse,
+            limit=args.limit,
+        )
+        if success:
+            success_count += 1
+        else:
+            failed_years.append(year)
+    # Final summary
+    print(f"\n{'='*60}")
+    print(f"Pipeline Summary")
+    print(f"{'='*60}")
+    print(f"✓ Successful: {success_count}/{len(years)} years")
+    if failed_years:
+        print(f"✗ Failed: {failed_years}")
+    print(f"\n📊 Final dataset: data/preprocessed_scored_reviews_2020-2025.csv")
+    print(f"   Ready for interface: python interface/Demo.py")
+    print(f"{'='*60}\n")
+    # Exit with error if any failed
+    if failed_years:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

run_topic_scoring.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#!/usr/bin/env python3
+"""
+Clean topic scoring pipeline for ICLR review data.
+Supports multiple model variants (SciBERT, DeBERTa, SciBERTa) and auto-detects available years.
+"""
+import argparse
+import sys
+import torch
+from pathlib import Path
+from tqdm import tqdm
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from config import Config
+from dependencies.Glimpse_tokenizer import glimpse_tokenizer
+from dependencies.scoring_utils import (
+    find_available_years,
+    load_topic_model,
+    predict_batch,
+    save_topic_results,
+    validate_input_file,
+    TOPIC_ID_TO_LABEL,
+)
+def score_reviews_topic(
+    year: int,
+    model_variant: str = "scibert",
+    device: str = "cuda",
+    input_dir: Path = None,
+    output_dir: Path = None,
+    skip_if_exists: bool = True,
+    limit: int = None,
+) -> Path:
+    """
+    Score reviews for topic using specified model variant.
+    Args:
+        year: Year of reviews to score
+        model_variant: Model to use ("scibert", "deberta", "scideberta")
+        device: Device for computation ("cuda" or "cpu")
+        input_dir: Directory containing preprocessed reviews
+        output_dir: Directory to save scored results
+        skip_if_exists: Skip if output already exists
+        limit: Limit to first N reviews (None = process all)
+    Returns:
+        Path to output CSV file
+    """
+    if input_dir is None:
+        input_dir = Config.BASE_DIR / "data" / "processed"
+    if output_dir is None:
+        output_dir = Config.TOPIC_DIR
+    output_path = output_dir / f"topic_scored_reviews_{year}.csv"
+    # Skip if already exists and not forced
+    if skip_if_exists and output_path.exists():
+        print(f"⏩ Topic scores already exist for {year}: {output_path}")
+        return output_path
+    print(f"\n{'='*60}")
+    print(f"Topic Scoring: {year}")
+    print(f"  Model: {model_variant}")
+    print(f"  Device: {device}")
+    if limit:
+        print(f"  Limit: {limit} reviews")
+    print(f"{'='*60}")
+    # Validate input file
+    input_path = input_dir / f"all_reviews_{year}.csv"
+    try:
+        df = validate_input_file(input_path, required_columns=["id", "text"])
+    except (FileNotFoundError, ValueError) as e:
+        print(f"✗ Input validation failed: {e}")
+        raise
+    # Apply limit if specified
+    if limit:
+        df = df.head(limit)
+        print(f"Limited to {len(df)} reviews")
+    # Load model
+    try:
+        print(f"Loading {model_variant} model...")
+        tokenizer, model, device_obj = load_topic_model(
+            model_variant, Config.BASE_DIR, device
+        )
+    except (ValueError, FileNotFoundError) as e:
+        print(f"✗ Model loading failed: {e}")
+        raise
+    # Process reviews
+    all_results = []
+    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing reviews"):
+        review_id = row["id"]
+        text = row["text"]
+        # Tokenize into sentences
+        sentences = glimpse_tokenizer(text)
+        if not sentences:
+            continue
+        # Predict topic for all sentences in batch
+        try:
+            predictions = predict_batch(sentences, tokenizer, model, device_obj)
+        except RuntimeError as e:
+            print(f"✗ Prediction failed for review {review_id}: {e}")
+            raise
+        # Store results with both numeric ID and label
+        for sentence, topic_id in zip(sentences, predictions):
+            topic_label = TOPIC_ID_TO_LABEL.get(topic_id, "UNKNOWN")
+            all_results.append({
+                "id": review_id,
+                "sentence": sentence,
+                "topic_id": topic_id,
+                "topic": topic_label,
+            })
+    # Save results
+    try:
+        save_topic_results(output_path, all_results)
+        print(f"✓ Topic scores saved: {output_path}")
+        print(f"  Scored sentences: {len(all_results)}")
+    except Exception as e:
+        print(f"✗ Failed to save results: {e}")
+        raise
+    return output_path
+def main():
+    parser = argparse.ArgumentParser(
+        description="Topic scoring pipeline for ICLR review data"
+    )
+    parser.add_argument(
+        "--year",
+        type=int,
+        help="Single year to process (if not specified, auto-detects all available years)",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="scibert",
+        choices=["scibert", "deberta", "scideberta"],
+        help="Model variant to use (default: scibert)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device for computation (default: cuda)",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force reprocessing even if results exist",
+    )
+    args = parser.parse_args()
+    # Determine years to process
+    if args.year:
+        years = [args.year]
+    else:
+        processed_dir = Config.BASE_DIR / "data" / "processed"
+        years = find_available_years(processed_dir)
+        if not years:
+            print("⚠️  No preprocessed data found in data/processed/")
+            print("   Run preprocess_data.py first")
+            return
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"Topic Scoring Pipeline")
+    print(f"Years: {years}")
+    print(f"Model: {args.model}")
+    print(f"Device: {args.device}")
+    print(f"{'='*60}")
+    # Process each year
+    success_count = 0
+    failed_years = []
+    for year in years:
+        try:
+            score_reviews_topic(
+                year,
+                model_variant=args.model,
+                device=args.device,
+                skip_if_exists=not args.force,
+            )
+            success_count += 1
+        except Exception as e:
+            print(f"\n⚠️  Failed to process {year}: {e}")
+            failed_years.append(year)
+    # Final summary
+    print(f"\n{'='*60}")
+    print(f"Pipeline Summary")
+    print(f"{'='*60}")
+    print(f"✓ Successful: {success_count}/{len(years)} years")
+    if failed_years:
+        print(f"✗ Failed: {failed_years}")
+    print(f"{'='*60}\n")
+    # Exit with error if any failed
+    if failed_years:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

scored_reviews_builder.py CHANGED Viewed

@@ -165,10 +165,16 @@ def build_2020_2025_dataset(
             review_metadata = {}
             for _, row in original_df.iterrows():
                 review_id = row["id"]
                 review_metadata[review_id] = {
-                    'rebuttal': row.get('rebuttal', ''),
                     'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
-                    'has_rebuttal': bool(row.get('rebuttal', '').strip()) if 'rebuttal' in original_df.columns else False,
                 }
             all_scored_reviews.append({
@@ -202,34 +208,20 @@ if __name__ == "__main__":
         years, all_scored_reviews_df = load_scored_reviews()
         print (years)
-    # Debugging sample output
-    sample_year = 2021
-    sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year]
-    review_dict = sample_df["scored_dict"].iloc[0]
-    print(f"\n=== Sample Review from {sample_year} ===")
-    for review_id, sentence_data_list in review_dict.items():
-        print(f"\nReview ID: {review_id}")
-        for sentence_dict in sentence_data_list:
-            for sentence, data in sentence_dict.items():
-                print(f"  Sentence: {sentence}")
-                for key, value in data.items():
-                    print(f"    → {key}: {value}")
-            break  # print only the first review's sentences
-        break  # only one review
-    # --- Testing code ---
-    # scored_reviews_2017 = all_scored_reviews_df[all_scored_reviews_df["year"] == 2017]
-    # print(scored_reviews_2017)
-    # scored_reviews_2017 = scored_reviews_2017["scored_dict"].iloc[0]
-    # # scored_reviews_2017 = ast.literal_eval(scored_reviews_2017)
-    # print(type(scored_reviews_2017))
-    # print(scored_reviews_2017.keys())
-    # sample = scored_reviews_2017["https://openreview.net/forum?id=r1rhWnZkg"]
-    # print(sample[0])
-    # print(years)
-    # for id in scored_reviews_2017.keys():
-    #     print(len(scored_reviews_2017[id]))

             review_metadata = {}
             for _, row in original_df.iterrows():
                 review_id = row["id"]
+                rebuttal = row.get('rebuttal', '') if 'rebuttal' in original_df.columns else ''
+                # Handle NaN values from pandas
+                if pd.isna(rebuttal):
+                    rebuttal = ''
+                rebuttal_str = str(rebuttal) if rebuttal else ''
                 review_metadata[review_id] = {
+                    'rebuttal': rebuttal_str,
                     'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
+                    'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
                 }
             all_scored_reviews.append({
         years, all_scored_reviews_df = load_scored_reviews()
         print (years)
+        # Debugging sample output
+        sample_year = 2021
+        sample_df = all_scored_reviews_df[all_scored_reviews_df["year"] == sample_year]
+        review_dict = sample_df["scored_dict"].iloc[0]
+        print(f"\n=== Sample Review from {sample_year} ===")
+        for review_id, sentence_data_list in review_dict.items():
+            print(f"\nReview ID: {review_id}")
+            for sentence_dict in sentence_data_list:
+                for sentence, data in sentence_dict.items():
+                    print(f"  Sentence: {sentence}")
+                    for key, value in data.items():
+                        print(f"    → {key}: {value}")
+                break  # print only the first review's sentences
+            break  # only one review