""" Fetch ICLR reviews and rebuttals from OpenReview API. Based on DISAPERE data_prep_lib.py patterns. """ import openreview import pandas as pd from pathlib import Path from tqdm import tqdm import time import os from typing import List, Dict, Optional import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ICLRDataFetcher: """Fetch ICLR reviews and rebuttals from OpenReview API.""" def __init__(self, base_url='https://api.openreview.net'): """ Initialize OpenReview client. Note: No authentication needed for public ICLR data. Uses guest client (V1 API) following DISAPERE patterns. """ logger.info(f"Connecting to OpenReview API at {base_url}") # Clear environment credentials to force guest access # (V1 Client also picks up these environment variables) env_backup = {} for key in ['OPENREVIEW_USERNAME', 'OPENREVIEW_PASSWORD']: if key in os.environ: env_backup[key] = os.environ.pop(key) try: # Use V1 API client (same as DISAPERE) # No credentials needed for guest access to public data self.client = openreview.Client(baseurl=base_url) finally: # Restore environment variables for key, value in env_backup.items(): os.environ[key] = value def get_venue_id(self, year: int) -> str: """Get OpenReview venue ID for ICLR in a given year.""" return f'ICLR.cc/{year}/Conference' def fetch_submissions(self, venue_id: str) -> List: """Fetch all submissions for a venue.""" logger.info(f"Fetching submissions for {venue_id}") # Get all submissions (papers) using V1 API # Try different invitation patterns (format changed over years) invitation_patterns = [ f'{venue_id}/-/Blind_Submission', f'{venue_id}/-/Submission', ] submissions = [] for pattern in invitation_patterns: try: submissions = list(openreview.tools.iterget_notes( self.client, invitation=pattern )) if submissions: logger.info(f"Found {len(submissions)} submissions with pattern: {pattern}") break except Exception as e: logger.debug(f"Pattern {pattern} failed: {e}") continue if not submissions: logger.warning(f"No submissions found for {venue_id}") return submissions def extract_reviews_and_rebuttals(self, submission) -> List[Dict]: """ Extract reviews and rebuttals from a submission. Returns list of dicts, one per review, with schema: - id: forum URL - paper_title: title - abstract: abstract - reviewer: reviewer name/ID - review: review text - rating: numerical rating - conf_rev: confidence - metareview: meta-review text (if available) - conf_meta: meta-review confidence - recommendation: accept/reject - rebuttal: author response to this review (NEW) """ rows = [] forum_id = submission.id forum_url = f"https://openreview.net/forum?id={forum_id}" # Get paper metadata (V1 API format - direct values, not nested dicts) paper_title = submission.content.get('title', '') abstract = submission.content.get('abstract', '') # Get all notes for this forum to find reviews, meta-reviews, and rebuttals forum_notes = self.client.get_notes(forum=forum_id) # Find meta-review metareview_text = '' metareview_conf = '' decision = '' for note in forum_notes: if 'Meta_Review' in note.invitation: metareview_text = note.content.get('metareview', '') metareview_conf = note.content.get('confidence', '') decision = note.content.get('recommendation', '') break # Find all official reviews reviews = [note for note in forum_notes if 'Official_Review' in note.invitation] # Extract data for each review for review_note in reviews: review_id = review_note.id # Extract review content (V1 API - direct values) review_content = review_note.content review_text = review_content.get('review', '') # Handle different rating formats rating_field = review_content.get('rating', '') if isinstance(rating_field, str): # Format like "6: Marginally above acceptance threshold" rating = rating_field.split(':')[0].strip() if ':' in rating_field else rating_field else: rating = str(rating_field) confidence = review_content.get('confidence', '') # Get reviewer signature reviewer = review_note.signatures[0] if review_note.signatures else 'Anonymous' # Find rebuttal (author response to this review) rebuttal_text = self._find_rebuttal_for_review(forum_notes, review_id) row = { 'id': forum_url, 'paper_title': paper_title, 'abstract': abstract, 'reviewer': reviewer, 'review': review_text, 'rating': rating, 'conf_rev': confidence, 'metareview': metareview_text, 'conf_meta': metareview_conf, 'recommendation': decision, 'rebuttal': rebuttal_text, # NEW COLUMN } rows.append(row) return rows def _find_rebuttal_for_review(self, forum_notes: List, review_id: str) -> str: """ Find author rebuttal that replies to a specific review. Simple approach: Look for notes that: 1. Are posted by authors (check 'Official_Comment' or 'Author.*Comment') 2. Reply to (replyto field = review_id) If multiple rebuttals found, concatenate with separator. If none found, return empty string. """ rebuttals = [] # Look through all forum notes for replies to this review for note in forum_notes: # Check if this note is a reply to the review if hasattr(note, 'replyto') and note.replyto == review_id: # Check if it's an author comment/rebuttal invitation = note.invitation if hasattr(note, 'invitation') else '' # Patterns that indicate author rebuttals author_patterns = ['Official_Comment', 'Author.*Comment', 'Comment'] is_author_reply = any(pattern in invitation for pattern in author_patterns) # Also check signatures for author markers if hasattr(note, 'signatures'): is_author_reply = is_author_reply or any('Authors' in sig for sig in note.signatures) if is_author_reply: # Extract comment text (field name varies: 'comment', 'rebuttal', 'title') rebuttal_content = (note.content.get('comment', '') or note.content.get('rebuttal', '') or note.content.get('title', '')) if rebuttal_content: rebuttals.append(rebuttal_content) # Concatenate multiple rebuttals with separator if len(rebuttals) > 1: return "\n\n--- ADDITIONAL RESPONSE ---\n\n".join(rebuttals) elif len(rebuttals) == 1: return rebuttals[0] else: return '' # No rebuttal found def fetch_year(self, year: int, output_path: Optional[Path] = None, limit: Optional[int] = None) -> pd.DataFrame: """ Fetch all ICLR data for a given year. Args: year: Conference year output_path: Where to save CSV (optional) limit: Limit number of papers to process (for testing) Returns: DataFrame with all reviews and rebuttals """ logger.info(f"Starting fetch for ICLR {year}") venue_id = self.get_venue_id(year) submissions = self.fetch_submissions(venue_id) # Limit submissions for testing if specified if limit is not None: logger.info(f"⚠️ TEST MODE: Limiting to {limit} submissions (out of {len(submissions)})") submissions = submissions[:limit] all_rows = [] # Process each submission for submission in tqdm(submissions, desc=f"Processing {year} submissions"): try: rows = self.extract_reviews_and_rebuttals(submission) all_rows.extend(rows) # Rate limiting: sleep briefly between submissions time.sleep(0.1) except Exception as e: logger.warning(f"Error processing submission {submission.id}: {e}") continue # Create DataFrame df = pd.DataFrame(all_rows) logger.info(f"Extracted {len(df)} reviews for {year}") logger.info(f"Reviews with rebuttals: {(df['rebuttal'] != '').sum()}") # Save to CSV if path provided if output_path: output_path.parent.mkdir(parents=True, exist_ok=True) # Use QUOTE_ALL to properly escape newlines and quotes in text fields import csv df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL) logger.info(f"Saved to {output_path}") return df def validate_dataframe(self, df: pd.DataFrame, year: int): """Validate fetched data quality.""" logger.info(f"Validating data for {year}") # Check row count if len(df) < 100: logger.warning(f"Low review count for {year}: {len(df)} (expected 500+)") # Check required columns required_cols = ['id', 'paper_title', 'review', 'rating', 'rebuttal'] missing = [col for col in required_cols if col not in df.columns] if missing: raise ValueError(f"Missing required columns: {missing}") # Check for empty reviews empty_reviews = df['review'].isna().sum() if empty_reviews > 0: logger.warning(f"{empty_reviews} reviews have missing text") # Rebuttal statistics total_reviews = len(df) with_rebuttals = (df['rebuttal'] != '').sum() rebuttal_pct = (with_rebuttals / total_reviews * 100) if total_reviews > 0 else 0 logger.info(f"Validation summary for {year}:") logger.info(f" Total reviews: {total_reviews}") logger.info(f" With rebuttals: {with_rebuttals} ({rebuttal_pct:.1f}%)") logger.info(f" Unique papers: {df['id'].nunique()}") def main(): """Fetch ICLR data with rebuttals.""" import argparse from datetime import datetime current_year = datetime.now().year parser = argparse.ArgumentParser( description='Fetch ICLR review data from OpenReview API' ) parser.add_argument( '--year', type=int, help='Fetch single year only' ) parser.add_argument( '--start-year', type=int, default=2020, help='Start year for batch fetch (default: 2020)' ) parser.add_argument( '--end-year', type=int, default=current_year, help=f'End year for batch fetch (default: {current_year})' ) parser.add_argument( '--output-dir', type=Path, default=Path(__file__).resolve().parent.parent / 'data', help='Output directory for CSV files' ) parser.add_argument( '--limit', type=int, default=None, help='Limit number of papers to process (for testing)' ) args = parser.parse_args() # Determine years to fetch if args.year: years_to_fetch = [args.year] else: years_to_fetch = list(range(args.start_year, args.end_year + 1)) output_dir = args.output_dir output_dir.mkdir(parents=True, exist_ok=True) # Initialize fetcher fetcher = ICLRDataFetcher() # Fetch each year for year in years_to_fetch: logger.info(f"\n{'='*60}") logger.info(f"FETCHING ICLR {year}") logger.info('='*60) output_path = output_dir / f'all_reviews_{year}.csv' try: df = fetcher.fetch_year(year, output_path=output_path, limit=args.limit) fetcher.validate_dataframe(df, year) except Exception as e: logger.error(f"Failed to fetch {year}: {e}") continue logger.info("\n✓ Data fetching complete!") logger.info(f"Files saved to: {output_dir}") if __name__ == '__main__': main()