| """ |
| Fetch ICLR reviews and rebuttals from OpenReview API. |
| Based on DISAPERE data_prep_lib.py patterns. |
| """ |
|
|
| import openreview |
| import pandas as pd |
| from pathlib import Path |
| from tqdm import tqdm |
| import time |
| import os |
| from typing import List, Dict, Optional |
| import logging |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class ICLRDataFetcher: |
| """Fetch ICLR reviews and rebuttals from OpenReview API.""" |
|
|
| def __init__(self, base_url='https://api.openreview.net'): |
| """ |
| Initialize OpenReview client. |
| |
| Note: No authentication needed for public ICLR data. |
| Uses guest client (V1 API) following DISAPERE patterns. |
| """ |
| logger.info(f"Connecting to OpenReview API at {base_url}") |
|
|
| |
| |
| env_backup = {} |
| for key in ['OPENREVIEW_USERNAME', 'OPENREVIEW_PASSWORD']: |
| if key in os.environ: |
| env_backup[key] = os.environ.pop(key) |
|
|
| try: |
| |
| |
| self.client = openreview.Client(baseurl=base_url) |
| finally: |
| |
| for key, value in env_backup.items(): |
| os.environ[key] = value |
|
|
| def get_venue_id(self, year: int) -> str: |
| """Get OpenReview venue ID for ICLR in a given year.""" |
| return f'ICLR.cc/{year}/Conference' |
|
|
| def fetch_submissions(self, venue_id: str) -> List: |
| """Fetch all submissions for a venue.""" |
| logger.info(f"Fetching submissions for {venue_id}") |
|
|
| |
| |
| invitation_patterns = [ |
| f'{venue_id}/-/Blind_Submission', |
| f'{venue_id}/-/Submission', |
| ] |
|
|
| submissions = [] |
| for pattern in invitation_patterns: |
| try: |
| submissions = list(openreview.tools.iterget_notes( |
| self.client, |
| invitation=pattern |
| )) |
| if submissions: |
| logger.info(f"Found {len(submissions)} submissions with pattern: {pattern}") |
| break |
| except Exception as e: |
| logger.debug(f"Pattern {pattern} failed: {e}") |
| continue |
|
|
| if not submissions: |
| logger.warning(f"No submissions found for {venue_id}") |
|
|
| return submissions |
|
|
| def extract_reviews_and_rebuttals(self, submission) -> List[Dict]: |
| """ |
| Extract reviews and rebuttals from a submission. |
| |
| Returns list of dicts, one per review, with schema: |
| - id: forum URL |
| - paper_title: title |
| - abstract: abstract |
| - reviewer: reviewer name/ID |
| - review: review text |
| - rating: numerical rating |
| - conf_rev: confidence |
| - metareview: meta-review text (if available) |
| - conf_meta: meta-review confidence |
| - recommendation: accept/reject |
| - rebuttal: author response to this review (NEW) |
| """ |
| rows = [] |
|
|
| forum_id = submission.id |
| forum_url = f"https://openreview.net/forum?id={forum_id}" |
|
|
| |
| paper_title = submission.content.get('title', '') |
| abstract = submission.content.get('abstract', '') |
|
|
| |
| forum_notes = self.client.get_notes(forum=forum_id) |
|
|
| |
| metareview_text = '' |
| metareview_conf = '' |
| decision = '' |
|
|
| for note in forum_notes: |
| if 'Meta_Review' in note.invitation: |
| metareview_text = note.content.get('metareview', '') |
| metareview_conf = note.content.get('confidence', '') |
| decision = note.content.get('recommendation', '') |
| break |
|
|
| |
| reviews = [note for note in forum_notes if 'Official_Review' in note.invitation] |
|
|
| |
| for review_note in reviews: |
| review_id = review_note.id |
|
|
| |
| review_content = review_note.content |
| review_text = review_content.get('review', '') |
|
|
| |
| rating_field = review_content.get('rating', '') |
| if isinstance(rating_field, str): |
| |
| rating = rating_field.split(':')[0].strip() if ':' in rating_field else rating_field |
| else: |
| rating = str(rating_field) |
|
|
| confidence = review_content.get('confidence', '') |
|
|
| |
| reviewer = review_note.signatures[0] if review_note.signatures else 'Anonymous' |
|
|
| |
| rebuttal_text = self._find_rebuttal_for_review(forum_notes, review_id) |
|
|
| row = { |
| 'id': forum_url, |
| 'paper_title': paper_title, |
| 'abstract': abstract, |
| 'reviewer': reviewer, |
| 'review': review_text, |
| 'rating': rating, |
| 'conf_rev': confidence, |
| 'metareview': metareview_text, |
| 'conf_meta': metareview_conf, |
| 'recommendation': decision, |
| 'rebuttal': rebuttal_text, |
| } |
|
|
| rows.append(row) |
|
|
| return rows |
|
|
| def _find_rebuttal_for_review(self, forum_notes: List, review_id: str) -> str: |
| """ |
| Find author rebuttal that replies to a specific review. |
| |
| Simple approach: Look for notes that: |
| 1. Are posted by authors (check 'Official_Comment' or 'Author.*Comment') |
| 2. Reply to (replyto field = review_id) |
| |
| If multiple rebuttals found, concatenate with separator. |
| If none found, return empty string. |
| """ |
| rebuttals = [] |
|
|
| |
| for note in forum_notes: |
| |
| if hasattr(note, 'replyto') and note.replyto == review_id: |
| |
| invitation = note.invitation if hasattr(note, 'invitation') else '' |
|
|
| |
| author_patterns = ['Official_Comment', 'Author.*Comment', 'Comment'] |
| is_author_reply = any(pattern in invitation for pattern in author_patterns) |
|
|
| |
| if hasattr(note, 'signatures'): |
| is_author_reply = is_author_reply or any('Authors' in sig for sig in note.signatures) |
|
|
| if is_author_reply: |
| |
| rebuttal_content = (note.content.get('comment', '') or |
| note.content.get('rebuttal', '') or |
| note.content.get('title', '')) |
| if rebuttal_content: |
| rebuttals.append(rebuttal_content) |
|
|
| |
| if len(rebuttals) > 1: |
| return "\n\n--- ADDITIONAL RESPONSE ---\n\n".join(rebuttals) |
| elif len(rebuttals) == 1: |
| return rebuttals[0] |
| else: |
| return '' |
|
|
| def fetch_year(self, year: int, output_path: Optional[Path] = None, limit: Optional[int] = None) -> pd.DataFrame: |
| """ |
| Fetch all ICLR data for a given year. |
| |
| Args: |
| year: Conference year |
| output_path: Where to save CSV (optional) |
| limit: Limit number of papers to process (for testing) |
| |
| Returns: |
| DataFrame with all reviews and rebuttals |
| """ |
| logger.info(f"Starting fetch for ICLR {year}") |
|
|
| venue_id = self.get_venue_id(year) |
| submissions = self.fetch_submissions(venue_id) |
|
|
| |
| if limit is not None: |
| logger.info(f"⚠️ TEST MODE: Limiting to {limit} submissions (out of {len(submissions)})") |
| submissions = submissions[:limit] |
|
|
| all_rows = [] |
|
|
| |
| for submission in tqdm(submissions, desc=f"Processing {year} submissions"): |
| try: |
| rows = self.extract_reviews_and_rebuttals(submission) |
| all_rows.extend(rows) |
|
|
| |
| time.sleep(0.1) |
|
|
| except Exception as e: |
| logger.warning(f"Error processing submission {submission.id}: {e}") |
| continue |
|
|
| |
| df = pd.DataFrame(all_rows) |
|
|
| logger.info(f"Extracted {len(df)} reviews for {year}") |
| logger.info(f"Reviews with rebuttals: {(df['rebuttal'] != '').sum()}") |
|
|
| |
| if output_path: |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| |
| import csv |
| df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL) |
| logger.info(f"Saved to {output_path}") |
|
|
| return df |
|
|
| def validate_dataframe(self, df: pd.DataFrame, year: int): |
| """Validate fetched data quality.""" |
| logger.info(f"Validating data for {year}") |
|
|
| |
| if len(df) < 100: |
| logger.warning(f"Low review count for {year}: {len(df)} (expected 500+)") |
|
|
| |
| required_cols = ['id', 'paper_title', 'review', 'rating', 'rebuttal'] |
| missing = [col for col in required_cols if col not in df.columns] |
| if missing: |
| raise ValueError(f"Missing required columns: {missing}") |
|
|
| |
| empty_reviews = df['review'].isna().sum() |
| if empty_reviews > 0: |
| logger.warning(f"{empty_reviews} reviews have missing text") |
|
|
| |
| total_reviews = len(df) |
| with_rebuttals = (df['rebuttal'] != '').sum() |
| rebuttal_pct = (with_rebuttals / total_reviews * 100) if total_reviews > 0 else 0 |
|
|
| logger.info(f"Validation summary for {year}:") |
| logger.info(f" Total reviews: {total_reviews}") |
| logger.info(f" With rebuttals: {with_rebuttals} ({rebuttal_pct:.1f}%)") |
| logger.info(f" Unique papers: {df['id'].nunique()}") |
|
|
|
|
| def main(): |
| """Fetch ICLR data with rebuttals.""" |
| import argparse |
| from datetime import datetime |
|
|
| current_year = datetime.now().year |
|
|
| parser = argparse.ArgumentParser( |
| description='Fetch ICLR review data from OpenReview API' |
| ) |
| parser.add_argument( |
| '--year', |
| type=int, |
| help='Fetch single year only' |
| ) |
| parser.add_argument( |
| '--start-year', |
| type=int, |
| default=2020, |
| help='Start year for batch fetch (default: 2020)' |
| ) |
| parser.add_argument( |
| '--end-year', |
| type=int, |
| default=current_year, |
| help=f'End year for batch fetch (default: {current_year})' |
| ) |
| parser.add_argument( |
| '--output-dir', |
| type=Path, |
| default=Path(__file__).resolve().parent.parent / 'data', |
| help='Output directory for CSV files' |
| ) |
| parser.add_argument( |
| '--limit', |
| type=int, |
| default=None, |
| help='Limit number of papers to process (for testing)' |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
| if args.year: |
| years_to_fetch = [args.year] |
| else: |
| years_to_fetch = list(range(args.start_year, args.end_year + 1)) |
|
|
| output_dir = args.output_dir |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| |
| fetcher = ICLRDataFetcher() |
|
|
| |
| for year in years_to_fetch: |
| logger.info(f"\n{'='*60}") |
| logger.info(f"FETCHING ICLR {year}") |
| logger.info('='*60) |
|
|
| output_path = output_dir / f'all_reviews_{year}.csv' |
|
|
| try: |
| df = fetcher.fetch_year(year, output_path=output_path, limit=args.limit) |
| fetcher.validate_dataframe(df, year) |
|
|
| except Exception as e: |
| logger.error(f"Failed to fetch {year}: {e}") |
| continue |
|
|
| logger.info("\n✓ Data fetching complete!") |
| logger.info(f"Files saved to: {output_dir}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|