#!/usr/bin/env python3 """ Standalone preprocessing script for ICLR data with rebuttal support. Keeps glimpse-ui independent from the glimpse repository. """ import sys import pandas as pd import os import re from pathlib import Path # Ensure sibling modules and project root are importable _dir = Path(__file__).resolve().parent sys.path[:0] = [str(_dir), str(_dir.parent)] from config import Config # Convenience alias BASE_DIR = Config.BASE_DIR def clean_text(text): """ Clean review/rebuttal text by removing formatting artifacts. Removes: - Hash separator lines (##########...) - Excessive whitespace and blank lines - Markdown formatting artifacts from OpenReview Args: text: Raw text from OpenReview Returns: Cleaned text with normalized whitespace """ if not isinstance(text, str) or not text.strip(): return "" # Remove hash separator lines (common in rebuttals) text = re.sub(r'#{2,}[\s]*\n', '\n', text) text = re.sub(r'#{2,}', '', text) # Remove excessive blank lines (more than 2 consecutive) text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Strip leading/trailing whitespace text = text.strip() return text def preprocess_reviews_with_rebuttals(year: int, input_dir: Path = None, output_dir: Path = None): """ Preprocess raw review data for a given year, including rebuttals. Args: year: Year to process input_dir: Directory containing raw all_reviews_{year}.csv files output_dir: Directory to write processed files """ if input_dir is None: input_dir = BASE_DIR / "data" if output_dir is None: output_dir = BASE_DIR / "data" / "processed" # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) input_file = input_dir / f"all_reviews_{year}.csv" output_file = output_dir / f"all_reviews_{year}.csv" if not input_file.exists(): print(f"⚠️ Skipping {year}: {input_file} not found") return False print(f"Processing {year}...") dataset = pd.read_csv(input_file) # Check if rebuttal column exists if 'rebuttal' in dataset.columns: sub_dataset = dataset[['id', 'review', 'metareview', 'rebuttal']] sub_dataset.rename(columns={ "review": "text", "metareview": "gold", "rebuttal": "rebuttal" }, inplace=True) print(f" ✓ Found {len(dataset)} reviews with rebuttals") else: # Fallback for data without rebuttals (legacy compatibility) sub_dataset = dataset[['id', 'review', 'metareview']] sub_dataset.rename(columns={ "review": "text", "metareview": "gold" }, inplace=True) sub_dataset['rebuttal'] = '' print(f" ✓ Found {len(dataset)} reviews (no rebuttals)") # Clean text and rebuttal columns print(f" → Cleaning review text...") sub_dataset['text'] = sub_dataset['text'].apply(clean_text) sub_dataset['rebuttal'] = sub_dataset['rebuttal'].apply(clean_text) sub_dataset.to_csv(output_file, index=False) print(f" → Saved to {output_file}") return True def find_available_years(data_dir: Path = None): """Auto-detect years by scanning data directory for all_reviews_YYYY.csv files.""" if data_dir is None: data_dir = BASE_DIR / "data" years = [] for file in data_dir.glob("all_reviews_*.csv"): match = re.search(r'all_reviews_(\d{4})\.csv', file.name) if match: years.append(int(match.group(1))) return sorted(years) def main(): """Preprocess all available years (auto-detected from data directory).""" import argparse parser = argparse.ArgumentParser( description='Preprocess ICLR review data with rebuttal support' ) parser.add_argument('--year', type=int, help='Process single year only') args = parser.parse_args() if args.year: # Process single year print(f"\nProcessing {args.year}...") if preprocess_reviews_with_rebuttals(args.year): print(f"✓ Successfully preprocessed {args.year}") else: print(f"✗ Failed to preprocess {args.year}") else: # Auto-detect and process all available years available_years = find_available_years() if not available_years: print("⚠️ No data files found in data/ directory") print(" Run fetch_iclr_data.py first to download data") return print(f"\n{'='*60}") print(f"Preprocessing ICLR data") print(f"Auto-detected years: {available_years}") print(f"{'='*60}\n") processed_count = 0 for year in available_years: if preprocess_reviews_with_rebuttals(year): processed_count += 1 print(f"\n{'='*60}") print(f"✓ Preprocessing complete: {processed_count}/{len(available_years)} years processed") print(f"{'='*60}\n") if __name__ == "__main__": main()