| |
| """ |
| Standalone preprocessing script for ICLR data with rebuttal support. |
| Keeps glimpse-ui independent from the glimpse repository. |
| """ |
|
|
| import sys |
| import pandas as pd |
| import os |
| import re |
| from pathlib import Path |
|
|
| |
| _dir = Path(__file__).resolve().parent |
| sys.path[:0] = [str(_dir), str(_dir.parent)] |
|
|
| from config import Config |
|
|
| |
| BASE_DIR = Config.BASE_DIR |
|
|
|
|
| def clean_text(text): |
| """ |
| Clean review/rebuttal text by removing formatting artifacts. |
| |
| Removes: |
| - Hash separator lines (##########...) |
| - Excessive whitespace and blank lines |
| - Markdown formatting artifacts from OpenReview |
| |
| Args: |
| text: Raw text from OpenReview |
| |
| Returns: |
| Cleaned text with normalized whitespace |
| """ |
| if not isinstance(text, str) or not text.strip(): |
| return "" |
|
|
| |
| text = re.sub(r'#{2,}[\s]*\n', '\n', text) |
| text = re.sub(r'#{2,}', '', text) |
|
|
| |
| text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) |
|
|
| |
| text = text.strip() |
|
|
| return text |
|
|
|
|
| def preprocess_reviews_with_rebuttals(year: int, |
| input_dir: Path = None, |
| output_dir: Path = None): |
| """ |
| Preprocess raw review data for a given year, including rebuttals. |
| |
| Args: |
| year: Year to process |
| input_dir: Directory containing raw all_reviews_{year}.csv files |
| output_dir: Directory to write processed files |
| """ |
| if input_dir is None: |
| input_dir = BASE_DIR / "data" |
| if output_dir is None: |
| output_dir = BASE_DIR / "data" / "processed" |
|
|
| |
| output_dir.mkdir(parents=True, exist_ok=True) |
|
|
| input_file = input_dir / f"all_reviews_{year}.csv" |
| output_file = output_dir / f"all_reviews_{year}.csv" |
|
|
| if not input_file.exists(): |
| print(f"β οΈ Skipping {year}: {input_file} not found") |
| return False |
|
|
| print(f"Processing {year}...") |
| dataset = pd.read_csv(input_file) |
|
|
| |
| if 'rebuttal' in dataset.columns: |
| sub_dataset = dataset[['id', 'review', 'metareview', 'rebuttal']] |
| sub_dataset.rename(columns={ |
| "review": "text", |
| "metareview": "gold", |
| "rebuttal": "rebuttal" |
| }, inplace=True) |
| print(f" β Found {len(dataset)} reviews with rebuttals") |
| else: |
| |
| sub_dataset = dataset[['id', 'review', 'metareview']] |
| sub_dataset.rename(columns={ |
| "review": "text", |
| "metareview": "gold" |
| }, inplace=True) |
| sub_dataset['rebuttal'] = '' |
| print(f" β Found {len(dataset)} reviews (no rebuttals)") |
|
|
| |
| print(f" β Cleaning review text...") |
| sub_dataset['text'] = sub_dataset['text'].apply(clean_text) |
| sub_dataset['rebuttal'] = sub_dataset['rebuttal'].apply(clean_text) |
|
|
| sub_dataset.to_csv(output_file, index=False) |
| print(f" β Saved to {output_file}") |
| return True |
|
|
|
|
| def find_available_years(data_dir: Path = None): |
| """Auto-detect years by scanning data directory for all_reviews_YYYY.csv files.""" |
| if data_dir is None: |
| data_dir = BASE_DIR / "data" |
|
|
| years = [] |
| for file in data_dir.glob("all_reviews_*.csv"): |
| match = re.search(r'all_reviews_(\d{4})\.csv', file.name) |
| if match: |
| years.append(int(match.group(1))) |
|
|
| return sorted(years) |
|
|
|
|
| def main(): |
| """Preprocess all available years (auto-detected from data directory).""" |
| import argparse |
|
|
| parser = argparse.ArgumentParser( |
| description='Preprocess ICLR review data with rebuttal support' |
| ) |
| parser.add_argument('--year', type=int, help='Process single year only') |
| args = parser.parse_args() |
|
|
| if args.year: |
| |
| print(f"\nProcessing {args.year}...") |
| if preprocess_reviews_with_rebuttals(args.year): |
| print(f"β Successfully preprocessed {args.year}") |
| else: |
| print(f"β Failed to preprocess {args.year}") |
| else: |
| |
| available_years = find_available_years() |
|
|
| if not available_years: |
| print("β οΈ No data files found in data/ directory") |
| print(" Run fetch_iclr_data.py first to download data") |
| return |
|
|
| print(f"\n{'='*60}") |
| print(f"Preprocessing ICLR data") |
| print(f"Auto-detected years: {available_years}") |
| print(f"{'='*60}\n") |
|
|
| processed_count = 0 |
| for year in available_years: |
| if preprocess_reviews_with_rebuttals(year): |
| processed_count += 1 |
|
|
| print(f"\n{'='*60}") |
| print(f"β Preprocessing complete: {processed_count}/{len(available_years)} years processed") |
| print(f"{'='*60}\n") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|