File size: 5,153 Bytes
ddee91f b9432ba ddee91f 9068195 ddee91f b9432ba 9068195 ddee91f c939455 ddee91f c939455 ddee91f 9068195 ddee91f 9068195 ddee91f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | #!/usr/bin/env python3
"""
Standalone preprocessing script for ICLR data with rebuttal support.
Keeps glimpse-ui independent from the glimpse repository.
"""
import sys
import pandas as pd
import os
import re
from pathlib import Path
# Ensure sibling modules and project root are importable
_dir = Path(__file__).resolve().parent
sys.path[:0] = [str(_dir), str(_dir.parent)]
from config import Config
# Convenience alias
BASE_DIR = Config.BASE_DIR
def clean_text(text):
"""
Clean review/rebuttal text by removing formatting artifacts.
Removes:
- Hash separator lines (##########...)
- Excessive whitespace and blank lines
- Markdown formatting artifacts from OpenReview
Args:
text: Raw text from OpenReview
Returns:
Cleaned text with normalized whitespace
"""
if not isinstance(text, str) or not text.strip():
return ""
# Remove hash separator lines (common in rebuttals)
text = re.sub(r'#{2,}[\s]*\n', '\n', text)
text = re.sub(r'#{2,}', '', text)
# Remove excessive blank lines (more than 2 consecutive)
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
def preprocess_reviews_with_rebuttals(year: int,
input_dir: Path = None,
output_dir: Path = None):
"""
Preprocess raw review data for a given year, including rebuttals.
Args:
year: Year to process
input_dir: Directory containing raw all_reviews_{year}.csv files
output_dir: Directory to write processed files
"""
if input_dir is None:
input_dir = BASE_DIR / "data"
if output_dir is None:
output_dir = BASE_DIR / "data" / "processed"
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
input_file = input_dir / f"all_reviews_{year}.csv"
output_file = output_dir / f"all_reviews_{year}.csv"
if not input_file.exists():
print(f"β οΈ Skipping {year}: {input_file} not found")
return False
print(f"Processing {year}...")
dataset = pd.read_csv(input_file)
# Check if rebuttal column exists
if 'rebuttal' in dataset.columns:
sub_dataset = dataset[['id', 'review', 'metareview', 'rebuttal']]
sub_dataset.rename(columns={
"review": "text",
"metareview": "gold",
"rebuttal": "rebuttal"
}, inplace=True)
print(f" β Found {len(dataset)} reviews with rebuttals")
else:
# Fallback for data without rebuttals (legacy compatibility)
sub_dataset = dataset[['id', 'review', 'metareview']]
sub_dataset.rename(columns={
"review": "text",
"metareview": "gold"
}, inplace=True)
sub_dataset['rebuttal'] = ''
print(f" β Found {len(dataset)} reviews (no rebuttals)")
# Clean text and rebuttal columns
print(f" β Cleaning review text...")
sub_dataset['text'] = sub_dataset['text'].apply(clean_text)
sub_dataset['rebuttal'] = sub_dataset['rebuttal'].apply(clean_text)
sub_dataset.to_csv(output_file, index=False)
print(f" β Saved to {output_file}")
return True
def find_available_years(data_dir: Path = None):
"""Auto-detect years by scanning data directory for all_reviews_YYYY.csv files."""
if data_dir is None:
data_dir = BASE_DIR / "data"
years = []
for file in data_dir.glob("all_reviews_*.csv"):
match = re.search(r'all_reviews_(\d{4})\.csv', file.name)
if match:
years.append(int(match.group(1)))
return sorted(years)
def main():
"""Preprocess all available years (auto-detected from data directory)."""
import argparse
parser = argparse.ArgumentParser(
description='Preprocess ICLR review data with rebuttal support'
)
parser.add_argument('--year', type=int, help='Process single year only')
args = parser.parse_args()
if args.year:
# Process single year
print(f"\nProcessing {args.year}...")
if preprocess_reviews_with_rebuttals(args.year):
print(f"β Successfully preprocessed {args.year}")
else:
print(f"β Failed to preprocess {args.year}")
else:
# Auto-detect and process all available years
available_years = find_available_years()
if not available_years:
print("β οΈ No data files found in data/ directory")
print(" Run fetch_iclr_data.py first to download data")
return
print(f"\n{'='*60}")
print(f"Preprocessing ICLR data")
print(f"Auto-detected years: {available_years}")
print(f"{'='*60}\n")
processed_count = 0
for year in available_years:
if preprocess_reviews_with_rebuttals(year):
processed_count += 1
print(f"\n{'='*60}")
print(f"β Preprocessing complete: {processed_count}/{len(available_years)} years processed")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()
|