ReView / pipeline /preprocess_data.py
Sina1138
Move all pipeline scripts into pipeline/ directory for cleaner project structure
b9432ba
#!/usr/bin/env python3
"""
Standalone preprocessing script for ICLR data with rebuttal support.
Keeps glimpse-ui independent from the glimpse repository.
"""
import sys
import pandas as pd
import os
import re
from pathlib import Path
# Ensure sibling modules and project root are importable
_dir = Path(__file__).resolve().parent
sys.path[:0] = [str(_dir), str(_dir.parent)]
from config import Config
# Convenience alias
BASE_DIR = Config.BASE_DIR
def clean_text(text):
"""
Clean review/rebuttal text by removing formatting artifacts.
Removes:
- Hash separator lines (##########...)
- Excessive whitespace and blank lines
- Markdown formatting artifacts from OpenReview
Args:
text: Raw text from OpenReview
Returns:
Cleaned text with normalized whitespace
"""
if not isinstance(text, str) or not text.strip():
return ""
# Remove hash separator lines (common in rebuttals)
text = re.sub(r'#{2,}[\s]*\n', '\n', text)
text = re.sub(r'#{2,}', '', text)
# Remove excessive blank lines (more than 2 consecutive)
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
def preprocess_reviews_with_rebuttals(year: int,
input_dir: Path = None,
output_dir: Path = None):
"""
Preprocess raw review data for a given year, including rebuttals.
Args:
year: Year to process
input_dir: Directory containing raw all_reviews_{year}.csv files
output_dir: Directory to write processed files
"""
if input_dir is None:
input_dir = BASE_DIR / "data"
if output_dir is None:
output_dir = BASE_DIR / "data" / "processed"
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
input_file = input_dir / f"all_reviews_{year}.csv"
output_file = output_dir / f"all_reviews_{year}.csv"
if not input_file.exists():
print(f"⚠️ Skipping {year}: {input_file} not found")
return False
print(f"Processing {year}...")
dataset = pd.read_csv(input_file)
# Check if rebuttal column exists
if 'rebuttal' in dataset.columns:
sub_dataset = dataset[['id', 'review', 'metareview', 'rebuttal']]
sub_dataset.rename(columns={
"review": "text",
"metareview": "gold",
"rebuttal": "rebuttal"
}, inplace=True)
print(f" βœ“ Found {len(dataset)} reviews with rebuttals")
else:
# Fallback for data without rebuttals (legacy compatibility)
sub_dataset = dataset[['id', 'review', 'metareview']]
sub_dataset.rename(columns={
"review": "text",
"metareview": "gold"
}, inplace=True)
sub_dataset['rebuttal'] = ''
print(f" βœ“ Found {len(dataset)} reviews (no rebuttals)")
# Clean text and rebuttal columns
print(f" β†’ Cleaning review text...")
sub_dataset['text'] = sub_dataset['text'].apply(clean_text)
sub_dataset['rebuttal'] = sub_dataset['rebuttal'].apply(clean_text)
sub_dataset.to_csv(output_file, index=False)
print(f" β†’ Saved to {output_file}")
return True
def find_available_years(data_dir: Path = None):
"""Auto-detect years by scanning data directory for all_reviews_YYYY.csv files."""
if data_dir is None:
data_dir = BASE_DIR / "data"
years = []
for file in data_dir.glob("all_reviews_*.csv"):
match = re.search(r'all_reviews_(\d{4})\.csv', file.name)
if match:
years.append(int(match.group(1)))
return sorted(years)
def main():
"""Preprocess all available years (auto-detected from data directory)."""
import argparse
parser = argparse.ArgumentParser(
description='Preprocess ICLR review data with rebuttal support'
)
parser.add_argument('--year', type=int, help='Process single year only')
args = parser.parse_args()
if args.year:
# Process single year
print(f"\nProcessing {args.year}...")
if preprocess_reviews_with_rebuttals(args.year):
print(f"βœ“ Successfully preprocessed {args.year}")
else:
print(f"βœ— Failed to preprocess {args.year}")
else:
# Auto-detect and process all available years
available_years = find_available_years()
if not available_years:
print("⚠️ No data files found in data/ directory")
print(" Run fetch_iclr_data.py first to download data")
return
print(f"\n{'='*60}")
print(f"Preprocessing ICLR data")
print(f"Auto-detected years: {available_years}")
print(f"{'='*60}\n")
processed_count = 0
for year in available_years:
if preprocess_reviews_with_rebuttals(year):
processed_count += 1
print(f"\n{'='*60}")
print(f"βœ“ Preprocessing complete: {processed_count}/{len(available_years)} years processed")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()