File size: 5,153 Bytes
ddee91f
 
 
 
 
 
b9432ba
ddee91f
 
9068195
ddee91f
b9432ba
 
 
 
 
9068195
 
 
 
ddee91f
 
c939455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddee91f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c939455
 
 
 
 
ddee91f
 
 
 
 
9068195
 
 
 
 
 
 
 
 
 
 
 
 
 
ddee91f
9068195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddee91f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python3
"""
Standalone preprocessing script for ICLR data with rebuttal support.
Keeps glimpse-ui independent from the glimpse repository.
"""

import sys
import pandas as pd
import os
import re
from pathlib import Path

# Ensure sibling modules and project root are importable
_dir = Path(__file__).resolve().parent
sys.path[:0] = [str(_dir), str(_dir.parent)]

from config import Config

# Convenience alias
BASE_DIR = Config.BASE_DIR


def clean_text(text):
    """
    Clean review/rebuttal text by removing formatting artifacts.

    Removes:
    - Hash separator lines (##########...)
    - Excessive whitespace and blank lines
    - Markdown formatting artifacts from OpenReview

    Args:
        text: Raw text from OpenReview

    Returns:
        Cleaned text with normalized whitespace
    """
    if not isinstance(text, str) or not text.strip():
        return ""

    # Remove hash separator lines (common in rebuttals)
    text = re.sub(r'#{2,}[\s]*\n', '\n', text)
    text = re.sub(r'#{2,}', '', text)

    # Remove excessive blank lines (more than 2 consecutive)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)

    # Strip leading/trailing whitespace
    text = text.strip()

    return text


def preprocess_reviews_with_rebuttals(year: int,
                                       input_dir: Path = None,
                                       output_dir: Path = None):
    """
    Preprocess raw review data for a given year, including rebuttals.

    Args:
        year: Year to process
        input_dir: Directory containing raw all_reviews_{year}.csv files
        output_dir: Directory to write processed files
    """
    if input_dir is None:
        input_dir = BASE_DIR / "data"
    if output_dir is None:
        output_dir = BASE_DIR / "data" / "processed"

    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)

    input_file = input_dir / f"all_reviews_{year}.csv"
    output_file = output_dir / f"all_reviews_{year}.csv"

    if not input_file.exists():
        print(f"⚠️  Skipping {year}: {input_file} not found")
        return False

    print(f"Processing {year}...")
    dataset = pd.read_csv(input_file)

    # Check if rebuttal column exists
    if 'rebuttal' in dataset.columns:
        sub_dataset = dataset[['id', 'review', 'metareview', 'rebuttal']]
        sub_dataset.rename(columns={
            "review": "text",
            "metareview": "gold",
            "rebuttal": "rebuttal"
        }, inplace=True)
        print(f"  βœ“ Found {len(dataset)} reviews with rebuttals")
    else:
        # Fallback for data without rebuttals (legacy compatibility)
        sub_dataset = dataset[['id', 'review', 'metareview']]
        sub_dataset.rename(columns={
            "review": "text",
            "metareview": "gold"
        }, inplace=True)
        sub_dataset['rebuttal'] = ''
        print(f"  βœ“ Found {len(dataset)} reviews (no rebuttals)")

    # Clean text and rebuttal columns
    print(f"  β†’ Cleaning review text...")
    sub_dataset['text'] = sub_dataset['text'].apply(clean_text)
    sub_dataset['rebuttal'] = sub_dataset['rebuttal'].apply(clean_text)

    sub_dataset.to_csv(output_file, index=False)
    print(f"  β†’ Saved to {output_file}")
    return True


def find_available_years(data_dir: Path = None):
    """Auto-detect years by scanning data directory for all_reviews_YYYY.csv files."""
    if data_dir is None:
        data_dir = BASE_DIR / "data"

    years = []
    for file in data_dir.glob("all_reviews_*.csv"):
        match = re.search(r'all_reviews_(\d{4})\.csv', file.name)
        if match:
            years.append(int(match.group(1)))

    return sorted(years)


def main():
    """Preprocess all available years (auto-detected from data directory)."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Preprocess ICLR review data with rebuttal support'
    )
    parser.add_argument('--year', type=int, help='Process single year only')
    args = parser.parse_args()

    if args.year:
        # Process single year
        print(f"\nProcessing {args.year}...")
        if preprocess_reviews_with_rebuttals(args.year):
            print(f"βœ“ Successfully preprocessed {args.year}")
        else:
            print(f"βœ— Failed to preprocess {args.year}")
    else:
        # Auto-detect and process all available years
        available_years = find_available_years()

        if not available_years:
            print("⚠️  No data files found in data/ directory")
            print("   Run fetch_iclr_data.py first to download data")
            return

        print(f"\n{'='*60}")
        print(f"Preprocessing ICLR data")
        print(f"Auto-detected years: {available_years}")
        print(f"{'='*60}\n")

        processed_count = 0
        for year in available_years:
            if preprocess_reviews_with_rebuttals(year):
                processed_count += 1

        print(f"\n{'='*60}")
        print(f"βœ“ Preprocessing complete: {processed_count}/{len(available_years)} years processed")
        print(f"{'='*60}\n")


if __name__ == "__main__":
    main()