Sina1138 commited on
Commit Β·
4cd1bc5
1
Parent(s): e5ab9ff
Enhance paper title handling: load titles from raw data CSVs for improved metadata accuracy in reviews
Browse files- interface/Demo.py +36 -7
- pipeline/scored_reviews_builder.py +14 -1
interface/Demo.py
CHANGED
|
@@ -55,6 +55,21 @@ if df_new.empty:
|
|
| 55 |
|
| 56 |
# Use new data only
|
| 57 |
years, all_scored_reviews_df = years_new, df_new
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
year_range_str = f"{min(years)}β{max(years)}" if years else "N/A"
|
| 59 |
|
| 60 |
# -----------------------------------
|
|
@@ -536,10 +551,24 @@ with gr.Blocks(title="ReView", css=CUSTOM_CSS) as demo:
|
|
| 536 |
color_map = {} # Default to empty map
|
| 537 |
legend = False
|
| 538 |
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
number_of_displayed_reviews = len(current_review)
|
| 545 |
review_updates = []
|
|
@@ -573,12 +602,12 @@ with gr.Blocks(title="ReView", css=CUSTOM_CSS) as demo:
|
|
| 573 |
highlighted = []
|
| 574 |
for sentence, metadata in review_item:
|
| 575 |
polarity = metadata.get("polarity", None)
|
| 576 |
-
if polarity
|
| 577 |
label = "β" # positive
|
| 578 |
-
elif polarity
|
| 579 |
label = "β" # negative
|
| 580 |
else:
|
| 581 |
-
label = None #
|
| 582 |
highlighted.append((sentence, label))
|
| 583 |
elif show_consensuality:
|
| 584 |
highlighted = []
|
|
|
|
| 55 |
|
| 56 |
# Use new data only
|
| 57 |
years, all_scored_reviews_df = years_new, df_new
|
| 58 |
+
|
| 59 |
+
# Build a {forum_url: paper_title} lookup from raw data CSVs (processed CSVs lack paper_title)
|
| 60 |
+
def _load_paper_titles() -> dict:
|
| 61 |
+
titles = {}
|
| 62 |
+
for csv in sorted((BASE_DIR / "data").glob("all_reviews_*.csv")):
|
| 63 |
+
try:
|
| 64 |
+
df = pd.read_csv(csv, usecols=["id", "paper_title"])
|
| 65 |
+
for _, row in df.iterrows():
|
| 66 |
+
if row["id"] not in titles and pd.notna(row.get("paper_title", "")):
|
| 67 |
+
titles[row["id"]] = str(row["paper_title"])
|
| 68 |
+
except Exception:
|
| 69 |
+
pass
|
| 70 |
+
return titles
|
| 71 |
+
|
| 72 |
+
_paper_titles = _load_paper_titles()
|
| 73 |
year_range_str = f"{min(years)}β{max(years)}" if years else "N/A"
|
| 74 |
|
| 75 |
# -----------------------------------
|
|
|
|
| 551 |
color_map = {} # Default to empty map
|
| 552 |
legend = False
|
| 553 |
|
| 554 |
+
current_id = review_ids[current_index]
|
| 555 |
+
# Primary source: raw CSV lookup (processed CSVs lack paper_title)
|
| 556 |
+
paper_title = _paper_titles.get(current_id, "")
|
| 557 |
+
# Fallback: metadata column in preprocessed CSV
|
| 558 |
+
if not paper_title:
|
| 559 |
+
paper_meta = state.get("metadata_for_year", {}).get(current_id, {})
|
| 560 |
+
paper_title = paper_meta.get("paper_title", "") if isinstance(paper_meta, dict) else ""
|
| 561 |
+
if paper_title:
|
| 562 |
+
new_review_id = (
|
| 563 |
+
f"### {paper_title}\n\n"
|
| 564 |
+
f"[View on OpenReview]({current_id}) Β· "
|
| 565 |
+
f"({current_index + 1} of {len(state['review_ids'])} submissions)"
|
| 566 |
+
)
|
| 567 |
+
else:
|
| 568 |
+
new_review_id = (
|
| 569 |
+
f"### [View on OpenReview]({current_id})\n\n"
|
| 570 |
+
f"({current_index + 1} of {len(state['review_ids'])} submissions)"
|
| 571 |
+
)
|
| 572 |
|
| 573 |
number_of_displayed_reviews = len(current_review)
|
| 574 |
review_updates = []
|
|
|
|
| 602 |
highlighted = []
|
| 603 |
for sentence, metadata in review_item:
|
| 604 |
polarity = metadata.get("polarity", None)
|
| 605 |
+
if polarity == 2:
|
| 606 |
label = "β" # positive
|
| 607 |
+
elif polarity == 0:
|
| 608 |
label = "β" # negative
|
| 609 |
else:
|
| 610 |
+
label = None # neutral (1)
|
| 611 |
highlighted.append((sentence, label))
|
| 612 |
elif show_consensuality:
|
| 613 |
highlighted = []
|
pipeline/scored_reviews_builder.py
CHANGED
|
@@ -224,6 +224,19 @@ def build_dataset(
|
|
| 224 |
# Load original data to extract rebuttals
|
| 225 |
original_df = pd.read_csv(original_csv_path)
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
# Build metadata dict with rebuttals
|
| 228 |
review_metadata = {}
|
| 229 |
for _, row in original_df.iterrows():
|
|
@@ -236,7 +249,7 @@ def build_dataset(
|
|
| 236 |
|
| 237 |
review_metadata[review_id] = {
|
| 238 |
'rebuttal': rebuttal_str,
|
| 239 |
-
'paper_title':
|
| 240 |
'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
|
| 241 |
}
|
| 242 |
|
|
|
|
| 224 |
# Load original data to extract rebuttals
|
| 225 |
original_df = pd.read_csv(original_csv_path)
|
| 226 |
|
| 227 |
+
# Load paper titles from raw data CSV (processed CSVs lack paper_title)
|
| 228 |
+
paper_titles = {}
|
| 229 |
+
if raw_data_csv_path.exists():
|
| 230 |
+
try:
|
| 231 |
+
raw_df = pd.read_csv(raw_data_csv_path, usecols=["id", "paper_title"])
|
| 232 |
+
paper_titles = {
|
| 233 |
+
row["id"]: str(row["paper_title"])
|
| 234 |
+
for _, row in raw_df.iterrows()
|
| 235 |
+
if pd.notna(row.get("paper_title", ""))
|
| 236 |
+
}
|
| 237 |
+
except Exception as e:
|
| 238 |
+
print(f"Warning: Could not load paper titles from {raw_data_csv_path}: {e}")
|
| 239 |
+
|
| 240 |
# Build metadata dict with rebuttals
|
| 241 |
review_metadata = {}
|
| 242 |
for _, row in original_df.iterrows():
|
|
|
|
| 249 |
|
| 250 |
review_metadata[review_id] = {
|
| 251 |
'rebuttal': rebuttal_str,
|
| 252 |
+
'paper_title': paper_titles.get(review_id, ''),
|
| 253 |
'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
|
| 254 |
}
|
| 255 |
|