Sina1138 commited on
Commit
4cd1bc5
Β·
1 Parent(s): e5ab9ff

Enhance paper title handling: load titles from raw data CSVs for improved metadata accuracy in reviews

Browse files
interface/Demo.py CHANGED
@@ -55,6 +55,21 @@ if df_new.empty:
55
 
56
  # Use new data only
57
  years, all_scored_reviews_df = years_new, df_new
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  year_range_str = f"{min(years)}–{max(years)}" if years else "N/A"
59
 
60
  # -----------------------------------
@@ -536,10 +551,24 @@ with gr.Blocks(title="ReView", css=CUSTOM_CSS) as demo:
536
  color_map = {} # Default to empty map
537
  legend = False
538
 
539
- new_review_id = (
540
- f"### Submission Link:\n\n{review_ids[current_index]}<br>"
541
- f"(Showing {current_index + 1} of {len(state['review_ids'])} reviews)"
542
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
544
  number_of_displayed_reviews = len(current_review)
545
  review_updates = []
@@ -573,12 +602,12 @@ with gr.Blocks(title="ReView", css=CUSTOM_CSS) as demo:
573
  highlighted = []
574
  for sentence, metadata in review_item:
575
  polarity = metadata.get("polarity", None)
576
- if polarity >= 0.995:
577
  label = "βž•" # positive
578
- elif polarity <= -0.99:
579
  label = "βž–" # negative
580
  else:
581
- label = None # ignore neutral (1)
582
  highlighted.append((sentence, label))
583
  elif show_consensuality:
584
  highlighted = []
 
55
 
56
  # Use new data only
57
  years, all_scored_reviews_df = years_new, df_new
58
+
59
+ # Build a {forum_url: paper_title} lookup from raw data CSVs (processed CSVs lack paper_title)
60
+ def _load_paper_titles() -> dict:
61
+ titles = {}
62
+ for csv in sorted((BASE_DIR / "data").glob("all_reviews_*.csv")):
63
+ try:
64
+ df = pd.read_csv(csv, usecols=["id", "paper_title"])
65
+ for _, row in df.iterrows():
66
+ if row["id"] not in titles and pd.notna(row.get("paper_title", "")):
67
+ titles[row["id"]] = str(row["paper_title"])
68
+ except Exception:
69
+ pass
70
+ return titles
71
+
72
+ _paper_titles = _load_paper_titles()
73
  year_range_str = f"{min(years)}–{max(years)}" if years else "N/A"
74
 
75
  # -----------------------------------
 
551
  color_map = {} # Default to empty map
552
  legend = False
553
 
554
+ current_id = review_ids[current_index]
555
+ # Primary source: raw CSV lookup (processed CSVs lack paper_title)
556
+ paper_title = _paper_titles.get(current_id, "")
557
+ # Fallback: metadata column in preprocessed CSV
558
+ if not paper_title:
559
+ paper_meta = state.get("metadata_for_year", {}).get(current_id, {})
560
+ paper_title = paper_meta.get("paper_title", "") if isinstance(paper_meta, dict) else ""
561
+ if paper_title:
562
+ new_review_id = (
563
+ f"### {paper_title}\n\n"
564
+ f"[View on OpenReview]({current_id}) &nbsp;Β·&nbsp; "
565
+ f"({current_index + 1} of {len(state['review_ids'])} submissions)"
566
+ )
567
+ else:
568
+ new_review_id = (
569
+ f"### [View on OpenReview]({current_id})\n\n"
570
+ f"({current_index + 1} of {len(state['review_ids'])} submissions)"
571
+ )
572
 
573
  number_of_displayed_reviews = len(current_review)
574
  review_updates = []
 
602
  highlighted = []
603
  for sentence, metadata in review_item:
604
  polarity = metadata.get("polarity", None)
605
+ if polarity == 2:
606
  label = "βž•" # positive
607
+ elif polarity == 0:
608
  label = "βž–" # negative
609
  else:
610
+ label = None # neutral (1)
611
  highlighted.append((sentence, label))
612
  elif show_consensuality:
613
  highlighted = []
pipeline/scored_reviews_builder.py CHANGED
@@ -224,6 +224,19 @@ def build_dataset(
224
  # Load original data to extract rebuttals
225
  original_df = pd.read_csv(original_csv_path)
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  # Build metadata dict with rebuttals
228
  review_metadata = {}
229
  for _, row in original_df.iterrows():
@@ -236,7 +249,7 @@ def build_dataset(
236
 
237
  review_metadata[review_id] = {
238
  'rebuttal': rebuttal_str,
239
- 'paper_title': row.get('paper_title', '') if 'paper_title' in original_df.columns else '',
240
  'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
241
  }
242
 
 
224
  # Load original data to extract rebuttals
225
  original_df = pd.read_csv(original_csv_path)
226
 
227
+ # Load paper titles from raw data CSV (processed CSVs lack paper_title)
228
+ paper_titles = {}
229
+ if raw_data_csv_path.exists():
230
+ try:
231
+ raw_df = pd.read_csv(raw_data_csv_path, usecols=["id", "paper_title"])
232
+ paper_titles = {
233
+ row["id"]: str(row["paper_title"])
234
+ for _, row in raw_df.iterrows()
235
+ if pd.notna(row.get("paper_title", ""))
236
+ }
237
+ except Exception as e:
238
+ print(f"Warning: Could not load paper titles from {raw_data_csv_path}: {e}")
239
+
240
  # Build metadata dict with rebuttals
241
  review_metadata = {}
242
  for _, row in original_df.iterrows():
 
249
 
250
  review_metadata[review_id] = {
251
  'rebuttal': rebuttal_str,
252
+ 'paper_title': paper_titles.get(review_id, ''),
253
  'has_rebuttal': bool(rebuttal_str.strip()) if rebuttal_str else False,
254
  }
255