Filip Makraduli commited on
Commit
e9012d2
Β·
1 Parent(s): ec8c22e

better nan handling

Browse files
Files changed (1) hide show
  1. research_ai_agent.py +48 -32
research_ai_agent.py CHANGED
@@ -476,12 +476,6 @@ class RetrievalTool(Tool):
476
  print("⚠️ No results returned from Superlinked!")
477
  return pd.DataFrame() # Return empty dataframe
478
 
479
- # Ensure summary is a string
480
- if 'summary' in df_result.columns:
481
- df_result['summary'] = df_result['summary'].astype(str)
482
- else:
483
- print("⚠️ Warning: 'summary' column not found in retrieved DataFrame.")
484
-
485
  # ALWAYS merge with original dataframe to ensure complete data
486
  if self.df is not None:
487
  # Handle both 'entry_id' and 'id' column names from Superlinked
@@ -509,10 +503,13 @@ class RetrievalTool(Tool):
509
  how='left',
510
  suffixes=('', '_orig')
511
  )
512
- # Use original data if current is missing
 
513
  for col in ['title', 'summary', 'published']:
514
  if f'{col}_orig' in df_result.columns:
515
- df_result[col] = df_result[col].fillna(df_result[f'{col}_orig'])
 
 
516
 
517
  # After merge, check if we have publication dates for debugging recency
518
  if 'published' in df_result.columns:
@@ -704,20 +701,29 @@ def create_paper_cards(df_result):
704
  if len(df_result) == 0:
705
  return "<p>No papers found for your query.</p>"
706
 
707
- print(f"Sample data from df_result:")
708
- for i, row in df_result.head(2).iterrows():
709
- print(f"Row {i}: {dict(row)}")
710
 
711
- # Data should already be merged in RetrievalTool
712
- print(f"πŸ“‹ Columns available: {list(df_result.columns)}")
 
 
 
 
 
 
713
 
714
  # MINIMAL filtering - only remove completely broken rows
715
  valid_results = []
716
  for i, row in df_result.iterrows():
717
  # Accept ALL rows that have any title data - don't be picky
718
- valid_results.append(row)
 
 
 
 
719
 
720
- print(f"Original results: {len(df_result)}, Valid results after filtering: {len(valid_results)}")
721
 
722
  if len(valid_results) == 0:
723
  return "<div style='padding: 20px; text-align: center; color: #666;'>No valid papers found for your query. Try a different search term.</div>"
@@ -891,7 +897,18 @@ def create_paper_cards(df_result):
891
  for i, row in enumerate(valid_results):
892
  # Extract data safely with better fallbacks
893
  title = str(row.get('title', 'Unknown Title'))
894
- summary = str(row.get('summary', ''))
 
 
 
 
 
 
 
 
 
 
 
895
 
896
  # Try multiple ways to get entry_id
897
  entry_id = None
@@ -903,24 +920,23 @@ def create_paper_cards(df_result):
903
  if not entry_id:
904
  entry_id = f"Paper_{i+1}" # Fallback ID
905
 
906
- # Handle missing summary gracefully
907
- if pd.isna(row.get('summary')) or summary.lower().strip() in ['nan', '', 'none', 'null']:
908
- summary_display = "Summary not available in search results"
909
  else:
910
- summary_display = summary[:200] + "..." if len(summary) > 200 else summary
911
 
912
- # Format publication date
913
- if 'published' in df_result.columns and pd.notna(row.get('published')):
914
- try:
915
- pub_date = pd.to_datetime(row['published'])
916
- formatted_date = pub_date.strftime('%B %Y') # e.g., "March 2023"
917
- year_only = pub_date.strftime('%Y')
918
- except:
919
- formatted_date = str(row['published'])[:10]
920
- year_only = formatted_date[:4] if len(str(row['published'])) >= 4 else "Unknown"
921
- else:
922
- formatted_date = "Date not available"
923
- year_only = "Unknown"
924
 
925
  # LONGER title display - don't truncate so aggressively
926
  title_display = title[:120] + "..." if len(title) > 120 else title
 
476
  print("⚠️ No results returned from Superlinked!")
477
  return pd.DataFrame() # Return empty dataframe
478
 
 
 
 
 
 
 
479
  # ALWAYS merge with original dataframe to ensure complete data
480
  if self.df is not None:
481
  # Handle both 'entry_id' and 'id' column names from Superlinked
 
503
  how='left',
504
  suffixes=('', '_orig')
505
  )
506
+
507
+ # CRITICAL FIX: Use original data when current data is missing or nan
508
  for col in ['title', 'summary', 'published']:
509
  if f'{col}_orig' in df_result.columns:
510
+ # Fill missing/nan values with original data
511
+ mask = df_result[col].isna() | (df_result[col].astype(str).str.lower() == 'nan')
512
+ df_result.loc[mask, col] = df_result.loc[mask, f'{col}_orig']
513
 
514
  # After merge, check if we have publication dates for debugging recency
515
  if 'published' in df_result.columns:
 
701
  if len(df_result) == 0:
702
  return "<p>No papers found for your query.</p>"
703
 
704
+ print(f"🎨 Creating cards for {len(df_result)} results...")
705
+ print(f"πŸ“‹ Available columns: {list(df_result.columns)}")
 
706
 
707
+ # Show sample data for debugging
708
+ if len(df_result) > 0:
709
+ print(f"πŸ” Sample card data:")
710
+ for i, row in df_result.head(2).iterrows():
711
+ title = row.get('title', 'MISSING')
712
+ summary = str(row.get('summary', 'MISSING'))[:50]
713
+ published = row.get('published', 'MISSING')
714
+ print(f" Row {i}: title='{title}', summary='{summary}...', published={published}")
715
 
716
  # MINIMAL filtering - only remove completely broken rows
717
  valid_results = []
718
  for i, row in df_result.iterrows():
719
  # Accept ALL rows that have any title data - don't be picky
720
+ title = str(row.get('title', ''))
721
+ if title and title.lower() not in ['nan', 'none', 'null', '']:
722
+ valid_results.append(row)
723
+ else:
724
+ print(f"⚠️ Skipping row {i} with invalid title: '{title}'")
725
 
726
+ print(f"πŸ“Š Original results: {len(df_result)}, Valid results after filtering: {len(valid_results)}")
727
 
728
  if len(valid_results) == 0:
729
  return "<div style='padding: 20px; text-align: center; color: #666;'>No valid papers found for your query. Try a different search term.</div>"
 
897
  for i, row in enumerate(valid_results):
898
  # Extract data safely with better fallbacks
899
  title = str(row.get('title', 'Unknown Title'))
900
+
901
+ # Try to get summary from multiple possible columns
902
+ summary = None
903
+ for summary_col in ['summary', 'summary_orig']:
904
+ if summary_col in row and pd.notna(row.get(summary_col)):
905
+ candidate_summary = str(row[summary_col])
906
+ if candidate_summary.lower() not in ['nan', 'none', 'null', '']:
907
+ summary = candidate_summary
908
+ break
909
+
910
+ if not summary:
911
+ summary = "Summary not available in search results"
912
 
913
  # Try multiple ways to get entry_id
914
  entry_id = None
 
920
  if not entry_id:
921
  entry_id = f"Paper_{i+1}" # Fallback ID
922
 
923
+ # Handle summary display
924
+ if len(summary) > 200:
925
+ summary_display = summary[:200] + "..."
926
  else:
927
+ summary_display = summary
928
 
929
+ # Format publication date - try multiple columns
930
+ formatted_date = "Date not available"
931
+ for date_col in ['published', 'published_orig']:
932
+ if date_col in row and pd.notna(row.get(date_col)):
933
+ try:
934
+ pub_date = pd.to_datetime(row[date_col])
935
+ formatted_date = pub_date.strftime('%B %Y') # e.g., "March 2023"
936
+ break
937
+ except:
938
+ formatted_date = str(row[date_col])[:10]
939
+ break
 
940
 
941
  # LONGER title display - don't truncate so aggressively
942
  title_display = title[:120] + "..." if len(title) > 120 else title