Filip Makraduli
commited on
Commit
Β·
e9012d2
1
Parent(s):
ec8c22e
better nan handling
Browse files- research_ai_agent.py +48 -32
research_ai_agent.py
CHANGED
|
@@ -476,12 +476,6 @@ class RetrievalTool(Tool):
|
|
| 476 |
print("β οΈ No results returned from Superlinked!")
|
| 477 |
return pd.DataFrame() # Return empty dataframe
|
| 478 |
|
| 479 |
-
# Ensure summary is a string
|
| 480 |
-
if 'summary' in df_result.columns:
|
| 481 |
-
df_result['summary'] = df_result['summary'].astype(str)
|
| 482 |
-
else:
|
| 483 |
-
print("β οΈ Warning: 'summary' column not found in retrieved DataFrame.")
|
| 484 |
-
|
| 485 |
# ALWAYS merge with original dataframe to ensure complete data
|
| 486 |
if self.df is not None:
|
| 487 |
# Handle both 'entry_id' and 'id' column names from Superlinked
|
|
@@ -509,10 +503,13 @@ class RetrievalTool(Tool):
|
|
| 509 |
how='left',
|
| 510 |
suffixes=('', '_orig')
|
| 511 |
)
|
| 512 |
-
|
|
|
|
| 513 |
for col in ['title', 'summary', 'published']:
|
| 514 |
if f'{col}_orig' in df_result.columns:
|
| 515 |
-
|
|
|
|
|
|
|
| 516 |
|
| 517 |
# After merge, check if we have publication dates for debugging recency
|
| 518 |
if 'published' in df_result.columns:
|
|
@@ -704,20 +701,29 @@ def create_paper_cards(df_result):
|
|
| 704 |
if len(df_result) == 0:
|
| 705 |
return "<p>No papers found for your query.</p>"
|
| 706 |
|
| 707 |
-
print(f"
|
| 708 |
-
|
| 709 |
-
print(f"Row {i}: {dict(row)}")
|
| 710 |
|
| 711 |
-
#
|
| 712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
|
| 714 |
# MINIMAL filtering - only remove completely broken rows
|
| 715 |
valid_results = []
|
| 716 |
for i, row in df_result.iterrows():
|
| 717 |
# Accept ALL rows that have any title data - don't be picky
|
| 718 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
|
| 720 |
-
print(f"Original results: {len(df_result)}, Valid results after filtering: {len(valid_results)}")
|
| 721 |
|
| 722 |
if len(valid_results) == 0:
|
| 723 |
return "<div style='padding: 20px; text-align: center; color: #666;'>No valid papers found for your query. Try a different search term.</div>"
|
|
@@ -891,7 +897,18 @@ def create_paper_cards(df_result):
|
|
| 891 |
for i, row in enumerate(valid_results):
|
| 892 |
# Extract data safely with better fallbacks
|
| 893 |
title = str(row.get('title', 'Unknown Title'))
|
| 894 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 895 |
|
| 896 |
# Try multiple ways to get entry_id
|
| 897 |
entry_id = None
|
|
@@ -903,24 +920,23 @@ def create_paper_cards(df_result):
|
|
| 903 |
if not entry_id:
|
| 904 |
entry_id = f"Paper_{i+1}" # Fallback ID
|
| 905 |
|
| 906 |
-
# Handle
|
| 907 |
-
if
|
| 908 |
-
summary_display =
|
| 909 |
else:
|
| 910 |
-
summary_display = summary
|
| 911 |
|
| 912 |
-
# Format publication date
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
year_only = "Unknown"
|
| 924 |
|
| 925 |
# LONGER title display - don't truncate so aggressively
|
| 926 |
title_display = title[:120] + "..." if len(title) > 120 else title
|
|
|
|
| 476 |
print("β οΈ No results returned from Superlinked!")
|
| 477 |
return pd.DataFrame() # Return empty dataframe
|
| 478 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
# ALWAYS merge with original dataframe to ensure complete data
|
| 480 |
if self.df is not None:
|
| 481 |
# Handle both 'entry_id' and 'id' column names from Superlinked
|
|
|
|
| 503 |
how='left',
|
| 504 |
suffixes=('', '_orig')
|
| 505 |
)
|
| 506 |
+
|
| 507 |
+
# CRITICAL FIX: Use original data when current data is missing or nan
|
| 508 |
for col in ['title', 'summary', 'published']:
|
| 509 |
if f'{col}_orig' in df_result.columns:
|
| 510 |
+
# Fill missing/nan values with original data
|
| 511 |
+
mask = df_result[col].isna() | (df_result[col].astype(str).str.lower() == 'nan')
|
| 512 |
+
df_result.loc[mask, col] = df_result.loc[mask, f'{col}_orig']
|
| 513 |
|
| 514 |
# After merge, check if we have publication dates for debugging recency
|
| 515 |
if 'published' in df_result.columns:
|
|
|
|
| 701 |
if len(df_result) == 0:
|
| 702 |
return "<p>No papers found for your query.</p>"
|
| 703 |
|
| 704 |
+
print(f"π¨ Creating cards for {len(df_result)} results...")
|
| 705 |
+
print(f"π Available columns: {list(df_result.columns)}")
|
|
|
|
| 706 |
|
| 707 |
+
# Show sample data for debugging
|
| 708 |
+
if len(df_result) > 0:
|
| 709 |
+
print(f"π Sample card data:")
|
| 710 |
+
for i, row in df_result.head(2).iterrows():
|
| 711 |
+
title = row.get('title', 'MISSING')
|
| 712 |
+
summary = str(row.get('summary', 'MISSING'))[:50]
|
| 713 |
+
published = row.get('published', 'MISSING')
|
| 714 |
+
print(f" Row {i}: title='{title}', summary='{summary}...', published={published}")
|
| 715 |
|
| 716 |
# MINIMAL filtering - only remove completely broken rows
|
| 717 |
valid_results = []
|
| 718 |
for i, row in df_result.iterrows():
|
| 719 |
# Accept ALL rows that have any title data - don't be picky
|
| 720 |
+
title = str(row.get('title', ''))
|
| 721 |
+
if title and title.lower() not in ['nan', 'none', 'null', '']:
|
| 722 |
+
valid_results.append(row)
|
| 723 |
+
else:
|
| 724 |
+
print(f"β οΈ Skipping row {i} with invalid title: '{title}'")
|
| 725 |
|
| 726 |
+
print(f"π Original results: {len(df_result)}, Valid results after filtering: {len(valid_results)}")
|
| 727 |
|
| 728 |
if len(valid_results) == 0:
|
| 729 |
return "<div style='padding: 20px; text-align: center; color: #666;'>No valid papers found for your query. Try a different search term.</div>"
|
|
|
|
| 897 |
for i, row in enumerate(valid_results):
|
| 898 |
# Extract data safely with better fallbacks
|
| 899 |
title = str(row.get('title', 'Unknown Title'))
|
| 900 |
+
|
| 901 |
+
# Try to get summary from multiple possible columns
|
| 902 |
+
summary = None
|
| 903 |
+
for summary_col in ['summary', 'summary_orig']:
|
| 904 |
+
if summary_col in row and pd.notna(row.get(summary_col)):
|
| 905 |
+
candidate_summary = str(row[summary_col])
|
| 906 |
+
if candidate_summary.lower() not in ['nan', 'none', 'null', '']:
|
| 907 |
+
summary = candidate_summary
|
| 908 |
+
break
|
| 909 |
+
|
| 910 |
+
if not summary:
|
| 911 |
+
summary = "Summary not available in search results"
|
| 912 |
|
| 913 |
# Try multiple ways to get entry_id
|
| 914 |
entry_id = None
|
|
|
|
| 920 |
if not entry_id:
|
| 921 |
entry_id = f"Paper_{i+1}" # Fallback ID
|
| 922 |
|
| 923 |
+
# Handle summary display
|
| 924 |
+
if len(summary) > 200:
|
| 925 |
+
summary_display = summary[:200] + "..."
|
| 926 |
else:
|
| 927 |
+
summary_display = summary
|
| 928 |
|
| 929 |
+
# Format publication date - try multiple columns
|
| 930 |
+
formatted_date = "Date not available"
|
| 931 |
+
for date_col in ['published', 'published_orig']:
|
| 932 |
+
if date_col in row and pd.notna(row.get(date_col)):
|
| 933 |
+
try:
|
| 934 |
+
pub_date = pd.to_datetime(row[date_col])
|
| 935 |
+
formatted_date = pub_date.strftime('%B %Y') # e.g., "March 2023"
|
| 936 |
+
break
|
| 937 |
+
except:
|
| 938 |
+
formatted_date = str(row[date_col])[:10]
|
| 939 |
+
break
|
|
|
|
| 940 |
|
| 941 |
# LONGER title display - don't truncate so aggressively
|
| 942 |
title_display = title[:120] + "..." if len(title) > 120 else title
|