Spaces:

superlinked
/

research_paper_ai_agent

Sleeping

App Files Files Community

Filip Makraduli commited on Jun 3, 2025

Commit

e9012d2

1 Parent(s): ec8c22e

better nan handling

Browse files

Files changed (1) hide show

research_ai_agent.py +48 -32

research_ai_agent.py CHANGED Viewed

@@ -476,12 +476,6 @@ class RetrievalTool(Tool):
             print("⚠️  No results returned from Superlinked!")
             return pd.DataFrame()  # Return empty dataframe
-        # Ensure summary is a string
-        if 'summary' in df_result.columns:
-            df_result['summary'] = df_result['summary'].astype(str)
-        else:
-            print("⚠️  Warning: 'summary' column not found in retrieved DataFrame.")
         # ALWAYS merge with original dataframe to ensure complete data
         if self.df is not None:
             # Handle both 'entry_id' and 'id' column names from Superlinked
@@ -509,10 +503,13 @@ class RetrievalTool(Tool):
                     how='left',
                     suffixes=('', '_orig')
                 )
-                # Use original data if current is missing
                 for col in ['title', 'summary', 'published']:
                     if f'{col}_orig' in df_result.columns:
-                        df_result[col] = df_result[col].fillna(df_result[f'{col}_orig'])
                 # After merge, check if we have publication dates for debugging recency
                 if 'published' in df_result.columns:
@@ -704,20 +701,29 @@ def create_paper_cards(df_result):
     if len(df_result) == 0:
         return "<p>No papers found for your query.</p>"
-    print(f"Sample data from df_result:")
-    for i, row in df_result.head(2).iterrows():
-        print(f"Row {i}: {dict(row)}")
-    # Data should already be merged in RetrievalTool
-    print(f"📋 Columns available: {list(df_result.columns)}")
     # MINIMAL filtering - only remove completely broken rows
     valid_results = []
     for i, row in df_result.iterrows():
         # Accept ALL rows that have any title data - don't be picky
-        valid_results.append(row)
-    print(f"Original results: {len(df_result)}, Valid results after filtering: {len(valid_results)}")
     if len(valid_results) == 0:
         return "<div style='padding: 20px; text-align: center; color: #666;'>No valid papers found for your query. Try a different search term.</div>"
@@ -891,7 +897,18 @@ def create_paper_cards(df_result):
     for i, row in enumerate(valid_results):
         # Extract data safely with better fallbacks
         title = str(row.get('title', 'Unknown Title'))
-        summary = str(row.get('summary', ''))
         # Try multiple ways to get entry_id
         entry_id = None
@@ -903,24 +920,23 @@ def create_paper_cards(df_result):
         if not entry_id:
             entry_id = f"Paper_{i+1}"  # Fallback ID
-        # Handle missing summary gracefully
-        if pd.isna(row.get('summary')) or summary.lower().strip() in ['nan', '', 'none', 'null']:
-            summary_display = "Summary not available in search results"
         else:
-            summary_display = summary[:200] + "..." if len(summary) > 200 else summary
-        # Format publication date
-        if 'published' in df_result.columns and pd.notna(row.get('published')):
-            try:
-                pub_date = pd.to_datetime(row['published'])
-                formatted_date = pub_date.strftime('%B %Y')  # e.g., "March 2023"
-                year_only = pub_date.strftime('%Y')
-            except:
-                formatted_date = str(row['published'])[:10]
-                year_only = formatted_date[:4] if len(str(row['published'])) >= 4 else "Unknown"
-        else:
-            formatted_date = "Date not available"
-            year_only = "Unknown"
         # LONGER title display - don't truncate so aggressively
         title_display = title[:120] + "..." if len(title) > 120 else title

             print("⚠️  No results returned from Superlinked!")
             return pd.DataFrame()  # Return empty dataframe
         # ALWAYS merge with original dataframe to ensure complete data
         if self.df is not None:
             # Handle both 'entry_id' and 'id' column names from Superlinked
                     how='left',
                     suffixes=('', '_orig')
                 )
+                # CRITICAL FIX: Use original data when current data is missing or nan
                 for col in ['title', 'summary', 'published']:
                     if f'{col}_orig' in df_result.columns:
+                        # Fill missing/nan values with original data
+                        mask = df_result[col].isna() | (df_result[col].astype(str).str.lower() == 'nan')
+                        df_result.loc[mask, col] = df_result.loc[mask, f'{col}_orig']
                 # After merge, check if we have publication dates for debugging recency
                 if 'published' in df_result.columns:
     if len(df_result) == 0:
         return "<p>No papers found for your query.</p>"
+    print(f"🎨 Creating cards for {len(df_result)} results...")
+    print(f"📋 Available columns: {list(df_result.columns)}")
+    # Show sample data for debugging
+    if len(df_result) > 0:
+        print(f"🔍 Sample card data:")
+        for i, row in df_result.head(2).iterrows():
+            title = row.get('title', 'MISSING')
+            summary = str(row.get('summary', 'MISSING'))[:50]
+            published = row.get('published', 'MISSING')
+            print(f"  Row {i}: title='{title}', summary='{summary}...', published={published}")
     # MINIMAL filtering - only remove completely broken rows
     valid_results = []
     for i, row in df_result.iterrows():
         # Accept ALL rows that have any title data - don't be picky
+        title = str(row.get('title', ''))
+        if title and title.lower() not in ['nan', 'none', 'null', '']:
+            valid_results.append(row)
+        else:
+            print(f"⚠️  Skipping row {i} with invalid title: '{title}'")
+    print(f"📊 Original results: {len(df_result)}, Valid results after filtering: {len(valid_results)}")
     if len(valid_results) == 0:
         return "<div style='padding: 20px; text-align: center; color: #666;'>No valid papers found for your query. Try a different search term.</div>"
     for i, row in enumerate(valid_results):
         # Extract data safely with better fallbacks
         title = str(row.get('title', 'Unknown Title'))
+        # Try to get summary from multiple possible columns
+        summary = None
+        for summary_col in ['summary', 'summary_orig']:
+            if summary_col in row and pd.notna(row.get(summary_col)):
+                candidate_summary = str(row[summary_col])
+                if candidate_summary.lower() not in ['nan', 'none', 'null', '']:
+                    summary = candidate_summary
+                    break
+        if not summary:
+            summary = "Summary not available in search results"
         # Try multiple ways to get entry_id
         entry_id = None
         if not entry_id:
             entry_id = f"Paper_{i+1}"  # Fallback ID
+        # Handle summary display
+        if len(summary) > 200:
+            summary_display = summary[:200] + "..."
         else:
+            summary_display = summary
+        # Format publication date - try multiple columns
+        formatted_date = "Date not available"
+        for date_col in ['published', 'published_orig']:
+            if date_col in row and pd.notna(row.get(date_col)):
+                try:
+                    pub_date = pd.to_datetime(row[date_col])
+                    formatted_date = pub_date.strftime('%B %Y')  # e.g., "March 2023"
+                    break
+                except:
+                    formatted_date = str(row[date_col])[:10]
+                    break
         # LONGER title display - don't truncate so aggressively
         title_display = title[:120] + "..." if len(title) > 120 else title