Spaces:

Testys
/

semantic-search

Sleeping

App Files Files Community

Testys commited on Mar 21, 2025

Commit

2906fee

1 Parent(s): 500f692

Update search_utils.py

Browse files

Files changed (1) hide show

search_utils.py +9 -81

search_utils.py CHANGED Viewed

@@ -145,7 +145,7 @@ class MetadataManager:
                 shard_path = self.shard_dir / shard
                 if not shard_path.exists():
                     logger.error(f"Shard file not found: {shard_path}")
-                    return pd.DataFrame(columns=["title", "summary", "similarity"])
                 file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
                 logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
                 try:
@@ -158,7 +158,7 @@ class MetadataManager:
                         logger.info(f"Parquet schema: {schema}")
                     except Exception:
                         pass
-                    return pd.DataFrame(columns=["title", "summary", "similarity"])
             df = self.loaded_shards[shard]
             df_len = len(df)
             valid_local_indices = [idx for idx in local_indices if 0 <= idx < df_len]
@@ -170,13 +170,13 @@ class MetadataManager:
                 return chunk
         except Exception as e:
             logger.error(f"Error processing shard {shard}: {str(e)}", exc_info=True)
-        return pd.DataFrame(columns=["title", "summary", "similarity"])
     def get_metadata(self, global_indices):
         """Retrieve metadata for a batch of global indices using parallel shard processing."""
         if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
             logger.warning("Empty indices array passed to get_metadata")
-            return pd.DataFrame(columns=["title", "summary", "similarity"])
         indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
         logger.info(f"Retrieving metadata for {len(indices_list)} indices")
@@ -186,7 +186,7 @@ class MetadataManager:
             logger.warning(f"Filtered out {invalid_count} invalid indices")
         if not valid_indices:
             logger.warning("No valid indices remain after filtering")
-            return pd.DataFrame(columns=["title", "summary", "similarity"])
         # Group indices by shard
         shard_groups = {}
@@ -216,69 +216,9 @@ class MetadataManager:
             return combined
         else:
             logger.warning("No metadata records retrieved")
-            return pd.DataFrame(columns=["title", "summary", "similarity"])
-    def _init_url_resolver(self):
-        """Initialize API session and cache."""
-        self.session = requests.Session()
-        adapter = requests.adapters.HTTPAdapter(
-            pool_connections=10,
-            pool_maxsize=10,
-            max_retries=3
-        )
-        self.session.mount("https://", adapter)
-    def resolve_url(self, title: str) -> str:
-        """Optimized URL resolution with fail-fast."""
-        if title in self.api_cache:
-            return self.api_cache[title]
-        links = {}
-        arxiv_url = self._get_arxiv_url(title)
-        if arxiv_url:
-            links["arxiv"] = arxiv_url
-        semantic_url = self._get_semantic_url(title)
-        if semantic_url:
-            links["semantic"] = semantic_url
-        scholar_url = f"https://scholar.google.com/scholar?q={quote(title)}"
-        links["google"] = scholar_url
-        self.api_cache[title] = links
-        return links
-    def _get_arxiv_url(self, title: str) -> str:
-        """Fast arXiv lookup with timeout."""
-        with self.session.get(
-            "http://export.arxiv.org/api/query",
-            params={"search_query": f'ti:"{title}"', "max_results": 1, "sortBy": "relevance"},
-            timeout=2
-        ) as response:
-            if response.ok:
-                return self._parse_arxiv_response(response.text)
-        return ""
-    def _parse_arxiv_response(self, xml: str) -> str:
-        """Fast XML parsing using string operations."""
-        if "<entry>" not in xml:
-            return ""
-        start = xml.find("<id>") + 4
-        end = xml.find("</id>", start)
-        return xml[start:end].replace("http:", "https:") if start > 3 else ""
-    def _get_semantic_url(self, title: str) -> str:
-        """Batch-friendly Semantic Scholar lookup."""
-        with self.session.get(
-            "https://api.semanticscholar.org/graph/v1/paper/search",
-            params={"query": title[:200], "limit": 1},
-            timeout=2
-        ) as response:
-            if response.ok:
-                data = response.json()
-                if data.get("data"):
-                    return data["data"][0].get("url", "")
-        return ""
 class SemanticSearch:
     def __init__(self):
         self.shard_dir = Path("compressed_shards")
@@ -429,9 +369,8 @@ class SemanticSearch:
                 self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
                                   f"max={results['similarity'].max():.3f}, " +
                                   f"mean={results['similarity'].mean():.3f}")
-            results['source'] = results['title'].apply(
-                lambda title: self._format_source_links(self.metadata_mgr.resolve_url(title))
-            )
             pre_dedup = len(results)
             results = results.drop_duplicates(subset=["title", "source"]).sort_values("similarity", ascending=False).head(top_k)
             post_dedup = len(results)
@@ -441,15 +380,4 @@ class SemanticSearch:
             return results.reset_index(drop=True)
         except Exception as e:
             self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
-            return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
-    def _format_source_links(self, links):
-        """Generate an HTML snippet for the available source links."""
-        html_parts = []
-        if "arxiv" in links:
-            html_parts.append(f"<a class='source-link' href='{links['arxiv']}' target='_blank' rel='noopener noreferrer'> 📜 arXiv</a>")
-        if "semantic" in links:
-            html_parts.append(f"<a class='source-link' href='{links['semantic']}' target='_blank' rel='noopener noreferrer'> 🌐 Semantic Scholar</a>")
-        if "google" in links:
-            html_parts.append(f"<a class='source-link' href='{links['google']}' target='_blank' rel='noopener noreferrer'> 🔍 Google Scholar</a>")
-        return " | ".join(html_parts)

                 shard_path = self.shard_dir / shard
                 if not shard_path.exists():
                     logger.error(f"Shard file not found: {shard_path}")
+                    return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
                 file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
                 logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
                 try:
                         logger.info(f"Parquet schema: {schema}")
                     except Exception:
                         pass
+                    return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
             df = self.loaded_shards[shard]
             df_len = len(df)
             valid_local_indices = [idx for idx in local_indices if 0 <= idx < df_len]
                 return chunk
         except Exception as e:
             logger.error(f"Error processing shard {shard}: {str(e)}", exc_info=True)
+        return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
     def get_metadata(self, global_indices):
         """Retrieve metadata for a batch of global indices using parallel shard processing."""
         if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
             logger.warning("Empty indices array passed to get_metadata")
+            return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
         indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
         logger.info(f"Retrieving metadata for {len(indices_list)} indices")
             logger.warning(f"Filtered out {invalid_count} invalid indices")
         if not valid_indices:
             logger.warning("No valid indices remain after filtering")
+            return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
         # Group indices by shard
         shard_groups = {}
             return combined
         else:
             logger.warning("No metadata records retrieved")
+            return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
 class SemanticSearch:
     def __init__(self):
         self.shard_dir = Path("compressed_shards")
                 self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
                                   f"max={results['similarity'].max():.3f}, " +
                                   f"mean={results['similarity'].mean():.3f}")
+            results['source'] = results["source"]
             pre_dedup = len(results)
             results = results.drop_duplicates(subset=["title", "source"]).sort_values("similarity", ascending=False).head(top_k)
             post_dedup = len(results)
             return results.reset_index(drop=True)
         except Exception as e:
             self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
+            return pd.DataFrame(columns=["title", "summary", "source", "similarity"])