ScholarAgent

Running

App Files Files Community

pdx97 commited on Mar 9, 2025

Commit

8267210

verified ·

1 Parent(s): fcfda0a

Updated app.py

Browse files

Added tf-idf method for better semantic search

Files changed (1) hide show

app.py +98 -25

app.py CHANGED Viewed

@@ -49,40 +49,108 @@ from smolagents import CodeAgent, HfApiModel, tool
 #         print(f"ERROR: {str(e)}")  # Debug errors
 #         return [f"Error fetching research papers: {str(e)}"]
-from rank_bm25 import BM25Okapi
-import nltk
-import os
-import shutil
-nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt")
-if os.path.exists(nltk_data_path):
-    shutil.rmtree(nltk_data_path)  # Remove corrupted version
-print("✅ Removed old NLTK 'punkt' data. Reinstalling...")
-# ✅ Step 2: Download the correct 'punkt' tokenizer
-nltk.download("punkt_tab")
-print("✅ Successfully installed 'punkt'!")
-@tool  # Register the function properly as a SmolAgents tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
-    """Fetches and ranks arXiv papers using BM25 keyword relevance.
     Args:
         keywords: List of keywords for search.
         num_results: Number of results to return.
     Returns:
-        List of the most relevant papers based on BM25 ranking.
     """
     try:
         print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
-        # Use a general keyword search (without `ti:` and `abs:`)
         query = "+AND+".join([f"all:{kw}" for kw in keywords])
         query_encoded = urllib.parse.quote(query)
         url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
@@ -105,17 +173,22 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
         if not papers:
             return [{"error": "No results found. Try different keywords."}]
-        # Apply BM25 ranking
-        tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
-        bm25 = BM25Okapi(tokenized_corpus)
-        tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
-        scores = bm25.get_scores(tokenized_query)
-        # Sort papers based on BM25 score
-        ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)
-        # Return the most relevant ones
         return [paper[0] for paper in ranked_papers[:num_results]]
     except Exception as e:
@@ -188,11 +261,11 @@ def search_papers(user_input):
     results = fetch_latest_arxiv_papers(keywords, num_results=3)  # Fetch 3 results
     print(f"DEBUG: Results received - {results}")  # Debug function output
-    # ✅ Check if the API returned an error
     if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
         return results[0]["error"]  # Return the error message directly
-    # ✅ Format results only if valid papers exist
     if isinstance(results, list) and results and isinstance(results[0], dict):
         formatted_results = "\n\n".join([
             f"---\n\n"

 #         print(f"ERROR: {str(e)}")  # Debug errors
 #         return [f"Error fetching research papers: {str(e)}"]
+#"""------Applied BM25 search for paper retrival------"""
+# from rank_bm25 import BM25Okapi
+# import nltk
+# import os
+# import shutil
+# nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt")
+# if os.path.exists(nltk_data_path):
+#     shutil.rmtree(nltk_data_path)  # Remove corrupted version
+# print("Removed old NLTK 'punkt' data. Reinstalling...")
+# # Step 2: Download the correct 'punkt' tokenizer
+# nltk.download("punkt_tab")
+# print("Successfully installed 'punkt'!")
+# @tool  # Register the function properly as a SmolAgents tool
+# def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
+#     """Fetches and ranks arXiv papers using BM25 keyword relevance.
+#     Args:
+#         keywords: List of keywords for search.
+#         num_results: Number of results to return.
+#     Returns:
+#         List of the most relevant papers based on BM25 ranking.
+#     """
+#     try:
+#         print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
+#         # Use a general keyword search (without `ti:` and `abs:`)
+#         query = "+AND+".join([f"all:{kw}" for kw in keywords])
+#         query_encoded = urllib.parse.quote(query)
+#         url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
+#         print(f"DEBUG: Query URL - {url}")
+#         feed = feedparser.parse(url)
+#         papers = []
+#         # Extract papers from arXiv
+#         for entry in feed.entries:
+#             papers.append({
+#                 "title": entry.title,
+#                 "authors": ", ".join(author.name for author in entry.authors),
+#                 "year": entry.published[:4],
+#                 "abstract": entry.summary,
+#                 "link": entry.link
+#             })
+#         if not papers:
+#             return [{"error": "No results found. Try different keywords."}]
+#         # Apply BM25 ranking
+#         tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
+#         bm25 = BM25Okapi(tokenized_corpus)
+#         tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
+#         scores = bm25.get_scores(tokenized_query)
+#         # Sort papers based on BM25 score
+#         ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)
+#         # Return the most relevant ones
+#         return [paper[0] for paper in ranked_papers[:num_results]]
+#     except Exception as e:
+#         print(f"ERROR: {str(e)}")
+#         return [{"error": f"Error fetching research papers: {str(e)}"}]
+"""------Applied TF-IDF for better semantic search------"""
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import gradio as gr
+from smolagents import CodeAgent, HfApiModel, tool
+import nltk
+nltk.download("stopwords")
+from nltk.corpus import stopwords
+@tool  # ✅ Register the function properly as a SmolAgents tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
+    """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
     Args:
         keywords: List of keywords for search.
         num_results: Number of results to return.
     Returns:
+        List of the most relevant papers based on TF-IDF ranking.
     """
     try:
         print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
+        # Use a general keyword search
         query = "+AND+".join([f"all:{kw}" for kw in keywords])
         query_encoded = urllib.parse.quote(query)
         url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
         if not papers:
             return [{"error": "No results found. Try different keywords."}]
+        # Prepare TF-IDF Vectorization
+        corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
+        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))  # Remove stopwords
+        tfidf_matrix = vectorizer.fit_transform(corpus)
+        # Transform Query into TF-IDF Vector
+        query_str = " ".join(keywords)
+        query_vec = vectorizer.transform([query_str])
+        #Compute Cosine Similarity
+        similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
+        #Sort papers based on similarity score
+        ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
+        # Return the most relevant papers
         return [paper[0] for paper in ranked_papers[:num_results]]
     except Exception as e:
     results = fetch_latest_arxiv_papers(keywords, num_results=3)  # Fetch 3 results
     print(f"DEBUG: Results received - {results}")  # Debug function output
+    # Check if the API returned an error
     if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
         return results[0]["error"]  # Return the error message directly
+    # Format results only if valid papers exist
     if isinstance(results, list) and results and isinstance(results[0], dict):
         formatted_results = "\n\n".join([
             f"---\n\n"