First_agent_template

Build error

App Files Files Community

First_agent_template / app.py

ngockhoinguyenpy

Update app.py

b814551 verified 11 months ago

raw

history blame contribute delete

8.4 kB

	import feedparser
	import urllib.parse
	import yaml
	import gradio as gr
	from smolagents import CodeAgent, HfApiModel, tool

	# @tool
	# def fetch_latest_arxiv_papers(keywords: list, num_results: int = 3) -> list:
	# """Fetches the latest research papers from arXiv based on provided keywords.

	# Args:
	# keywords: A list of keywords to search for relevant papers.
	# num_results: The number of papers to fetch (default is 3).

	# Returns:
	# A list of dictionaries containing:
	# - "title": The title of the research paper.
	# - "authors": The authors of the paper.
	# - "year": The publication year.
	# - "abstract": A summary of the research paper.
	# - "link": A direct link to the paper on arXiv.
	# """
	# try:
	# print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Debug input

	# #Properly format query with +AND+ for multiple keywords
	# query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# query_encoded = urllib.parse.quote(query) # Encode spaces and special characters

	# url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results={num_results}&sortBy=submittedDate&sortOrder=descending"

	# print(f"DEBUG: Query URL - {url}") # Debug URL

	# feed = feedparser.parse(url)

	# papers = []
	# for entry in feed.entries:
	# papers.append({
	# "title": entry.title,
	# "authors": ", ".join(author.name for author in entry.authors),
	# "year": entry.published[:4], # Extract year
	# "abstract": entry.summary,
	# "link": entry.link
	# })

	# return papers

	# except Exception as e:
	# print(f"ERROR: {str(e)}") # Debug errors
	# return [f"Error fetching research papers: {str(e)}"]

	from rank_bm25 import BM25Okapi
	import nltk

	import os
	import shutil


	nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt")
	if os.path.exists(nltk_data_path):
	shutil.rmtree(nltk_data_path) # Remove corrupted version

	print("✅ Removed old NLTK 'punkt' data. Reinstalling...")

	# ✅ Step 2: Download the correct 'punkt' tokenizer
	nltk.download("punkt_tab")

	print("✅ Successfully installed 'punkt'!")


	@tool # Register the function properly as a SmolAgents tool
	def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
	"""Fetches and ranks arXiv papers using BM25 keyword relevance.
	Args:
	keywords: List of keywords for search.
	num_results: Number of results to return.
	Returns:
	List of the most relevant papers based on BM25 ranking.
	"""
	try:
	print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")

	# Use a general keyword search (without `ti:` and `abs:`)
	query = "+AND+".join([f"all:{kw}" for kw in keywords])
	query_encoded = urllib.parse.quote(query)
	url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"

	print(f"DEBUG: Query URL - {url}")

	feed = feedparser.parse(url)
	papers = []

	# Extract papers from arXiv
	for entry in feed.entries:
	papers.append({
	"title": entry.title,
	"authors": ", ".join(author.name for author in entry.authors),
	"year": entry.published[:4],
	"abstract": entry.summary,
	"link": entry.link
	})

	if not papers:
	return [{"error": "No results found. Try different keywords."}]

	# Apply BM25 ranking
	tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
	bm25 = BM25Okapi(tokenized_corpus)

	tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
	scores = bm25.get_scores(tokenized_query)

	# Sort papers based on BM25 score
	ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)

	# Return the most relevant ones
	return [paper[0] for paper in ranked_papers[:num_results]]

	except Exception as e:
	print(f"ERROR: {str(e)}")
	return [{"error": f"Error fetching research papers: {str(e)}"}]


	# AI Model
	model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'

	model = HfApiModel(
	max_tokens=2096,
	temperature=0.5,
	model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
	custom_role_conversions=None,
	)

	# Load prompt templates
	with open("prompts.yaml", 'r') as stream:
	prompt_templates = yaml.safe_load(stream)

	# Create the AI Agent
	agent = CodeAgent(
	model=model,
	tools=[fetch_latest_arxiv_papers], # Properly registered tool
	max_steps=6,
	verbosity_level=1,
	grammar=None,
	planning_interval=None,
	name="ScholarAgent",
	description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
	prompt_templates=prompt_templates
	)

	# # Define Gradio Search Function
	# def search_papers(user_input):
	# keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
	# print(f"DEBUG: Received input keywords - {keywords}") # Debug user input

	# if not keywords:
	# print("DEBUG: No valid keywords provided.")
	# return "Error: Please enter at least one valid keyword."

	# results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
	# print(f"DEBUG: Results received - {results}") # Debug function output

	# if isinstance(results, list) and results and isinstance(results[0], dict):
	# #Format output with better readability and clarity
	# formatted_results = "\n\n".join([
	# f"---\n\n"
	# f"📌 Title:\n{paper['title']}\n\n"
	# f"👨‍🔬 Authors:\n{paper['authors']}\n\n"
	# f"📅 Year: {paper['year']}\n\n"
	# f"📖 Abstract:\n{paper['abstract'][:500]}... (truncated for readability)\n\n"
	# f"[🔗 Read Full Paper]({paper['link']})\n\n"
	# for paper in results
	# ])
	# return formatted_results

	# print("DEBUG: No results found.")
	# return "No results found. Try different keywords."

	#Search Papers
	def search_papers(user_input):
	keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
	print(f"DEBUG: Received input keywords - {keywords}") # Debug user input

	if not keywords:
	print("DEBUG: No valid keywords provided.")
	return "Error: Please enter at least one valid keyword."

	results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
	print(f"DEBUG: Results received - {results}") # Debug function output

	# ✅ Check if the API returned an error
	if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
	return results[0]["error"] # Return the error message directly

	# ✅ Format results only if valid papers exist
	if isinstance(results, list) and results and isinstance(results[0], dict):
	formatted_results = "\n\n".join([
	f"---\n\n"
	f"📌 Title: {paper['title']}\n\n"
	f"👨‍🔬 Authors: {paper['authors']}\n\n"
	f"📅 Year: {paper['year']}\n\n"
	f"📖 Abstract: {paper['abstract'][:500]}... (truncated for readability)\n\n"
	f"[🔗 Read Full Paper]({paper['link']})\n\n"
	for paper in results
	])
	return formatted_results

	print("DEBUG: No results found.")
	return "No results found. Try different keywords."


	# Create Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# ScholarAgent")
	keyword_input = gr.Textbox(label="Enter keywords (comma-separated)", placeholder="e.g., deep learning, reinforcement learning")
	output_display = gr.Markdown()
	search_button = gr.Button("Search")

	search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])

	print("DEBUG: Gradio UI is running. Waiting for user input...")

	# Launch Gradio App
	demo.launch()