from langchain_core.tools import tool import wikipediaapi import pandas as pd import requests import fitz # PyMuPDF import io from urllib.parse import urlparse from typing import List, Dict import pandas as pd import re from difflib import SequenceMatcher def clean(text): return re.sub(r'[^a-zA-Z0-9 ]', '', text.lower()) def extract_relevant_table_info(query: str, tables: List[pd.DataFrame], min_score: float = 0.2) -> Dict[str, str]: query_clean = clean(query) results = {} for i, df in enumerate(tables): column_scores = [] for col in df.columns: score = SequenceMatcher(None, query_clean, clean(str(col))).ratio() column_scores.append((col, score)) # Keep columns above threshold relevant_cols = [col for col, score in column_scores if score >= min_score] if not relevant_cols: continue # skip irrelevant tables compact_str = ", ".join( f"{row[relevant_cols[0]]}=" + ", ".join(f"{col}={row[col]}" for col in relevant_cols[1:]) for _, row in df[relevant_cols].dropna().head(3).iterrows() ) results[f"table_{i}"] = compact_str return results @tool def add(a: int, b: int) -> int: """ Sums two values and returns the result of the sum Args: a: first number b: second number """ return a + b @tool def subtract(a: int, b: int) -> int: """ Subtracts one value from another and returns the result of the sum Args: a: first number b: second number """ return a - b @tool def multiply(a: int, b: int) -> int: """ Multiplies two values and returns the result of the sum Args: a: first number b: second number """ return a * b @tool def divide(a: int, b: int) -> int: """ Divides two values and returns the result of the sum Args: a: numerator b: denominator """ if b == 0: raise ValueError("Cannot divide by zero.") return a / b @tool def search_wikipedia(query: str, page_title: str, language: str) -> str: """ This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains. The function is capable of retrieving the most relevant information given the original query. Args: query: The original question that prompted the use of the function. page_title: Title of the Wikipedia page. language: Language code (e.g., "en", "es", "fr"). Returns: A string containing the page title, text, and any extracted tables in markdown format. """ try: wiki_wiki = wikipediaapi.Wikipedia( user_agent='AIAgent (gabriel_abilleira@tutanota.com)', language=language, extract_format=wikipediaapi.ExtractFormat.HTML ) page = wiki_wiki.page(page_title) if not page.exists(): return f"Error: Page '{page_title}' not found in language '{language}'." # Use the URL to read tables tables = pd.read_html(page.fullurl) markdown_tables = extract_relevant_table_info(query, tables, min_score = 0.2) table_output = "\n".join(list(markdown_tables.values())) if markdown_tables else "No tables found on this page." return f"Text: {page.summary[:500]}\n\n{table_output}" except Exception as e: return f"Error retrieving Wikipedia content: {str(e)}" @tool def duckduckgo_search(query: str) -> str: """Use DuckDuckGo to search the web for up-to-date information. Args: query: The query to search for on the web. It may be a literal url (e.g. https://www.youtube.com/watch?v=7ybEg14CP1g) """ url = "https://api.duckduckgo.com/" params = { "q": query, "format": "json", "no_redirect": 1, "no_html": 1, "skip_disambig": 1, } try: response = requests.get(url, params=params) data = response.json() # Try the most useful fields if data.get("AbstractText"): return data["AbstractText"] elif data.get("Answer"): return data["Answer"] elif data.get("RelatedTopics"): # Return some related results results = data["RelatedTopics"][:3] return "\n".join(rt.get("Text", "") for rt in results if "Text" in rt) else: return "No good results found." except Exception as e: return f"Search failed: {e}" @tool def search_papers(query: str) -> str: """Search for academic papers and retrieve their content when possible.""" url = "https://api.semanticscholar.org/graph/v1/paper/search" params = { "query": query, "limit": 3, "fields": "title,abstract,authors,url,year" } try: response = requests.get(url, params=params) data = response.json() if not data.get("data"): return "No papers found." results = [] for paper in data["data"]: title = paper.get("title", "No title") authors = ", ".join([a.get("name", "") for a in paper.get("authors", [])]) year = paper.get("year", "n.d.") abstract = paper.get("abstract", "No abstract available.") link = paper.get("url", "") full_text = "Full text not available." # Attempt to download and parse PDF (for arXiv) if "arxiv.org" in link: pdf_url = link.replace("abs", "pdf") + ".pdf" try: pdf_response = requests.get(pdf_url) doc = fitz.open(stream=pdf_response.content, filetype="pdf") full_text = "\n".join(page.get_text() for page in doc[3:10]) # Only first 3 pages doc.close() except Exception as pdf_err: full_text = f"Failed to retrieve full text: {pdf_err}" result = f"""**{title}** ({year}) by {authors} Abstract: {abstract} Link: {link} Full Text (first pages):\n{full_text}""" results.append(result) return "\n\n---\n\n".join(results) except Exception as e: return f"Error fetching papers: {e}" @tool def download_file(task_id: str) -> str: """ Downloads a file associated with the given task ID. Returns the file path where the file is saved locally. Args: task_id: The task id to download attachment from. """ file_url = f"{DEFAULT_API_URL}/files/{task_id}" local_file_path = f"downloads/{task_id}.file" print(f"Downloading file for task ID {task_id} from {file_url}...") try: response = requests.get(file_url, stream=True, timeout=15) response.raise_for_status() os.makedirs("downloads", exist_ok=True) with open(local_file_path, "wb") as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) print(f"File downloaded successfully: {local_file_path}") return local_file_path except requests.exceptions.RequestException as e: print(f"Error downloading file for task {task_id}: {e}") raise