Spaces:
Sleeping
Sleeping
| from langchain_core.tools import tool | |
| import wikipediaapi | |
| import pandas as pd | |
| import requests | |
| import fitz # PyMuPDF | |
| import io | |
| from urllib.parse import urlparse | |
| from typing import List, Dict | |
| import pandas as pd | |
| import re | |
| from difflib import SequenceMatcher | |
| def clean(text): | |
| return re.sub(r'[^a-zA-Z0-9 ]', '', text.lower()) | |
| def extract_relevant_table_info(query: str, tables: List[pd.DataFrame], min_score: float = 0.2) -> Dict[str, str]: | |
| query_clean = clean(query) | |
| results = {} | |
| for i, df in enumerate(tables): | |
| column_scores = [] | |
| for col in df.columns: | |
| score = SequenceMatcher(None, query_clean, clean(str(col))).ratio() | |
| column_scores.append((col, score)) | |
| # Keep columns above threshold | |
| relevant_cols = [col for col, score in column_scores if score >= min_score] | |
| if not relevant_cols: | |
| continue # skip irrelevant tables | |
| compact_str = ", ".join( | |
| f"{row[relevant_cols[0]]}=" + ", ".join(f"{col}={row[col]}" for col in relevant_cols[1:]) | |
| for _, row in df[relevant_cols].dropna().head(3).iterrows() | |
| ) | |
| results[f"table_{i}"] = compact_str | |
| return results | |
| def add(a: int, b: int) -> int: | |
| """ | |
| Sums two values and returns the result of the sum | |
| Args: | |
| a: first number | |
| b: second number | |
| """ | |
| return a + b | |
| def subtract(a: int, b: int) -> int: | |
| """ | |
| Subtracts one value from another and returns the result of the sum | |
| Args: | |
| a: first number | |
| b: second number | |
| """ | |
| return a - b | |
| def multiply(a: int, b: int) -> int: | |
| """ | |
| Multiplies two values and returns the result of the sum | |
| Args: | |
| a: first number | |
| b: second number | |
| """ | |
| return a * b | |
| def divide(a: int, b: int) -> int: | |
| """ | |
| Divides two values and returns the result of the sum | |
| Args: | |
| a: numerator | |
| b: denominator | |
| """ | |
| if b == 0: | |
| raise ValueError("Cannot divide by zero.") | |
| return a / b | |
| def search_wikipedia(query: str, page_title: str, language: str) -> str: | |
| """ | |
| This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains. | |
| The function is capable of retrieving the most relevant information given the original query. | |
| Args: | |
| query: The original question that prompted the use of the function. | |
| page_title: Title of the Wikipedia page. | |
| language: Language code (e.g., "en", "es", "fr"). | |
| Returns: | |
| A string containing the page title, text, and any extracted tables in markdown format. | |
| """ | |
| try: | |
| wiki_wiki = wikipediaapi.Wikipedia( | |
| user_agent='AIAgent (gabriel_abilleira@tutanota.com)', | |
| language=language, | |
| extract_format=wikipediaapi.ExtractFormat.HTML | |
| ) | |
| page = wiki_wiki.page(page_title) | |
| if not page.exists(): | |
| return f"Error: Page '{page_title}' not found in language '{language}'." | |
| # Use the URL to read tables | |
| tables = pd.read_html(page.fullurl) | |
| markdown_tables = extract_relevant_table_info(query, tables, min_score = 0.2) | |
| table_output = "\n".join(list(markdown_tables.values())) if markdown_tables else "No tables found on this page." | |
| return f"Text: {page.summary[:500]}\n\n{table_output}" | |
| except Exception as e: | |
| return f"Error retrieving Wikipedia content: {str(e)}" | |
| def duckduckgo_search(query: str) -> str: | |
| """Use DuckDuckGo to search the web for up-to-date information. | |
| Args: | |
| query: The query to search for on the web. It may be a literal url (e.g. https://www.youtube.com/watch?v=7ybEg14CP1g) | |
| """ | |
| url = "https://api.duckduckgo.com/" | |
| params = { | |
| "q": query, | |
| "format": "json", | |
| "no_redirect": 1, | |
| "no_html": 1, | |
| "skip_disambig": 1, | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| data = response.json() | |
| # Try the most useful fields | |
| if data.get("AbstractText"): | |
| return data["AbstractText"] | |
| elif data.get("Answer"): | |
| return data["Answer"] | |
| elif data.get("RelatedTopics"): | |
| # Return some related results | |
| results = data["RelatedTopics"][:3] | |
| return "\n".join(rt.get("Text", "") for rt in results if "Text" in rt) | |
| else: | |
| return "No good results found." | |
| except Exception as e: | |
| return f"Search failed: {e}" | |
| def search_papers(query: str) -> str: | |
| """Search for academic papers and retrieve their content when possible.""" | |
| url = "https://api.semanticscholar.org/graph/v1/paper/search" | |
| params = { | |
| "query": query, | |
| "limit": 3, | |
| "fields": "title,abstract,authors,url,year" | |
| } | |
| try: | |
| response = requests.get(url, params=params) | |
| data = response.json() | |
| if not data.get("data"): | |
| return "No papers found." | |
| results = [] | |
| for paper in data["data"]: | |
| title = paper.get("title", "No title") | |
| authors = ", ".join([a.get("name", "") for a in paper.get("authors", [])]) | |
| year = paper.get("year", "n.d.") | |
| abstract = paper.get("abstract", "No abstract available.") | |
| link = paper.get("url", "") | |
| full_text = "Full text not available." | |
| # Attempt to download and parse PDF (for arXiv) | |
| if "arxiv.org" in link: | |
| pdf_url = link.replace("abs", "pdf") + ".pdf" | |
| try: | |
| pdf_response = requests.get(pdf_url) | |
| doc = fitz.open(stream=pdf_response.content, filetype="pdf") | |
| full_text = "\n".join(page.get_text() for page in doc[3:10]) # Only first 3 pages | |
| doc.close() | |
| except Exception as pdf_err: | |
| full_text = f"Failed to retrieve full text: {pdf_err}" | |
| result = f"""**{title}** ({year}) by {authors} | |
| Abstract: {abstract} | |
| Link: {link} | |
| Full Text (first pages):\n{full_text}""" | |
| results.append(result) | |
| return "\n\n---\n\n".join(results) | |
| except Exception as e: | |
| return f"Error fetching papers: {e}" | |
| def download_file(task_id: str) -> str: | |
| """ | |
| Downloads a file associated with the given task ID. | |
| Returns the file path where the file is saved locally. | |
| Args: | |
| task_id: The task id to download attachment from. | |
| """ | |
| file_url = f"{DEFAULT_API_URL}/files/{task_id}" | |
| local_file_path = f"downloads/{task_id}.file" | |
| print(f"Downloading file for task ID {task_id} from {file_url}...") | |
| try: | |
| response = requests.get(file_url, stream=True, timeout=15) | |
| response.raise_for_status() | |
| os.makedirs("downloads", exist_ok=True) | |
| with open(local_file_path, "wb") as file: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| file.write(chunk) | |
| print(f"File downloaded successfully: {local_file_path}") | |
| return local_file_path | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error downloading file for task {task_id}: {e}") | |
| raise |