Spaces:
Sleeping
Sleeping
Update tools.py
Browse files
tools.py
CHANGED
|
@@ -5,6 +5,40 @@ import requests
|
|
| 5 |
import fitz # PyMuPDF
|
| 6 |
import io
|
| 7 |
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
@tool
|
|
@@ -54,11 +88,13 @@ def divide(a: int, b: int) -> int:
|
|
| 54 |
return a / b
|
| 55 |
|
| 56 |
@tool
|
| 57 |
-
def search_wikipedia(page_title: str, language: str) -> str:
|
| 58 |
"""
|
| 59 |
This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains.
|
|
|
|
| 60 |
|
| 61 |
Args:
|
|
|
|
| 62 |
page_title: Title of the Wikipedia page.
|
| 63 |
language: Language code (e.g., "en", "es", "fr").
|
| 64 |
|
|
@@ -79,16 +115,11 @@ def search_wikipedia(page_title: str, language: str) -> str:
|
|
| 79 |
|
| 80 |
# Use the URL to read tables
|
| 81 |
tables = pd.read_html(page.fullurl)
|
| 82 |
-
markdown_tables =
|
| 83 |
-
|
| 84 |
-
for i, table in enumerate(tables):
|
| 85 |
-
if isinstance(table, pd.DataFrame):
|
| 86 |
-
markdown = table.iloc[:20, :3].to_markdown(index=False)
|
| 87 |
-
markdown_tables.append(f"\n---\n**Table {i + 1}:**\n{markdown}")
|
| 88 |
|
| 89 |
-
table_output = "\n".join(markdown_tables) if markdown_tables else "No tables found on this page."
|
| 90 |
|
| 91 |
-
return f"Text: {page.summary[:
|
| 92 |
|
| 93 |
except Exception as e:
|
| 94 |
return f"Error retrieving Wikipedia content: {str(e)}"
|
|
|
|
| 5 |
import fitz # PyMuPDF
|
| 6 |
import io
|
| 7 |
from urllib.parse import urlparse
|
| 8 |
+
from typing import List, Dict
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import re
|
| 11 |
+
from difflib import SequenceMatcher
|
| 12 |
+
|
| 13 |
+
def clean(text):
|
| 14 |
+
return re.sub(r'[^a-zA-Z0-9 ]', '', text.lower())
|
| 15 |
+
|
| 16 |
+
def extract_relevant_table_info(query: str, tables: List[pd.DataFrame], min_score: float = 0.2) -> Dict[str, str]:
|
| 17 |
+
query_clean = clean(query)
|
| 18 |
+
results = {}
|
| 19 |
+
|
| 20 |
+
for i, df in enumerate(tables):
|
| 21 |
+
column_scores = []
|
| 22 |
+
|
| 23 |
+
for col in df.columns:
|
| 24 |
+
score = SequenceMatcher(None, query_clean, clean(str(col))).ratio()
|
| 25 |
+
column_scores.append((col, score))
|
| 26 |
+
|
| 27 |
+
# Keep columns above threshold
|
| 28 |
+
relevant_cols = [col for col, score in column_scores if score >= min_score]
|
| 29 |
+
|
| 30 |
+
if not relevant_cols:
|
| 31 |
+
continue # skip irrelevant tables
|
| 32 |
+
|
| 33 |
+
compact_str = ", ".join(
|
| 34 |
+
f"{row[relevant_cols[0]]}=" + ", ".join(f"{col}={row[col]}" for col in relevant_cols[1:])
|
| 35 |
+
for _, row in df[relevant_cols].dropna().head(3).iterrows()
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
results[f"table_{i}"] = compact_str
|
| 39 |
+
|
| 40 |
+
return results
|
| 41 |
+
|
| 42 |
|
| 43 |
|
| 44 |
@tool
|
|
|
|
| 88 |
return a / b
|
| 89 |
|
| 90 |
@tool
|
| 91 |
+
def search_wikipedia(query: str, page_title: str, language: str) -> str:
|
| 92 |
"""
|
| 93 |
This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains.
|
| 94 |
+
The function is capable of retrieving the most relevant information given the original query.
|
| 95 |
|
| 96 |
Args:
|
| 97 |
+
query: The original question that prompted the use of the function.
|
| 98 |
page_title: Title of the Wikipedia page.
|
| 99 |
language: Language code (e.g., "en", "es", "fr").
|
| 100 |
|
|
|
|
| 115 |
|
| 116 |
# Use the URL to read tables
|
| 117 |
tables = pd.read_html(page.fullurl)
|
| 118 |
+
markdown_tables = extract_relevant_table_info(query, tables, min_score = 0.2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
table_output = "\n".join(list(markdown_tables.values())) if markdown_tables else "No tables found on this page."
|
| 121 |
|
| 122 |
+
return f"Text: {page.summary[:500]}\n\n{table_output}"
|
| 123 |
|
| 124 |
except Exception as e:
|
| 125 |
return f"Error retrieving Wikipedia content: {str(e)}"
|