WeByT3 commited on
Commit
5532431
·
verified ·
1 Parent(s): 02acb02

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +40 -9
tools.py CHANGED
@@ -5,6 +5,40 @@ import requests
5
  import fitz # PyMuPDF
6
  import io
7
  from urllib.parse import urlparse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  @tool
@@ -54,11 +88,13 @@ def divide(a: int, b: int) -> int:
54
  return a / b
55
 
56
  @tool
57
- def search_wikipedia(page_title: str, language: str) -> str:
58
  """
59
  This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains.
 
60
 
61
  Args:
 
62
  page_title: Title of the Wikipedia page.
63
  language: Language code (e.g., "en", "es", "fr").
64
 
@@ -79,16 +115,11 @@ def search_wikipedia(page_title: str, language: str) -> str:
79
 
80
  # Use the URL to read tables
81
  tables = pd.read_html(page.fullurl)
82
- markdown_tables = []
83
-
84
- for i, table in enumerate(tables):
85
- if isinstance(table, pd.DataFrame):
86
- markdown = table.iloc[:20, :3].to_markdown(index=False)
87
- markdown_tables.append(f"\n---\n**Table {i + 1}:**\n{markdown}")
88
 
89
- table_output = "\n".join(markdown_tables) if markdown_tables else "No tables found on this page."
90
 
91
- return f"Text: {page.summary[:75]}\n\n{table_output}"
92
 
93
  except Exception as e:
94
  return f"Error retrieving Wikipedia content: {str(e)}"
 
5
  import fitz # PyMuPDF
6
  import io
7
  from urllib.parse import urlparse
8
+ from typing import List, Dict
9
+ import pandas as pd
10
+ import re
11
+ from difflib import SequenceMatcher
12
+
13
+ def clean(text):
14
+ return re.sub(r'[^a-zA-Z0-9 ]', '', text.lower())
15
+
16
+ def extract_relevant_table_info(query: str, tables: List[pd.DataFrame], min_score: float = 0.2) -> Dict[str, str]:
17
+ query_clean = clean(query)
18
+ results = {}
19
+
20
+ for i, df in enumerate(tables):
21
+ column_scores = []
22
+
23
+ for col in df.columns:
24
+ score = SequenceMatcher(None, query_clean, clean(str(col))).ratio()
25
+ column_scores.append((col, score))
26
+
27
+ # Keep columns above threshold
28
+ relevant_cols = [col for col, score in column_scores if score >= min_score]
29
+
30
+ if not relevant_cols:
31
+ continue # skip irrelevant tables
32
+
33
+ compact_str = ", ".join(
34
+ f"{row[relevant_cols[0]]}=" + ", ".join(f"{col}={row[col]}" for col in relevant_cols[1:])
35
+ for _, row in df[relevant_cols].dropna().head(3).iterrows()
36
+ )
37
+
38
+ results[f"table_{i}"] = compact_str
39
+
40
+ return results
41
+
42
 
43
 
44
  @tool
 
88
  return a / b
89
 
90
  @tool
91
+ def search_wikipedia(query: str, page_title: str, language: str) -> str:
92
  """
93
  This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains.
94
+ The function is capable of retrieving the most relevant information given the original query.
95
 
96
  Args:
97
+ query: The original question that prompted the use of the function.
98
  page_title: Title of the Wikipedia page.
99
  language: Language code (e.g., "en", "es", "fr").
100
 
 
115
 
116
  # Use the URL to read tables
117
  tables = pd.read_html(page.fullurl)
118
+ markdown_tables = extract_relevant_table_info(query, tables, min_score = 0.2)
 
 
 
 
 
119
 
120
+ table_output = "\n".join(list(markdown_tables.values())) if markdown_tables else "No tables found on this page."
121
 
122
+ return f"Text: {page.summary[:500]}\n\n{table_output}"
123
 
124
  except Exception as e:
125
  return f"Error retrieving Wikipedia content: {str(e)}"