FinalAgentProject

Sleeping

App Files Files Community

WeByT3 commited on Jun 5, 2025

Commit

5532431

verified ·

1 Parent(s): 02acb02

Update tools.py

Browse files

Files changed (1) hide show

tools.py +40 -9

tools.py CHANGED Viewed

@@ -5,6 +5,40 @@ import requests
 import fitz  # PyMuPDF
 import io
 from urllib.parse import urlparse
 @tool
@@ -54,11 +88,13 @@ def divide(a: int, b: int) -> int:
     return a / b
 @tool
-def search_wikipedia(page_title: str, language: str) -> str:
     """
     This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains.
     Args:
         page_title: Title of the Wikipedia page.
         language: Language code (e.g., "en", "es", "fr").
@@ -79,16 +115,11 @@ def search_wikipedia(page_title: str, language: str) -> str:
         # Use the URL to read tables
         tables = pd.read_html(page.fullurl)
-        markdown_tables = []
-        for i, table in enumerate(tables):
-            if isinstance(table, pd.DataFrame):
-                markdown = table.iloc[:20, :3].to_markdown(index=False)
-                markdown_tables.append(f"\n---\n**Table {i + 1}:**\n{markdown}")
-        table_output = "\n".join(markdown_tables) if markdown_tables else "No tables found on this page."
-        return f"Text: {page.summary[:75]}\n\n{table_output}"
     except Exception as e:
         return f"Error retrieving Wikipedia content: {str(e)}"

 import fitz  # PyMuPDF
 import io
 from urllib.parse import urlparse
+from typing import List, Dict
+import pandas as pd
+import re
+from difflib import SequenceMatcher
+def clean(text):
+    return re.sub(r'[^a-zA-Z0-9 ]', '', text.lower())
+def extract_relevant_table_info(query: str, tables: List[pd.DataFrame], min_score: float = 0.2) -> Dict[str, str]:
+    query_clean = clean(query)
+    results = {}
+    for i, df in enumerate(tables):
+        column_scores = []
+        for col in df.columns:
+            score = SequenceMatcher(None, query_clean, clean(str(col))).ratio()
+            column_scores.append((col, score))
+        # Keep columns above threshold
+        relevant_cols = [col for col, score in column_scores if score >= min_score]
+        if not relevant_cols:
+            continue  # skip irrelevant tables
+        compact_str = ", ".join(
+            f"{row[relevant_cols[0]]}=" + ", ".join(f"{col}={row[col]}" for col in relevant_cols[1:])
+            for _, row in df[relevant_cols].dropna().head(3).iterrows()
+        )
+        results[f"table_{i}"] = compact_str
+    return results
 @tool
     return a / b
 @tool
+def search_wikipedia(query: str, page_title: str, language: str) -> str:
     """
     This tool searches Wikipedia for a specific page and returns its text and any HTML tables it contains.
+    The function is capable of retrieving the most relevant information given the original query.
     Args:
+        query: The original question that prompted the use of the function.
         page_title: Title of the Wikipedia page.
         language: Language code (e.g., "en", "es", "fr").
         # Use the URL to read tables
         tables = pd.read_html(page.fullurl)
+        markdown_tables = extract_relevant_table_info(query, tables, min_score = 0.2)
+        table_output = "\n".join(list(markdown_tables.values())) if markdown_tables else "No tables found on this page."
+        return f"Text: {page.summary[:500]}\n\n{table_output}"
     except Exception as e:
         return f"Error retrieving Wikipedia content: {str(e)}"