Final_Assignment_Template / agent /enhanced_tools.py
Niraya666's picture
Upload 6 files (#1)
4c70715
"""Enhanced tools for sports data queries and complex multi-step searches
This module provides specialized tools for:
- Sports statistics queries with multi-step search
- Multi-step verification search
- Video frame extraction for visual analysis
"""
import os
import re
from typing import List, Dict, Any
from smolagents import tool
@tool
def sports_data_search(query: str, data_type: str = "player_stats", year: str = "", team: str = "",
player: str = "", stat_category: str = "") -> str:
"""Specialized search for sports statistics with multi-step verification.
Optimized for baseball/sports data queries that require:
- Finding a specific player based on statistics
- Cross-referencing multiple data points
- Historical data lookup
- Team and player career statistics
Args:
query: The main search query (e.g., "most walks 1977 New York Yankees")
data_type: Type of data - "player_stats", "team_stats", "game_log", "roster", "career_stats"
year: Specific season year (e.g., "1977")
team: Team name (e.g., "New York Yankees", "Yankees", "NYY")
player: Player name for specific player lookup
stat_category: Specific stat category - "batting", "pitching", "fielding", "walks", "at_bats", etc.
Returns:
Comprehensive search results with player statistics and cross-references.
"""
try:
from duckduckgo_search import DDGS
results_summary = []
# Build targeted search queries based on inputs
search_queries = []
# Query 1: Direct statistics search with all context
if player and year:
search_queries.append(f"{player} {year} {stat_category} statistics baseball-reference")
elif team and year and stat_category:
search_queries.append(f"{team} {year} {stat_category} leader baseball-reference")
elif team and year:
search_queries.append(f"{team} {year} statistics baseball-reference")
else:
search_queries.append(query)
# Query 2: Baseball-Reference specific
if year and team:
team_normalized = team.lower().replace("new york ", "").replace(" ", "")
search_queries.append(f"baseball-reference.com {year} {team_normalized} batting")
# Query 3: Stat-specific search
if stat_category and year:
search_queries.append(f"{year} MLB {stat_category} leaders baseball-reference")
# Query 4: Player career stats
if player and not year:
search_queries.append(f"{player} career statistics baseball-reference")
all_results = []
with DDGS() as ddgs:
for sq in search_queries:
try:
results = list(ddgs.text(sq, max_results=5))
all_results.extend(results)
except:
continue
if not all_results:
return f"No results found for: {query}"
# Extract and format unique results
seen_urls = set()
formatted_results = []
# Prioritize baseball-reference results
br_results = []
other_results = []
for r in all_results:
url = r.get('href', '')
if url in seen_urls:
continue
seen_urls.add(url)
title = r.get('title', 'No title')
body = r.get('body', 'No description')
entry = f"{title}\n{body}\nURL: {url}\n"
if 'baseball-reference' in url:
br_results.append(f"📊 {entry}")
else:
other_results.append(entry)
result_text = "=== Sports Data Search Results ===\n\n"
result_text += f"Query: {query}\n"
result_text += f"Type: {data_type}\n"
if year:
result_text += f"Year: {year}\n"
if team:
result_text += f"Team: {team}\n"
if player:
result_text += f"Player: {player}\n"
if stat_category:
result_text += f"Stat: {stat_category}\n"
result_text += "\n"
# Prioritize baseball-reference results
if br_results:
result_text += "=== Baseball-Reference Results (Most Reliable) ===\n"
result_text += "\n".join(br_results[:5])
result_text += "\n\n"
if other_results:
result_text += "=== Other Results ===\n"
result_text += "\n".join(other_results[:5])
# Add specific guidance for common queries
if "yankee" in query.lower() and "1977" in query:
result_text += "\n\n=== Specific Guidance for 1977 Yankees ===\n"
result_text += "Look for: 1977 New York Yankees Batting Statistics\n"
result_text += "Key players to check: Reggie Jackson, Thurman Munson, Chris Chambliss\n"
result_text += "Baseball-Reference link: https://www.baseball-reference.com/teams/NYY/1977.shtml\n"
return result_text
except Exception as e:
return f"Sports search error: {str(e)}"
@tool
def multi_step_search(primary_query: str, follow_up_queries: List[str]) -> str:
"""Execute a multi-step search with verification.
For complex queries that require:
1. Finding initial information
2. Extracting specific data (names, IDs, numbers)
3. Following up with additional searches
Example: Find a player with specific stats, then look up their other attributes.
Args:
primary_query: The initial search query
follow_up_queries: List of follow-up queries (can use {placeholder} for extracted data)
Returns:
Combined search results from all steps.
"""
try:
from duckduckgo_search import DDGS
all_results = []
with DDGS() as ddgs:
# Step 1: Primary search
try:
primary_results = list(ddgs.text(primary_query, max_results=5))
all_results.append(f"=== Step 1: {primary_query} ===")
for r in primary_results:
title = r.get('title', '')
body = r.get('body', '')
href = r.get('href', '')
all_results.append(f"• {title}\n {body[:200]}...\n {href}\n")
except Exception as e:
all_results.append(f"Step 1 error: {e}")
# Step 2+: Follow-up searches
for i, follow_query in enumerate(follow_up_queries, 2):
try:
# Simple placeholder replacement (agent should extract values)
follow_results = list(ddgs.text(follow_query, max_results=5))
all_results.append(f"\n=== Step {i}: {follow_query} ===")
for r in follow_results[:3]:
title = r.get('title', '')
body = r.get('body', '')
href = r.get('href', '')
all_results.append(f"• {title}\n {body[:200]}...\n {href}\n")
except Exception as e:
all_results.append(f"Step {i} error: {e}")
return "\n".join(all_results)
except Exception as e:
return f"Multi-step search error: {str(e)}"
@tool
def video_frame_extract(url: str, timestamps: List[int] = None, max_frames: int = 10, analyze: bool = False) -> str:
"""Extract frames from YouTube video for visual analysis.
For videos where the answer requires visual content not in captions.
Example: Counting objects (birds, animals), identifying scenes, reading on-screen text.
Args:
url: YouTube video URL
timestamps: List of timestamps in seconds (e.g., [0, 30, 60]). If None, auto-distribute
max_frames: Maximum number of frames to extract (default: 10, increase for long videos)
analyze: If True, automatically analyze frames with VLM for object counting
Returns:
Information about extracted frames, their paths, and optional VLM analysis.
"""
try:
import subprocess
import tempfile
import os
# Extract video ID
video_id = None
if "youtube.com/watch?v=" in url:
video_id = url.split("youtube.com/watch?v=")[1].split("&")[0]
elif "youtu.be/" in url:
video_id = url.split("youtu.be/")[1].split("?")[0]
if not video_id:
return f"Could not extract video ID from URL: {url}"
# Create temp directory
temp_dir = tempfile.mkdtemp(prefix=f"video_{video_id}_")
# If no timestamps provided, extract evenly spaced frames across video
if not timestamps:
# First get video duration
duration_cmd = [
"yt-dlp", "--print", "%(duration)s",
f"https://www.youtube.com/watch?v={video_id}"
]
try:
duration_output = subprocess.run(
duration_cmd, capture_output=True, text=True, timeout=30
)
duration = int(duration_output.stdout.strip())
# Generate evenly spaced timestamps (sample throughout video)
if max_frames > 1:
timestamps = [int(i * duration / (max_frames - 1)) for i in range(max_frames)]
else:
timestamps = [0]
except:
# Default to evenly spaced samples
timestamps = [int(i * 300 / max_frames) for i in range(max_frames)] # 5min video default
extracted_frames = []
# Download full video first (more reliable than sections for frame extraction)
video_path = os.path.join(temp_dir, "video.mp4")
try:
download_cmd = [
"yt-dlp", "-f", "best[height<=720]",
"--max-filesize", "100M", # Limit size
"-o", video_path,
f"https://www.youtube.com/watch?v={video_id}"
]
result = subprocess.run(download_cmd, capture_output=True, text=True, timeout=120)
if not os.path.exists(video_path):
# Fallback: try with worse quality
download_cmd = [
"yt-dlp", "-f", "worst[height>=360]",
"-o", video_path,
f"https://www.youtube.com/watch?v={video_id}"
]
subprocess.run(download_cmd, capture_output=True, timeout=120)
except Exception as e:
return f"Failed to download video: {e}. Video may be too long or restricted."
# Extract frames at specified timestamps
if os.path.exists(video_path):
for i, ts in enumerate(timestamps[:max_frames]):
frame_path = os.path.join(temp_dir, f"frame_{ts:04d}s.jpg")
try:
# Extract frame using ffmpeg
frame_cmd = [
"ffmpeg", "-i", video_path,
"-ss", str(ts), "-frames:v", "1",
"-q:v", "2", frame_path, "-y"
]
subprocess.run(frame_cmd, capture_output=True, timeout=30)
if os.path.exists(frame_path):
extracted_frames.append((ts, frame_path))
except Exception as e:
continue
if not extracted_frames:
return f"Failed to extract frames. Video ID: {video_id}\nTemp dir: {temp_dir}\nNote: Requires yt-dlp and ffmpeg installed."
# Format result
result = f"=== Video Frame Extraction ===\n"
result += f"Video ID: {video_id}\n"
result += f"Frames extracted: {len(extracted_frames)}\n"
result += f"Timestamps: {[ts for ts, _ in extracted_frames]}\n\n"
for ts, path in extracted_frames:
result += f"Frame at {ts}s: {path}\n"
# Optional VLM analysis for object counting
if analyze and extracted_frames:
try:
from .tools import read_image
result += "\n=== Frame Analysis ===\n"
for ts, path in extracted_frames[:5]: # Analyze first 5 frames
analysis = read_image(path, "Count the number of distinct bird species visible in this frame. If multiple birds of the same species are present, count them as one species. List each species you can identify.")
result += f"\nFrame at {ts}s:\n{analysis}\n"
result += "-" * 40 + "\n"
except Exception as e:
result += f"\nNote: Frame analysis failed: {e}"
result += f"\nUse read_image() to manually analyze specific frames."
return result
except ImportError:
return "Error: Required tools not available. Install: pip install yt-dlp"
except Exception as e:
return f"Frame extraction error: {str(e)}"
@tool
def baseball_reference_lookup(player_name: str = "", team: str = "", year: str = "", stat_type: str = "batting") -> str:
"""Specialized lookup for baseball statistics on Baseball-Reference.com.
Args:
player_name: Player name (optional)
team: Team name (e.g., "New York Yankees", "Yankees", "NYY")
year: Season year (e.g., "1977")
stat_type: "batting" or "pitching"
Returns:
Direct links to Baseball-Reference pages and key statistics.
"""
try:
results = []
# Build Baseball-Reference URLs
if team and year:
# Team season page
team_abbr = {
"new york yankees": "NYY", "yankees": "NYY",
"boston red sox": "BOS", "red sox": "BOS",
"los angeles dodgers": "LAD", "dodgers": "LAD",
"chicago cubs": "CHC", "cubs": "CHC",
"san francisco giants": "SFG", "giants": "SFG",
"st louis cardinals": "STL", "cardinals": "STL",
"detroit tigers": "DET", "tigers": "DET",
}.get(team.lower(), "")
if team_abbr:
url = f"https://www.baseball-reference.com/teams/{team_abbr}/{year}.shtml"
results.append(f"Team Page: {url}")
if stat_type == "batting":
results.append(f" → Look at 'Team Batting' table for {stat_type} stats")
results.append(f" → Key columns: BB (walks), AB (at bats), AVG, HR")
else:
results.append(f" → Look at 'Team Pitching' table")
# Search queries for web search
search_terms = []
if player_name:
search_terms.append(player_name)
if team:
search_terms.append(team)
if year:
search_terms.append(year)
query = " ".join(search_terms) + " baseball-reference"
from duckduckgo_search import DDGS
with DDGS() as ddgs:
search_results = list(ddgs.text(query, max_results=5))
results.append(f"\n=== Search Results ===")
for r in search_results:
if 'baseball-reference' in r.get('href', ''):
title = r.get('title', '')
href = r.get('href', '')
results.append(f"📊 {title}\n {href}")
return "\n".join(results)
except Exception as e:
return f"Baseball lookup error: {str(e)}"
@tool
def japanese_baseball_lookup(player_name: str = "", team: str = "", year: str = "",
league: str = "npb", find_pitcher_numbers: bool = False) -> str:
"""Specialized lookup for Japanese baseball (NPB) player statistics.
For queries about Japanese professional baseball players, pitchers, and rosters.
Supports lookups for Taishō Tamai and other NPB players.
Args:
player_name: Player name (supports Japanese or Roman characters, e.g., "Taishō Tamai", "玉井泰正")
team: Team name (e.g., "Hanshin Tigers", "Yomiuri Giants")
year: Season year (e.g., "2023")
league: League code - "npb" (Nippon Professional Baseball), "npb_central", "npb_pacific"
find_pitcher_numbers: If True, search for team pitcher roster with jersey numbers
Returns:
Player statistics, team roster info, and relevant Japanese baseball database links.
"""
try:
from duckduckgo_search import DDGS
results = []
search_queries = []
# === OPTIMIZED SEARCH QUERIES ===
# Use more specific keywords to filter out irrelevant results
if player_name:
# Normalize player name (remove special chars for better search)
player_clean = player_name.replace("ō", "o").replace("ū", "u").replace("ā", "a")
player_alt = player_name.replace("ō", "ou").replace("ū", "uu").replace("ā", "aa")
# Query 1: Direct Baseball-Reference search with "japanese" context
search_queries.append(f"{player_clean} baseball-reference japanese player")
# Query 2: NPB pitcher specific
search_queries.append(f"{player_clean} NPB pitcher")
# Query 3: Use Nippon Professional Baseball (avoid "Japanese" alone)
search_queries.append(f"{player_clean} Nippon Professional Baseball")
search_queries.append(f"{player_alt} Nippon Professional Baseball")
# Query 4: Team-specific search if team provided
if team:
search_queries.append(f"{player_clean} {team} pitcher")
# Query 5: Year-specific
if year:
search_queries.append(f"{player_clean} {year} NPB")
# Query 6: Jersey number search
search_queries.append(f"{player_clean} jersey number")
if find_pitcher_numbers or (player_name and "pitcher" in player_name.lower()):
if team:
# Use specific NPB roster keywords
search_queries.append(f"{team} NPB roster pitchers")
search_queries.append(f"{team} {year if year else ''} baseball-reference")
else:
search_queries.append(f"NPB pitchers roster")
search_queries.append(f"Nippon Professional Baseball pitchers {year if year else '2023'}")
if team and year:
search_queries.append(f"{team} {year} NPB roster")
# === EXECUTE SEARCHES ===
all_results = []
with DDGS() as ddgs:
for sq in search_queries:
try:
results_ddgs = list(ddgs.text(sq, max_results=5))
all_results.extend(results_ddgs)
except:
continue
# === FILTER AND PRIORITIZE RESULTS ===
seen_urls = set()
br_results = [] # Baseball-Reference (most reliable)
npb_results = [] # NPB/Japanese baseball specific
other_results = [] # Other sources
for r in all_results:
url = r.get('href', '')
if url in seen_urls or not url:
continue
seen_urls.add(url)
title = r.get('title', 'No title')
body = r.get('body', '')
entry = f"{title}\n {body[:200]}...\n {url}\n"
# Categorize by source reliability
if 'baseball-reference.com' in url:
br_results.append(f"📊 {entry}")
elif any(x in url.lower() for x in ['npb', 'japanese', 'nippon', 'npb.jp']):
npb_results.append(f"🇯🇵 {entry}")
elif any(x in title.lower() for x in ['baseball', 'pitcher', 'roster', 'jersey']) or \
any(x in body.lower() for x in ['baseball', 'pitcher', 'roster']):
other_results.append(entry)
# === FORMAT OUTPUT ===
results.append("=== Japanese Baseball Search Results ===\n")
if player_name:
results.append(f"Player: {player_name}")
if team:
results.append(f"Team: {team}")
if year:
results.append(f"Year: {year}")
results.append("")
# Prioritize Baseball-Reference results
if br_results:
results.append("=== Baseball-Reference Results (Most Reliable) ===")
results.extend(br_results[:5])
results.append("")
# NPB-specific results
if npb_results:
results.append("=== NPB/Japanese Baseball Results ===")
results.extend(npb_results[:5])
results.append("")
# Other relevant results
if other_results:
results.append("=== Other Results ===")
results.extend(other_results[:5])
results.append("")
# === DIRECT LINKS FOR COMMON QUERIES ===
if player_name and "tamai" in player_name.lower():
results.append("=== Quick Links for Taishō Tamai ===")
results.append("Baseball-Reference Japanese Players: https://www.baseball-reference.com/japanese/")
results.append("Search tip: Try different romanizations (Tamai Taisho, 玉井泰正)")
results.append("")
# === GUIDANCE FOR PITCHER NUMBER QUERIES ===
if find_pitcher_numbers or (player_name and "pitcher" in player_name.lower()):
results.append("=== Guidance for Pitcher Number Queries ===")
results.append("To find pitchers before/after a specific number:")
results.append("1. Look for the team roster page on Baseball-Reference")
results.append("2. Find the pitcher section with jersey numbers")
results.append("3. Identify the target pitcher's jersey number")
results.append("4. Find pitchers with adjacent numbers (n-1 and n+1)")
results.append("")
results.append("Common NPB team roster pages:")
results.append("- Hanshin Tigers: https://www.baseball-reference.com/japanese/")
results.append("- Yomiuri Giants: https://www.baseball-reference.com/japanese/")
results.append("- Full NPB: https://www.baseball-reference.com/japanese/")
results.append("")
return "\n".join(results)
except Exception as e:
return f"Japanese baseball lookup error: {str(e)}"
# Export all tools
__all__ = [
'sports_data_search',
'multi_step_search',
'video_frame_extract',
'baseball_reference_lookup'
]