Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

ml-agent / agent /tools /github_find_examples.py

akseljoonas HF Staff

Initial commit: ML Agent with Xet storage for binaries

8cfacd3 3 days ago

raw

history blame contribute delete

17.6 kB

	"""
	GitHub Find Examples Tool - Discover examples, tutorials, and guides for any library

	Lists all files in a repository and performs deterministic keyword search.
	"""

	import os
	from typing import Any, Dict, List

	import requests
	from thefuzz import fuzz

	from agent.tools.types import ToolResult

	# In order of priority (lower index = higher priority for sorting)
	EXAMPLE_PATTERNS = [
	"scripts",
	# General example patterns (catch-all, lower priority)
	"examples",
	"example",
	# Notebook patterns
	"notebooks",
	"notebook",
	# Tutorial/learning patterns
	"tutorials",
	"tutorial",
	"quickstart",
	"walkthroughs",
	"walkthrough",
	# Cookbook/recipe patterns
	"cookbook",
	"cookbooks",
	"recipes",
	"recipe",
	# Demo/sample patterns
	"demos",
	"demo",
	"samples",
	"sample",
	# Other patterns
	"guides",
	"guide",
	"getting-started",
	"getting_started",
	"playground",
	"howto",
	"how-to",
	"use-cases",
	"usecases",
	"use_cases",
	"sandbox",
	"showcase",
	]


	def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]], str]:
	"""Get all files in a repository recursively. Returns (files, error_message)"""
	headers = {
	"Accept": "application/vnd.github+json",
	"X-GitHub-Api-Version": "2022-11-28",
	"Authorization": f"Bearer {token}",
	}

	full_repo = f"{org}/{repo}"

	# Get default branch
	try:
	response = requests.get(
	f"https://api.github.com/repos/{full_repo}", headers=headers, timeout=10
	)
	if response.status_code == 404:
	return [], "not_found"
	if response.status_code != 200:
	return [], f"API error: {response.status_code}"

	repo_data = response.json()
	default_branch = repo_data.get("default_branch", "main")
	except Exception as e:
	return [], f"Error fetching repo: {str(e)}"

	# Get repository tree recursively
	try:
	response = requests.get(
	f"https://api.github.com/repos/{full_repo}/git/trees/{default_branch}",
	headers=headers,
	params={"recursive": "1"},
	timeout=30,
	)
	if response.status_code != 200:
	return [], f"Error fetching tree: {response.status_code}"

	data = response.json()
	tree = data.get("tree", [])

	# Filter to only include files (not directories)
	files = [
	{
	"path": item["path"],
	"ref": item["sha"],
	"size": item.get("size", 0),
	"url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}",
	}
	for item in tree
	if item["type"] == "blob"
	]

	return files, ""
	except Exception as e:
	return [], f"Error processing tree: {str(e)}"


	def _search_similar_repos(org: str, repo: str, token: str) -> List[Dict[str, Any]]:
	"""Search for similar repository names in the organization"""
	headers = {
	"Accept": "application/vnd.github+json",
	"X-GitHub-Api-Version": "2022-11-28",
	"Authorization": f"Bearer {token}",
	}

	# Search for repos in the org with similar name
	query = f"org:{org} {repo}"

	try:
	response = requests.get(
	"https://api.github.com/search/repositories",
	headers=headers,
	params={"q": query, "sort": "stars", "order": "desc", "per_page": 10},
	timeout=30,
	)

	if response.status_code != 200:
	return []

	data = response.json()
	items = data.get("items", [])

	return [
	{
	"name": item.get("name"),
	"full_name": item.get("full_name"),
	"description": item.get("description"),
	"stars": item.get("stargazers_count", 0),
	"url": item.get("html_url"),
	}
	for item in items
	]
	except Exception:
	return []


	def _score_against_example_patterns(file_path: str) -> int:
	"""Score file against example patterns using token_set_ratio"""
	scores = []
	for pattern in EXAMPLE_PATTERNS:
	score = fuzz.token_set_ratio(pattern.lower(), file_path.lower())
	scores.append(score)
	return max(scores) if scores else 0


	def _score_against_keyword(file_path: str, keyword: str) -> int:
	"""Calculate fuzzy match score for a file path against a keyword"""
	# Use partial_ratio for substring matching (good for paths)
	# Also check token_set_ratio for word-level matching
	partial_score = fuzz.partial_ratio(keyword.lower(), file_path.lower())
	token_score = fuzz.token_set_ratio(keyword.lower(), file_path.lower())

	# Return the higher of the two
	return max(partial_score, token_score)


	def _get_pattern_priority(file_path: str) -> tuple[int, int, int]:
	"""
	Get priority of a file path based on which example pattern directory it's in.

	Returns: (in_examples_dir, pattern_priority, path_depth)
	- in_examples_dir: 0 if in examples/ directory, 1 otherwise (lower is better)
	- pattern_priority: Index in EXAMPLE_PATTERNS (lower is better), or 999 if no match
	- path_depth: Number of path segments (lower is better)

	Note: Prioritizes files in "examples/" directory first, then by most specific pattern match.
	E.g., "examples/scripts/train.py" is better than "scripts/util.py"
	"""
	path_lower = file_path.lower()
	path_parts = path_lower.split("/")

	# Check if file is in examples/ directory (highest priority)
	in_examples_dir = 0 if (path_parts[0] in ["examples", "example"]) else 1

	# Find ALL matching patterns and use the best (lowest index) one
	# But prefer deeper matches (more specific) over shallow ones
	best_priority = 999
	best_depth_at_match = -1

	for i, pattern in enumerate(EXAMPLE_PATTERNS):
	# Check if pattern appears as a directory component in the path
	if pattern in path_parts:
	# Find the depth where this pattern appears (rightmost occurrence)
	depth = len(path_parts) - 1 - path_parts[::-1].index(pattern)

	# Prefer deeper matches, or better priority if at same depth
	if depth > best_depth_at_match or (
	depth == best_depth_at_match and i < best_priority
	):
	best_priority = i
	best_depth_at_match = depth

	return (in_examples_dir, best_priority, len(path_parts))


	def _handle_repo_tree_errors(
	all_files: List[Dict[str, Any]],
	error: str,
	org: str,
	repo: str,
	token: str,
	) -> ToolResult \| None:
	"""Handle errors from repo tree fetch. Returns ToolResult if error, None if OK."""
	if error == "not_found":
	similar_repos = _search_similar_repos(org, repo, token)

	if not similar_repos:
	return {
	"formatted": f"Repository '{org}/{repo}' not found and no similar repositories found.",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	# Format similar repos
	lines = [f"Repository '{org}/{repo}' not found. Similar repositories:\n"]
	for i, r in enumerate(similar_repos, 1):
	lines.append(f"{i}. {r['full_name']} (⭐ {r['stars']:,} stars)")
	if r["description"]:
	desc = (
	r["description"][:100] + "..."
	if len(r["description"]) > 100
	else r["description"]
	)
	lines.append(f" {desc}")
	lines.append(f" {r['url']}\n")

	return {
	"formatted": "\n".join(lines),
	"totalResults": len(similar_repos),
	"resultsShared": len(similar_repos),
	"isError": True,
	}

	if error:
	return {
	"formatted": f"Error accessing repository '{org}/{repo}': {error}",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	if not all_files:
	return {
	"formatted": f"No files found in repository '{org}/{repo}'",
	"totalResults": 0,
	"resultsShared": 0,
	}

	return None


	def find_examples(
	keyword: str = "",
	repo: str = "",
	org: str = "huggingface",
	max_results: int = 10,
	min_score: int = 80,
	) -> ToolResult:
	"""
	Find example files in a repository using fuzzy matching.

	Args:
	keyword: Keyword to fuzzy match against file paths (e.g., "grpo")
	repo: Repository name (e.g., "trl")
	org: GitHub organization (default: "huggingface")
	max_results: Maximum number of results (default 50)
	min_score: Minimum fuzzy match score (0-100, default 60)

	Returns:
	ToolResult with matching files, or similar repos if repo not found
	"""
	token = os.environ.get("GITHUB_TOKEN")
	if not token:
	return {
	"formatted": "Error: GITHUB_TOKEN environment variable is required",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	if not repo:
	return {
	"formatted": "Error: repo parameter is required",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	# Get all files in the repository
	all_files, error = _get_repo_tree(org, repo, token)

	# Handle errors (not found, API errors, empty repo)
	if error_result := _handle_repo_tree_errors(all_files, error, org, repo, token):
	return error_result

	# Step 1: Filter files by example patterns (score >= 60)
	example_threshold = 60
	example_files = []
	for file in all_files:
	example_score = _score_against_example_patterns(file["path"])
	if example_score >= example_threshold:
	example_files.append({**file, "example_score": example_score})

	if not example_files:
	return {
	"formatted": f"No example files found in {org}/{repo} (no files match example patterns with score >= {example_threshold}).",
	"totalResults": 0,
	"resultsShared": 0,
	}

	# Step 2: If keyword provided, score and filter by keyword
	if keyword:
	scored_files = []
	for file in example_files:
	keyword_score = _score_against_keyword(file["path"], keyword)
	if keyword_score >= min_score:
	scored_files.append({**file, "score": keyword_score})

	if not scored_files:
	return {
	"formatted": f"No files found in {org}/{repo} matching keyword '{keyword}' (min score: {min_score}) among {len(example_files)} example files.",
	"totalResults": 0,
	"resultsShared": 0,
	}

	# Sort by keyword score (descending) for best matches first
	scored_files.sort(key=lambda x: x["score"], reverse=True)
	else:
	# No keyword: prioritize by pattern directory, then path depth
	scored_files = []
	for file in example_files:
	in_examples_dir, pattern_priority, path_depth = _get_pattern_priority(
	file["path"]
	)
	scored_files.append(
	{
	**file,
	"score": file["example_score"],
	"in_examples_dir": in_examples_dir,
	"pattern_priority": pattern_priority,
	"path_depth": path_depth,
	}
	)

	if not scored_files:
	return {
	"formatted": f"No example files found in {org}/{repo}.",
	"totalResults": 0,
	"resultsShared": 0,
	}

	# Sort by: 1) files in examples/ dir first, 2) pattern priority (scripts > datasets > etc), 3) path depth, 4) path name
	scored_files.sort(
	key=lambda x: (
	x["in_examples_dir"],
	x["pattern_priority"],
	x["path_depth"],
	x["path"],
	)
	)

	# Limit results
	results = scored_files[:max_results]

	# Format output
	keyword_desc = f" matching '{keyword}'" if keyword else ""
	lines = [f"Found {len(results)} example files in {org}/{repo}{keyword_desc}:"]
	if len(scored_files) > max_results:
	lines[0] += f" (showing {max_results} of {len(scored_files)})"
	lines.append("")

	for i, file in enumerate(results, 1):
	lines.append(f"{i}. {file['path']}")
	lines.append(f" Size: {file['size']:,} bytes \| Ref: {file['ref'][:7]}")
	lines.append(f" URL: {file['url']}")

	# Copyable parameters for read_file tool
	read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}"
	lines.append(f" To read, use: {read_params}")
	lines.append("")

	return {
	"formatted": "\n".join(lines),
	"totalResults": len(results),
	"resultsShared": len(results),
	}


	# Tool specification
	GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
	"name": "github_find_examples",
	"description": (
	"Discover working code examples, tutorials, scripts, and demos in GitHub repositories. "
	"⚠️ CRITICAL: ALWAYS use this BEFORE implementing ML tasks - find working reference code first. "
	"Your training data may be outdated; real repository examples show current best practices. "
	"Use when: (1) Starting any ML implementation (training, inference, evaluation), "
	"(2) User asks 'how to' questions about libraries, (3) Need reference implementations, "
	"(4) Exploring library capabilities, (5) Before writing training/processing scripts. "
	"Pattern: github_find_examples (discover) → github_read_file (study code) → implement with researched approach. "
	"Returns: List of example files (scripts/notebooks/tutorials) with paths and URLs, sorted by relevance. "
	"Then: Use github_read_file to read the actual implementation code. "
	"Critical for reliability: Real examples prevent outdated API usage and show proven patterns. "
	"## How it works\n\n"
	"1. Fetches all example files (examples/, scripts/, tutorials/, demos/, notebooks/, etc.) from repository\n"
	"2. If keyword provided, scores files against keyword using fuzzy matching\n"
	"3. Returns best matches sorted by relevance and pattern priority\n"
	"4. Provides copyable parameters for github_read_file tool\n\n"
	"## Examples\n\n"
	"<example>\n"
	"// ML Workflow Step: Find GRPO training examples before implementation\n"
	"// Task: Starting GRPO fine-tuning project, need reference implementation\n"
	"{\n"
	" keyword: 'grpo',\n"
	" repo: 'trl',\n"
	" org: 'huggingface'\n"
	"}\n"
	"// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
	"// Next step: github_read_file to study working implementation\n"
	"</example>\n\n"
	"<example>\n"
	"// ML Workflow Step: Discover all available training methods\n"
	"// Task: Exploring TRL training options before choosing approach\n"
	"{\n"
	" repo: 'trl',\n"
	" org: 'huggingface',\n"
	" max_results: 20\n"
	"}\n"
	"// Lists: SFT, DPO, GRPO, PPO, reward modeling examples\n"
	"// Helps user choose appropriate method\n"
	"</example>\n\n"
	"<example>\n"
	"// ML Workflow Step: Find LoRA fine-tuning examples\n"
	"// Task: Learning parameter-efficient fine-tuning patterns\n"
	"{\n"
	" keyword: 'lora',\n"
	" repo: 'peft',\n"
	" org: 'huggingface'\n"
	"}\n"
	"// Discovers LoRA configuration and training examples\n"
	"// Shows current PEFT API usage patterns\n"
	"</example>"
	),
	"parameters": {
	"type": "object",
	"properties": {
	"keyword": {
	"type": "string",
	"description": "Keyword to fuzzy match against file paths (e.g., 'grpo', 'sft').",
	},
	"repo": {
	"type": "string",
	"description": "Repository name (e.g., 'trl', 'transformers'). Required.",
	},
	"org": {
	"type": "string",
	"description": "GitHub organization or username. Default: 'huggingface'.",
	},
	"max_results": {
	"type": "integer",
	"description": "Maximum number of results to return. Default: 50.",
	},
	"min_score": {
	"type": "integer",
	"description": "Minimum fuzzy match score (0-100). Default: 60.",
	},
	},
	"required": ["repo"],
	},
	}


	async def github_find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
	"""Handler for agent tool router"""
	try:
	result = find_examples(
	keyword=arguments.get("keyword", ""),
	repo=arguments["repo"],
	org=arguments.get("org", "huggingface"),
	max_results=arguments.get("max_results", 50),
	min_score=arguments.get("min_score", 60),
	)
	return result["formatted"], not result.get("isError", False)
	except Exception as e:
	return f"Error finding examples: {str(e)}", False