Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

ml-agent / agent /tools /github_read_file.py

akseljoonas HF Staff

integrated hf skills into the prompts

0610df6 about 1 month ago

raw

history blame

12.6 kB

	"""
	GitHub Read File Tool - Read file contents from any GitHub repository with line range support

	Fetch exact file contents with metadata, supporting line ranges for efficient reading.
	"""

	import base64
	import json
	import os
	from typing import Any, Dict, Optional

	import nbformat
	import requests
	from nbconvert import MarkdownExporter
	from nbconvert.preprocessors import ClearOutputPreprocessor, TagRemovePreprocessor

	from agent.tools.types import ToolResult


	def _convert_ipynb_to_markdown(content: str) -> str:
	"""
	Convert Jupyter notebook JSON to LLM-friendly Markdown.

	Args:
	content: Raw notebook JSON string

	Returns:
	Converted Markdown string
	"""
	try:
	# Parse notebook JSON
	nb_dict = json.loads(content)

	# Normalize cell sources (can be string or list of strings)
	if "cells" in nb_dict:
	for cell in nb_dict["cells"]:
	if "source" in cell and isinstance(cell["source"], list):
	cell["source"] = "".join(cell["source"])

	# Read notebook with explicit version
	nb = nbformat.reads(json.dumps(nb_dict), as_version=4)

	# Strip outputs for LLM readability (outputs can be noisy/large)
	clear = ClearOutputPreprocessor()
	nb, _ = clear.preprocess(nb, {})

	# Optionally remove cells tagged with "hide" or similar
	remove = TagRemovePreprocessor(
	remove_cell_tags={"hide", "hidden", "remove"},
	remove_input_tags=set(),
	remove_all_outputs_tags=set(),
	)
	nb, _ = remove.preprocess(nb, {})

	# Convert to markdown
	exporter = MarkdownExporter()
	markdown, _ = exporter.from_notebook_node(nb)

	return markdown

	except json.JSONDecodeError:
	return content
	except Exception:
	return content


	def read_file(
	repo: str,
	path: str,
	ref: str = "HEAD",
	line_start: Optional[int] = None,
	line_end: Optional[int] = None,
	) -> ToolResult:
	"""
	Read file contents from a GitHub repository with line range support.

	Args:
	repo: Repository in format "owner/repo" (e.g., "github/github-mcp-server")
	path: Path to file in repository (e.g., "pkg/github/search.go")
	ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
	line_start: Starting line number (1-indexed, inclusive)
	line_end: Ending line number (1-indexed, inclusive)

	Returns:
	ToolResult with file contents and metadata
	"""
	token = os.environ.get("GITHUB_TOKEN")
	if not token:
	return {
	"formatted": "Error: GITHUB_TOKEN environment variable is required",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	# Parse repo
	if "/" not in repo:
	return {
	"formatted": "Error: repo must be in format 'owner/repo'",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	owner, repo_name = repo.split("/", 1)

	headers = {
	"Accept": "application/vnd.github+json",
	"X-GitHub-Api-Version": "2022-11-28",
	"Authorization": f"Bearer {token}",
	}

	# Fetch file contents
	url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
	params = {}
	if ref and ref != "HEAD":
	params["ref"] = ref

	try:
	response = requests.get(url, headers=headers, params=params, timeout=30)

	if response.status_code == 404:
	return {
	"formatted": f"File not found: {path} in {repo} (ref: {ref})",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	if response.status_code != 200:
	error_msg = f"GitHub API error (status {response.status_code})"
	try:
	error_data = response.json()
	if "message" in error_data:
	error_msg += f": {error_data['message']}"
	except Exception:
	pass
	return {
	"formatted": error_msg,
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	data = response.json()

	# Check if it's a file
	if data.get("type") != "file":
	return {
	"formatted": f"Path {path} is not a file (type: {data.get('type')})",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	# Decode content
	content_b64 = data.get("content", "")
	if content_b64:
	content_b64 = content_b64.replace("\n", "").replace(" ", "")
	content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
	else:
	# For large files, fetch raw content
	raw_headers = {
	"Accept": "application/vnd.github.raw",
	"X-GitHub-Api-Version": "2022-11-28",
	"Authorization": f"Bearer {token}",
	}
	raw_response = requests.get(
	url, headers=raw_headers, params=params, timeout=30
	)
	if raw_response.status_code != 200:
	return {
	"formatted": "Failed to fetch file content",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}
	content = raw_response.text

	if path.lower().endswith(".ipynb"):
	content = _convert_ipynb_to_markdown(content)

	# Process line ranges
	lines = content.split("\n")
	total_lines = len(lines)

	truncated = False

	if line_start is None and line_end is None:
	# No range specified
	if total_lines > 300:
	line_start = 1
	line_end = 300
	truncated = True
	else:
	line_start = 1
	line_end = total_lines
	else:
	# Range specified
	if line_start is None:
	line_start = 1
	if line_end is None:
	line_end = total_lines

	# Validate range
	line_start = max(1, line_start)
	line_end = min(total_lines, line_end)
	if line_start > line_end:
	return {
	"formatted": f"Invalid range: line_start ({line_start}) > line_end ({line_end})",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	# Extract lines
	selected_lines = lines[line_start - 1 : line_end]
	selected_content = "\n".join(selected_lines)

	# Format output
	lines_output = [f"Reading file from repo: {repo}, path: {path}"]

	if ref and ref != "HEAD":
	lines_output.append(f"Ref: {ref}")

	lines_output.append("\n**File content:")
	lines_output.append("```")
	lines_output.append(selected_content)
	lines_output.append("```")
	if truncated:
	lines_output.append(
	f"Currently showing lines {line_start}-{line_end} out of {total_lines} total lines. Use line_start and line_end to view more lines."
	)
	return {
	"formatted": "\n".join(lines_output),
	"totalResults": 1,
	"resultsShared": 1,
	}

	except requests.exceptions.RequestException as e:
	return {
	"formatted": f"Failed to connect to GitHub API: {str(e)}",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}


	# Tool specification
	GITHUB_READ_FILE_TOOL_SPEC = {
	"name": "github_read_file",
	"description": (
	"Read file contents from GitHub repositories with line range support (default 300 lines). "
	"⚠️ CRITICAL: Use AFTER github_find_examples to study working implementation code. "
	"Use when: (1) Found example file via github_find_examples and need full code, "
	"(2) Need to read trainer class implementation, (3) Study configuration patterns, "
	"(4) Read specific code sections with line ranges, (5) Review code from specific branches/commits. "
	"Pattern: github_find_examples (discover files) → github_read_file (read code) → implement using researched patterns. "
	"Returns: File contents with line numbers, formatted for LLM reading. Auto-converts Jupyter notebooks to markdown. "
	"Then: Implement using patterns and APIs from the example code. "
	"Critical for reliability: Reading working examples prevents API errors and shows current best practices. "
	"Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
	"## When to use this tool\n\n"
	"- When reading example code, trainer implementations, or configuration files\n"
	"- After github_find_examples returns file paths you want to study\n"
	"- When investigating specific code sections with line ranges\n"
	"- When reading from specific branches, tags, or commits (use ref parameter)\n\n"
	"## When NOT to use this tool\n\n"
	"- When you don't know exact file path (use github_find_examples or github_search_code first)\n"
	"- When searching for code patterns across repos (use github_search_code instead)\n\n"
	"## Examples\n\n"
	"<example>\n"
	"// ML Workflow Step: Read GRPO trainer class after finding via github_find_examples\n"
	"// Use case: Understand GRPOTrainer API, parameters, and methods\n"
	"{\n"
	" repo: 'huggingface/trl',\n"
	" path: 'trl/trainer/grpo_trainer.py',\n"
	" line_start: 1,\n"
	" line_end: 200\n"
	"}\n"
	"// Read class definition and constructor to understand current API\n"
	"// Shows: __init__ parameters, configuration, required arguments\n"
	"</example>\n\n"
	"<example>\n"
	"// ML Workflow Step: Study complete training script from examples\n"
	"// Use case: Learn end-to-end VLM fine-tuning workflow\n"
	"{\n"
	" repo: 'huggingface/trl',\n"
	" path: 'examples/scripts/grpo_vlm.py'\n"
	"}\n"
	"// Returns first 300 lines - shows full training setup\n"
	"// Use line_start/line_end if need to read more\n"
	"</example>\n\n"
	"<example>\n"
	"// ML Workflow Step: Check TrainingArguments configuration patterns\n"
	"// Use case: Learn how to structure training configs correctly\n"
	"{\n"
	" repo: 'huggingface/transformers',\n"
	" path: 'examples/pytorch/language-modeling/run_clm.py',\n"
	" line_start: 50,\n"
	" line_end: 150\n"
	"}\n"
	"// Read argument parsing and config setup section\n"
	"// Shows: current parameter names, default values, best practices\n"
	"</example>"
	),
	"parameters": {
	"type": "object",
	"properties": {
	"repo": {
	"type": "string",
	"description": "Repository in format 'owner/repo' (e.g., 'github/github-mcp-server'). Required.",
	},
	"path": {
	"type": "string",
	"description": "Path to file in repository (e.g., 'src/index.js'). Required.",
	},
	"ref": {
	"type": "string",
	"description": "Git reference - branch name, tag, or commit SHA. Default: 'HEAD'.",
	},
	"line_start": {
	"type": "integer",
	"description": "Starting line number (1-indexed, inclusive). Optional.",
	},
	"line_end": {
	"type": "integer",
	"description": "Ending line number (1-indexed, inclusive). Optional.",
	},
	},
	"required": ["repo", "path"],
	},
	}


	async def github_read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
	"""Handler for agent tool router"""
	try:
	result = read_file(
	repo=arguments["repo"],
	path=arguments["path"],
	ref=arguments.get("ref", "HEAD"),
	line_start=arguments.get("line_start"),
	line_end=arguments.get("line_end"),
	)
	return result["formatted"], not result.get("isError", False)
	except Exception as e:
	return f"Error reading file: {str(e)}", False