Spaces:
Running
Running
main merge
Browse files- .gitignore +2 -1
- README.md +48 -26
- agent/config.py +3 -0
- agent/context_manager/manager.py +12 -0
- agent/core/agent_loop.py +60 -22
- agent/core/session.py +171 -1
- agent/core/session_uploader.py +194 -0
- agent/core/tools.py +63 -15
- agent/main.py +16 -12
- agent/prompts/system_prompt.yaml +1 -2
- agent/tools/__init__.py +24 -0
- agent/tools/docs_tools.py +0 -49
- agent/tools/github_find_examples.py +489 -0
- agent/tools/github_list_repos.py +281 -0
- agent/tools/github_read_file.py +336 -0
- agent/tools/github_search_code.py +453 -0
- agent/tools/jobs_tool.py +62 -6
- agent/tools/utilities.py +2 -2
- agent/tools/utils_tools.py +5 -8
- configs/main_agent_config.json +3 -1
- pyproject.toml +31 -12
- tests/unit/tools/test_jobs_tool.py +83 -0
- uv.lock +0 -0
.gitignore
CHANGED
|
@@ -15,4 +15,5 @@ wheels/
|
|
| 15 |
*.csv
|
| 16 |
/logs
|
| 17 |
hf-agent-leaderboard/
|
| 18 |
-
.cursor/
|
|
|
|
|
|
| 15 |
*.csv
|
| 16 |
/logs
|
| 17 |
hf-agent-leaderboard/
|
| 18 |
+
.cursor/
|
| 19 |
+
session_logs/
|
README.md
CHANGED
|
@@ -11,9 +11,11 @@ An MLE agent CLI with MCP (Model Context Protocol) integration and built-in tool
|
|
| 11 |
# Clone the repository
|
| 12 |
git clone git@github.com:huggingface/hf_agent.git
|
| 13 |
cd hf-agent
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
|
|
|
| 17 |
```
|
| 18 |
|
| 19 |
### Interactive CLI
|
|
@@ -21,11 +23,19 @@ uv sync
|
|
| 21 |
```bash
|
| 22 |
uv run python -m agent.main
|
| 23 |
```
|
| 24 |
-
|
| 25 |
This starts an interactive chat session with the agent. Type your messages and the agent will respond, using tools as needed.
|
| 26 |
|
| 27 |
The agent will automatically discover and register all tools from configured MCP servers.
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
## Architecture
|
| 30 |
|
| 31 |
### Component Overview
|
|
@@ -58,16 +68,20 @@ The agent will automatically discover and register all tools from configured MCP
|
|
| 58 |
│ │ │ │ │ ContextManager │ │ │ │ │ │
|
| 59 |
│ │ │ │ │ • Message history │ │ │ │ │ │
|
| 60 |
│ │ │ │ │ (litellm.Message[]) │ │ │ │ │ │
|
|
|
|
| 61 |
│ │ │ │ └────────────────────────────┘ │ │ │ │ │
|
| 62 |
│ │ │ │ │ │ │ │ │
|
| 63 |
│ │ │ │ ┌────────────────────────────┐ │ │ │ │ │
|
| 64 |
│ │ │ │ │ ToolRouter │ │ │ │ │ │
|
| 65 |
-
│ │ │ │ │ ├─
|
| 66 |
-
│ │ │ │ │ ├─
|
| 67 |
-
│ │ │ │ │ ├─
|
| 68 |
-
│ │ │ │ │
|
| 69 |
-
│ │ │ │ │
|
| 70 |
-
│ │ │ │ │
|
|
|
|
|
|
|
|
|
|
| 71 |
│ │ │ │ └────────────────────────────┘ │ │ │ │ │
|
| 72 |
│ │ │ └──────────────────────────────────┘ │ │ │ │
|
| 73 |
│ │ │ │ │ │ │
|
|
@@ -121,16 +135,20 @@ User Message
|
|
| 121 |
agent/
|
| 122 |
├── config.py # Configuration models
|
| 123 |
├── main.py # Interactive CLI entry point
|
|
|
|
|
|
|
| 124 |
├── context_manager/
|
| 125 |
-
│ └── manager.py # Message history
|
| 126 |
└── core/
|
| 127 |
├── agent_loop.py # Main agent loop and handlers
|
| 128 |
├── session.py # Session management
|
| 129 |
├── mcp_client.py # MCP SDK integration
|
| 130 |
└── tools.py # ToolRouter and built-in tools
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
| 134 |
eval/ # Evaluation suite (see eval/README.md)
|
| 135 |
```
|
| 136 |
|
|
@@ -143,6 +161,7 @@ The agent emits the following events via `event_queue`:
|
|
| 143 |
- `assistant_message` - LLM response text
|
| 144 |
- `tool_call` - Tool being called with arguments
|
| 145 |
- `tool_output` - Tool execution result
|
|
|
|
| 146 |
- `turn_complete` - Agent finished processing
|
| 147 |
- `error` - Error occurred during processing
|
| 148 |
- `interrupted` - Agent was interrupted
|
|
@@ -177,18 +196,21 @@ def create_builtin_tools() -> list[ToolSpec]:
|
|
| 177 |
|
| 178 |
### Adding MCP Servers
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
```
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
| 194 |
```
|
|
|
|
|
|
|
|
|
| 11 |
# Clone the repository
|
| 12 |
git clone git@github.com:huggingface/hf_agent.git
|
| 13 |
cd hf-agent
|
| 14 |
+
```
|
| 15 |
|
| 16 |
+
#### Install recommended dependencies
|
| 17 |
+
```bash
|
| 18 |
+
uv sync --extra agent # or uv sync --extra all
|
| 19 |
```
|
| 20 |
|
| 21 |
### Interactive CLI
|
|
|
|
| 23 |
```bash
|
| 24 |
uv run python -m agent.main
|
| 25 |
```
|
|
|
|
| 26 |
This starts an interactive chat session with the agent. Type your messages and the agent will respond, using tools as needed.
|
| 27 |
|
| 28 |
The agent will automatically discover and register all tools from configured MCP servers.
|
| 29 |
|
| 30 |
+
|
| 31 |
+
### Env Setup
|
| 32 |
+
```bash
|
| 33 |
+
ANTHROPIC_API_KEY=<one-key-to-rule-them-all>
|
| 34 |
+
HF_TOKEN=<hf-token-to-access-the-hub>
|
| 35 |
+
GITHUB_TOKEN=<gh-pat-key-for-not-reinventing-the-wheel>
|
| 36 |
+
HF_NAMESPACE=<hf-namespace-to-use>
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
## Architecture
|
| 40 |
|
| 41 |
### Component Overview
|
|
|
|
| 68 |
│ │ │ │ │ ContextManager │ │ │ │ │ │
|
| 69 |
│ │ │ │ │ • Message history │ │ │ │ │ │
|
| 70 |
│ │ │ │ │ (litellm.Message[]) │ │ │ │ │ │
|
| 71 |
+
│ │ │ │ │ • Auto-compaction (180k) │ │ │ │ │ │
|
| 72 |
│ │ │ │ └────────────────────────────┘ │ │ │ │ │
|
| 73 |
│ │ │ │ │ │ │ │ │
|
| 74 |
│ │ │ │ ┌────────────────────────────┐ │ │ │ │ │
|
| 75 |
│ │ │ │ │ ToolRouter │ │ │ │ │ │
|
| 76 |
+
│ │ │ │ │ ├─ explore_hf_docs │ │ │ │ │ │
|
| 77 |
+
│ │ │ │ │ ├─ fetch_hf_docs │ │ │ │ │ │
|
| 78 |
+
│ │ │ │ │ ├─ search_hf_api_endpoints│ │ │ │ │ │
|
| 79 |
+
│ │ │ │ │ ├─ plan_tool │ │ │ │ │ │
|
| 80 |
+
│ │ │ │ │ ├─ hf_jobs* │ │ │ │ │ │
|
| 81 |
+
│ │ │ │ │ ├─ hf_private_repos* │ │ │ │ │ │
|
| 82 |
+
│ │ │ │ │ ├─ github_* (3 tools) │ │ │ │ │ │
|
| 83 |
+
│ │ │ │ │ └─ MCP tools (e.g., │ │ │ │ │ │
|
| 84 |
+
│ │ │ │ │ model_search, etc.) │ │ │ │ │ │
|
| 85 |
│ │ │ │ └────────────────────────────┘ │ │ │ │ │
|
| 86 |
│ │ │ └──────────────────────────────────┘ │ │ │ │
|
| 87 |
│ │ │ │ │ │ │
|
|
|
|
| 135 |
agent/
|
| 136 |
├── config.py # Configuration models
|
| 137 |
├── main.py # Interactive CLI entry point
|
| 138 |
+
├── prompts/
|
| 139 |
+
│ └── system_prompt.yaml # Agent behavior and personality
|
| 140 |
├── context_manager/
|
| 141 |
+
│ └── manager.py # Message history & auto-compaction
|
| 142 |
└── core/
|
| 143 |
├── agent_loop.py # Main agent loop and handlers
|
| 144 |
├── session.py # Session management
|
| 145 |
├── mcp_client.py # MCP SDK integration
|
| 146 |
└── tools.py # ToolRouter and built-in tools
|
| 147 |
|
| 148 |
+
configs/
|
| 149 |
+
└── main_agent_config.json # Model and MCP server configuration
|
| 150 |
+
|
| 151 |
+
tests/ # Integration and unit tests
|
| 152 |
eval/ # Evaluation suite (see eval/README.md)
|
| 153 |
```
|
| 154 |
|
|
|
|
| 161 |
- `assistant_message` - LLM response text
|
| 162 |
- `tool_call` - Tool being called with arguments
|
| 163 |
- `tool_output` - Tool execution result
|
| 164 |
+
- `approval_request` - Requesting user approval for sensitive operations
|
| 165 |
- `turn_complete` - Agent finished processing
|
| 166 |
- `error` - Error occurred during processing
|
| 167 |
- `interrupted` - Agent was interrupted
|
|
|
|
| 196 |
|
| 197 |
### Adding MCP Servers
|
| 198 |
|
| 199 |
+
Edit `configs/main_agent_config.json`:
|
| 200 |
+
|
| 201 |
+
```json
|
| 202 |
+
{
|
| 203 |
+
"model_name": "anthropic/claude-sonnet-4-5-20250929",
|
| 204 |
+
"mcpServers": {
|
| 205 |
+
"your-server-name": {
|
| 206 |
+
"transport": "http",
|
| 207 |
+
"url": "https://example.com/mcp",
|
| 208 |
+
"headers": {
|
| 209 |
+
"Authorization": "Bearer ${YOUR_TOKEN}"
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
```
|
| 215 |
+
|
| 216 |
+
Note: Environment variables like `${YOUR_TOKEN}` are auto-substituted from `.env`.
|
agent/config.py
CHANGED
|
@@ -19,6 +19,9 @@ class Config(BaseModel):
|
|
| 19 |
|
| 20 |
model_name: str
|
| 21 |
mcpServers: dict[str, MCPServerConfig] = {}
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def substitute_env_vars(obj: Any) -> Any:
|
|
|
|
| 19 |
|
| 20 |
model_name: str
|
| 21 |
mcpServers: dict[str, MCPServerConfig] = {}
|
| 22 |
+
save_sessions: bool = True
|
| 23 |
+
session_dataset_repo: str = "smolagents/hf-agent-sessions"
|
| 24 |
+
auto_save_interval: int = 3 # Save every N user turns (0 = disabled)
|
| 25 |
|
| 26 |
|
| 27 |
def substitute_env_vars(obj: Any) -> Any:
|
agent/context_manager/manager.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
| 2 |
Context management for conversation history
|
| 3 |
"""
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Any
|
| 7 |
|
|
@@ -42,10 +44,20 @@ class ContextManager:
|
|
| 42 |
prompt_data = yaml.safe_load(f)
|
| 43 |
template_str = prompt_data.get("system_prompt", "")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
template = Template(template_str)
|
| 46 |
return template.render(
|
| 47 |
tools=tool_specs,
|
| 48 |
num_tools=len(tool_specs),
|
|
|
|
|
|
|
|
|
|
| 49 |
)
|
| 50 |
|
| 51 |
def add_message(self, message: Message, token_count: int = None) -> None:
|
|
|
|
| 2 |
Context management for conversation history
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
import zoneinfo
|
| 6 |
+
from datetime import datetime
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any
|
| 9 |
|
|
|
|
| 44 |
prompt_data = yaml.safe_load(f)
|
| 45 |
template_str = prompt_data.get("system_prompt", "")
|
| 46 |
|
| 47 |
+
# Get current date and time
|
| 48 |
+
tz = zoneinfo.ZoneInfo("Europe/Paris")
|
| 49 |
+
now = datetime.now(tz)
|
| 50 |
+
current_date = now.strftime("%d-%m-%Y")
|
| 51 |
+
current_time = now.strftime("%H:%M:%S.%f")[:-3]
|
| 52 |
+
current_timezone = f"{now.strftime('%Z')} (UTC{now.strftime('%z')[:3]}:{now.strftime('%z')[3:]})"
|
| 53 |
+
|
| 54 |
template = Template(template_str)
|
| 55 |
return template.render(
|
| 56 |
tools=tool_specs,
|
| 57 |
num_tools=len(tool_specs),
|
| 58 |
+
current_date=current_date,
|
| 59 |
+
current_time=current_time,
|
| 60 |
+
current_timezone=current_timezone,
|
| 61 |
)
|
| 62 |
|
| 63 |
def add_message(self, message: Message, token_count: int = None) -> None:
|
agent/core/agent_loop.py
CHANGED
|
@@ -25,9 +25,15 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
|
|
| 25 |
args = tool_args.get("args", {})
|
| 26 |
# Sometimes LLM passes args as string instead of dict
|
| 27 |
if isinstance(args, str):
|
| 28 |
-
return
|
|
|
|
|
|
|
|
|
|
| 29 |
if not isinstance(args, dict) and args is not None:
|
| 30 |
-
return
|
|
|
|
|
|
|
|
|
|
| 31 |
return True, None
|
| 32 |
|
| 33 |
|
|
@@ -38,8 +44,6 @@ def _needs_approval(tool_name: str, tool_args: dict) -> bool:
|
|
| 38 |
if not args_valid:
|
| 39 |
return False
|
| 40 |
|
| 41 |
-
args = tool_args.get("args", {})
|
| 42 |
-
|
| 43 |
if tool_name == "hf_jobs":
|
| 44 |
# Check if it's a run or uv operation
|
| 45 |
operation = tool_args.get("operation", "")
|
|
@@ -251,6 +255,11 @@ class Handlers:
|
|
| 251 |
data={"history_size": len(session.context_manager.items)},
|
| 252 |
)
|
| 253 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
return final_response
|
| 255 |
|
| 256 |
@staticmethod
|
|
@@ -410,6 +419,14 @@ class Handlers:
|
|
| 410 |
@staticmethod
|
| 411 |
async def shutdown(session: Session) -> bool:
|
| 412 |
"""Handle shutdown (like shutdown in codex.rs:1329)"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
session.is_running = False
|
| 414 |
await session.send_event(Event(event_type="shutdown"))
|
| 415 |
return True
|
|
@@ -470,26 +487,47 @@ async def submission_loop(
|
|
| 470 |
session = Session(event_queue, config=config, tool_router=tool_router)
|
| 471 |
print("Agent loop started")
|
| 472 |
|
| 473 |
-
#
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
Event(event_type="ready", data={"message": "Agent initialized"})
|
| 478 |
)
|
| 479 |
|
| 480 |
-
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
break
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
args = tool_args.get("args", {})
|
| 26 |
# Sometimes LLM passes args as string instead of dict
|
| 27 |
if isinstance(args, str):
|
| 28 |
+
return (
|
| 29 |
+
False,
|
| 30 |
+
f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}",
|
| 31 |
+
)
|
| 32 |
if not isinstance(args, dict) and args is not None:
|
| 33 |
+
return (
|
| 34 |
+
False,
|
| 35 |
+
f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}",
|
| 36 |
+
)
|
| 37 |
return True, None
|
| 38 |
|
| 39 |
|
|
|
|
| 44 |
if not args_valid:
|
| 45 |
return False
|
| 46 |
|
|
|
|
|
|
|
| 47 |
if tool_name == "hf_jobs":
|
| 48 |
# Check if it's a run or uv operation
|
| 49 |
operation = tool_args.get("operation", "")
|
|
|
|
| 255 |
data={"history_size": len(session.context_manager.items)},
|
| 256 |
)
|
| 257 |
)
|
| 258 |
+
|
| 259 |
+
# Increment turn counter and check for auto-save
|
| 260 |
+
session.increment_turn()
|
| 261 |
+
await session.auto_save_if_needed()
|
| 262 |
+
|
| 263 |
return final_response
|
| 264 |
|
| 265 |
@staticmethod
|
|
|
|
| 419 |
@staticmethod
|
| 420 |
async def shutdown(session: Session) -> bool:
|
| 421 |
"""Handle shutdown (like shutdown in codex.rs:1329)"""
|
| 422 |
+
# Save session trajectory if enabled (fire-and-forget, returns immediately)
|
| 423 |
+
if session.config.save_sessions:
|
| 424 |
+
print("💾 Saving session...")
|
| 425 |
+
repo_id = session.config.session_dataset_repo
|
| 426 |
+
local_path = session.save_and_upload_detached(repo_id)
|
| 427 |
+
if local_path:
|
| 428 |
+
print("✅ Session saved locally, upload in progress")
|
| 429 |
+
|
| 430 |
session.is_running = False
|
| 431 |
await session.send_event(Event(event_type="shutdown"))
|
| 432 |
return True
|
|
|
|
| 487 |
session = Session(event_queue, config=config, tool_router=tool_router)
|
| 488 |
print("Agent loop started")
|
| 489 |
|
| 490 |
+
# Retry any failed uploads from previous sessions (fire-and-forget)
|
| 491 |
+
if config and config.save_sessions:
|
| 492 |
+
Session.retry_failed_uploads_detached(
|
| 493 |
+
directory="session_logs", repo_id=config.session_dataset_repo
|
|
|
|
| 494 |
)
|
| 495 |
|
| 496 |
+
try:
|
| 497 |
+
# Main processing loop
|
| 498 |
+
async with tool_router:
|
| 499 |
+
# Emit ready event after initialization
|
| 500 |
+
await session.send_event(
|
| 501 |
+
Event(event_type="ready", data={"message": "Agent initialized"})
|
| 502 |
+
)
|
| 503 |
|
| 504 |
+
while session.is_running:
|
| 505 |
+
submission = await submission_queue.get()
|
| 506 |
+
|
| 507 |
+
try:
|
| 508 |
+
should_continue = await process_submission(session, submission)
|
| 509 |
+
if not should_continue:
|
| 510 |
+
break
|
| 511 |
+
except asyncio.CancelledError:
|
| 512 |
+
print("\n⚠️ Agent loop cancelled")
|
| 513 |
break
|
| 514 |
+
except Exception as e:
|
| 515 |
+
print(f"❌ Error in agent loop: {e}")
|
| 516 |
+
await session.send_event(
|
| 517 |
+
Event(event_type="error", data={"error": str(e)})
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
print("🛑 Agent loop exited")
|
| 521 |
|
| 522 |
+
finally:
|
| 523 |
+
# Emergency save if session saving is enabled and shutdown wasn't called properly
|
| 524 |
+
if session.config.save_sessions and session.is_running:
|
| 525 |
+
print("\n💾 Emergency save: preserving session before exit...")
|
| 526 |
+
try:
|
| 527 |
+
local_path = session.save_and_upload_detached(
|
| 528 |
+
session.config.session_dataset_repo
|
| 529 |
+
)
|
| 530 |
+
if local_path:
|
| 531 |
+
print("✅ Emergency save successful, upload in progress")
|
| 532 |
+
except Exception as e:
|
| 533 |
+
print(f"❌ Emergency save failed: {e}")
|
agent/core/session.py
CHANGED
|
@@ -1,7 +1,12 @@
|
|
| 1 |
import asyncio
|
|
|
|
|
|
|
|
|
|
| 2 |
import uuid
|
| 3 |
from dataclasses import dataclass
|
|
|
|
| 4 |
from enum import Enum
|
|
|
|
| 5 |
from typing import Any, Optional
|
| 6 |
|
| 7 |
from litellm import get_max_tokens
|
|
@@ -55,11 +60,176 @@ class Session:
|
|
| 55 |
self.current_task: asyncio.Task | None = None
|
| 56 |
self.pending_approval: Optional[dict[str, Any]] = None
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
async def send_event(self, event: Event) -> None:
|
| 59 |
-
"""Send event back to client"""
|
| 60 |
await self.event_queue.put(event)
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
def interrupt(self) -> None:
|
| 63 |
"""Interrupt current running task"""
|
| 64 |
if self.current_task and not self.current_task.done():
|
| 65 |
self.current_task.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
import uuid
|
| 6 |
from dataclasses import dataclass
|
| 7 |
+
from datetime import datetime
|
| 8 |
from enum import Enum
|
| 9 |
+
from pathlib import Path
|
| 10 |
from typing import Any, Optional
|
| 11 |
|
| 12 |
from litellm import get_max_tokens
|
|
|
|
| 60 |
self.current_task: asyncio.Task | None = None
|
| 61 |
self.pending_approval: Optional[dict[str, Any]] = None
|
| 62 |
|
| 63 |
+
# Session trajectory logging
|
| 64 |
+
self.logged_events: list[dict] = []
|
| 65 |
+
self.session_start_time = datetime.now().isoformat()
|
| 66 |
+
self.turn_count: int = 0
|
| 67 |
+
self.last_auto_save_turn: int = 0
|
| 68 |
+
|
| 69 |
async def send_event(self, event: Event) -> None:
|
| 70 |
+
"""Send event back to client and log to trajectory"""
|
| 71 |
await self.event_queue.put(event)
|
| 72 |
|
| 73 |
+
# Log event to trajectory
|
| 74 |
+
self.logged_events.append(
|
| 75 |
+
{
|
| 76 |
+
"timestamp": datetime.now().isoformat(),
|
| 77 |
+
"event_type": event.event_type,
|
| 78 |
+
"data": event.data,
|
| 79 |
+
}
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
def interrupt(self) -> None:
|
| 83 |
"""Interrupt current running task"""
|
| 84 |
if self.current_task and not self.current_task.done():
|
| 85 |
self.current_task.cancel()
|
| 86 |
+
|
| 87 |
+
def increment_turn(self) -> None:
|
| 88 |
+
"""Increment turn counter (called after each user interaction)"""
|
| 89 |
+
self.turn_count += 1
|
| 90 |
+
|
| 91 |
+
async def auto_save_if_needed(self) -> None:
|
| 92 |
+
"""Check if auto-save should trigger and save if so (completely non-blocking)"""
|
| 93 |
+
if not self.config.save_sessions:
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
interval = self.config.auto_save_interval
|
| 97 |
+
if interval <= 0:
|
| 98 |
+
return
|
| 99 |
+
|
| 100 |
+
turns_since_last_save = self.turn_count - self.last_auto_save_turn
|
| 101 |
+
if turns_since_last_save >= interval:
|
| 102 |
+
print(f"\n💾 Auto-saving session (turn {self.turn_count})...")
|
| 103 |
+
# Fire-and-forget save - returns immediately
|
| 104 |
+
self.save_and_upload_detached(self.config.session_dataset_repo)
|
| 105 |
+
self.last_auto_save_turn = self.turn_count
|
| 106 |
+
|
| 107 |
+
def get_trajectory(self) -> dict:
|
| 108 |
+
"""Serialize complete session trajectory for logging"""
|
| 109 |
+
return {
|
| 110 |
+
"session_id": self.session_id,
|
| 111 |
+
"session_start_time": self.session_start_time,
|
| 112 |
+
"session_end_time": datetime.now().isoformat(),
|
| 113 |
+
"model_name": self.config.model_name,
|
| 114 |
+
"messages": [msg.model_dump() for msg in self.context_manager.items],
|
| 115 |
+
"events": self.logged_events,
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
def save_trajectory_local(
|
| 119 |
+
self,
|
| 120 |
+
directory: str = "session_logs",
|
| 121 |
+
upload_status: str = "pending",
|
| 122 |
+
dataset_url: Optional[str] = None,
|
| 123 |
+
) -> Optional[str]:
|
| 124 |
+
"""
|
| 125 |
+
Save trajectory to local JSON file as backup with upload status
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
directory: Directory to save logs (default: "session_logs")
|
| 129 |
+
upload_status: Status of upload attempt ("pending", "success", "failed")
|
| 130 |
+
dataset_url: URL of dataset if upload succeeded
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
Path to saved file if successful, None otherwise
|
| 134 |
+
"""
|
| 135 |
+
try:
|
| 136 |
+
log_dir = Path(directory)
|
| 137 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
| 138 |
+
|
| 139 |
+
trajectory = self.get_trajectory()
|
| 140 |
+
|
| 141 |
+
# Add upload metadata
|
| 142 |
+
trajectory["upload_status"] = upload_status
|
| 143 |
+
trajectory["upload_url"] = dataset_url
|
| 144 |
+
trajectory["last_save_time"] = datetime.now().isoformat()
|
| 145 |
+
|
| 146 |
+
filename = f"session_{self.session_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 147 |
+
filepath = log_dir / filename
|
| 148 |
+
|
| 149 |
+
with open(filepath, "w") as f:
|
| 150 |
+
json.dump(trajectory, f, indent=2)
|
| 151 |
+
|
| 152 |
+
return str(filepath)
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Failed to save session locally: {e}")
|
| 155 |
+
return None
|
| 156 |
+
|
| 157 |
+
def update_local_save_status(
|
| 158 |
+
self, filepath: str, upload_status: str, dataset_url: Optional[str] = None
|
| 159 |
+
) -> bool:
|
| 160 |
+
"""Update the upload status of an existing local save file"""
|
| 161 |
+
try:
|
| 162 |
+
with open(filepath, "r") as f:
|
| 163 |
+
data = json.load(f)
|
| 164 |
+
|
| 165 |
+
data["upload_status"] = upload_status
|
| 166 |
+
data["upload_url"] = dataset_url
|
| 167 |
+
data["last_save_time"] = datetime.now().isoformat()
|
| 168 |
+
|
| 169 |
+
with open(filepath, "w") as f:
|
| 170 |
+
json.dump(data, f, indent=2)
|
| 171 |
+
|
| 172 |
+
return True
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f"Failed to update local save status: {e}")
|
| 175 |
+
return False
|
| 176 |
+
|
| 177 |
+
def save_and_upload_detached(self, repo_id: str) -> Optional[str]:
|
| 178 |
+
"""
|
| 179 |
+
Save session locally and spawn detached subprocess for upload (fire-and-forget)
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
repo_id: HuggingFace dataset repo ID
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
Path to local save file
|
| 186 |
+
"""
|
| 187 |
+
# Save locally first (fast, synchronous)
|
| 188 |
+
local_path = self.save_trajectory_local(upload_status="pending")
|
| 189 |
+
if not local_path:
|
| 190 |
+
return None
|
| 191 |
+
|
| 192 |
+
# Spawn detached subprocess for upload (fire-and-forget)
|
| 193 |
+
try:
|
| 194 |
+
uploader_script = Path(__file__).parent / "session_uploader.py"
|
| 195 |
+
|
| 196 |
+
# Use Popen with detached process
|
| 197 |
+
subprocess.Popen(
|
| 198 |
+
[sys.executable, str(uploader_script), "upload", local_path, repo_id],
|
| 199 |
+
stdin=subprocess.DEVNULL,
|
| 200 |
+
stdout=subprocess.DEVNULL,
|
| 201 |
+
stderr=subprocess.DEVNULL,
|
| 202 |
+
start_new_session=True, # Detach from parent
|
| 203 |
+
)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"⚠️ Failed to spawn upload subprocess: {e}")
|
| 206 |
+
|
| 207 |
+
return local_path
|
| 208 |
+
|
| 209 |
+
@staticmethod
|
| 210 |
+
def retry_failed_uploads_detached(
|
| 211 |
+
directory: str = "session_logs", repo_id: Optional[str] = None
|
| 212 |
+
) -> None:
|
| 213 |
+
"""
|
| 214 |
+
Spawn detached subprocess to retry failed/pending uploads (fire-and-forget)
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
directory: Directory containing session logs
|
| 218 |
+
repo_id: Target dataset repo ID
|
| 219 |
+
"""
|
| 220 |
+
if not repo_id:
|
| 221 |
+
return
|
| 222 |
+
|
| 223 |
+
try:
|
| 224 |
+
uploader_script = Path(__file__).parent / "session_uploader.py"
|
| 225 |
+
|
| 226 |
+
# Spawn detached subprocess for retry
|
| 227 |
+
subprocess.Popen(
|
| 228 |
+
[sys.executable, str(uploader_script), "retry", directory, repo_id],
|
| 229 |
+
stdin=subprocess.DEVNULL,
|
| 230 |
+
stdout=subprocess.DEVNULL,
|
| 231 |
+
stderr=subprocess.DEVNULL,
|
| 232 |
+
start_new_session=True, # Detach from parent
|
| 233 |
+
)
|
| 234 |
+
except Exception as e:
|
| 235 |
+
print(f"⚠️ Failed to spawn retry subprocess: {e}")
|
agent/core/session_uploader.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Standalone script for uploading session trajectories to HuggingFace.
|
| 4 |
+
This runs as a separate process to avoid blocking the main agent.
|
| 5 |
+
Uses individual file uploads to avoid race conditions.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def upload_session_as_file(
|
| 16 |
+
session_file: str, repo_id: str, max_retries: int = 3
|
| 17 |
+
) -> bool:
|
| 18 |
+
"""
|
| 19 |
+
Upload a single session as an individual JSONL file (no race conditions)
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
session_file: Path to local session JSON file
|
| 23 |
+
repo_id: HuggingFace dataset repo ID
|
| 24 |
+
max_retries: Number of retry attempts
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
True if successful, False otherwise
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
from huggingface_hub import HfApi
|
| 31 |
+
except ImportError:
|
| 32 |
+
print("Error: huggingface_hub library not available", file=sys.stderr)
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# Load session data
|
| 37 |
+
with open(session_file, "r") as f:
|
| 38 |
+
data = json.load(f)
|
| 39 |
+
|
| 40 |
+
# Check if already uploaded
|
| 41 |
+
upload_status = data.get("upload_status")
|
| 42 |
+
if upload_status == "success":
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 46 |
+
if not hf_token:
|
| 47 |
+
# Update status to failed
|
| 48 |
+
data["upload_status"] = "failed"
|
| 49 |
+
with open(session_file, "w") as f:
|
| 50 |
+
json.dump(data, f, indent=2)
|
| 51 |
+
return False
|
| 52 |
+
|
| 53 |
+
# Prepare JSONL content (single line)
|
| 54 |
+
# Store messages and events as JSON strings to avoid schema conflicts
|
| 55 |
+
session_row = {
|
| 56 |
+
"session_id": data["session_id"],
|
| 57 |
+
"session_start_time": data["session_start_time"],
|
| 58 |
+
"session_end_time": data["session_end_time"],
|
| 59 |
+
"model_name": data["model_name"],
|
| 60 |
+
"messages": json.dumps(data["messages"]),
|
| 61 |
+
"events": json.dumps(data["events"]),
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Create temporary JSONL file
|
| 65 |
+
import tempfile
|
| 66 |
+
|
| 67 |
+
with tempfile.NamedTemporaryFile(
|
| 68 |
+
mode="w", suffix=".jsonl", delete=False
|
| 69 |
+
) as tmp:
|
| 70 |
+
json.dump(session_row, tmp) # Single line JSON
|
| 71 |
+
tmp_path = tmp.name
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
# Generate unique path in repo: sessions/YYYY-MM-DD/session_id.jsonl
|
| 75 |
+
session_id = data["session_id"]
|
| 76 |
+
date_str = datetime.fromisoformat(data["session_start_time"]).strftime(
|
| 77 |
+
"%Y-%m-%d"
|
| 78 |
+
)
|
| 79 |
+
repo_path = f"sessions/{date_str}/{session_id}.jsonl"
|
| 80 |
+
|
| 81 |
+
# Upload with retries
|
| 82 |
+
api = HfApi()
|
| 83 |
+
for attempt in range(max_retries):
|
| 84 |
+
try:
|
| 85 |
+
# Try to create repo if it doesn't exist (idempotent)
|
| 86 |
+
try:
|
| 87 |
+
api.create_repo(
|
| 88 |
+
repo_id=repo_id,
|
| 89 |
+
repo_type="dataset",
|
| 90 |
+
private=True,
|
| 91 |
+
token=hf_token,
|
| 92 |
+
exist_ok=True, # Don't fail if already exists
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
except Exception:
|
| 96 |
+
# Repo might already exist, continue
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
# Upload the session file
|
| 100 |
+
api.upload_file(
|
| 101 |
+
path_or_fileobj=tmp_path,
|
| 102 |
+
path_in_repo=repo_path,
|
| 103 |
+
repo_id=repo_id,
|
| 104 |
+
repo_type="dataset",
|
| 105 |
+
token=hf_token,
|
| 106 |
+
commit_message=f"Add session {session_id}",
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Update local status to success
|
| 110 |
+
data["upload_status"] = "success"
|
| 111 |
+
data["upload_url"] = f"https://huggingface.co/datasets/{repo_id}"
|
| 112 |
+
with open(session_file, "w") as f:
|
| 113 |
+
json.dump(data, f, indent=2)
|
| 114 |
+
|
| 115 |
+
return True
|
| 116 |
+
|
| 117 |
+
except Exception:
|
| 118 |
+
if attempt < max_retries - 1:
|
| 119 |
+
import time
|
| 120 |
+
|
| 121 |
+
wait_time = 2**attempt
|
| 122 |
+
time.sleep(wait_time)
|
| 123 |
+
else:
|
| 124 |
+
# Final attempt failed
|
| 125 |
+
data["upload_status"] = "failed"
|
| 126 |
+
with open(session_file, "w") as f:
|
| 127 |
+
json.dump(data, f, indent=2)
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
finally:
|
| 131 |
+
# Clean up temp file
|
| 132 |
+
try:
|
| 133 |
+
os.unlink(tmp_path)
|
| 134 |
+
except Exception:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"Error uploading session: {e}", file=sys.stderr)
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def retry_failed_uploads(directory: str, repo_id: str):
|
| 143 |
+
"""Retry all failed/pending uploads in a directory"""
|
| 144 |
+
log_dir = Path(directory)
|
| 145 |
+
if not log_dir.exists():
|
| 146 |
+
return
|
| 147 |
+
|
| 148 |
+
session_files = list(log_dir.glob("session_*.json"))
|
| 149 |
+
|
| 150 |
+
for filepath in session_files:
|
| 151 |
+
try:
|
| 152 |
+
with open(filepath, "r") as f:
|
| 153 |
+
data = json.load(f)
|
| 154 |
+
|
| 155 |
+
upload_status = data.get("upload_status", "unknown")
|
| 156 |
+
|
| 157 |
+
# Only retry pending or failed uploads
|
| 158 |
+
if upload_status in ["pending", "failed"]:
|
| 159 |
+
upload_session_as_file(str(filepath), repo_id)
|
| 160 |
+
|
| 161 |
+
except Exception:
|
| 162 |
+
pass
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
+
if len(sys.argv) < 3:
|
| 167 |
+
print("Usage: session_uploader.py <command> <args...>")
|
| 168 |
+
sys.exit(1)
|
| 169 |
+
|
| 170 |
+
command = sys.argv[1]
|
| 171 |
+
|
| 172 |
+
if command == "upload":
|
| 173 |
+
# python session_uploader.py upload <session_file> <repo_id>
|
| 174 |
+
if len(sys.argv) < 4:
|
| 175 |
+
print("Usage: session_uploader.py upload <session_file> <repo_id>")
|
| 176 |
+
sys.exit(1)
|
| 177 |
+
session_file = sys.argv[2]
|
| 178 |
+
repo_id = sys.argv[3]
|
| 179 |
+
success = upload_session_as_file(session_file, repo_id)
|
| 180 |
+
sys.exit(0 if success else 1)
|
| 181 |
+
|
| 182 |
+
elif command == "retry":
|
| 183 |
+
# python session_uploader.py retry <directory> <repo_id>
|
| 184 |
+
if len(sys.argv) < 4:
|
| 185 |
+
print("Usage: session_uploader.py retry <directory> <repo_id>")
|
| 186 |
+
sys.exit(1)
|
| 187 |
+
directory = sys.argv[2]
|
| 188 |
+
repo_id = sys.argv[3]
|
| 189 |
+
retry_failed_uploads(directory, repo_id)
|
| 190 |
+
sys.exit(0)
|
| 191 |
+
|
| 192 |
+
else:
|
| 193 |
+
print(f"Unknown command: {command}")
|
| 194 |
+
sys.exit(1)
|
agent/core/tools.py
CHANGED
|
@@ -19,13 +19,27 @@ from agent.tools.docs_tools import (
|
|
| 19 |
explore_hf_docs_handler,
|
| 20 |
hf_docs_fetch_handler,
|
| 21 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 23 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
| 24 |
from agent.tools.private_hf_repo_tools import (
|
| 25 |
PRIVATE_HF_REPO_TOOL_SPEC,
|
| 26 |
private_hf_repo_handler,
|
| 27 |
)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Suppress aiohttp deprecation warning
|
| 31 |
warnings.filterwarnings(
|
|
@@ -118,11 +132,13 @@ class ToolRouter:
|
|
| 118 |
|
| 119 |
async def register_mcp_tools(self) -> None:
|
| 120 |
tools = await self.mcp_client.list_tools()
|
|
|
|
|
|
|
| 121 |
for tool in tools:
|
| 122 |
if tool.name in NOT_ALLOWED_TOOL_NAMES:
|
| 123 |
-
|
| 124 |
continue
|
| 125 |
-
|
| 126 |
self.register_tool(
|
| 127 |
ToolSpec(
|
| 128 |
name=tool.name,
|
|
@@ -131,6 +147,9 @@ class ToolRouter:
|
|
| 131 |
handler=None,
|
| 132 |
)
|
| 133 |
)
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
async def register_openapi_tool(self) -> None:
|
| 136 |
"""Register the OpenAPI search tool (requires async initialization)"""
|
|
@@ -139,8 +158,6 @@ class ToolRouter:
|
|
| 139 |
search_openapi_handler,
|
| 140 |
)
|
| 141 |
|
| 142 |
-
print("Registering OpenAPI search tool...")
|
| 143 |
-
|
| 144 |
# Register search_hf_api_endpoints with dynamic spec
|
| 145 |
openapi_spec = await _get_api_search_tool_spec()
|
| 146 |
self.register_tool(
|
|
@@ -151,7 +168,7 @@ class ToolRouter:
|
|
| 151 |
handler=search_openapi_handler,
|
| 152 |
)
|
| 153 |
)
|
| 154 |
-
print(f"
|
| 155 |
|
| 156 |
def get_tool_specs_for_llm(self) -> list[dict[str, Any]]:
|
| 157 |
"""Get tool specifications in OpenAI format"""
|
|
@@ -175,11 +192,13 @@ class ToolRouter:
|
|
| 175 |
await self.mcp_client.initialize()
|
| 176 |
await self.register_mcp_tools()
|
| 177 |
self._mcp_initialized = True
|
| 178 |
-
print(f"MCP initialized: {self._mcp_initialized}")
|
| 179 |
|
| 180 |
# Register OpenAPI tool (requires async initialization)
|
| 181 |
await self.register_openapi_tool()
|
| 182 |
|
|
|
|
|
|
|
|
|
|
| 183 |
return self
|
| 184 |
|
| 185 |
async def __aexit__(self, exc_type, exc, tb) -> None:
|
|
@@ -223,11 +242,8 @@ class ToolRouter:
|
|
| 223 |
|
| 224 |
def create_builtin_tools() -> list[ToolSpec]:
|
| 225 |
"""Create built-in tool specifications"""
|
| 226 |
-
print(
|
| 227 |
-
f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {UTILS_TOOL_SPEC['name']}"
|
| 228 |
-
)
|
| 229 |
# in order of importance
|
| 230 |
-
|
| 231 |
# Documentation search tools
|
| 232 |
ToolSpec(
|
| 233 |
name=EXPLORE_HF_DOCS_TOOL_SPEC["name"],
|
|
@@ -260,10 +276,42 @@ def create_builtin_tools() -> list[ToolSpec]:
|
|
| 260 |
parameters=PRIVATE_HF_REPO_TOOL_SPEC["parameters"],
|
| 261 |
handler=private_hf_repo_handler,
|
| 262 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
ToolSpec(
|
| 264 |
-
name=
|
| 265 |
-
description=
|
| 266 |
-
parameters=
|
| 267 |
-
handler=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
),
|
| 269 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
explore_hf_docs_handler,
|
| 20 |
hf_docs_fetch_handler,
|
| 21 |
)
|
| 22 |
+
from agent.tools.github_find_examples import (
|
| 23 |
+
GITHUB_FIND_EXAMPLES_TOOL_SPEC,
|
| 24 |
+
github_find_examples_handler,
|
| 25 |
+
)
|
| 26 |
+
from agent.tools.github_list_repos import (
|
| 27 |
+
GITHUB_LIST_REPOS_TOOL_SPEC,
|
| 28 |
+
github_list_repos_handler,
|
| 29 |
+
)
|
| 30 |
+
from agent.tools.github_read_file import (
|
| 31 |
+
GITHUB_READ_FILE_TOOL_SPEC,
|
| 32 |
+
github_read_file_handler,
|
| 33 |
+
)
|
| 34 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 35 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
| 36 |
from agent.tools.private_hf_repo_tools import (
|
| 37 |
PRIVATE_HF_REPO_TOOL_SPEC,
|
| 38 |
private_hf_repo_handler,
|
| 39 |
)
|
| 40 |
+
|
| 41 |
+
# NOTE: Utils tool disabled - date/time now loaded into system prompt at initialization
|
| 42 |
+
# from agent.tools.utils_tools import UTILS_TOOL_SPEC, utils_handler
|
| 43 |
|
| 44 |
# Suppress aiohttp deprecation warning
|
| 45 |
warnings.filterwarnings(
|
|
|
|
| 132 |
|
| 133 |
async def register_mcp_tools(self) -> None:
|
| 134 |
tools = await self.mcp_client.list_tools()
|
| 135 |
+
registered_names = []
|
| 136 |
+
skipped_count = 0
|
| 137 |
for tool in tools:
|
| 138 |
if tool.name in NOT_ALLOWED_TOOL_NAMES:
|
| 139 |
+
skipped_count += 1
|
| 140 |
continue
|
| 141 |
+
registered_names.append(tool.name)
|
| 142 |
self.register_tool(
|
| 143 |
ToolSpec(
|
| 144 |
name=tool.name,
|
|
|
|
| 147 |
handler=None,
|
| 148 |
)
|
| 149 |
)
|
| 150 |
+
print(
|
| 151 |
+
f"Loaded {len(registered_names)} MCP tools: {', '.join(registered_names)} ({skipped_count} disabled)"
|
| 152 |
+
)
|
| 153 |
|
| 154 |
async def register_openapi_tool(self) -> None:
|
| 155 |
"""Register the OpenAPI search tool (requires async initialization)"""
|
|
|
|
| 158 |
search_openapi_handler,
|
| 159 |
)
|
| 160 |
|
|
|
|
|
|
|
| 161 |
# Register search_hf_api_endpoints with dynamic spec
|
| 162 |
openapi_spec = await _get_api_search_tool_spec()
|
| 163 |
self.register_tool(
|
|
|
|
| 168 |
handler=search_openapi_handler,
|
| 169 |
)
|
| 170 |
)
|
| 171 |
+
print(f"Loaded OpenAPI search tool: {openapi_spec['name']}")
|
| 172 |
|
| 173 |
def get_tool_specs_for_llm(self) -> list[dict[str, Any]]:
|
| 174 |
"""Get tool specifications in OpenAI format"""
|
|
|
|
| 192 |
await self.mcp_client.initialize()
|
| 193 |
await self.register_mcp_tools()
|
| 194 |
self._mcp_initialized = True
|
|
|
|
| 195 |
|
| 196 |
# Register OpenAPI tool (requires async initialization)
|
| 197 |
await self.register_openapi_tool()
|
| 198 |
|
| 199 |
+
total_tools = len(self.tools)
|
| 200 |
+
print(f"\nAgent ready with {total_tools} tools total\n")
|
| 201 |
+
|
| 202 |
return self
|
| 203 |
|
| 204 |
async def __aexit__(self, exc_type, exc, tb) -> None:
|
|
|
|
| 242 |
|
| 243 |
def create_builtin_tools() -> list[ToolSpec]:
|
| 244 |
"""Create built-in tool specifications"""
|
|
|
|
|
|
|
|
|
|
| 245 |
# in order of importance
|
| 246 |
+
tools = [
|
| 247 |
# Documentation search tools
|
| 248 |
ToolSpec(
|
| 249 |
name=EXPLORE_HF_DOCS_TOOL_SPEC["name"],
|
|
|
|
| 276 |
parameters=PRIVATE_HF_REPO_TOOL_SPEC["parameters"],
|
| 277 |
handler=private_hf_repo_handler,
|
| 278 |
),
|
| 279 |
+
# NOTE: Utils tool disabled - date/time now loaded into system prompt at initialization (less tool calls=more reliablity)
|
| 280 |
+
# ToolSpec(
|
| 281 |
+
# name=UTILS_TOOL_SPEC["name"],
|
| 282 |
+
# description=UTILS_TOOL_SPEC["description"],
|
| 283 |
+
# parameters=UTILS_TOOL_SPEC["parameters"],
|
| 284 |
+
# handler=utils_handler,
|
| 285 |
+
# ),
|
| 286 |
+
# GitHub tools
|
| 287 |
+
# NOTE: Github search code tool disabled - a bit buggy
|
| 288 |
+
# ToolSpec(
|
| 289 |
+
# name=GITHUB_SEARCH_CODE_TOOL_SPEC["name"],
|
| 290 |
+
# description=GITHUB_SEARCH_CODE_TOOL_SPEC["description"],
|
| 291 |
+
# parameters=GITHUB_SEARCH_CODE_TOOL_SPEC["parameters"],
|
| 292 |
+
# handler=github_search_code_handler,
|
| 293 |
+
# ),
|
| 294 |
ToolSpec(
|
| 295 |
+
name=GITHUB_FIND_EXAMPLES_TOOL_SPEC["name"],
|
| 296 |
+
description=GITHUB_FIND_EXAMPLES_TOOL_SPEC["description"],
|
| 297 |
+
parameters=GITHUB_FIND_EXAMPLES_TOOL_SPEC["parameters"],
|
| 298 |
+
handler=github_find_examples_handler,
|
| 299 |
+
),
|
| 300 |
+
ToolSpec(
|
| 301 |
+
name=GITHUB_LIST_REPOS_TOOL_SPEC["name"],
|
| 302 |
+
description=GITHUB_LIST_REPOS_TOOL_SPEC["description"],
|
| 303 |
+
parameters=GITHUB_LIST_REPOS_TOOL_SPEC["parameters"],
|
| 304 |
+
handler=github_list_repos_handler,
|
| 305 |
+
),
|
| 306 |
+
ToolSpec(
|
| 307 |
+
name=GITHUB_READ_FILE_TOOL_SPEC["name"],
|
| 308 |
+
description=GITHUB_READ_FILE_TOOL_SPEC["description"],
|
| 309 |
+
parameters=GITHUB_READ_FILE_TOOL_SPEC["parameters"],
|
| 310 |
+
handler=github_read_file_handler,
|
| 311 |
),
|
| 312 |
]
|
| 313 |
+
|
| 314 |
+
tool_names = ", ".join([t.name for t in tools])
|
| 315 |
+
print(f"Loaded {len(tools)} built-in tools: {tool_names}")
|
| 316 |
+
|
| 317 |
+
return tools
|
agent/main.py
CHANGED
|
@@ -120,7 +120,6 @@ async def event_listener(
|
|
| 120 |
print(format_error(error))
|
| 121 |
turn_complete_event.set()
|
| 122 |
elif event.event_type == "shutdown":
|
| 123 |
-
print("Agent shutdown")
|
| 124 |
break
|
| 125 |
elif event.event_type == "processing":
|
| 126 |
print("Processing...", flush=True)
|
|
@@ -228,11 +227,15 @@ async def event_listener(
|
|
| 228 |
|
| 229 |
# Build repo URL
|
| 230 |
type_path = "" if repo_type == "model" else f"{repo_type}s"
|
| 231 |
-
repo_url =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
print(f"Repository: {repo_id}")
|
| 234 |
print(f"Type: {repo_type}")
|
| 235 |
-
print(
|
| 236 |
print(f"URL: {repo_url}")
|
| 237 |
|
| 238 |
# Show file preview for upload_file operation
|
|
@@ -243,9 +246,9 @@ async def event_listener(
|
|
| 243 |
|
| 244 |
if isinstance(file_content, str):
|
| 245 |
# Calculate metrics
|
| 246 |
-
all_lines = file_content.split(
|
| 247 |
line_count = len(all_lines)
|
| 248 |
-
size_bytes = len(file_content.encode(
|
| 249 |
size_kb = size_bytes / 1024
|
| 250 |
size_mb = size_kb / 1024
|
| 251 |
|
|
@@ -257,8 +260,10 @@ async def event_listener(
|
|
| 257 |
|
| 258 |
# Show preview
|
| 259 |
preview_lines = all_lines[:5]
|
| 260 |
-
preview =
|
| 261 |
-
print(
|
|
|
|
|
|
|
| 262 |
if len(all_lines) > 5:
|
| 263 |
print("...")
|
| 264 |
|
|
@@ -327,6 +332,8 @@ async def main():
|
|
| 327 |
print(f"{Colors.YELLOW} {banner}{Colors.RESET}")
|
| 328 |
print("Type your messages below. Type 'exit', 'quit', or '/quit' to end.\n")
|
| 329 |
print(format_separator())
|
|
|
|
|
|
|
| 330 |
|
| 331 |
# Create queues for communication
|
| 332 |
submission_queue = asyncio.Queue()
|
|
@@ -342,7 +349,7 @@ async def main():
|
|
| 342 |
config = load_config(config_path)
|
| 343 |
|
| 344 |
# Create tool router
|
| 345 |
-
print(f"
|
| 346 |
tool_router = ToolRouter(config.mcpServers)
|
| 347 |
|
| 348 |
# Create prompt session for input
|
|
@@ -368,8 +375,6 @@ async def main():
|
|
| 368 |
)
|
| 369 |
)
|
| 370 |
|
| 371 |
-
# Wait for agent to initialize
|
| 372 |
-
print("Initializing agent...")
|
| 373 |
await ready_event.wait()
|
| 374 |
|
| 375 |
submission_id = 0
|
|
@@ -416,8 +421,7 @@ async def main():
|
|
| 416 |
)
|
| 417 |
await submission_queue.put(shutdown_submission)
|
| 418 |
|
| 419 |
-
|
| 420 |
-
await asyncio.wait_for(agent_task, timeout=2.0)
|
| 421 |
listener_task.cancel()
|
| 422 |
|
| 423 |
print("✨ Goodbye!\n")
|
|
|
|
| 120 |
print(format_error(error))
|
| 121 |
turn_complete_event.set()
|
| 122 |
elif event.event_type == "shutdown":
|
|
|
|
| 123 |
break
|
| 124 |
elif event.event_type == "processing":
|
| 125 |
print("Processing...", flush=True)
|
|
|
|
| 227 |
|
| 228 |
# Build repo URL
|
| 229 |
type_path = "" if repo_type == "model" else f"{repo_type}s"
|
| 230 |
+
repo_url = (
|
| 231 |
+
f"https://huggingface.co/{type_path}/{repo_id}".replace(
|
| 232 |
+
"//", "/"
|
| 233 |
+
)
|
| 234 |
+
)
|
| 235 |
|
| 236 |
print(f"Repository: {repo_id}")
|
| 237 |
print(f"Type: {repo_type}")
|
| 238 |
+
print("Private: Yes")
|
| 239 |
print(f"URL: {repo_url}")
|
| 240 |
|
| 241 |
# Show file preview for upload_file operation
|
|
|
|
| 246 |
|
| 247 |
if isinstance(file_content, str):
|
| 248 |
# Calculate metrics
|
| 249 |
+
all_lines = file_content.split("\n")
|
| 250 |
line_count = len(all_lines)
|
| 251 |
+
size_bytes = len(file_content.encode("utf-8"))
|
| 252 |
size_kb = size_bytes / 1024
|
| 253 |
size_mb = size_kb / 1024
|
| 254 |
|
|
|
|
| 260 |
|
| 261 |
# Show preview
|
| 262 |
preview_lines = all_lines[:5]
|
| 263 |
+
preview = "\n".join(preview_lines)
|
| 264 |
+
print(
|
| 265 |
+
f"Content preview (first 5 lines):\n{preview}"
|
| 266 |
+
)
|
| 267 |
if len(all_lines) > 5:
|
| 268 |
print("...")
|
| 269 |
|
|
|
|
| 332 |
print(f"{Colors.YELLOW} {banner}{Colors.RESET}")
|
| 333 |
print("Type your messages below. Type 'exit', 'quit', or '/quit' to end.\n")
|
| 334 |
print(format_separator())
|
| 335 |
+
# Wait for agent to initialize
|
| 336 |
+
print("Initializing agent...")
|
| 337 |
|
| 338 |
# Create queues for communication
|
| 339 |
submission_queue = asyncio.Queue()
|
|
|
|
| 349 |
config = load_config(config_path)
|
| 350 |
|
| 351 |
# Create tool router
|
| 352 |
+
print(f"Loading MCP servers: {', '.join(config.mcpServers.keys())}")
|
| 353 |
tool_router = ToolRouter(config.mcpServers)
|
| 354 |
|
| 355 |
# Create prompt session for input
|
|
|
|
| 375 |
)
|
| 376 |
)
|
| 377 |
|
|
|
|
|
|
|
| 378 |
await ready_event.wait()
|
| 379 |
|
| 380 |
submission_id = 0
|
|
|
|
| 421 |
)
|
| 422 |
await submission_queue.put(shutdown_submission)
|
| 423 |
|
| 424 |
+
await asyncio.wait_for(agent_task, timeout=5.0)
|
|
|
|
| 425 |
listener_task.cancel()
|
| 426 |
|
| 427 |
print("✨ Goodbye!\n")
|
agent/prompts/system_prompt.yaml
CHANGED
|
@@ -26,7 +26,7 @@ system_prompt: |
|
|
| 26 |
- Invoke multiple independent tools simultaneously for efficiency
|
| 27 |
|
| 28 |
# Available Tools
|
| 29 |
-
|
| 30 |
You have access to the following main categories of tools. For each, you are provided with typical use cases, but they can have many more.
|
| 31 |
|
| 32 |
- Hugging Face Hub
|
|
@@ -168,4 +168,3 @@ system_prompt: |
|
|
| 168 |
3. Sort by trending or downloads.
|
| 169 |
4. Report top results with short descriptions and links.
|
| 170 |
</example>
|
| 171 |
-
|
|
|
|
| 26 |
- Invoke multiple independent tools simultaneously for efficiency
|
| 27 |
|
| 28 |
# Available Tools
|
| 29 |
+
|
| 30 |
You have access to the following main categories of tools. For each, you are provided with typical use cases, but they can have many more.
|
| 31 |
|
| 32 |
- Hugging Face Hub
|
|
|
|
| 168 |
3. Sort by trending or downloads.
|
| 169 |
4. Report top results with short descriptions and links.
|
| 170 |
</example>
|
|
|
agent/tools/__init__.py
CHANGED
|
@@ -2,6 +2,22 @@
|
|
| 2 |
Hugging Face tools for the agent
|
| 3 |
"""
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
|
| 6 |
from agent.tools.types import ToolResult
|
| 7 |
|
|
@@ -10,4 +26,12 @@ __all__ = [
|
|
| 10 |
"HF_JOBS_TOOL_SPEC",
|
| 11 |
"hf_jobs_handler",
|
| 12 |
"HfJobsTool",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
]
|
|
|
|
| 2 |
Hugging Face tools for the agent
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from agent.tools.github_find_examples import (
|
| 6 |
+
GITHUB_FIND_EXAMPLES_TOOL_SPEC,
|
| 7 |
+
github_find_examples_handler,
|
| 8 |
+
)
|
| 9 |
+
from agent.tools.github_list_repos import (
|
| 10 |
+
GITHUB_LIST_REPOS_TOOL_SPEC,
|
| 11 |
+
github_list_repos_handler,
|
| 12 |
+
)
|
| 13 |
+
from agent.tools.github_read_file import (
|
| 14 |
+
GITHUB_READ_FILE_TOOL_SPEC,
|
| 15 |
+
github_read_file_handler,
|
| 16 |
+
)
|
| 17 |
+
from agent.tools.github_search_code import (
|
| 18 |
+
GITHUB_SEARCH_CODE_TOOL_SPEC,
|
| 19 |
+
github_search_code_handler,
|
| 20 |
+
)
|
| 21 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
|
| 22 |
from agent.tools.types import ToolResult
|
| 23 |
|
|
|
|
| 26 |
"HF_JOBS_TOOL_SPEC",
|
| 27 |
"hf_jobs_handler",
|
| 28 |
"HfJobsTool",
|
| 29 |
+
"GITHUB_FIND_EXAMPLES_TOOL_SPEC",
|
| 30 |
+
"github_find_examples_handler",
|
| 31 |
+
"GITHUB_LIST_REPOS_TOOL_SPEC",
|
| 32 |
+
"github_list_repos_handler",
|
| 33 |
+
"GITHUB_READ_FILE_TOOL_SPEC",
|
| 34 |
+
"github_read_file_handler",
|
| 35 |
+
"GITHUB_SEARCH_CODE_TOOL_SPEC",
|
| 36 |
+
"github_search_code_handler",
|
| 37 |
]
|
agent/tools/docs_tools.py
CHANGED
|
@@ -5,7 +5,6 @@ Tools for exploring and fetching HuggingFace documentation and API specification
|
|
| 5 |
|
| 6 |
import asyncio
|
| 7 |
import os
|
| 8 |
-
import time
|
| 9 |
from typing import Any
|
| 10 |
|
| 11 |
import httpx
|
|
@@ -21,21 +20,15 @@ async def _fetch_html_page(hf_token: str, endpoint: str) -> str:
|
|
| 21 |
url = f"{base_url}/{endpoint}"
|
| 22 |
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 23 |
|
| 24 |
-
fetch_start = time.perf_counter()
|
| 25 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
| 26 |
response = await client.get(url, headers=headers)
|
| 27 |
response.raise_for_status()
|
| 28 |
|
| 29 |
-
fetch_time = time.perf_counter() - fetch_start
|
| 30 |
-
print(f"[DEBUG] _fetch_html_page: Fetched in {fetch_time:.2f}s")
|
| 31 |
-
|
| 32 |
return response.text
|
| 33 |
|
| 34 |
|
| 35 |
def _parse_sidebar_navigation(html_content: str) -> list[dict[str, str]]:
|
| 36 |
"""Parse the sidebar navigation and extract all links"""
|
| 37 |
-
parse_start = time.perf_counter()
|
| 38 |
-
|
| 39 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 40 |
sidebar = soup.find("nav", class_=lambda x: x and "flex-auto" in x)
|
| 41 |
|
|
@@ -53,11 +46,6 @@ def _parse_sidebar_navigation(html_content: str) -> list[dict[str, str]]:
|
|
| 53 |
page_url = f"https://huggingface.co{href}" if href.startswith("/") else href
|
| 54 |
nav_data.append({"title": title, "url": page_url})
|
| 55 |
|
| 56 |
-
parse_time = time.perf_counter() - parse_start
|
| 57 |
-
print(
|
| 58 |
-
f"[DEBUG] _parse_sidebar_navigation: Parsed in {parse_time:.2f}s, found {len(nav_data)} links"
|
| 59 |
-
)
|
| 60 |
-
|
| 61 |
return nav_data
|
| 62 |
|
| 63 |
|
|
@@ -96,18 +84,11 @@ async def _fetch_all_glimpses(
|
|
| 96 |
hf_token: str, nav_data: list[dict[str, str]]
|
| 97 |
) -> list[dict[str, str]]:
|
| 98 |
"""Fetch glimpses for all pages in parallel"""
|
| 99 |
-
glimpse_start = time.perf_counter()
|
| 100 |
-
|
| 101 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
| 102 |
result_items = await asyncio.gather(
|
| 103 |
*[_fetch_single_glimpse(client, hf_token, item) for item in nav_data]
|
| 104 |
)
|
| 105 |
|
| 106 |
-
glimpse_time = time.perf_counter() - glimpse_start
|
| 107 |
-
print(
|
| 108 |
-
f"[DEBUG] _fetch_all_glimpses: Fetched {len(result_items)} glimpses in {glimpse_time:.2f}s"
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
return list(result_items)
|
| 112 |
|
| 113 |
|
|
@@ -130,9 +111,6 @@ def _format_exploration_results(
|
|
| 130 |
|
| 131 |
async def explore_hf_docs(hf_token: str, endpoint: str) -> str:
|
| 132 |
"""Main function to explore documentation structure"""
|
| 133 |
-
start_time = time.perf_counter()
|
| 134 |
-
print(f"[DEBUG] explore_hf_docs: Starting for endpoint '{endpoint}'")
|
| 135 |
-
|
| 136 |
# Fetch HTML page
|
| 137 |
html_content = await _fetch_html_page(hf_token, endpoint)
|
| 138 |
|
|
@@ -148,9 +126,6 @@ async def explore_hf_docs(hf_token: str, endpoint: str) -> str:
|
|
| 148 |
# Format results
|
| 149 |
result = _format_exploration_results(endpoint, result_items)
|
| 150 |
|
| 151 |
-
total_time = time.perf_counter() - start_time
|
| 152 |
-
print(f"[DEBUG] explore_hf_docs: Total time {total_time:.2f}s")
|
| 153 |
-
|
| 154 |
return result
|
| 155 |
|
| 156 |
|
|
@@ -199,12 +174,8 @@ async def _fetch_openapi_spec() -> dict[str, Any]:
|
|
| 199 |
global _openapi_spec_cache
|
| 200 |
|
| 201 |
if _openapi_spec_cache is not None:
|
| 202 |
-
print("[DEBUG] _fetch_openapi_spec: Using cached spec")
|
| 203 |
return _openapi_spec_cache
|
| 204 |
|
| 205 |
-
start_time = time.perf_counter()
|
| 206 |
-
print("[DEBUG] _fetch_openapi_spec: Fetching from API")
|
| 207 |
-
|
| 208 |
url = "https://huggingface.co/.well-known/openapi.json"
|
| 209 |
|
| 210 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
@@ -214,9 +185,6 @@ async def _fetch_openapi_spec() -> dict[str, Any]:
|
|
| 214 |
spec = response.json()
|
| 215 |
_openapi_spec_cache = spec
|
| 216 |
|
| 217 |
-
fetch_time = time.perf_counter() - start_time
|
| 218 |
-
print(f"[DEBUG] _fetch_openapi_spec: Fetched and cached in {fetch_time:.2f}s")
|
| 219 |
-
|
| 220 |
return spec
|
| 221 |
|
| 222 |
|
|
@@ -457,9 +425,7 @@ async def search_openapi_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
|
|
| 457 |
Returns:
|
| 458 |
Tuple of (search_results, success)
|
| 459 |
"""
|
| 460 |
-
start_time = time.perf_counter()
|
| 461 |
tag = arguments.get("tag", "")
|
| 462 |
-
print(f"[DEBUG] search_openapi: Starting for tag '{tag}'")
|
| 463 |
|
| 464 |
if not tag:
|
| 465 |
return "Error: No tag provided", False
|
|
@@ -474,9 +440,6 @@ async def search_openapi_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
|
|
| 474 |
# Format results
|
| 475 |
formatted = _format_openapi_results(results, tag)
|
| 476 |
|
| 477 |
-
total_time = time.perf_counter() - start_time
|
| 478 |
-
print(f"[DEBUG] search_openapi: Total time {total_time:.2f}s")
|
| 479 |
-
|
| 480 |
return formatted, True
|
| 481 |
|
| 482 |
except httpx.HTTPStatusError as e:
|
|
@@ -497,9 +460,7 @@ async def hf_docs_fetch_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
|
|
| 497 |
Returns:
|
| 498 |
Tuple of (full_markdown_content, success)
|
| 499 |
"""
|
| 500 |
-
start_time = time.perf_counter()
|
| 501 |
url = arguments.get("url", "")
|
| 502 |
-
print(f"[DEBUG] fetch_hf_docs: Starting for URL '{url}'")
|
| 503 |
|
| 504 |
if not url:
|
| 505 |
return "Error: No URL provided", False
|
|
@@ -521,25 +482,15 @@ async def hf_docs_fetch_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
|
|
| 521 |
# Make request with auth
|
| 522 |
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 523 |
|
| 524 |
-
fetch_start = time.perf_counter()
|
| 525 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
| 526 |
response = await client.get(url, headers=headers)
|
| 527 |
response.raise_for_status()
|
| 528 |
|
| 529 |
-
fetch_time = time.perf_counter() - fetch_start
|
| 530 |
content = response.text
|
| 531 |
-
content_size_kb = len(content) / 1024
|
| 532 |
-
|
| 533 |
-
print(
|
| 534 |
-
f"[DEBUG] fetch_hf_docs: Fetched {content_size_kb:.1f}KB in {fetch_time:.2f}s"
|
| 535 |
-
)
|
| 536 |
|
| 537 |
# Return the markdown content directly
|
| 538 |
result = f"Documentation from: {url}\n\n{content}"
|
| 539 |
|
| 540 |
-
total_time = time.perf_counter() - start_time
|
| 541 |
-
print(f"[DEBUG] fetch_hf_docs: Total time {total_time:.2f}s")
|
| 542 |
-
|
| 543 |
return result, True
|
| 544 |
|
| 545 |
except httpx.HTTPStatusError as e:
|
|
|
|
| 5 |
|
| 6 |
import asyncio
|
| 7 |
import os
|
|
|
|
| 8 |
from typing import Any
|
| 9 |
|
| 10 |
import httpx
|
|
|
|
| 20 |
url = f"{base_url}/{endpoint}"
|
| 21 |
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 22 |
|
|
|
|
| 23 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
| 24 |
response = await client.get(url, headers=headers)
|
| 25 |
response.raise_for_status()
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
return response.text
|
| 28 |
|
| 29 |
|
| 30 |
def _parse_sidebar_navigation(html_content: str) -> list[dict[str, str]]:
|
| 31 |
"""Parse the sidebar navigation and extract all links"""
|
|
|
|
|
|
|
| 32 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 33 |
sidebar = soup.find("nav", class_=lambda x: x and "flex-auto" in x)
|
| 34 |
|
|
|
|
| 46 |
page_url = f"https://huggingface.co{href}" if href.startswith("/") else href
|
| 47 |
nav_data.append({"title": title, "url": page_url})
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
return nav_data
|
| 50 |
|
| 51 |
|
|
|
|
| 84 |
hf_token: str, nav_data: list[dict[str, str]]
|
| 85 |
) -> list[dict[str, str]]:
|
| 86 |
"""Fetch glimpses for all pages in parallel"""
|
|
|
|
|
|
|
| 87 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
| 88 |
result_items = await asyncio.gather(
|
| 89 |
*[_fetch_single_glimpse(client, hf_token, item) for item in nav_data]
|
| 90 |
)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
return list(result_items)
|
| 93 |
|
| 94 |
|
|
|
|
| 111 |
|
| 112 |
async def explore_hf_docs(hf_token: str, endpoint: str) -> str:
|
| 113 |
"""Main function to explore documentation structure"""
|
|
|
|
|
|
|
|
|
|
| 114 |
# Fetch HTML page
|
| 115 |
html_content = await _fetch_html_page(hf_token, endpoint)
|
| 116 |
|
|
|
|
| 126 |
# Format results
|
| 127 |
result = _format_exploration_results(endpoint, result_items)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
return result
|
| 130 |
|
| 131 |
|
|
|
|
| 174 |
global _openapi_spec_cache
|
| 175 |
|
| 176 |
if _openapi_spec_cache is not None:
|
|
|
|
| 177 |
return _openapi_spec_cache
|
| 178 |
|
|
|
|
|
|
|
|
|
|
| 179 |
url = "https://huggingface.co/.well-known/openapi.json"
|
| 180 |
|
| 181 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
|
|
| 185 |
spec = response.json()
|
| 186 |
_openapi_spec_cache = spec
|
| 187 |
|
|
|
|
|
|
|
|
|
|
| 188 |
return spec
|
| 189 |
|
| 190 |
|
|
|
|
| 425 |
Returns:
|
| 426 |
Tuple of (search_results, success)
|
| 427 |
"""
|
|
|
|
| 428 |
tag = arguments.get("tag", "")
|
|
|
|
| 429 |
|
| 430 |
if not tag:
|
| 431 |
return "Error: No tag provided", False
|
|
|
|
| 440 |
# Format results
|
| 441 |
formatted = _format_openapi_results(results, tag)
|
| 442 |
|
|
|
|
|
|
|
|
|
|
| 443 |
return formatted, True
|
| 444 |
|
| 445 |
except httpx.HTTPStatusError as e:
|
|
|
|
| 460 |
Returns:
|
| 461 |
Tuple of (full_markdown_content, success)
|
| 462 |
"""
|
|
|
|
| 463 |
url = arguments.get("url", "")
|
|
|
|
| 464 |
|
| 465 |
if not url:
|
| 466 |
return "Error: No URL provided", False
|
|
|
|
| 482 |
# Make request with auth
|
| 483 |
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 484 |
|
|
|
|
| 485 |
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
| 486 |
response = await client.get(url, headers=headers)
|
| 487 |
response.raise_for_status()
|
| 488 |
|
|
|
|
| 489 |
content = response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
# Return the markdown content directly
|
| 492 |
result = f"Documentation from: {url}\n\n{content}"
|
| 493 |
|
|
|
|
|
|
|
|
|
|
| 494 |
return result, True
|
| 495 |
|
| 496 |
except httpx.HTTPStatusError as e:
|
agent/tools/github_find_examples.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub Find Examples Tool - Discover examples, tutorials, and guides for any library
|
| 3 |
+
|
| 4 |
+
Lists all files in a repository and performs deterministic keyword search.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import Any, Dict, List
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
from thefuzz import fuzz
|
| 12 |
+
|
| 13 |
+
from agent.tools.types import ToolResult
|
| 14 |
+
|
| 15 |
+
# In order of priority (lower index = higher priority for sorting)
|
| 16 |
+
EXAMPLE_PATTERNS = [
|
| 17 |
+
"scripts",
|
| 18 |
+
# General example patterns (catch-all, lower priority)
|
| 19 |
+
"examples",
|
| 20 |
+
"example",
|
| 21 |
+
# Notebook patterns
|
| 22 |
+
"notebooks",
|
| 23 |
+
"notebook",
|
| 24 |
+
# Tutorial/learning patterns
|
| 25 |
+
"tutorials",
|
| 26 |
+
"tutorial",
|
| 27 |
+
"quickstart",
|
| 28 |
+
"walkthroughs",
|
| 29 |
+
"walkthrough",
|
| 30 |
+
# Cookbook/recipe patterns
|
| 31 |
+
"cookbook",
|
| 32 |
+
"cookbooks",
|
| 33 |
+
"recipes",
|
| 34 |
+
"recipe",
|
| 35 |
+
# Demo/sample patterns
|
| 36 |
+
"demos",
|
| 37 |
+
"demo",
|
| 38 |
+
"samples",
|
| 39 |
+
"sample",
|
| 40 |
+
# Other patterns
|
| 41 |
+
"guides",
|
| 42 |
+
"guide",
|
| 43 |
+
"getting-started",
|
| 44 |
+
"getting_started",
|
| 45 |
+
"playground",
|
| 46 |
+
"howto",
|
| 47 |
+
"how-to",
|
| 48 |
+
"use-cases",
|
| 49 |
+
"usecases",
|
| 50 |
+
"use_cases",
|
| 51 |
+
"sandbox",
|
| 52 |
+
"showcase",
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]], str]:
|
| 57 |
+
"""Get all files in a repository recursively. Returns (files, error_message)"""
|
| 58 |
+
headers = {
|
| 59 |
+
"Accept": "application/vnd.github+json",
|
| 60 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 61 |
+
"Authorization": f"Bearer {token}",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
full_repo = f"{org}/{repo}"
|
| 65 |
+
|
| 66 |
+
# Get default branch
|
| 67 |
+
try:
|
| 68 |
+
response = requests.get(
|
| 69 |
+
f"https://api.github.com/repos/{full_repo}", headers=headers, timeout=10
|
| 70 |
+
)
|
| 71 |
+
if response.status_code == 404:
|
| 72 |
+
return [], "not_found"
|
| 73 |
+
if response.status_code != 200:
|
| 74 |
+
return [], f"API error: {response.status_code}"
|
| 75 |
+
|
| 76 |
+
repo_data = response.json()
|
| 77 |
+
default_branch = repo_data.get("default_branch", "main")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
return [], f"Error fetching repo: {str(e)}"
|
| 80 |
+
|
| 81 |
+
# Get repository tree recursively
|
| 82 |
+
try:
|
| 83 |
+
response = requests.get(
|
| 84 |
+
f"https://api.github.com/repos/{full_repo}/git/trees/{default_branch}",
|
| 85 |
+
headers=headers,
|
| 86 |
+
params={"recursive": "1"},
|
| 87 |
+
timeout=30,
|
| 88 |
+
)
|
| 89 |
+
if response.status_code != 200:
|
| 90 |
+
return [], f"Error fetching tree: {response.status_code}"
|
| 91 |
+
|
| 92 |
+
data = response.json()
|
| 93 |
+
tree = data.get("tree", [])
|
| 94 |
+
|
| 95 |
+
# Filter to only include files (not directories)
|
| 96 |
+
files = [
|
| 97 |
+
{
|
| 98 |
+
"path": item["path"],
|
| 99 |
+
"ref": item["sha"],
|
| 100 |
+
"size": item.get("size", 0),
|
| 101 |
+
"url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}",
|
| 102 |
+
}
|
| 103 |
+
for item in tree
|
| 104 |
+
if item["type"] == "blob"
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
return files, ""
|
| 108 |
+
except Exception as e:
|
| 109 |
+
return [], f"Error processing tree: {str(e)}"
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _search_similar_repos(org: str, repo: str, token: str) -> List[Dict[str, Any]]:
|
| 113 |
+
"""Search for similar repository names in the organization"""
|
| 114 |
+
headers = {
|
| 115 |
+
"Accept": "application/vnd.github+json",
|
| 116 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 117 |
+
"Authorization": f"Bearer {token}",
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Search for repos in the org with similar name
|
| 121 |
+
query = f"org:{org} {repo}"
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
response = requests.get(
|
| 125 |
+
"https://api.github.com/search/repositories",
|
| 126 |
+
headers=headers,
|
| 127 |
+
params={"q": query, "sort": "stars", "order": "desc", "per_page": 10},
|
| 128 |
+
timeout=30,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
if response.status_code != 200:
|
| 132 |
+
return []
|
| 133 |
+
|
| 134 |
+
data = response.json()
|
| 135 |
+
items = data.get("items", [])
|
| 136 |
+
|
| 137 |
+
return [
|
| 138 |
+
{
|
| 139 |
+
"name": item.get("name"),
|
| 140 |
+
"full_name": item.get("full_name"),
|
| 141 |
+
"description": item.get("description"),
|
| 142 |
+
"stars": item.get("stargazers_count", 0),
|
| 143 |
+
"url": item.get("html_url"),
|
| 144 |
+
}
|
| 145 |
+
for item in items
|
| 146 |
+
]
|
| 147 |
+
except Exception:
|
| 148 |
+
return []
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _score_against_example_patterns(file_path: str) -> int:
|
| 152 |
+
"""Score file against example patterns using token_set_ratio"""
|
| 153 |
+
scores = []
|
| 154 |
+
for pattern in EXAMPLE_PATTERNS:
|
| 155 |
+
score = fuzz.token_set_ratio(pattern.lower(), file_path.lower())
|
| 156 |
+
scores.append(score)
|
| 157 |
+
return max(scores) if scores else 0
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def _score_against_keyword(file_path: str, keyword: str) -> int:
|
| 161 |
+
"""Calculate fuzzy match score for a file path against a keyword"""
|
| 162 |
+
# Use partial_ratio for substring matching (good for paths)
|
| 163 |
+
# Also check token_set_ratio for word-level matching
|
| 164 |
+
partial_score = fuzz.partial_ratio(keyword.lower(), file_path.lower())
|
| 165 |
+
token_score = fuzz.token_set_ratio(keyword.lower(), file_path.lower())
|
| 166 |
+
|
| 167 |
+
# Return the higher of the two
|
| 168 |
+
return max(partial_score, token_score)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _get_pattern_priority(file_path: str) -> tuple[int, int, int]:
|
| 172 |
+
"""
|
| 173 |
+
Get priority of a file path based on which example pattern directory it's in.
|
| 174 |
+
|
| 175 |
+
Returns: (in_examples_dir, pattern_priority, path_depth)
|
| 176 |
+
- in_examples_dir: 0 if in examples/ directory, 1 otherwise (lower is better)
|
| 177 |
+
- pattern_priority: Index in EXAMPLE_PATTERNS (lower is better), or 999 if no match
|
| 178 |
+
- path_depth: Number of path segments (lower is better)
|
| 179 |
+
|
| 180 |
+
Note: Prioritizes files in "examples/" directory first, then by most specific pattern match.
|
| 181 |
+
E.g., "examples/scripts/train.py" is better than "scripts/util.py"
|
| 182 |
+
"""
|
| 183 |
+
path_lower = file_path.lower()
|
| 184 |
+
path_parts = path_lower.split("/")
|
| 185 |
+
|
| 186 |
+
# Check if file is in examples/ directory (highest priority)
|
| 187 |
+
in_examples_dir = 0 if (path_parts[0] in ["examples", "example"]) else 1
|
| 188 |
+
|
| 189 |
+
# Find ALL matching patterns and use the best (lowest index) one
|
| 190 |
+
# But prefer deeper matches (more specific) over shallow ones
|
| 191 |
+
best_priority = 999
|
| 192 |
+
best_depth_at_match = -1
|
| 193 |
+
|
| 194 |
+
for i, pattern in enumerate(EXAMPLE_PATTERNS):
|
| 195 |
+
# Check if pattern appears as a directory component in the path
|
| 196 |
+
if pattern in path_parts:
|
| 197 |
+
# Find the depth where this pattern appears (rightmost occurrence)
|
| 198 |
+
depth = len(path_parts) - 1 - path_parts[::-1].index(pattern)
|
| 199 |
+
|
| 200 |
+
# Prefer deeper matches, or better priority if at same depth
|
| 201 |
+
if depth > best_depth_at_match or (
|
| 202 |
+
depth == best_depth_at_match and i < best_priority
|
| 203 |
+
):
|
| 204 |
+
best_priority = i
|
| 205 |
+
best_depth_at_match = depth
|
| 206 |
+
|
| 207 |
+
return (in_examples_dir, best_priority, len(path_parts))
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _handle_repo_tree_errors(
|
| 211 |
+
all_files: List[Dict[str, Any]],
|
| 212 |
+
error: str,
|
| 213 |
+
org: str,
|
| 214 |
+
repo: str,
|
| 215 |
+
token: str,
|
| 216 |
+
) -> ToolResult | None:
|
| 217 |
+
"""Handle errors from repo tree fetch. Returns ToolResult if error, None if OK."""
|
| 218 |
+
if error == "not_found":
|
| 219 |
+
similar_repos = _search_similar_repos(org, repo, token)
|
| 220 |
+
|
| 221 |
+
if not similar_repos:
|
| 222 |
+
return {
|
| 223 |
+
"formatted": f"Repository '{org}/{repo}' not found and no similar repositories found.",
|
| 224 |
+
"totalResults": 0,
|
| 225 |
+
"resultsShared": 0,
|
| 226 |
+
"isError": True,
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# Format similar repos
|
| 230 |
+
lines = [f"**Repository '{org}/{repo}' not found. Similar repositories:**\n"]
|
| 231 |
+
for i, r in enumerate(similar_repos, 1):
|
| 232 |
+
lines.append(f"{i}. **{r['full_name']}** (⭐ {r['stars']:,} stars)")
|
| 233 |
+
if r["description"]:
|
| 234 |
+
desc = (
|
| 235 |
+
r["description"][:100] + "..."
|
| 236 |
+
if len(r["description"]) > 100
|
| 237 |
+
else r["description"]
|
| 238 |
+
)
|
| 239 |
+
lines.append(f" {desc}")
|
| 240 |
+
lines.append(f" {r['url']}\n")
|
| 241 |
+
|
| 242 |
+
return {
|
| 243 |
+
"formatted": "\n".join(lines),
|
| 244 |
+
"totalResults": len(similar_repos),
|
| 245 |
+
"resultsShared": len(similar_repos),
|
| 246 |
+
"isError": True,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
if error:
|
| 250 |
+
return {
|
| 251 |
+
"formatted": f"Error accessing repository '{org}/{repo}': {error}",
|
| 252 |
+
"totalResults": 0,
|
| 253 |
+
"resultsShared": 0,
|
| 254 |
+
"isError": True,
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
if not all_files:
|
| 258 |
+
return {
|
| 259 |
+
"formatted": f"No files found in repository '{org}/{repo}'",
|
| 260 |
+
"totalResults": 0,
|
| 261 |
+
"resultsShared": 0,
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
return None
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def find_examples(
|
| 268 |
+
keyword: str = "",
|
| 269 |
+
repo: str = "",
|
| 270 |
+
org: str = "huggingface",
|
| 271 |
+
max_results: int = 10,
|
| 272 |
+
min_score: int = 80,
|
| 273 |
+
) -> ToolResult:
|
| 274 |
+
"""
|
| 275 |
+
Find example files in a repository using fuzzy matching.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
keyword: Keyword to fuzzy match against file paths (e.g., "grpo")
|
| 279 |
+
repo: Repository name (e.g., "trl")
|
| 280 |
+
org: GitHub organization (default: "huggingface")
|
| 281 |
+
max_results: Maximum number of results (default 50)
|
| 282 |
+
min_score: Minimum fuzzy match score (0-100, default 60)
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
ToolResult with matching files, or similar repos if repo not found
|
| 286 |
+
"""
|
| 287 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 288 |
+
if not token:
|
| 289 |
+
return {
|
| 290 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 291 |
+
"totalResults": 0,
|
| 292 |
+
"resultsShared": 0,
|
| 293 |
+
"isError": True,
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
if not repo:
|
| 297 |
+
return {
|
| 298 |
+
"formatted": "Error: repo parameter is required",
|
| 299 |
+
"totalResults": 0,
|
| 300 |
+
"resultsShared": 0,
|
| 301 |
+
"isError": True,
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
# Get all files in the repository
|
| 305 |
+
all_files, error = _get_repo_tree(org, repo, token)
|
| 306 |
+
|
| 307 |
+
# Handle errors (not found, API errors, empty repo)
|
| 308 |
+
if error_result := _handle_repo_tree_errors(all_files, error, org, repo, token):
|
| 309 |
+
return error_result
|
| 310 |
+
|
| 311 |
+
# Step 1: Filter files by example patterns (score >= 60)
|
| 312 |
+
example_threshold = 60
|
| 313 |
+
example_files = []
|
| 314 |
+
for file in all_files:
|
| 315 |
+
example_score = _score_against_example_patterns(file["path"])
|
| 316 |
+
if example_score >= example_threshold:
|
| 317 |
+
example_files.append({**file, "example_score": example_score})
|
| 318 |
+
|
| 319 |
+
if not example_files:
|
| 320 |
+
return {
|
| 321 |
+
"formatted": f"No example files found in {org}/{repo} (no files match example patterns with score >= {example_threshold}).",
|
| 322 |
+
"totalResults": 0,
|
| 323 |
+
"resultsShared": 0,
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
# Step 2: If keyword provided, score and filter by keyword
|
| 327 |
+
if keyword:
|
| 328 |
+
scored_files = []
|
| 329 |
+
for file in example_files:
|
| 330 |
+
keyword_score = _score_against_keyword(file["path"], keyword)
|
| 331 |
+
if keyword_score >= min_score:
|
| 332 |
+
scored_files.append({**file, "score": keyword_score})
|
| 333 |
+
|
| 334 |
+
if not scored_files:
|
| 335 |
+
return {
|
| 336 |
+
"formatted": f"No files found in {org}/{repo} matching keyword '{keyword}' (min score: {min_score}) among {len(example_files)} example files.",
|
| 337 |
+
"totalResults": 0,
|
| 338 |
+
"resultsShared": 0,
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Sort by keyword score (descending) for best matches first
|
| 342 |
+
scored_files.sort(key=lambda x: x["score"], reverse=True)
|
| 343 |
+
else:
|
| 344 |
+
# No keyword: prioritize by pattern directory, then path depth
|
| 345 |
+
scored_files = []
|
| 346 |
+
for file in example_files:
|
| 347 |
+
in_examples_dir, pattern_priority, path_depth = _get_pattern_priority(
|
| 348 |
+
file["path"]
|
| 349 |
+
)
|
| 350 |
+
scored_files.append(
|
| 351 |
+
{
|
| 352 |
+
**file,
|
| 353 |
+
"score": file["example_score"],
|
| 354 |
+
"in_examples_dir": in_examples_dir,
|
| 355 |
+
"pattern_priority": pattern_priority,
|
| 356 |
+
"path_depth": path_depth,
|
| 357 |
+
}
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
if not scored_files:
|
| 361 |
+
return {
|
| 362 |
+
"formatted": f"No example files found in {org}/{repo}.",
|
| 363 |
+
"totalResults": 0,
|
| 364 |
+
"resultsShared": 0,
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
# Sort by: 1) files in examples/ dir first, 2) pattern priority (scripts > datasets > etc), 3) path depth, 4) path name
|
| 368 |
+
scored_files.sort(
|
| 369 |
+
key=lambda x: (
|
| 370 |
+
x["in_examples_dir"],
|
| 371 |
+
x["pattern_priority"],
|
| 372 |
+
x["path_depth"],
|
| 373 |
+
x["path"],
|
| 374 |
+
)
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
# Limit results
|
| 378 |
+
results = scored_files[:max_results]
|
| 379 |
+
|
| 380 |
+
# Format output
|
| 381 |
+
keyword_desc = f" matching '{keyword}'" if keyword else ""
|
| 382 |
+
lines = [f"**Found {len(results)} example files in {org}/{repo}{keyword_desc}:**"]
|
| 383 |
+
if len(scored_files) > max_results:
|
| 384 |
+
lines[0] += f" (showing {max_results} of {len(scored_files)})"
|
| 385 |
+
lines.append("")
|
| 386 |
+
|
| 387 |
+
for i, file in enumerate(results, 1):
|
| 388 |
+
lines.append(f"{i}. **{file['path']}**")
|
| 389 |
+
lines.append(f" Size: {file['size']:,} bytes | Ref: {file['ref'][:7]}")
|
| 390 |
+
lines.append(f" URL: {file['url']}")
|
| 391 |
+
|
| 392 |
+
# Copyable parameters for read_file tool
|
| 393 |
+
read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}"
|
| 394 |
+
lines.append(f" To read, use: {read_params}")
|
| 395 |
+
lines.append("")
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
"formatted": "\n".join(lines),
|
| 399 |
+
"totalResults": len(results),
|
| 400 |
+
"resultsShared": len(results),
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
# Tool specification
|
| 405 |
+
GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
|
| 406 |
+
"name": "github_find_examples",
|
| 407 |
+
"description": "Discover best practices, reusable scripts, tutorials, and demos for using a specific library or framework. This is an important step before implementing anything ML related. "
|
| 408 |
+
"Use together with github_read_file tool.\n\n"
|
| 409 |
+
"## When to use this tool\n\n"
|
| 410 |
+
"- ALWAYS before implementing any training/inference/benchmarking or other ML related code or answering how-to question.\n"
|
| 411 |
+
"- When exploring a new repository and need to understand how to use it\n"
|
| 412 |
+
"## How it works\n\n"
|
| 413 |
+
"1. Fetches all (examples, tutorials, demos, notebooks, scripts, etc.) from the repository\n"
|
| 414 |
+
"2. If keyword provided, scores found files against the keyword using fuzzy matching\n"
|
| 415 |
+
"3. Returns best matches sorted by relevance score\n"
|
| 416 |
+
"## Examples\n\n"
|
| 417 |
+
"<example>\n"
|
| 418 |
+
"// ML Workflow Step: Find GRPO/SFT/DPO/RLOO etc training examples\n"
|
| 419 |
+
"// Task: Starting GRPO fine-tuning project, need reference implementations\n"
|
| 420 |
+
"{\n"
|
| 421 |
+
" keyword: 'grpo',\n"
|
| 422 |
+
" repo: 'trl',\n"
|
| 423 |
+
" org: 'huggingface'\n"
|
| 424 |
+
"}\n"
|
| 425 |
+
"// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
|
| 426 |
+
"// Next step: Use github_read_file to study the implementation\n"
|
| 427 |
+
"</example>\n\n"
|
| 428 |
+
"<example>\n"
|
| 429 |
+
"// ML Workflow Step: Discover all training examples in TRL\n"
|
| 430 |
+
"// Task: Exploring available training methods before choosing approach\n"
|
| 431 |
+
"{\n"
|
| 432 |
+
" repo: 'trl',\n"
|
| 433 |
+
" org: 'huggingface',\n"
|
| 434 |
+
" max_results: 20\n"
|
| 435 |
+
"}\n"
|
| 436 |
+
"// Lists all example scripts: PPO, DPO, GRPO, reward modeling, etc.\n"
|
| 437 |
+
"</example>\n\n"
|
| 438 |
+
"<example>\n"
|
| 439 |
+
"// ML Workflow Step: Find LoRA fine-tuning examples\n"
|
| 440 |
+
"// Task: Learning parameter-efficient fine-tuning with PEFT\n"
|
| 441 |
+
"{\n"
|
| 442 |
+
" keyword: 'lora',\n"
|
| 443 |
+
" repo: 'peft',\n"
|
| 444 |
+
" org: 'huggingface'\n"
|
| 445 |
+
"}\n"
|
| 446 |
+
"// Discovers LoRA configuration and training examples\n"
|
| 447 |
+
"</example>",
|
| 448 |
+
"parameters": {
|
| 449 |
+
"type": "object",
|
| 450 |
+
"properties": {
|
| 451 |
+
"keyword": {
|
| 452 |
+
"type": "string",
|
| 453 |
+
"description": "Keyword to fuzzy match against file paths (e.g., 'grpo', 'sft').",
|
| 454 |
+
},
|
| 455 |
+
"repo": {
|
| 456 |
+
"type": "string",
|
| 457 |
+
"description": "Repository name (e.g., 'trl', 'transformers'). Required.",
|
| 458 |
+
},
|
| 459 |
+
"org": {
|
| 460 |
+
"type": "string",
|
| 461 |
+
"description": "GitHub organization or username. Default: 'huggingface'.",
|
| 462 |
+
},
|
| 463 |
+
"max_results": {
|
| 464 |
+
"type": "integer",
|
| 465 |
+
"description": "Maximum number of results to return. Default: 50.",
|
| 466 |
+
},
|
| 467 |
+
"min_score": {
|
| 468 |
+
"type": "integer",
|
| 469 |
+
"description": "Minimum fuzzy match score (0-100). Default: 60.",
|
| 470 |
+
},
|
| 471 |
+
},
|
| 472 |
+
"required": ["repo"],
|
| 473 |
+
},
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
async def github_find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 478 |
+
"""Handler for agent tool router"""
|
| 479 |
+
try:
|
| 480 |
+
result = find_examples(
|
| 481 |
+
keyword=arguments.get("keyword", ""),
|
| 482 |
+
repo=arguments["repo"],
|
| 483 |
+
org=arguments.get("org", "huggingface"),
|
| 484 |
+
max_results=arguments.get("max_results", 50),
|
| 485 |
+
min_score=arguments.get("min_score", 60),
|
| 486 |
+
)
|
| 487 |
+
return result["formatted"], not result.get("isError", False)
|
| 488 |
+
except Exception as e:
|
| 489 |
+
return f"Error finding examples: {str(e)}", False
|
agent/tools/github_list_repos.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub List Repositories Tool - List and sort repositories for any user or organization
|
| 3 |
+
|
| 4 |
+
Efficiently discover repositories with flexible sorting options.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import Any, Dict, Literal, Optional
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
|
| 12 |
+
from agent.tools.types import ToolResult
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def list_repos(
|
| 16 |
+
owner: str,
|
| 17 |
+
owner_type: Literal["user", "org"] = "org",
|
| 18 |
+
sort: Literal["stars", "forks", "updated", "created"] = "stars",
|
| 19 |
+
order: Literal["asc", "desc"] = "desc",
|
| 20 |
+
limit: Optional[int] = 30,
|
| 21 |
+
) -> ToolResult:
|
| 22 |
+
"""
|
| 23 |
+
List repositories for a user or organization using GitHub REST API.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
owner: GitHub username or organization name
|
| 27 |
+
owner_type: Whether the owner is a "user" or "org" (default: "org")
|
| 28 |
+
sort: Sort field - "stars", "forks", "updated", or "created"
|
| 29 |
+
order: Sort order - "asc" or "desc" (default: "desc")
|
| 30 |
+
limit: Maximum number of repositories to return
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
ToolResult with repository information
|
| 34 |
+
"""
|
| 35 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 36 |
+
if not token:
|
| 37 |
+
return {
|
| 38 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 39 |
+
"totalResults": 0,
|
| 40 |
+
"resultsShared": 0,
|
| 41 |
+
"isError": True,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
if owner_type == "org":
|
| 45 |
+
url = f"https://api.github.com/orgs/{owner}/repos"
|
| 46 |
+
else:
|
| 47 |
+
url = f"https://api.github.com/users/{owner}/repos"
|
| 48 |
+
|
| 49 |
+
headers = {
|
| 50 |
+
"Accept": "application/vnd.github+json",
|
| 51 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 52 |
+
"Authorization": f"Bearer {token}",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
all_repos = []
|
| 56 |
+
page = 1
|
| 57 |
+
per_page = 100 # Maximum allowed by GitHub
|
| 58 |
+
|
| 59 |
+
# Map our sort values to GitHub API sort values
|
| 60 |
+
# Note: GitHub list repos API doesn't support sorting by stars/forks
|
| 61 |
+
# We'll fetch all repos and sort in memory for those cases
|
| 62 |
+
api_sort_map = {
|
| 63 |
+
"created": "created",
|
| 64 |
+
"updated": "updated",
|
| 65 |
+
"stars": None, # Not supported by list API
|
| 66 |
+
"forks": None, # Not supported by list API
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
api_sort = api_sort_map.get(sort)
|
| 70 |
+
need_manual_sort = api_sort is None
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
while True:
|
| 74 |
+
params = {
|
| 75 |
+
"page": page,
|
| 76 |
+
"per_page": per_page,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# Only add sort/direction if API supports it
|
| 80 |
+
if api_sort:
|
| 81 |
+
params["sort"] = api_sort
|
| 82 |
+
params["direction"] = order
|
| 83 |
+
|
| 84 |
+
response = requests.get(
|
| 85 |
+
url,
|
| 86 |
+
headers=headers,
|
| 87 |
+
params=params,
|
| 88 |
+
timeout=30,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if response.status_code == 403:
|
| 92 |
+
error_data = response.json()
|
| 93 |
+
return {
|
| 94 |
+
"formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
|
| 95 |
+
"totalResults": 0,
|
| 96 |
+
"resultsShared": 0,
|
| 97 |
+
"isError": True,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
if response.status_code != 200:
|
| 101 |
+
error_msg = f"GitHub API error (status {response.status_code})"
|
| 102 |
+
try:
|
| 103 |
+
error_data = response.json()
|
| 104 |
+
if "message" in error_data:
|
| 105 |
+
error_msg += f": {error_data['message']}"
|
| 106 |
+
except Exception:
|
| 107 |
+
pass
|
| 108 |
+
return {
|
| 109 |
+
"formatted": error_msg,
|
| 110 |
+
"totalResults": 0,
|
| 111 |
+
"resultsShared": 0,
|
| 112 |
+
"isError": True,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
items = response.json()
|
| 116 |
+
|
| 117 |
+
if not items:
|
| 118 |
+
break
|
| 119 |
+
|
| 120 |
+
for item in items:
|
| 121 |
+
all_repos.append(
|
| 122 |
+
{
|
| 123 |
+
"name": item.get("name"),
|
| 124 |
+
"full_name": item.get("full_name"),
|
| 125 |
+
"description": item.get("description"),
|
| 126 |
+
"html_url": item.get("html_url"),
|
| 127 |
+
"language": item.get("language"),
|
| 128 |
+
"stars": item.get("stargazers_count", 0),
|
| 129 |
+
"forks": item.get("forks_count", 0),
|
| 130 |
+
"open_issues": item.get("open_issues_count", 0),
|
| 131 |
+
"topics": item.get("topics", []),
|
| 132 |
+
"updated_at": item.get("updated_at"),
|
| 133 |
+
"created_at": item.get("created_at"),
|
| 134 |
+
}
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Check if we got fewer results than requested (last page)
|
| 138 |
+
if len(items) < per_page:
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
# Stop if we have enough repos
|
| 142 |
+
if limit and len(all_repos) >= limit:
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
page += 1
|
| 146 |
+
|
| 147 |
+
except requests.exceptions.RequestException as e:
|
| 148 |
+
return {
|
| 149 |
+
"formatted": f"Failed to connect to GitHub API: {str(e)}",
|
| 150 |
+
"totalResults": 0,
|
| 151 |
+
"resultsShared": 0,
|
| 152 |
+
"isError": True,
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Manual sorting if needed (for stars/forks)
|
| 156 |
+
if need_manual_sort and all_repos:
|
| 157 |
+
reverse = order == "desc"
|
| 158 |
+
all_repos.sort(key=lambda x: x[sort], reverse=reverse)
|
| 159 |
+
|
| 160 |
+
# Apply limit after sorting
|
| 161 |
+
if limit:
|
| 162 |
+
all_repos = all_repos[:limit]
|
| 163 |
+
|
| 164 |
+
if not all_repos:
|
| 165 |
+
return {
|
| 166 |
+
"formatted": f"No repositories found for {owner_type} '{owner}'",
|
| 167 |
+
"totalResults": 0,
|
| 168 |
+
"resultsShared": 0,
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Format output
|
| 172 |
+
lines = [f"**Found {len(all_repos)} repositories for {owner}:**\n"]
|
| 173 |
+
|
| 174 |
+
for i, repo in enumerate(all_repos, 1):
|
| 175 |
+
lines.append(f"{i}. **{repo['full_name']}**")
|
| 176 |
+
lines.append(
|
| 177 |
+
f" ⭐ {repo['stars']:,} stars | 🍴 {repo['forks']:,} forks | Language: {repo['language'] or 'N/A'}"
|
| 178 |
+
)
|
| 179 |
+
if repo["description"]:
|
| 180 |
+
desc = (
|
| 181 |
+
repo["description"][:100] + "..."
|
| 182 |
+
if len(repo["description"]) > 100
|
| 183 |
+
else repo["description"]
|
| 184 |
+
)
|
| 185 |
+
lines.append(f" {desc}")
|
| 186 |
+
lines.append(f" URL: {repo['html_url']}")
|
| 187 |
+
if repo["topics"]:
|
| 188 |
+
lines.append(f" Topics: {', '.join(repo['topics'][:5])}")
|
| 189 |
+
|
| 190 |
+
# Copyable parameters for other tools
|
| 191 |
+
lines.append(f" Use in tools: {{'repo': '{repo['full_name']}'}}")
|
| 192 |
+
lines.append("")
|
| 193 |
+
|
| 194 |
+
return {
|
| 195 |
+
"formatted": "\n".join(lines),
|
| 196 |
+
"totalResults": len(all_repos),
|
| 197 |
+
"resultsShared": len(all_repos),
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# Tool specification
|
| 202 |
+
GITHUB_LIST_REPOS_TOOL_SPEC = {
|
| 203 |
+
"name": "github_list_repos",
|
| 204 |
+
"description": (
|
| 205 |
+
"List and discover repositories for any GitHub user or organization with flexible sorting.\n\n"
|
| 206 |
+
"Returns comprehensive repository information including stars, forks, language, topics, and direct URLs. "
|
| 207 |
+
"Sorts by stars, forks, update date, or creation date.\n\n"
|
| 208 |
+
"## When to use this tool\n\n"
|
| 209 |
+
"- When you need to find libraries to use in your implementation, or to explore what repositories exist for a task.\n"
|
| 210 |
+
"- When debugging an error to looking up if others are having the same issues in repositories."
|
| 211 |
+
"- When finding the most popular or active projects for a user or org\n"
|
| 212 |
+
"## Examples\n\n"
|
| 213 |
+
"<example>\n"
|
| 214 |
+
"// ML Workflow Step: Discover HF libraries for RLHF/alignment\n"
|
| 215 |
+
"// Use case: Find the right library for training with human feedback\n"
|
| 216 |
+
"{\n"
|
| 217 |
+
" owner: 'huggingface',\n"
|
| 218 |
+
" owner_type: 'org',\n"
|
| 219 |
+
" sort: 'stars',\n"
|
| 220 |
+
" limit: 10\n"
|
| 221 |
+
"}\n"
|
| 222 |
+
"// Returns: transformers, trl, peft, accelerate, diffusers...\n"
|
| 223 |
+
"</example>\n\n"
|
| 224 |
+
"<example>\n"
|
| 225 |
+
"// ML Workflow Step: Check for recently updated HF repos\n"
|
| 226 |
+
"// Use case: Find actively maintained libraries with latest features\n"
|
| 227 |
+
"{\n"
|
| 228 |
+
" owner: 'huggingface',\n"
|
| 229 |
+
" owner_type: 'org',\n"
|
| 230 |
+
" sort: 'updated',\n"
|
| 231 |
+
" order: 'desc',\n"
|
| 232 |
+
" limit: 15\n"
|
| 233 |
+
"}\n"
|
| 234 |
+
"// Helps identify which repos have recent improvements/fixes\n"
|
| 235 |
+
"</example>"
|
| 236 |
+
),
|
| 237 |
+
"parameters": {
|
| 238 |
+
"type": "object",
|
| 239 |
+
"properties": {
|
| 240 |
+
"owner": {
|
| 241 |
+
"type": "string",
|
| 242 |
+
"description": "GitHub username or organization name. Required.",
|
| 243 |
+
},
|
| 244 |
+
"owner_type": {
|
| 245 |
+
"type": "string",
|
| 246 |
+
"enum": ["user", "org"],
|
| 247 |
+
"description": "Whether the owner is a 'user' or 'org'. Default: 'org'.",
|
| 248 |
+
},
|
| 249 |
+
"sort": {
|
| 250 |
+
"type": "string",
|
| 251 |
+
"enum": ["stars", "forks", "updated", "created"],
|
| 252 |
+
"description": "Sort field. Options: 'stars', 'forks', 'updated', 'created'. Default: 'stars'.",
|
| 253 |
+
},
|
| 254 |
+
"order": {
|
| 255 |
+
"type": "string",
|
| 256 |
+
"enum": ["asc", "desc"],
|
| 257 |
+
"description": "Sort order. Options: 'asc', 'desc'. Default: 'desc'.",
|
| 258 |
+
},
|
| 259 |
+
"limit": {
|
| 260 |
+
"type": "integer",
|
| 261 |
+
"description": "Maximum number of repositories to return. No limit if not specified. Default: 30.",
|
| 262 |
+
},
|
| 263 |
+
},
|
| 264 |
+
"required": ["owner"],
|
| 265 |
+
},
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
async def github_list_repos_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 270 |
+
"""Handler for agent tool router"""
|
| 271 |
+
try:
|
| 272 |
+
result = list_repos(
|
| 273 |
+
owner=arguments["owner"],
|
| 274 |
+
owner_type=arguments.get("owner_type", "org"),
|
| 275 |
+
sort=arguments.get("sort", "stars"),
|
| 276 |
+
order=arguments.get("order", "desc"),
|
| 277 |
+
limit=arguments.get("limit"),
|
| 278 |
+
)
|
| 279 |
+
return result["formatted"], not result.get("isError", False)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
return f"Error listing repositories: {str(e)}", False
|
agent/tools/github_read_file.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub Read File Tool - Read file contents from any GitHub repository with line range support
|
| 3 |
+
|
| 4 |
+
Fetch exact file contents with metadata, supporting line ranges for efficient reading.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import base64
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from typing import Any, Dict, Optional
|
| 11 |
+
|
| 12 |
+
import nbformat
|
| 13 |
+
import requests
|
| 14 |
+
from nbconvert import MarkdownExporter
|
| 15 |
+
from nbconvert.preprocessors import ClearOutputPreprocessor, TagRemovePreprocessor
|
| 16 |
+
|
| 17 |
+
from agent.tools.types import ToolResult
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _convert_ipynb_to_markdown(content: str) -> str:
|
| 21 |
+
"""
|
| 22 |
+
Convert Jupyter notebook JSON to LLM-friendly Markdown.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
content: Raw notebook JSON string
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Converted Markdown string
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
# Parse notebook JSON
|
| 32 |
+
nb_dict = json.loads(content)
|
| 33 |
+
|
| 34 |
+
# Normalize cell sources (can be string or list of strings)
|
| 35 |
+
if "cells" in nb_dict:
|
| 36 |
+
for cell in nb_dict["cells"]:
|
| 37 |
+
if "source" in cell and isinstance(cell["source"], list):
|
| 38 |
+
cell["source"] = "".join(cell["source"])
|
| 39 |
+
|
| 40 |
+
# Read notebook with explicit version
|
| 41 |
+
nb = nbformat.reads(json.dumps(nb_dict), as_version=4)
|
| 42 |
+
|
| 43 |
+
# Strip outputs for LLM readability (outputs can be noisy/large)
|
| 44 |
+
clear = ClearOutputPreprocessor()
|
| 45 |
+
nb, _ = clear.preprocess(nb, {})
|
| 46 |
+
|
| 47 |
+
# Optionally remove cells tagged with "hide" or similar
|
| 48 |
+
remove = TagRemovePreprocessor(
|
| 49 |
+
remove_cell_tags={"hide", "hidden", "remove"},
|
| 50 |
+
remove_input_tags=set(),
|
| 51 |
+
remove_all_outputs_tags=set(),
|
| 52 |
+
)
|
| 53 |
+
nb, _ = remove.preprocess(nb, {})
|
| 54 |
+
|
| 55 |
+
# Convert to markdown
|
| 56 |
+
exporter = MarkdownExporter()
|
| 57 |
+
markdown, _ = exporter.from_notebook_node(nb)
|
| 58 |
+
|
| 59 |
+
return markdown
|
| 60 |
+
|
| 61 |
+
except json.JSONDecodeError:
|
| 62 |
+
return content
|
| 63 |
+
except Exception:
|
| 64 |
+
return content
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def read_file(
|
| 68 |
+
repo: str,
|
| 69 |
+
path: str,
|
| 70 |
+
ref: str = "HEAD",
|
| 71 |
+
line_start: Optional[int] = None,
|
| 72 |
+
line_end: Optional[int] = None,
|
| 73 |
+
) -> ToolResult:
|
| 74 |
+
"""
|
| 75 |
+
Read file contents from a GitHub repository with line range support.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
repo: Repository in format "owner/repo" (e.g., "github/github-mcp-server")
|
| 79 |
+
path: Path to file in repository (e.g., "pkg/github/search.go")
|
| 80 |
+
ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
|
| 81 |
+
line_start: Starting line number (1-indexed, inclusive)
|
| 82 |
+
line_end: Ending line number (1-indexed, inclusive)
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
ToolResult with file contents and metadata
|
| 86 |
+
"""
|
| 87 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 88 |
+
if not token:
|
| 89 |
+
return {
|
| 90 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 91 |
+
"totalResults": 0,
|
| 92 |
+
"resultsShared": 0,
|
| 93 |
+
"isError": True,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Parse repo
|
| 97 |
+
if "/" not in repo:
|
| 98 |
+
return {
|
| 99 |
+
"formatted": "Error: repo must be in format 'owner/repo'",
|
| 100 |
+
"totalResults": 0,
|
| 101 |
+
"resultsShared": 0,
|
| 102 |
+
"isError": True,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
owner, repo_name = repo.split("/", 1)
|
| 106 |
+
|
| 107 |
+
headers = {
|
| 108 |
+
"Accept": "application/vnd.github+json",
|
| 109 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 110 |
+
"Authorization": f"Bearer {token}",
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
# Fetch file contents
|
| 114 |
+
url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
|
| 115 |
+
params = {}
|
| 116 |
+
if ref and ref != "HEAD":
|
| 117 |
+
params["ref"] = ref
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
response = requests.get(url, headers=headers, params=params, timeout=30)
|
| 121 |
+
|
| 122 |
+
if response.status_code == 404:
|
| 123 |
+
return {
|
| 124 |
+
"formatted": f"File not found: {path} in {repo} (ref: {ref})",
|
| 125 |
+
"totalResults": 0,
|
| 126 |
+
"resultsShared": 0,
|
| 127 |
+
"isError": True,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
if response.status_code != 200:
|
| 131 |
+
error_msg = f"GitHub API error (status {response.status_code})"
|
| 132 |
+
try:
|
| 133 |
+
error_data = response.json()
|
| 134 |
+
if "message" in error_data:
|
| 135 |
+
error_msg += f": {error_data['message']}"
|
| 136 |
+
except Exception:
|
| 137 |
+
pass
|
| 138 |
+
return {
|
| 139 |
+
"formatted": error_msg,
|
| 140 |
+
"totalResults": 0,
|
| 141 |
+
"resultsShared": 0,
|
| 142 |
+
"isError": True,
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
data = response.json()
|
| 146 |
+
|
| 147 |
+
# Check if it's a file
|
| 148 |
+
if data.get("type") != "file":
|
| 149 |
+
return {
|
| 150 |
+
"formatted": f"Path {path} is not a file (type: {data.get('type')})",
|
| 151 |
+
"totalResults": 0,
|
| 152 |
+
"resultsShared": 0,
|
| 153 |
+
"isError": True,
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
# Decode content
|
| 157 |
+
content_b64 = data.get("content", "")
|
| 158 |
+
if content_b64:
|
| 159 |
+
content_b64 = content_b64.replace("\n", "").replace(" ", "")
|
| 160 |
+
content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
|
| 161 |
+
else:
|
| 162 |
+
# For large files, fetch raw content
|
| 163 |
+
raw_headers = {
|
| 164 |
+
"Accept": "application/vnd.github.raw",
|
| 165 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 166 |
+
"Authorization": f"Bearer {token}",
|
| 167 |
+
}
|
| 168 |
+
raw_response = requests.get(
|
| 169 |
+
url, headers=raw_headers, params=params, timeout=30
|
| 170 |
+
)
|
| 171 |
+
if raw_response.status_code != 200:
|
| 172 |
+
return {
|
| 173 |
+
"formatted": "Failed to fetch file content",
|
| 174 |
+
"totalResults": 0,
|
| 175 |
+
"resultsShared": 0,
|
| 176 |
+
"isError": True,
|
| 177 |
+
}
|
| 178 |
+
content = raw_response.text
|
| 179 |
+
|
| 180 |
+
if path.lower().endswith(".ipynb"):
|
| 181 |
+
content = _convert_ipynb_to_markdown(content)
|
| 182 |
+
|
| 183 |
+
# Process line ranges
|
| 184 |
+
lines = content.split("\n")
|
| 185 |
+
total_lines = len(lines)
|
| 186 |
+
|
| 187 |
+
truncated = False
|
| 188 |
+
|
| 189 |
+
if line_start is None and line_end is None:
|
| 190 |
+
# No range specified
|
| 191 |
+
if total_lines > 300:
|
| 192 |
+
line_start = 1
|
| 193 |
+
line_end = 300
|
| 194 |
+
truncated = True
|
| 195 |
+
else:
|
| 196 |
+
line_start = 1
|
| 197 |
+
line_end = total_lines
|
| 198 |
+
else:
|
| 199 |
+
# Range specified
|
| 200 |
+
if line_start is None:
|
| 201 |
+
line_start = 1
|
| 202 |
+
if line_end is None:
|
| 203 |
+
line_end = total_lines
|
| 204 |
+
|
| 205 |
+
# Validate range
|
| 206 |
+
line_start = max(1, line_start)
|
| 207 |
+
line_end = min(total_lines, line_end)
|
| 208 |
+
if line_start > line_end:
|
| 209 |
+
return {
|
| 210 |
+
"formatted": f"Invalid range: line_start ({line_start}) > line_end ({line_end})",
|
| 211 |
+
"totalResults": 0,
|
| 212 |
+
"resultsShared": 0,
|
| 213 |
+
"isError": True,
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
# Extract lines
|
| 217 |
+
selected_lines = lines[line_start - 1 : line_end]
|
| 218 |
+
selected_content = "\n".join(selected_lines)
|
| 219 |
+
|
| 220 |
+
# Format output
|
| 221 |
+
lines_output = [f"**Reading file from repo: {repo}, path: {path}**"]
|
| 222 |
+
|
| 223 |
+
if ref and ref != "HEAD":
|
| 224 |
+
lines_output.append(f"Ref: {ref}")
|
| 225 |
+
|
| 226 |
+
lines_output.append("\n**File content:")
|
| 227 |
+
lines_output.append("```")
|
| 228 |
+
lines_output.append(selected_content)
|
| 229 |
+
lines_output.append("```")
|
| 230 |
+
if truncated:
|
| 231 |
+
lines_output.append(
|
| 232 |
+
f"Currently showing lines {line_start}-{line_end} out of {total_lines} total lines. Use line_start and line_end to view more lines."
|
| 233 |
+
)
|
| 234 |
+
return {
|
| 235 |
+
"formatted": "\n".join(lines_output),
|
| 236 |
+
"totalResults": 1,
|
| 237 |
+
"resultsShared": 1,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
except requests.exceptions.RequestException as e:
|
| 241 |
+
return {
|
| 242 |
+
"formatted": f"Failed to connect to GitHub API: {str(e)}",
|
| 243 |
+
"totalResults": 0,
|
| 244 |
+
"resultsShared": 0,
|
| 245 |
+
"isError": True,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
# Tool specification
|
| 250 |
+
GITHUB_READ_FILE_TOOL_SPEC = {
|
| 251 |
+
"name": "github_read_file",
|
| 252 |
+
"description": (
|
| 253 |
+
"Read file contents from any GitHub repository with line range support.\n\n"
|
| 254 |
+
"Fetches exact file contents in the given line range (default 300 lines, use line_start/line_end adjust). \n\n"
|
| 255 |
+
"## When to use this tool\n\n"
|
| 256 |
+
"- When reading example code, implementations, or documentation on a specific github file\n"
|
| 257 |
+
"- When you found a file via github_list_repos, or github_find_examples and need its contents\n"
|
| 258 |
+
"- When investigating specific code sections with line ranges\n"
|
| 259 |
+
"- When reading from specific branches, tags, or commits\n"
|
| 260 |
+
"## When NOT to use this tool\n\n"
|
| 261 |
+
"- When you don't know the exact file path beforehand (use github_search_code or github_find_examples first)\n\n"
|
| 262 |
+
"## Examples\n\n"
|
| 263 |
+
"<example>\n"
|
| 264 |
+
"// ML Workflow Step: Reading example code from for GRPO training with TRL\n"
|
| 265 |
+
"// Use case: Read trainer class to understand API and methods\n"
|
| 266 |
+
"{\n"
|
| 267 |
+
" repo: 'huggingface/trl',\n"
|
| 268 |
+
" path: 'trl/trainer/grpo_trainer.py',\n"
|
| 269 |
+
" line_start: 1,\n"
|
| 270 |
+
" line_end: 200\n"
|
| 271 |
+
"}\n"
|
| 272 |
+
"// Read class definition and constructor to understand parameters\n"
|
| 273 |
+
"</example>\n\n"
|
| 274 |
+
"<example>\n"
|
| 275 |
+
"// ML Workflow Step: Study complete training script\n"
|
| 276 |
+
"// Use case: Learn end-to-end VLM fine-tuning with GRPO\n"
|
| 277 |
+
"{\n"
|
| 278 |
+
" repo: 'huggingface/trl',\n"
|
| 279 |
+
" path: 'examples/scripts/grpo_vlm.py'\n"
|
| 280 |
+
"}\n"
|
| 281 |
+
"// Returns first 300 lines of the file\n"
|
| 282 |
+
"</example>\n\n"
|
| 283 |
+
"<example>\n"
|
| 284 |
+
"// ML Workflow Step: Check configuration patterns\n"
|
| 285 |
+
"// Use case: Learn how to structure training configs\n"
|
| 286 |
+
"{\n"
|
| 287 |
+
" repo: 'huggingface/transformers',\n"
|
| 288 |
+
" path: 'examples/pytorch/language-modeling/run_clm.py',\n"
|
| 289 |
+
" line_start: 50,\n"
|
| 290 |
+
" line_end: 150\n"
|
| 291 |
+
"}\n"
|
| 292 |
+
"// Read argument parsing and config setup section\n"
|
| 293 |
+
"</example>"
|
| 294 |
+
),
|
| 295 |
+
"parameters": {
|
| 296 |
+
"type": "object",
|
| 297 |
+
"properties": {
|
| 298 |
+
"repo": {
|
| 299 |
+
"type": "string",
|
| 300 |
+
"description": "Repository in format 'owner/repo' (e.g., 'github/github-mcp-server'). Required.",
|
| 301 |
+
},
|
| 302 |
+
"path": {
|
| 303 |
+
"type": "string",
|
| 304 |
+
"description": "Path to file in repository (e.g., 'src/index.js'). Required.",
|
| 305 |
+
},
|
| 306 |
+
"ref": {
|
| 307 |
+
"type": "string",
|
| 308 |
+
"description": "Git reference - branch name, tag, or commit SHA. Default: 'HEAD'.",
|
| 309 |
+
},
|
| 310 |
+
"line_start": {
|
| 311 |
+
"type": "integer",
|
| 312 |
+
"description": "Starting line number (1-indexed, inclusive). Optional.",
|
| 313 |
+
},
|
| 314 |
+
"line_end": {
|
| 315 |
+
"type": "integer",
|
| 316 |
+
"description": "Ending line number (1-indexed, inclusive). Optional.",
|
| 317 |
+
},
|
| 318 |
+
},
|
| 319 |
+
"required": ["repo", "path"],
|
| 320 |
+
},
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
async def github_read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 325 |
+
"""Handler for agent tool router"""
|
| 326 |
+
try:
|
| 327 |
+
result = read_file(
|
| 328 |
+
repo=arguments["repo"],
|
| 329 |
+
path=arguments["path"],
|
| 330 |
+
ref=arguments.get("ref", "HEAD"),
|
| 331 |
+
line_start=arguments.get("line_start"),
|
| 332 |
+
line_end=arguments.get("line_end"),
|
| 333 |
+
)
|
| 334 |
+
return result["formatted"], not result.get("isError", False)
|
| 335 |
+
except Exception as e:
|
| 336 |
+
return f"Error reading file: {str(e)}", False
|
agent/tools/github_search_code.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub Code Search Tool - Search code across GitHub with intelligent filtering
|
| 3 |
+
|
| 4 |
+
Maps user-friendly patterns to GitHub's Code Search API capabilities.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import fnmatch
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
from typing import Any, Dict, Optional
|
| 11 |
+
|
| 12 |
+
import requests
|
| 13 |
+
|
| 14 |
+
from agent.tools.types import ToolResult
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _glob_match(text: str, pattern: str) -> bool:
|
| 18 |
+
"""Check if text matches glob pattern, supporting ** for multi-level paths"""
|
| 19 |
+
if "**" in pattern:
|
| 20 |
+
regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>")
|
| 21 |
+
regex_pattern = fnmatch.translate(regex_pattern)
|
| 22 |
+
regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*")
|
| 23 |
+
return re.match(regex_pattern, text) is not None
|
| 24 |
+
return fnmatch.fnmatch(text, pattern)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _parse_repo_filter(repo_pattern: str) -> tuple[Optional[str], Optional[str]]:
|
| 28 |
+
"""
|
| 29 |
+
Parse repository pattern into GitHub API filter and client-side glob pattern.
|
| 30 |
+
|
| 31 |
+
Returns: (api_filter, client_glob)
|
| 32 |
+
- api_filter: GitHub API filter string (e.g., "org:huggingface")
|
| 33 |
+
- client_glob: Pattern for client-side filtering (e.g., "huggingface/trl*")
|
| 34 |
+
|
| 35 |
+
Examples:
|
| 36 |
+
"huggingface/trl" → ("repo:huggingface/trl", None)
|
| 37 |
+
"huggingface/*" → ("org:huggingface", "huggingface/*")
|
| 38 |
+
"huggingface/trl*" → ("org:huggingface", "huggingface/trl*")
|
| 39 |
+
"huggingface" → ("org:huggingface", None)
|
| 40 |
+
"*/*" → (None, "*/*")
|
| 41 |
+
"""
|
| 42 |
+
if not repo_pattern:
|
| 43 |
+
return None, None
|
| 44 |
+
|
| 45 |
+
# Pattern: owner/repo (exact match)
|
| 46 |
+
if "/" in repo_pattern and "*" not in repo_pattern and "?" not in repo_pattern:
|
| 47 |
+
return f"repo:{repo_pattern}", None
|
| 48 |
+
|
| 49 |
+
# Pattern: owner/* or owner/prefix* (org + client filter)
|
| 50 |
+
if "/" in repo_pattern and ("*" in repo_pattern or "?" in repo_pattern):
|
| 51 |
+
org_name = repo_pattern.split("/")[0]
|
| 52 |
+
if "*" not in org_name and "?" not in org_name:
|
| 53 |
+
return f"org:{org_name}", repo_pattern
|
| 54 |
+
# Org name has wildcards - can't filter server-side
|
| 55 |
+
return None, repo_pattern
|
| 56 |
+
|
| 57 |
+
# Pattern: owner (just org name, no wildcards)
|
| 58 |
+
if "*" not in repo_pattern and "?" not in repo_pattern:
|
| 59 |
+
return f"org:{repo_pattern}", None
|
| 60 |
+
|
| 61 |
+
# Pattern: */* or other complex patterns (client-side only)
|
| 62 |
+
return None, repo_pattern
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _parse_path_filter(path_pattern: str) -> tuple[Optional[str], Optional[str]]:
|
| 66 |
+
"""
|
| 67 |
+
Parse path pattern into GitHub API filter and client-side glob pattern.
|
| 68 |
+
|
| 69 |
+
Returns: (api_filter, client_glob)
|
| 70 |
+
|
| 71 |
+
Examples:
|
| 72 |
+
"*.py" → ("extension:py", None)
|
| 73 |
+
"**/*.py" → ("extension:py", None)
|
| 74 |
+
"src/**/*.py" → ("extension:py", "src/**/*.py")
|
| 75 |
+
"test_*.py" → ("extension:py", "test_*.py")
|
| 76 |
+
"src/main.py" → ("path:src/main.py", None)
|
| 77 |
+
"""
|
| 78 |
+
if not path_pattern:
|
| 79 |
+
return None, None
|
| 80 |
+
|
| 81 |
+
# Exact path (no wildcards)
|
| 82 |
+
if "*" not in path_pattern and "?" not in path_pattern:
|
| 83 |
+
return f"path:{path_pattern}", None
|
| 84 |
+
|
| 85 |
+
# Extract extension if present
|
| 86 |
+
ext_match = re.search(r"\*\.(\w+)$", path_pattern)
|
| 87 |
+
if ext_match:
|
| 88 |
+
extension = ext_match.group(1)
|
| 89 |
+
api_filter = f"extension:{extension}"
|
| 90 |
+
|
| 91 |
+
# Check if there's a directory prefix that needs client-side filtering
|
| 92 |
+
# e.g., "src/**/*.py" needs client filter, "**/*.py" doesn't
|
| 93 |
+
if path_pattern in [f"*.{extension}", f"**/*.{extension}"]:
|
| 94 |
+
# Simple patterns - API filter is enough
|
| 95 |
+
return api_filter, None
|
| 96 |
+
else:
|
| 97 |
+
# Complex pattern - need client-side filter too
|
| 98 |
+
return api_filter, path_pattern
|
| 99 |
+
|
| 100 |
+
# Pattern like "test_*.py" or "README*" - use filename with client filter
|
| 101 |
+
# GitHub's filename: doesn't support wildcards, so we rely on client-side
|
| 102 |
+
if "/" not in path_pattern:
|
| 103 |
+
# Try to extract extension for API filtering
|
| 104 |
+
if "." in path_pattern:
|
| 105 |
+
parts = path_pattern.rsplit(".", 1)
|
| 106 |
+
if "*" not in parts[-1] and "?" not in parts[-1]:
|
| 107 |
+
# Extension is clean
|
| 108 |
+
return f"extension:{parts[-1]}", path_pattern
|
| 109 |
+
# No extension or complex - client-side only
|
| 110 |
+
return None, path_pattern
|
| 111 |
+
|
| 112 |
+
# Complex path pattern - client-side only
|
| 113 |
+
return None, path_pattern
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def search_code(
|
| 117 |
+
query: str,
|
| 118 |
+
repo_pattern: Optional[str] = None,
|
| 119 |
+
path_pattern: Optional[str] = None,
|
| 120 |
+
regex: bool = False,
|
| 121 |
+
max_results: int = 20,
|
| 122 |
+
) -> ToolResult:
|
| 123 |
+
"""
|
| 124 |
+
Search for code across GitHub with intelligent pattern matching.
|
| 125 |
+
|
| 126 |
+
This tool intelligently maps user patterns to GitHub's Code Search API capabilities:
|
| 127 |
+
|
| 128 |
+
Repository Patterns:
|
| 129 |
+
- "owner/repo" → Searches exact repository
|
| 130 |
+
- "owner/*" or "owner" → Searches all repos in organization
|
| 131 |
+
- "*/*" → Searches all GitHub (no repo filter)
|
| 132 |
+
- Wildcards trigger client-side filtering when needed
|
| 133 |
+
|
| 134 |
+
Path Patterns:
|
| 135 |
+
- "*.py" → Searches all Python files
|
| 136 |
+
- "**/*.js" → Searches all JavaScript files (any directory)
|
| 137 |
+
- "src/**/*.py" → Python files in src/ (uses client-side filtering)
|
| 138 |
+
- "test_*.py" → Files matching pattern (client-side filtering)
|
| 139 |
+
- "path/to/file.py" → Exact file path
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
query: Search term or pattern to find in code
|
| 143 |
+
repo_pattern: Repository pattern (e.g., "huggingface/trl", "huggingface/*", "huggingface")
|
| 144 |
+
path_pattern: File path pattern (e.g., "*.py", "src/**/*.js")
|
| 145 |
+
regex: If True, treat query as regular expression
|
| 146 |
+
max_results: Maximum number of results to return (default 20)
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
ToolResult with code matches and snippets
|
| 150 |
+
"""
|
| 151 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 152 |
+
if not token:
|
| 153 |
+
return {
|
| 154 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 155 |
+
"totalResults": 0,
|
| 156 |
+
"resultsShared": 0,
|
| 157 |
+
"isError": True,
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
# Build GitHub API query
|
| 161 |
+
query_parts = []
|
| 162 |
+
|
| 163 |
+
# Add search term
|
| 164 |
+
if regex:
|
| 165 |
+
query_parts.append(f"/{query}/")
|
| 166 |
+
else:
|
| 167 |
+
query_parts.append(f'"{query}"' if " " in query else query)
|
| 168 |
+
|
| 169 |
+
# Parse repository filter
|
| 170 |
+
repo_api_filter, repo_client_glob = _parse_repo_filter(repo_pattern)
|
| 171 |
+
if repo_api_filter:
|
| 172 |
+
query_parts.append(repo_api_filter)
|
| 173 |
+
|
| 174 |
+
# Parse path filter
|
| 175 |
+
path_api_filter, path_client_glob = _parse_path_filter(path_pattern)
|
| 176 |
+
if path_api_filter:
|
| 177 |
+
query_parts.append(path_api_filter)
|
| 178 |
+
|
| 179 |
+
github_query = " ".join(query_parts)
|
| 180 |
+
|
| 181 |
+
headers = {
|
| 182 |
+
"Accept": "application/vnd.github.text-match+json",
|
| 183 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 184 |
+
"Authorization": f"Bearer {token}",
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
all_matches = []
|
| 188 |
+
page = 1
|
| 189 |
+
per_page = min(100, max_results)
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
while len(all_matches) < max_results:
|
| 193 |
+
params = {
|
| 194 |
+
"q": github_query,
|
| 195 |
+
"page": page,
|
| 196 |
+
"per_page": per_page,
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
response = requests.get(
|
| 200 |
+
"https://api.github.com/search/code",
|
| 201 |
+
headers=headers,
|
| 202 |
+
params=params,
|
| 203 |
+
timeout=30,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
if response.status_code == 403:
|
| 207 |
+
error_data = response.json()
|
| 208 |
+
return {
|
| 209 |
+
"formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
|
| 210 |
+
"totalResults": 0,
|
| 211 |
+
"resultsShared": 0,
|
| 212 |
+
"isError": True,
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
if response.status_code != 200:
|
| 216 |
+
error_msg = f"GitHub API error (status {response.status_code})"
|
| 217 |
+
try:
|
| 218 |
+
error_data = response.json()
|
| 219 |
+
if "message" in error_data:
|
| 220 |
+
error_msg += f": {error_data['message']}"
|
| 221 |
+
except Exception:
|
| 222 |
+
pass
|
| 223 |
+
return {
|
| 224 |
+
"formatted": error_msg,
|
| 225 |
+
"totalResults": 0,
|
| 226 |
+
"resultsShared": 0,
|
| 227 |
+
"isError": True,
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
data = response.json()
|
| 231 |
+
items = data.get("items", [])
|
| 232 |
+
|
| 233 |
+
if not items:
|
| 234 |
+
break
|
| 235 |
+
|
| 236 |
+
for item in items:
|
| 237 |
+
repo_name = item.get("repository", {}).get("full_name", "unknown")
|
| 238 |
+
file_path = item.get("path", "")
|
| 239 |
+
sha = item.get("sha", "")
|
| 240 |
+
|
| 241 |
+
# Apply client-side filtering
|
| 242 |
+
if repo_client_glob and not _glob_match(repo_name, repo_client_glob):
|
| 243 |
+
continue
|
| 244 |
+
if path_client_glob and not _glob_match(file_path, path_client_glob):
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
# Extract text matches
|
| 248 |
+
text_matches = item.get("text_matches", [])
|
| 249 |
+
if text_matches:
|
| 250 |
+
for text_match in text_matches:
|
| 251 |
+
fragment = text_match.get("fragment", "")
|
| 252 |
+
lines = fragment.split("\n")
|
| 253 |
+
line_count = len([line for line in lines if line.strip()])
|
| 254 |
+
|
| 255 |
+
all_matches.append(
|
| 256 |
+
{
|
| 257 |
+
"repo": repo_name,
|
| 258 |
+
"path": file_path,
|
| 259 |
+
"ref": sha,
|
| 260 |
+
"line_start": 1,
|
| 261 |
+
"line_end": line_count,
|
| 262 |
+
"snippet": fragment.strip(),
|
| 263 |
+
"url": item.get("html_url", ""),
|
| 264 |
+
}
|
| 265 |
+
)
|
| 266 |
+
else:
|
| 267 |
+
all_matches.append(
|
| 268 |
+
{
|
| 269 |
+
"repo": repo_name,
|
| 270 |
+
"path": file_path,
|
| 271 |
+
"ref": sha,
|
| 272 |
+
"line_start": 1,
|
| 273 |
+
"line_end": 1,
|
| 274 |
+
"snippet": "(snippet not available)",
|
| 275 |
+
"url": item.get("html_url", ""),
|
| 276 |
+
}
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
if len(all_matches) >= data.get("total_count", 0):
|
| 280 |
+
break
|
| 281 |
+
|
| 282 |
+
page += 1
|
| 283 |
+
|
| 284 |
+
except requests.exceptions.RequestException as e:
|
| 285 |
+
return {
|
| 286 |
+
"formatted": f"Failed to connect to GitHub API: {str(e)}",
|
| 287 |
+
"totalResults": 0,
|
| 288 |
+
"resultsShared": 0,
|
| 289 |
+
"isError": True,
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
results = all_matches[:max_results]
|
| 293 |
+
|
| 294 |
+
if not results:
|
| 295 |
+
return {
|
| 296 |
+
"formatted": f"No code matches found for query: {query}",
|
| 297 |
+
"totalResults": 0,
|
| 298 |
+
"resultsShared": 0,
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
# Format output
|
| 302 |
+
lines_output = [f"**Found {len(results)} code matches:**\n"]
|
| 303 |
+
|
| 304 |
+
for i, match in enumerate(results, 1):
|
| 305 |
+
lines_output.append(f"{i}. **{match['repo']}:{match['path']}**")
|
| 306 |
+
lines_output.append(
|
| 307 |
+
f" Lines: {match['line_start']}-{match['line_end']} | Ref: {match['ref'][:7]}"
|
| 308 |
+
)
|
| 309 |
+
lines_output.append(f" URL: {match['url']}")
|
| 310 |
+
|
| 311 |
+
# Copyable parameters for read_file tool
|
| 312 |
+
read_params = f"{{'repo': '{match['repo']}', 'path': '{match['path']}', 'ref': '{match['ref'][:7]}'}}"
|
| 313 |
+
lines_output.append(f" To read, use: {read_params}")
|
| 314 |
+
|
| 315 |
+
# Show snippet (first 5 lines)
|
| 316 |
+
snippet_lines = match["snippet"].split("\n")[:5]
|
| 317 |
+
if snippet_lines:
|
| 318 |
+
lines_output.append(" ```")
|
| 319 |
+
for line in snippet_lines:
|
| 320 |
+
lines_output.append(f" {line}")
|
| 321 |
+
if len(match["snippet"].split("\n")) > 5:
|
| 322 |
+
lines_output.append(" ...")
|
| 323 |
+
lines_output.append(" ```")
|
| 324 |
+
lines_output.append("")
|
| 325 |
+
|
| 326 |
+
return {
|
| 327 |
+
"formatted": "\n".join(lines_output),
|
| 328 |
+
"totalResults": len(results),
|
| 329 |
+
"resultsShared": len(results),
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# Tool specification
|
| 334 |
+
GITHUB_SEARCH_CODE_TOOL_SPEC = {
|
| 335 |
+
"name": "github_search_code",
|
| 336 |
+
"description": (
|
| 337 |
+
"Search for code patterns across GitHub repositories with intelligent pattern matching.\n\n"
|
| 338 |
+
"Searches for specific code patterns, functions, classes, or implementations across GitHub. "
|
| 339 |
+
"Intelligently maps patterns to GitHub's Code Search API for efficient server-side filtering, "
|
| 340 |
+
"with automatic client-side filtering for complex patterns. Returns code snippets with context.\n\n"
|
| 341 |
+
"## When to use this tool\n\n"
|
| 342 |
+
"- When searching for specific code patterns, functions, or classes across repositories\n"
|
| 343 |
+
"- When looking for implementation examples of specific methods or APIs\n"
|
| 344 |
+
"- When you need to find where specific code exists across multiple files or repos\n"
|
| 345 |
+
"- When investigating how a feature is implemented in different repositories\n"
|
| 346 |
+
"- When searching for TODO comments, specific patterns, or code structures\n"
|
| 347 |
+
"- Use this for searching actual implementation code (not examples - use github_find_examples for those)\n\n"
|
| 348 |
+
"## When NOT to use this tool\n\n"
|
| 349 |
+
"- When looking for example files or tutorials (use github_find_examples instead)\n"
|
| 350 |
+
"- When you already know the exact file path (use github_read_file directly)\n"
|
| 351 |
+
"- When you need to list repositories (use github_list_repos instead)\n\n"
|
| 352 |
+
"## Repository Patterns\n\n"
|
| 353 |
+
"- **Exact repo**: `'huggingface/trl'` → Searches only that repository\n"
|
| 354 |
+
"- **Organization**: `'huggingface'` or `'huggingface/*'` → All repos in organization\n"
|
| 355 |
+
"- **All GitHub**: `'*/*'` or omit repo_pattern → Searches across all GitHub\n"
|
| 356 |
+
"- **Wildcards**: `'huggingface/trl*'` → Automatic client-side filtering for complex patterns\n\n"
|
| 357 |
+
"## Path Patterns\n\n"
|
| 358 |
+
"- **Extension**: `'*.py'` or `'**/*.py'` → All Python files\n"
|
| 359 |
+
"- **Directory**: `'src/**/*.js'` → JavaScript files in src/ directory (client-filtered)\n"
|
| 360 |
+
"- **Pattern**: `'test_*.py'` → Files matching pattern (client-filtered)\n"
|
| 361 |
+
"- **Exact path**: `'README.md'` → Specific file\n\n"
|
| 362 |
+
"## How it works\n\n"
|
| 363 |
+
"1. Parses repository and path patterns\n"
|
| 364 |
+
"2. Converts to GitHub API filters when possible (server-side, fast)\n"
|
| 365 |
+
"3. Falls back to client-side filtering for complex patterns\n"
|
| 366 |
+
"4. Returns code snippets with line numbers, URLs, and file refs\n"
|
| 367 |
+
"5. Results can be used directly with github_read_file tool\n\n"
|
| 368 |
+
"## Examples\n\n"
|
| 369 |
+
"<example>\n"
|
| 370 |
+
"// ML Workflow Step: Find how AutoModelForCausalLM is used\n"
|
| 371 |
+
"// Use case: Learning best practices for loading LLMs in TRL\n"
|
| 372 |
+
"{\n"
|
| 373 |
+
" query: 'AutoModelForCausalLM.from_pretrained',\n"
|
| 374 |
+
" repo_pattern: 'huggingface/trl',\n"
|
| 375 |
+
" path_pattern: '*.py'\n"
|
| 376 |
+
"}\n"
|
| 377 |
+
"// Finds all model loading patterns with quantization, device_map, etc.\n"
|
| 378 |
+
"</example>\n\n"
|
| 379 |
+
"<example>\n"
|
| 380 |
+
"// ML Workflow Step: Discover TrainingArguments configurations\n"
|
| 381 |
+
"// Use case: Setting up training hyperparameters correctly\n"
|
| 382 |
+
"{\n"
|
| 383 |
+
" query: 'TrainingArguments',\n"
|
| 384 |
+
" repo_pattern: 'huggingface/transformers',\n"
|
| 385 |
+
" path_pattern: 'examples/**/*.py',\n"
|
| 386 |
+
" max_results: 10\n"
|
| 387 |
+
"}\n"
|
| 388 |
+
"// Shows various TrainingArguments setups across different tasks\n"
|
| 389 |
+
"</example>\n\n"
|
| 390 |
+
"<example>\n"
|
| 391 |
+
"// ML Workflow Step: Find dataset preprocessing patterns\n"
|
| 392 |
+
"// Use case: Learning how to prepare data for instruction tuning\n"
|
| 393 |
+
"{\n"
|
| 394 |
+
" query: 'map(tokenize',\n"
|
| 395 |
+
" repo_pattern: 'huggingface',\n"
|
| 396 |
+
" path_pattern: '*.py'\n"
|
| 397 |
+
"}\n"
|
| 398 |
+
"// Discovers tokenization and dataset mapping patterns\n"
|
| 399 |
+
"</example>\n\n"
|
| 400 |
+
"<example>\n"
|
| 401 |
+
"// ML Workflow Step: Find all Trainer class implementations\n"
|
| 402 |
+
"// Use case: Understanding available trainer variants for different tasks\n"
|
| 403 |
+
"{\n"
|
| 404 |
+
" query: 'class \\\\w+Trainer\\\\(',\n"
|
| 405 |
+
" repo_pattern: 'huggingface/trl',\n"
|
| 406 |
+
" path_pattern: 'trl/trainer/**/*.py',\n"
|
| 407 |
+
" regex: true\n"
|
| 408 |
+
"}\n"
|
| 409 |
+
"// Lists: GRPOTrainer, DPOTrainer, PPOTrainer, RewardTrainer, etc.\n"
|
| 410 |
+
"</example>"
|
| 411 |
+
),
|
| 412 |
+
"parameters": {
|
| 413 |
+
"type": "object",
|
| 414 |
+
"properties": {
|
| 415 |
+
"query": {
|
| 416 |
+
"type": "string",
|
| 417 |
+
"description": "Search term or pattern to find in code. Required.",
|
| 418 |
+
},
|
| 419 |
+
"repo_pattern": {
|
| 420 |
+
"type": "string",
|
| 421 |
+
"description": "Repository pattern: 'owner/repo' (exact), 'owner' (org), 'owner/*' (org with filter), '*/*' (all). Optional.",
|
| 422 |
+
},
|
| 423 |
+
"path_pattern": {
|
| 424 |
+
"type": "string",
|
| 425 |
+
"description": "File path pattern: '*.ext' (extension), 'dir/**/*.ext' (directory), 'pattern*.ext' (name pattern). Optional.",
|
| 426 |
+
},
|
| 427 |
+
"regex": {
|
| 428 |
+
"type": "boolean",
|
| 429 |
+
"description": "If true, treat query as regular expression. Default: false.",
|
| 430 |
+
},
|
| 431 |
+
"max_results": {
|
| 432 |
+
"type": "integer",
|
| 433 |
+
"description": "Maximum number of results to return. Default: 20.",
|
| 434 |
+
},
|
| 435 |
+
},
|
| 436 |
+
"required": ["query"],
|
| 437 |
+
},
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
async def github_search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 442 |
+
"""Handler for agent tool router"""
|
| 443 |
+
try:
|
| 444 |
+
result = search_code(
|
| 445 |
+
query=arguments["query"],
|
| 446 |
+
repo_pattern=arguments.get("repo_pattern"),
|
| 447 |
+
path_pattern=arguments.get("path_pattern"),
|
| 448 |
+
regex=arguments.get("regex", False),
|
| 449 |
+
max_results=arguments.get("max_results", 20),
|
| 450 |
+
)
|
| 451 |
+
return result["formatted"], not result.get("isError", False)
|
| 452 |
+
except Exception as e:
|
| 453 |
+
return f"Error searching code: {str(e)}", False
|
agent/tools/jobs_tool.py
CHANGED
|
@@ -7,6 +7,7 @@ Refactored to use official huggingface-hub library instead of custom HTTP client
|
|
| 7 |
import asyncio
|
| 8 |
import base64
|
| 9 |
import os
|
|
|
|
| 10 |
from typing import Any, Dict, Literal, Optional
|
| 11 |
|
| 12 |
from huggingface_hub import HfApi
|
|
@@ -40,6 +41,20 @@ GPU_FLAVORS = [
|
|
| 40 |
"h100",
|
| 41 |
"h100x8",
|
| 42 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
SPECIALIZED_FLAVORS = ["inf2x6"]
|
| 44 |
ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
|
| 45 |
|
|
@@ -62,6 +77,44 @@ OperationType = Literal[
|
|
| 62 |
UV_DEFAULT_IMAGE = "ghcr.io/astral-sh/uv:python3.12-bookworm"
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def _add_environment_variables(params: Dict[str, Any] | None) -> Dict[str, Any]:
|
| 66 |
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or ""
|
| 67 |
|
|
@@ -375,8 +428,11 @@ class HfJobsTool:
|
|
| 375 |
namespace=self.namespace,
|
| 376 |
)
|
| 377 |
|
|
|
|
|
|
|
|
|
|
| 378 |
# Format all logs for the agent
|
| 379 |
-
log_text = "\n".join(
|
| 380 |
|
| 381 |
response = f"""{job_type} job completed!
|
| 382 |
|
|
@@ -741,12 +797,12 @@ HF_JOBS_TOOL_SPEC = {
|
|
| 741 |
"1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
|
| 742 |
"2. **Docker mode:** Provide 'image' + 'command' → full control\n"
|
| 743 |
"(script and command are mutually exclusive)\n\n"
|
| 744 |
-
"## Hardware:\n"
|
| 745 |
-
"CPU:
|
| 746 |
-
"GPU:
|
| 747 |
"## Examples:\n\n"
|
| 748 |
"**Fine-tune LLM and push to Hub:**\n"
|
| 749 |
-
"{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"
|
| 750 |
"**Generate dataset daily and upload:**\n"
|
| 751 |
"{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
|
| 752 |
"**Run custom training with Docker:**\n"
|
|
@@ -807,7 +863,7 @@ HF_JOBS_TOOL_SPEC = {
|
|
| 807 |
# Hardware and environment
|
| 808 |
"hardware_flavor": {
|
| 809 |
"type": "string",
|
| 810 |
-
"description": "Hardware type. CPU:
|
| 811 |
},
|
| 812 |
"timeout": {
|
| 813 |
"type": "string",
|
|
|
|
| 7 |
import asyncio
|
| 8 |
import base64
|
| 9 |
import os
|
| 10 |
+
import re
|
| 11 |
from typing import Any, Dict, Literal, Optional
|
| 12 |
|
| 13 |
from huggingface_hub import HfApi
|
|
|
|
| 41 |
"h100",
|
| 42 |
"h100x8",
|
| 43 |
]
|
| 44 |
+
|
| 45 |
+
# Detailed specs for display (vCPU/RAM/GPU VRAM)
|
| 46 |
+
CPU_FLAVORS_DESC = (
|
| 47 |
+
"cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
|
| 48 |
+
)
|
| 49 |
+
GPU_FLAVORS_DESC = (
|
| 50 |
+
"t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
|
| 51 |
+
"l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
|
| 52 |
+
"l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
|
| 53 |
+
"a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
|
| 54 |
+
"a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
|
| 55 |
+
"a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
|
| 56 |
+
"zero-a10g(dynamic alloc)"
|
| 57 |
+
)
|
| 58 |
SPECIALIZED_FLAVORS = ["inf2x6"]
|
| 59 |
ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
|
| 60 |
|
|
|
|
| 77 |
UV_DEFAULT_IMAGE = "ghcr.io/astral-sh/uv:python3.12-bookworm"
|
| 78 |
|
| 79 |
|
| 80 |
+
def _filter_uv_install_output(logs: list[str]) -> list[str]:
|
| 81 |
+
"""
|
| 82 |
+
Filter out UV package installation output from logs.
|
| 83 |
+
|
| 84 |
+
Replaces installation details with "[installs truncated]" and keeps
|
| 85 |
+
the "Installed X packages in Y ms/s" summary line.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
logs: List of log lines
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
Filtered list of log lines
|
| 92 |
+
"""
|
| 93 |
+
if not logs:
|
| 94 |
+
return logs
|
| 95 |
+
|
| 96 |
+
# Regex pattern to match: "Installed X packages in Y ms" or "Installed X package in Y s"
|
| 97 |
+
install_pattern = re.compile(
|
| 98 |
+
r"^Installed\s+\d+\s+packages?\s+in\s+\d+(?:\.\d+)?\s*(?:ms|s)$"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Find the index of the "Installed X packages" line
|
| 102 |
+
install_line_idx = None
|
| 103 |
+
for idx, line in enumerate(logs):
|
| 104 |
+
if install_pattern.match(line.strip()):
|
| 105 |
+
install_line_idx = idx
|
| 106 |
+
break
|
| 107 |
+
|
| 108 |
+
# If pattern found, replace installation details with truncation message
|
| 109 |
+
if install_line_idx is not None and install_line_idx > 0:
|
| 110 |
+
# Keep logs from the "Installed X packages" line onward
|
| 111 |
+
# Add truncation message before the "Installed" line
|
| 112 |
+
return ["[installs truncated]"] + logs[install_line_idx:]
|
| 113 |
+
|
| 114 |
+
# If pattern not found, return original logs
|
| 115 |
+
return logs
|
| 116 |
+
|
| 117 |
+
|
| 118 |
def _add_environment_variables(params: Dict[str, Any] | None) -> Dict[str, Any]:
|
| 119 |
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or ""
|
| 120 |
|
|
|
|
| 428 |
namespace=self.namespace,
|
| 429 |
)
|
| 430 |
|
| 431 |
+
# Filter out UV package installation output
|
| 432 |
+
filtered_logs = _filter_uv_install_output(all_logs)
|
| 433 |
+
|
| 434 |
# Format all logs for the agent
|
| 435 |
+
log_text = "\n".join(filtered_logs) if filtered_logs else "(no logs)"
|
| 436 |
|
| 437 |
response = f"""{job_type} job completed!
|
| 438 |
|
|
|
|
| 797 |
"1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
|
| 798 |
"2. **Docker mode:** Provide 'image' + 'command' → full control\n"
|
| 799 |
"(script and command are mutually exclusive)\n\n"
|
| 800 |
+
"## Available Hardware (vCPU/RAM/GPU):\n"
|
| 801 |
+
f"CPU: {CPU_FLAVORS_DESC}\n"
|
| 802 |
+
f"GPU: {GPU_FLAVORS_DESC}\n"
|
| 803 |
"## Examples:\n\n"
|
| 804 |
"**Fine-tune LLM and push to Hub:**\n"
|
| 805 |
+
"{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B-Thinking-2507\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
|
| 806 |
"**Generate dataset daily and upload:**\n"
|
| 807 |
"{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
|
| 808 |
"**Run custom training with Docker:**\n"
|
|
|
|
| 863 |
# Hardware and environment
|
| 864 |
"hardware_flavor": {
|
| 865 |
"type": "string",
|
| 866 |
+
"description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
|
| 867 |
},
|
| 868 |
"timeout": {
|
| 869 |
"type": "string",
|
agent/tools/utilities.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
| 2 |
Utility functions for Hugging Face tools
|
| 3 |
|
| 4 |
Ported from: hf-mcp-server/packages/mcp/src/jobs/formatters.ts
|
|
|
|
| 5 |
"""
|
| 6 |
|
|
|
|
| 7 |
from datetime import datetime
|
| 8 |
from typing import Any, Dict, List, Optional
|
| 9 |
|
|
@@ -126,7 +128,6 @@ def format_scheduled_jobs_table(jobs: List[Dict[str, Any]]) -> str:
|
|
| 126 |
|
| 127 |
def format_job_details(jobs: Any) -> str:
|
| 128 |
"""Format job details as JSON in a markdown code block"""
|
| 129 |
-
import json
|
| 130 |
|
| 131 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 132 |
json_str = json.dumps(job_array, indent=2)
|
|
@@ -135,7 +136,6 @@ def format_job_details(jobs: Any) -> str:
|
|
| 135 |
|
| 136 |
def format_scheduled_job_details(jobs: Any) -> str:
|
| 137 |
"""Format scheduled job details as JSON in a markdown code block"""
|
| 138 |
-
import json
|
| 139 |
|
| 140 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 141 |
json_str = json.dumps(job_array, indent=2)
|
|
|
|
| 2 |
Utility functions for Hugging Face tools
|
| 3 |
|
| 4 |
Ported from: hf-mcp-server/packages/mcp/src/jobs/formatters.ts
|
| 5 |
+
Includes GPU memory validation for job submissions
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import json
|
| 9 |
from datetime import datetime
|
| 10 |
from typing import Any, Dict, List, Optional
|
| 11 |
|
|
|
|
| 128 |
|
| 129 |
def format_job_details(jobs: Any) -> str:
|
| 130 |
"""Format job details as JSON in a markdown code block"""
|
|
|
|
| 131 |
|
| 132 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 133 |
json_str = json.dumps(job_array, indent=2)
|
|
|
|
| 136 |
|
| 137 |
def format_scheduled_job_details(jobs: Any) -> str:
|
| 138 |
"""Format scheduled job details as JSON in a markdown code block"""
|
|
|
|
| 139 |
|
| 140 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 141 |
json_str = json.dumps(job_array, indent=2)
|
agent/tools/utils_tools.py
CHANGED
|
@@ -4,14 +4,9 @@ Utils Tools - General utility operations
|
|
| 4 |
Provides system information like current date/time with timezone support.
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
import
|
| 8 |
from datetime import datetime
|
| 9 |
-
from typing import Any, Dict, Literal
|
| 10 |
-
|
| 11 |
-
try:
|
| 12 |
-
import zoneinfo
|
| 13 |
-
except ImportError:
|
| 14 |
-
from backports import zoneinfo
|
| 15 |
|
| 16 |
from agent.tools.types import ToolResult
|
| 17 |
|
|
@@ -123,7 +118,9 @@ Common timezones: Europe/Paris, America/New_York, America/Los_Angeles, Asia/Toky
|
|
| 123 |
date_str = now.strftime("%d-%m-%Y")
|
| 124 |
|
| 125 |
# Format time as HH:MM:SS.mmm
|
| 126 |
-
time_str = now.strftime("%H:%M:%S.%f")[
|
|
|
|
|
|
|
| 127 |
|
| 128 |
# Get timezone abbreviation/offset
|
| 129 |
tz_offset = now.strftime("%z")
|
|
|
|
| 4 |
Provides system information like current date/time with timezone support.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
import zoneinfo
|
| 8 |
from datetime import datetime
|
| 9 |
+
from typing import Any, Dict, Literal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from agent.tools.types import ToolResult
|
| 12 |
|
|
|
|
| 118 |
date_str = now.strftime("%d-%m-%Y")
|
| 119 |
|
| 120 |
# Format time as HH:MM:SS.mmm
|
| 121 |
+
time_str = now.strftime("%H:%M:%S.%f")[
|
| 122 |
+
:-3
|
| 123 |
+
] # Remove last 3 digits to keep only milliseconds
|
| 124 |
|
| 125 |
# Get timezone abbreviation/offset
|
| 126 |
tz_offset = now.strftime("%z")
|
configs/main_agent_config.json
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"model_name": "anthropic/claude-
|
|
|
|
|
|
|
| 3 |
"mcpServers": {
|
| 4 |
"hf-mcp-server": {
|
| 5 |
"transport": "http",
|
|
|
|
| 1 |
{
|
| 2 |
+
"model_name": "anthropic/claude-opus-4-5-20251101",
|
| 3 |
+
"save_sessions": true,
|
| 4 |
+
"session_dataset_repo": "smolagents/hf-agent-sessions",
|
| 5 |
"mcpServers": {
|
| 6 |
"hf-mcp-server": {
|
| 7 |
"transport": "http",
|
pyproject.toml
CHANGED
|
@@ -5,22 +5,41 @@ description = "Add your description here"
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
| 8 |
-
"
|
| 9 |
-
|
| 10 |
"pydantic>=2.12.3",
|
| 11 |
-
"litellm>=1.0.0",
|
| 12 |
-
"tenacity>=8.0.0",
|
| 13 |
-
"pandas>=2.3.3",
|
| 14 |
"python-dotenv>=1.2.1",
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"huggingface-hub>=1.0.1",
|
| 17 |
"fastmcp>=2.4.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"inspect-ai>=0.3.149",
|
| 19 |
-
"
|
| 20 |
-
"
|
| 21 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
"pytest>=9.0.2",
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
]
|
|
|
|
| 5 |
readme = "README.md"
|
| 6 |
requires-python = ">=3.12"
|
| 7 |
dependencies = [
|
| 8 |
+
"datasets>=4.4.1",
|
| 9 |
+
# Core dependencies (always required)
|
| 10 |
"pydantic>=2.12.3",
|
|
|
|
|
|
|
|
|
|
| 11 |
"python-dotenv>=1.2.1",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[project.optional-dependencies]
|
| 15 |
+
# Agent runtime dependencies
|
| 16 |
+
agent = [
|
| 17 |
+
"requests>=2.32.5",
|
| 18 |
+
"litellm>=1.0.0",
|
| 19 |
"huggingface-hub>=1.0.1",
|
| 20 |
"fastmcp>=2.4.0",
|
| 21 |
+
"lmnr>=0.7.23", # Note: Using base package to avoid torch/transformers from [all] extra
|
| 22 |
+
"prompt-toolkit>=3.0.0",
|
| 23 |
+
"thefuzz>=0.22.1",
|
| 24 |
+
"nbconvert>=7.16.6",
|
| 25 |
+
"nbformat>=5.10.4",
|
| 26 |
+
"datasets>=4.3.0", # For session logging to HF datasets
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
# Evaluation/benchmarking dependencies
|
| 30 |
+
eval = [
|
| 31 |
"inspect-ai>=0.3.149",
|
| 32 |
+
"pandas>=2.3.3",
|
| 33 |
+
"datasets>=4.3.0",
|
| 34 |
+
"tenacity>=8.0.0",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
# Development and testing dependencies
|
| 38 |
+
dev = [
|
| 39 |
"pytest>=9.0.2",
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
# All dependencies (agent + eval + dev)
|
| 43 |
+
all = [
|
| 44 |
+
"hf-agent[agent,eval,dev]",
|
| 45 |
]
|
tests/unit/tools/test_jobs_tool.py
CHANGED
|
@@ -452,3 +452,86 @@ async def test_list_jobs_with_status_filter():
|
|
| 452 |
assert "job-3" in result["formatted"]
|
| 453 |
assert "job-1" not in result["formatted"]
|
| 454 |
assert result["resultsShared"] == 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
assert "job-3" in result["formatted"]
|
| 453 |
assert "job-1" not in result["formatted"]
|
| 454 |
assert result["resultsShared"] == 1
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def test_filter_uv_install_output():
|
| 458 |
+
"""Test filtering of UV package installation output"""
|
| 459 |
+
from agent.tools.jobs_tool import _filter_uv_install_output
|
| 460 |
+
|
| 461 |
+
# Test case 1: Logs with UV installation output
|
| 462 |
+
logs_with_install = [
|
| 463 |
+
"Resolved 68 packages in 1.01s",
|
| 464 |
+
"Installed 68 packages in 251ms",
|
| 465 |
+
"Hello from the script!",
|
| 466 |
+
"Script execution completed",
|
| 467 |
+
]
|
| 468 |
+
|
| 469 |
+
filtered = _filter_uv_install_output(logs_with_install)
|
| 470 |
+
assert len(filtered) == 4
|
| 471 |
+
assert filtered[0] == "[installs truncated]"
|
| 472 |
+
assert filtered[1] == "Installed 68 packages in 251ms"
|
| 473 |
+
assert filtered[2] == "Hello from the script!"
|
| 474 |
+
assert filtered[3] == "Script execution completed"
|
| 475 |
+
|
| 476 |
+
# Test case 2: Logs without UV installation output
|
| 477 |
+
logs_without_install = [
|
| 478 |
+
"Script started",
|
| 479 |
+
"Processing data...",
|
| 480 |
+
"Done!",
|
| 481 |
+
]
|
| 482 |
+
|
| 483 |
+
filtered = _filter_uv_install_output(logs_without_install)
|
| 484 |
+
assert len(filtered) == 3
|
| 485 |
+
assert filtered == logs_without_install
|
| 486 |
+
|
| 487 |
+
# Test case 3: Empty logs
|
| 488 |
+
assert _filter_uv_install_output([]) == []
|
| 489 |
+
|
| 490 |
+
# Test case 4: Different time formats (ms vs s)
|
| 491 |
+
logs_with_seconds = [
|
| 492 |
+
"Downloading packages...",
|
| 493 |
+
"Installed 10 packages in 2s",
|
| 494 |
+
"Running main.py",
|
| 495 |
+
]
|
| 496 |
+
|
| 497 |
+
filtered = _filter_uv_install_output(logs_with_seconds)
|
| 498 |
+
assert len(filtered) == 3
|
| 499 |
+
assert filtered[0] == "[installs truncated]"
|
| 500 |
+
assert filtered[1] == "Installed 10 packages in 2s"
|
| 501 |
+
assert filtered[2] == "Running main.py"
|
| 502 |
+
|
| 503 |
+
# Test case 5: Single package
|
| 504 |
+
logs_single_package = [
|
| 505 |
+
"Resolving dependencies",
|
| 506 |
+
"Installed 1 package in 50ms",
|
| 507 |
+
"Import successful",
|
| 508 |
+
]
|
| 509 |
+
|
| 510 |
+
filtered = _filter_uv_install_output(logs_single_package)
|
| 511 |
+
assert len(filtered) == 3
|
| 512 |
+
assert filtered[0] == "[installs truncated]"
|
| 513 |
+
assert filtered[1] == "Installed 1 package in 50ms"
|
| 514 |
+
assert filtered[2] == "Import successful"
|
| 515 |
+
|
| 516 |
+
# Test case 6: Decimal time values
|
| 517 |
+
logs_decimal_time = [
|
| 518 |
+
"Starting installation",
|
| 519 |
+
"Installed 25 packages in 125.5ms",
|
| 520 |
+
"All dependencies ready",
|
| 521 |
+
]
|
| 522 |
+
|
| 523 |
+
filtered = _filter_uv_install_output(logs_decimal_time)
|
| 524 |
+
assert len(filtered) == 3
|
| 525 |
+
assert filtered[0] == "[installs truncated]"
|
| 526 |
+
assert filtered[1] == "Installed 25 packages in 125.5ms"
|
| 527 |
+
assert filtered[2] == "All dependencies ready"
|
| 528 |
+
|
| 529 |
+
# Test case 7: "Installed" line is first (no truncation needed)
|
| 530 |
+
logs_install_first = [
|
| 531 |
+
"Installed 5 packages in 100ms",
|
| 532 |
+
"Running script...",
|
| 533 |
+
]
|
| 534 |
+
|
| 535 |
+
filtered = _filter_uv_install_output(logs_install_first)
|
| 536 |
+
# No truncation message if "Installed" is the first line
|
| 537 |
+
assert filtered == logs_install_first
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|