Upload folder using huggingface_hub

#1
by chmielvu - opened
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Modules/AI_Web_Search.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Web Search Module.
3
+
4
+ AI-powered web search using Pollinations API with Perplexity and Gemini models.
5
+ Query optimization is ALWAYS enabled for best results.
6
+
7
+ Depth levels:
8
+ - fast: Gemini with Google Search - Quick, reliable answers
9
+ - normal: Perplexity Sonar - Balanced speed and quality
10
+ - deep: Perplexity Sonar Reasoning - Deep analysis with reasoning chain
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Annotated, Literal
16
+
17
+ import gradio as gr
18
+
19
+ from app import _log_call_end, _log_call_start, _truncate_for_log
20
+ from ._docstrings import autodoc
21
+ from ._pollinations_client import PollinationsClient
22
+ from ._query_optimizer import get_optimizer
23
+
24
+
25
+ # Single source of truth for the LLM-facing tool description
26
+ TOOL_SUMMARY = (
27
+ "AI-powered web search using Perplexity or Gemini with built-in web search. "
28
+ "Returns synthesized answers with source citations. "
29
+ "Use for complex questions requiring current information and analysis. "
30
+ "Query optimization is automatically applied for best results."
31
+ )
32
+
33
+
34
+ @autodoc(
35
+ summary=TOOL_SUMMARY,
36
+ )
37
+ def AI_Web_Search(
38
+ query: Annotated[str, "The search query or question."],
39
+ depth: Annotated[
40
+ Literal["fast", "normal", "deep"],
41
+ "Search depth: 'fast' (Gemini + Google Search), 'normal' (Perplexity Sonar), 'deep' (Perplexity Sonar Reasoning).",
42
+ ] = "normal",
43
+ detailed: Annotated[bool, "Request a comprehensive answer with more detail."] = False,
44
+ ) -> str:
45
+ """
46
+ AI-powered web search with automatic query optimization.
47
+
48
+ Uses Pollinations API to access AI search models with built-in web search:
49
+ - fast: Gemini with Google Search - Best for quick facts
50
+ - normal: Perplexity Sonar - Balanced for general research
51
+ - deep: Perplexity Sonar Reasoning - Best for complex analysis
52
+
53
+ Query optimization is ALWAYS ON - queries are automatically optimized
54
+ for AI search using SC-CoT (Mistral → HF fallback chain).
55
+
56
+ Returns a synthesized answer with numbered citations and source URLs.
57
+ """
58
+ _log_call_start("AI_Web_Search", query=query, depth=depth, detailed=detailed)
59
+
60
+ if not query or not query.strip():
61
+ result = "No search query provided. Please enter a question or search term."
62
+ _log_call_end("AI_Web_Search", _truncate_for_log(result))
63
+ return result
64
+
65
+ # ALWAYS optimize the query for AI search
66
+ original_query = query
67
+ optimization_metadata = None
68
+ try:
69
+ optimizer = get_optimizer()
70
+ query, optimization_metadata = optimizer.optimize_for_ai_search(query)
71
+ except Exception as exc:
72
+ print(f"[AI_Web_Search] Query optimization failed: {exc}", flush=True)
73
+ # Continue with original query
74
+
75
+ try:
76
+ client = PollinationsClient()
77
+ result_data = client.web_search_sync(query, depth, detailed)
78
+
79
+ # Build output
80
+ lines = []
81
+
82
+ # Add optimization info if available
83
+ if optimization_metadata and optimization_metadata.get("original_query") != optimization_metadata.get("optimized_query"):
84
+ lines.append(f"Optimized query: {query}")
85
+ lines.append(f"Original query: {original_query}")
86
+ lines.append(f"Optimizer: {optimization_metadata.get('provider', 'unknown')}")
87
+ lines.append("")
88
+
89
+ lines.append(f"Query: {result_data['query']}")
90
+ lines.append(f"Model: {result_data['model']}")
91
+ lines.append(f"Depth: {depth}")
92
+ lines.append("")
93
+ lines.append("Answer:")
94
+ lines.append(result_data["answer"] or "No answer generated.")
95
+
96
+ if result_data["sources"]:
97
+ lines.append("")
98
+ lines.append("Sources:")
99
+ for i, source in enumerate(result_data["sources"], 1):
100
+ lines.append(f" {i}. {source}")
101
+ else:
102
+ lines.append("")
103
+ lines.append("(No sources provided)")
104
+
105
+ result = "\n".join(lines)
106
+ _log_call_end("AI_Web_Search", _truncate_for_log(result))
107
+ return result
108
+
109
+ except Exception as exc:
110
+ error_msg = f"Search failed: {exc}"
111
+ _log_call_end("AI_Web_Search", error_msg)
112
+ return error_msg
113
+
114
+
115
+ def build_interface() -> gr.Interface:
116
+ """Build the Gradio interface for AI Web Search."""
117
+ return gr.Interface(
118
+ fn=AI_Web_Search,
119
+ inputs=[
120
+ gr.Textbox(
121
+ label="Query",
122
+ placeholder="Ask a question or enter a search topic...",
123
+ max_lines=3,
124
+ info="Your question or search query (will be optimized automatically)",
125
+ ),
126
+ gr.Radio(
127
+ label="Search Depth",
128
+ choices=["fast", "normal", "deep"],
129
+ value="normal",
130
+ info="fast: Gemini + Google Search | normal: Perplexity Sonar | deep: Perplexity Reasoning",
131
+ ),
132
+ gr.Checkbox(
133
+ label="Detailed Answer",
134
+ value=False,
135
+ info="Request a comprehensive answer with more detail",
136
+ ),
137
+ ],
138
+ outputs=gr.Textbox(
139
+ label="AI Search Results",
140
+ interactive=False,
141
+ lines=20,
142
+ max_lines=30,
143
+ ),
144
+ title="AI Web Search",
145
+ description=(
146
+ "<div style='text-align:center'>"
147
+ "AI-powered web search with automatic query optimization. "
148
+ "Uses Perplexity Sonar or Gemini with built-in web search to provide "
149
+ "direct answers with source citations. Query optimization is always enabled."
150
+ "</div>"
151
+ ),
152
+ api_description=TOOL_SUMMARY,
153
+ flagging_mode="never",
154
+ submit_btn="Search",
155
+ )
156
+
157
+
158
+ __all__ = ["AI_Web_Search", "build_interface"]
Modules/Agent_Skills.py CHANGED
@@ -1,788 +1,788 @@
1
- from __future__ import annotations
2
-
3
- """
4
- Agent Skills Module for Nymbo-Tools MCP Server.
5
-
6
- Provides structured skill discovery, activation, validation, and resource access
7
- following the Agent Skills specification (https://agentskills.io).
8
-
9
- Skills are directories containing a SKILL.md file with YAML frontmatter (name, description)
10
- and Markdown instructions. This tool enables agents to efficiently discover and use skills
11
- through progressive disclosure: low-token metadata discovery, on-demand full activation,
12
- and targeted resource access.
13
- """
14
-
15
- import json
16
- import os
17
- import re
18
- import unicodedata
19
- from pathlib import Path
20
- from typing import Annotated, Optional
21
-
22
- import gradio as gr
23
-
24
- from app import _log_call_end, _log_call_start, _truncate_for_log
25
- from ._docstrings import autodoc
26
- from .File_System import ROOT_DIR, _display_path
27
-
28
-
29
- # ---------------------------------------------------------------------------
30
- # Constants
31
- # ---------------------------------------------------------------------------
32
-
33
- SKILLS_SUBDIR = "Skills" # Subdirectory under ROOT_DIR containing skills
34
- MAX_SKILL_NAME_LENGTH = 64
35
- MAX_DESCRIPTION_LENGTH = 1024
36
- MAX_COMPATIBILITY_LENGTH = 500
37
-
38
- ALLOWED_FRONTMATTER_FIELDS = {
39
- "name",
40
- "description",
41
- "license",
42
- "allowed-tools",
43
- "metadata",
44
- "compatibility",
45
- }
46
-
47
- TOOL_SUMMARY = (
48
- "Discover, inspect, validate, and access Agent Skills. "
49
- "Actions: discover (list all skills), info (get SKILL.md contents), "
50
- "resources (list/read bundled files), validate (check format), search (find by keyword). "
51
- "Skills provide structured instructions for specialized tasks. "
52
- "Use in combination with the `Shell_Command` and `File_System` tools."
53
- )
54
-
55
- HELP_TEXT = """\
56
- Agent Skills — actions and usage
57
-
58
- Skills are directories containing a SKILL.md file with YAML frontmatter (name, description)
59
- and Markdown instructions. They live under /Skills/ in the filesystem root.
60
-
61
- Actions:
62
- - discover: List all available skills with their metadata (name, description, location)
63
- - info: Get the full contents of a specific skill's SKILL.md file
64
- - resources: List or read files within a skill's bundled directories (scripts/, references/, assets/)
65
- - validate: Check if a skill conforms to the Agent Skills specification
66
- - search: Find skills by keyword in name or description
67
- - help: Show this guide
68
-
69
- Examples:
70
- - Discover all skills: action="discover"
71
- - Get skill info: action="info", skill_name="pdf"
72
- - List skill resources: action="resources", skill_name="mcp-builder"
73
- - Read a resource: action="resources", skill_name="pdf", resource_path="references/forms.md"
74
- - Validate a skill: action="validate", skill_name="pdf"
75
- - Search for skills: action="search", query="MCP"
76
- """
77
-
78
-
79
- # ---------------------------------------------------------------------------
80
- # Skills Root Resolution
81
- # ---------------------------------------------------------------------------
82
-
83
- def _get_skills_root() -> Path:
84
- """Get the absolute path to the skills directory."""
85
- skills_root = os.getenv("NYMBO_SKILLS_ROOT")
86
- if skills_root and skills_root.strip():
87
- return Path(skills_root.strip()).resolve()
88
- return Path(ROOT_DIR) / SKILLS_SUBDIR
89
-
90
- # Import _fmt_size from shared utility instead of duplicating
91
- from ._core import _fmt_size
92
-
93
-
94
- # ---------------------------------------------------------------------------
95
- # YAML Frontmatter Parsing (adapted from skills_ref/parser.py)
96
- # ---------------------------------------------------------------------------
97
-
98
- class ParseError(Exception):
99
- """Raised when SKILL.md parsing fails."""
100
- pass
101
-
102
-
103
- class ValidationError(Exception):
104
- """Raised when skill validation fails."""
105
- def __init__(self, message: str, errors: list[str] | None = None):
106
- super().__init__(message)
107
- self.errors = errors if errors is not None else [message]
108
-
109
-
110
- def _parse_frontmatter(content: str) -> tuple[dict, str]:
111
- """
112
- Parse YAML frontmatter from SKILL.md content.
113
-
114
- Returns (metadata dict, markdown body).
115
- Raises ParseError if frontmatter is missing or invalid.
116
- """
117
- if not content.startswith("---"):
118
- raise ParseError("SKILL.md must start with YAML frontmatter (---)")
119
-
120
- parts = content.split("---", 2)
121
- if len(parts) < 3:
122
- raise ParseError("SKILL.md frontmatter not properly closed with ---")
123
-
124
- frontmatter_str = parts[1]
125
- body = parts[2].strip()
126
-
127
- # Simple YAML parsing without external dependency
128
- metadata: dict = {}
129
- in_metadata_block = False
130
- metadata_dict: dict = {}
131
-
132
- for line in frontmatter_str.strip().split("\n"):
133
- if not line.strip():
134
- continue
135
-
136
- if line.strip() == "metadata:":
137
- in_metadata_block = True
138
- continue
139
-
140
- if in_metadata_block:
141
- if line.startswith(" "):
142
- match = re.match(r"^\s+(\w+):\s*(.*)$", line)
143
- if match:
144
- key = match.group(1).strip()
145
- value = match.group(2).strip().strip('"').strip("'")
146
- metadata_dict[key] = value
147
- continue
148
- else:
149
- in_metadata_block = False
150
- if metadata_dict:
151
- metadata["metadata"] = metadata_dict
152
- metadata_dict = {}
153
-
154
- match = re.match(r"^(\S+):\s*(.*)$", line)
155
- if match:
156
- key = match.group(1).strip()
157
- value = match.group(2).strip()
158
- if (value.startswith('"') and value.endswith('"')) or \
159
- (value.startswith("'") and value.endswith("'")):
160
- value = value[1:-1]
161
- metadata[key] = value if value else ""
162
-
163
- if in_metadata_block and metadata_dict:
164
- metadata["metadata"] = metadata_dict
165
-
166
- return metadata, body
167
-
168
-
169
- def _find_skill_md(skill_dir: Path) -> Optional[Path]:
170
- """Find the SKILL.md file in a skill directory (prefers uppercase)."""
171
- for name in ("SKILL.md", "skill.md"):
172
- path = skill_dir / name
173
- if path.exists():
174
- return path
175
- return None
176
-
177
-
178
- # ---------------------------------------------------------------------------
179
- # Skill Validation (adapted from skills_ref/validator.py)
180
- # ---------------------------------------------------------------------------
181
-
182
- def _validate_name(name: str, skill_dir: Path) -> list[str]:
183
- """Validate skill name format and directory match."""
184
- errors = []
185
-
186
- if not name or not isinstance(name, str) or not name.strip():
187
- errors.append("Field 'name' must be a non-empty string")
188
- return errors
189
-
190
- name = unicodedata.normalize("NFKC", name.strip())
191
-
192
- if len(name) > MAX_SKILL_NAME_LENGTH:
193
- errors.append(f"Skill name '{name}' exceeds {MAX_SKILL_NAME_LENGTH} character limit ({len(name)} chars)")
194
-
195
- if name != name.lower():
196
- errors.append(f"Skill name '{name}' must be lowercase")
197
-
198
- if name.startswith("-") or name.endswith("-"):
199
- errors.append("Skill name cannot start or end with a hyphen")
200
-
201
- if "--" in name:
202
- errors.append("Skill name cannot contain consecutive hyphens")
203
-
204
- if not all(c.isalnum() or c == "-" for c in name):
205
- errors.append(f"Skill name '{name}' contains invalid characters. Only letters, digits, and hyphens allowed.")
206
-
207
- if skill_dir:
208
- dir_name = unicodedata.normalize("NFKC", skill_dir.name)
209
- if dir_name != name:
210
- errors.append(f"Directory name '{skill_dir.name}' must match skill name '{name}'")
211
-
212
- return errors
213
-
214
-
215
- def _validate_description(description: str) -> list[str]:
216
- """Validate description format."""
217
- errors = []
218
-
219
- if not description or not isinstance(description, str) or not description.strip():
220
- errors.append("Field 'description' must be a non-empty string")
221
- return errors
222
-
223
- if len(description) > MAX_DESCRIPTION_LENGTH:
224
- errors.append(f"Description exceeds {MAX_DESCRIPTION_LENGTH} character limit ({len(description)} chars)")
225
-
226
- return errors
227
-
228
-
229
- def _validate_compatibility(compatibility: str) -> list[str]:
230
- """Validate compatibility format."""
231
- errors = []
232
-
233
- if not isinstance(compatibility, str):
234
- errors.append("Field 'compatibility' must be a string")
235
- return errors
236
-
237
- if len(compatibility) > MAX_COMPATIBILITY_LENGTH:
238
- errors.append(f"Compatibility exceeds {MAX_COMPATIBILITY_LENGTH} character limit ({len(compatibility)} chars)")
239
-
240
- return errors
241
-
242
-
243
- def _validate_skill(skill_dir: Path) -> list[str]:
244
- """Validate a skill directory. Returns list of error messages (empty = valid)."""
245
- if not skill_dir.exists():
246
- return [f"Path does not exist: {skill_dir}"]
247
-
248
- if not skill_dir.is_dir():
249
- return [f"Not a directory: {skill_dir}"]
250
-
251
- skill_md = _find_skill_md(skill_dir)
252
- if skill_md is None:
253
- return ["Missing required file: SKILL.md"]
254
-
255
- try:
256
- content = skill_md.read_text(encoding="utf-8")
257
- metadata, _ = _parse_frontmatter(content)
258
- except ParseError as e:
259
- return [str(e)]
260
- except Exception as e:
261
- return [f"Failed to read SKILL.md: {e}"]
262
-
263
- errors = []
264
-
265
- extra_fields = set(metadata.keys()) - ALLOWED_FRONTMATTER_FIELDS
266
- if extra_fields:
267
- errors.append(f"Unexpected fields in frontmatter: {', '.join(sorted(extra_fields))}")
268
-
269
- if "name" not in metadata:
270
- errors.append("Missing required field: name")
271
- else:
272
- errors.extend(_validate_name(metadata["name"], skill_dir))
273
-
274
- if "description" not in metadata:
275
- errors.append("Missing required field: description")
276
- else:
277
- errors.extend(_validate_description(metadata["description"]))
278
-
279
- if "compatibility" in metadata:
280
- errors.extend(_validate_compatibility(metadata["compatibility"]))
281
-
282
- return errors
283
-
284
-
285
- # ---------------------------------------------------------------------------
286
- # Skill Discovery and Info
287
- # ---------------------------------------------------------------------------
288
-
289
- def _read_skill_properties(skill_dir: Path) -> dict:
290
- """Read skill properties from SKILL.md frontmatter. Returns dict with metadata."""
291
- skill_md = _find_skill_md(skill_dir)
292
- if skill_md is None:
293
- raise ParseError(f"SKILL.md not found in {skill_dir}")
294
-
295
- content = skill_md.read_text(encoding="utf-8")
296
- metadata, body = _parse_frontmatter(content)
297
-
298
- if "name" not in metadata:
299
- raise ValidationError("Missing required field: name")
300
- if "description" not in metadata:
301
- raise ValidationError("Missing required field: description")
302
-
303
- return {
304
- "name": metadata.get("name", "").strip(),
305
- "description": metadata.get("description", "").strip(),
306
- "license": metadata.get("license"),
307
- "compatibility": metadata.get("compatibility"),
308
- "allowed_tools": metadata.get("allowed-tools"),
309
- "metadata": metadata.get("metadata", {}),
310
- "location": str(skill_md),
311
- "body": body,
312
- }
313
-
314
-
315
- def _discover_skills() -> list[dict]:
316
- """Discover all valid skills in the skills directory."""
317
- skills_root = _get_skills_root()
318
-
319
- if not skills_root.exists():
320
- return []
321
-
322
- skills = []
323
- for item in sorted(skills_root.iterdir()):
324
- if not item.is_dir():
325
- continue
326
-
327
- skill_md = _find_skill_md(item)
328
- if skill_md is None:
329
- continue
330
-
331
- try:
332
- props = _read_skill_properties(item)
333
- skills.append({
334
- "name": props["name"],
335
- "description": props["description"],
336
- "location": _display_path(str(skill_md)),
337
- })
338
- except Exception:
339
- continue
340
-
341
- return skills
342
-
343
-
344
- def _get_skill_info(skill_name: str, offset: int = 0, max_chars: int = 0) -> dict:
345
- """Get full information for a specific skill."""
346
- skills_root = _get_skills_root()
347
- skill_dir = skills_root / skill_name
348
-
349
- if not skill_dir.exists():
350
- raise FileNotFoundError(f"Skill not found: {skill_name}")
351
-
352
- skill_md = _find_skill_md(skill_dir)
353
- if skill_md is None:
354
- raise FileNotFoundError(f"SKILL.md not found in skill: {skill_name}")
355
-
356
- content = skill_md.read_text(encoding="utf-8")
357
- metadata, body = _parse_frontmatter(content)
358
-
359
- total_chars = len(body)
360
- start = max(0, min(offset, total_chars))
361
- if max_chars > 0:
362
- end = min(total_chars, start + max_chars)
363
- else:
364
- end = total_chars
365
-
366
- body_chunk = body[start:end]
367
- truncated = end < total_chars
368
- next_cursor = end if truncated else None
369
-
370
- return {
371
- "name": metadata.get("name", "").strip(),
372
- "description": metadata.get("description", "").strip(),
373
- "license": metadata.get("license"),
374
- "compatibility": metadata.get("compatibility"),
375
- "allowed_tools": metadata.get("allowed-tools"),
376
- "metadata": metadata.get("metadata", {}),
377
- "location": _display_path(str(skill_md)),
378
- "body": body_chunk,
379
- "offset": start,
380
- "total_chars": total_chars,
381
- "truncated": truncated,
382
- "next_cursor": next_cursor,
383
- }
384
-
385
-
386
- def _list_skill_resources(skill_name: str) -> dict:
387
- """List all resources within a skill directory.
388
-
389
- Dynamically discovers all subdirectories, not just predefined ones.
390
- """
391
- skills_root = _get_skills_root()
392
- skill_dir = skills_root / skill_name
393
-
394
- if not skill_dir.exists():
395
- raise FileNotFoundError(f"Skill not found: {skill_name}")
396
-
397
- resources = {
398
- "skill": skill_name,
399
- "directories": {}, # Dynamic: dirname -> file list
400
- "other_files": [],
401
- }
402
-
403
- for item in sorted(skill_dir.iterdir()):
404
- if item.name.lower() in ("skill.md",):
405
- continue
406
-
407
- if item.is_dir():
408
- files = []
409
- for f in sorted(item.rglob("*")):
410
- if f.is_file():
411
- files.append({
412
- "path": f.relative_to(item).as_posix(),
413
- "size": f.stat().st_size,
414
- })
415
- resources["directories"][item.name] = files
416
- elif item.is_file():
417
- resources["other_files"].append({
418
- "path": item.name,
419
- "size": item.stat().st_size,
420
- })
421
-
422
- return resources
423
-
424
-
425
- def _read_skill_resource(skill_name: str, resource_path: str, offset: int = 0, max_chars: int = 3000) -> dict:
426
- """Read a specific resource file from a skill."""
427
- skills_root = _get_skills_root()
428
- skill_dir = skills_root / skill_name
429
-
430
- if not skill_dir.exists():
431
- raise FileNotFoundError(f"Skill not found: {skill_name}")
432
-
433
- resource_file = skill_dir / resource_path
434
-
435
- try:
436
- resource_file.resolve().relative_to(skill_dir.resolve())
437
- except ValueError:
438
- raise PermissionError(f"Resource path escapes skill directory: {resource_path}")
439
-
440
- if not resource_file.exists():
441
- raise FileNotFoundError(f"Resource not found: {resource_path}")
442
-
443
- if resource_file.is_dir():
444
- raise IsADirectoryError(f"Path is a directory: {resource_path}")
445
-
446
- content = resource_file.read_text(encoding="utf-8", errors="replace")
447
- total_chars = len(content)
448
-
449
- start = max(0, min(offset, total_chars))
450
- if max_chars > 0:
451
- end = min(total_chars, start + max_chars)
452
- else:
453
- end = total_chars
454
-
455
- chunk = content[start:end]
456
- truncated = end < total_chars
457
- next_cursor = end if truncated else None
458
-
459
- return {
460
- "skill": skill_name,
461
- "resource": resource_path,
462
- "content": chunk,
463
- "size": resource_file.stat().st_size,
464
- "offset": start,
465
- "total_chars": total_chars,
466
- "truncated": truncated,
467
- "next_cursor": next_cursor,
468
- }
469
-
470
-
471
- def _search_skills(query: str) -> list[dict]:
472
- """Search for skills by keyword in name or description."""
473
- query_lower = query.lower()
474
- all_skills = _discover_skills()
475
-
476
- matches = []
477
- for skill in all_skills:
478
- name_match = query_lower in skill["name"].lower()
479
- desc_match = query_lower in skill["description"].lower()
480
-
481
- if name_match or desc_match:
482
- matches.append({
483
- **skill,
484
- "match_in": "name" if name_match else "description",
485
- })
486
-
487
- return matches
488
-
489
-
490
- # ---------------------------------------------------------------------------
491
- # Human-Readable Output Formatters
492
- # ---------------------------------------------------------------------------
493
-
494
- def _format_discover(skills: list[dict]) -> str:
495
- """Format skill discovery results as human-readable text."""
496
- skills_root = _display_path(str(_get_skills_root()))
497
- lines = [
498
- f"Available Skills",
499
- f"Root: {skills_root}",
500
- f"Total: {len(skills)} skills",
501
- "",
502
- ]
503
-
504
- if not skills:
505
- lines.append("No skills found.")
506
- else:
507
- for i, skill in enumerate(skills, 1):
508
- name = skill["name"]
509
- desc = skill["description"]
510
- # Truncate long descriptions
511
- if len(desc) > 100:
512
- desc = desc[:97] + "..."
513
- lines.append(f"{i}. {name}")
514
- lines.append(f" {desc}")
515
- lines.append("")
516
-
517
- return "\n".join(lines).strip()
518
-
519
-
520
- def _format_skill_info(info: dict) -> str:
521
- """Format skill info as human-readable text."""
522
- lines = [
523
- f"Skill: {info['name']}",
524
- f"Location: {info['location']}",
525
- "",
526
- f"Description: {info['description']}",
527
- ]
528
-
529
- if info.get("license"):
530
- lines.append(f"License: {info['license']}")
531
- if info.get("compatibility"):
532
- lines.append(f"Compatibility: {info['compatibility']}")
533
- if info.get("allowed_tools"):
534
- lines.append(f"Allowed Tools: {info['allowed_tools']}")
535
- if info.get("metadata"):
536
- meta_str = ", ".join(f"{k}={v}" for k, v in info["metadata"].items())
537
- lines.append(f"Metadata: {meta_str}")
538
-
539
- lines.append("")
540
- lines.append("--- SKILL.md Body ---")
541
- if info.get("offset", 0) > 0:
542
- lines.append(f"(Showing content from offset {info['offset']})")
543
- lines.append("")
544
- lines.append(info["body"])
545
-
546
- if info.get("truncated"):
547
- lines.append("")
548
- lines.append(f"… Truncated. Showing {len(info['body'])} chars (offset {info['offset']}). Total: {info['total_chars']}.")
549
- lines.append(f"Next cursor: {info['next_cursor']}")
550
-
551
- return "\n".join(lines)
552
-
553
-
554
- def _format_resources_list(resources: dict) -> str:
555
- """Format resource listing as a visual filesystem tree with line connectors."""
556
- from ._core import build_tree, render_tree
557
-
558
- skill = resources["skill"]
559
- lines = [
560
- f"Resources for skill: {skill}",
561
- "",
562
- ]
563
-
564
- # Build entries list for the tree
565
- entries: list[tuple[str, dict]] = []
566
-
567
- # Add all discovered directories and their files
568
- directories = resources.get("directories", {})
569
- for dirname, files in directories.items():
570
- for f in files:
571
- path = f"{dirname}/{f['path']}"
572
- entries.append((path, {"size": f["size"]}))
573
-
574
- # Add root files
575
- other = resources.get("other_files", [])
576
- for f in other:
577
- entries.append((f["path"], {"size": f["size"]}))
578
-
579
- # Build and render the unified tree
580
- tree = build_tree(entries)
581
-
582
- # Count files
583
- total_files = len(entries)
584
-
585
- # Render with skill as root
586
- lines.append(f"└── {skill}/")
587
- lines.extend(render_tree(tree, " "))
588
-
589
- lines.append("")
590
- if total_files == 0:
591
- lines.append("No resource files found.")
592
- else:
593
- lines.append(f"Total: {total_files} files")
594
-
595
- return "\n".join(lines).strip()
596
-
597
-
598
- def _format_resource_content(data: dict) -> str:
599
- """Format resource file content as human-readable text."""
600
- lines = [
601
- f"Resource: {data['resource']}",
602
- f"Skill: {data['skill']}",
603
- f"Size: {_fmt_size(data['size'])}",
604
- ]
605
-
606
- offset = data.get("offset", 0)
607
- lines.append(f"Showing: {len(data['content'])} of {data['total_chars']} chars (offset {offset})")
608
-
609
- lines.append("")
610
- lines.append("--- Content ---")
611
- lines.append("")
612
- lines.append(data["content"])
613
-
614
- if data.get("truncated"):
615
- lines.append("")
616
- lines.append(f"… Truncated. Next cursor: {data['next_cursor']}")
617
-
618
- return "\n".join(lines)
619
-
620
-
621
- def _format_validation(skill_name: str, errors: list[str]) -> str:
622
- """Format validation results as human-readable text."""
623
- if not errors:
624
- return f"✓ Skill '{skill_name}' is valid."
625
-
626
- lines = [
627
- f"✗ Validation failed for skill '{skill_name}'",
628
- f"Errors: {len(errors)}",
629
- "",
630
- ]
631
-
632
- for i, err in enumerate(errors, 1):
633
- lines.append(f" {i}. {err}")
634
-
635
- return "\n".join(lines)
636
-
637
-
638
- def _format_search(query: str, matches: list[dict]) -> str:
639
- """Format search results as human-readable text."""
640
- lines = [
641
- f"Search results for: {query}",
642
- f"Matches: {len(matches)}",
643
- "",
644
- ]
645
-
646
- if not matches:
647
- lines.append("No matching skills found.")
648
- else:
649
- for i, m in enumerate(matches, 1):
650
- name = m["name"]
651
- desc = m["description"]
652
- match_in = m.get("match_in", "")
653
- if len(desc) > 80:
654
- desc = desc[:77] + "..."
655
- lines.append(f"{i}. {name} (matched in {match_in})")
656
- lines.append(f" {desc}")
657
- lines.append("")
658
-
659
- return "\n".join(lines).strip()
660
-
661
-
662
- def _format_error(message: str, hint: str = "") -> str:
663
- """Format error as human-readable text."""
664
- lines = [f"Error: {message}"]
665
- if hint:
666
- lines.append(f"Hint: {hint}")
667
- return "\n".join(lines)
668
-
669
-
670
- # ---------------------------------------------------------------------------
671
- # Main Tool Function
672
- # ---------------------------------------------------------------------------
673
-
674
- @autodoc(summary=TOOL_SUMMARY)
675
- def Agent_Skills(
676
- action: Annotated[str, "Operation: 'discover', 'info', 'resources', 'validate', 'search', 'help'."],
677
- skill_name: Annotated[Optional[str], "Name of skill (required for info/resources/validate)."] = None,
678
- resource_path: Annotated[Optional[str], "Path to resource file within skill (for resources action)."] = None,
679
- query: Annotated[Optional[str], "Search query (for search action)."] = None,
680
- max_chars: Annotated[int, "Max characters to return for skill body or resource content (0 = no limit)."] = 3000,
681
- offset: Annotated[int, "Start offset for reading content (for info/resources)."] = 0,
682
- ) -> str:
683
- _log_call_start("Agent_Skills", action=action, skill_name=skill_name, resource_path=resource_path, query=query, max_chars=max_chars, offset=offset)
684
-
685
- action = (action or "").strip().lower()
686
-
687
- if action not in {"discover", "info", "resources", "validate", "search", "help"}:
688
- result = _format_error(
689
- f"Invalid action: {action}",
690
- "Choose from: discover, info, resources, validate, search, help."
691
- )
692
- _log_call_end("Agent_Skills", _truncate_for_log(result))
693
- return result
694
-
695
- try:
696
- if action == "help":
697
- result = HELP_TEXT
698
-
699
- elif action == "discover":
700
- skills = _discover_skills()
701
- result = _format_discover(skills)
702
-
703
- elif action == "info":
704
- if not skill_name:
705
- result = _format_error("skill_name is required for 'info' action.")
706
- else:
707
- info = _get_skill_info(skill_name.strip(), offset=offset, max_chars=max_chars)
708
- result = _format_skill_info(info)
709
-
710
- elif action == "resources":
711
- if not skill_name:
712
- result = _format_error("skill_name is required for 'resources' action.")
713
- elif resource_path:
714
- resource_data = _read_skill_resource(skill_name.strip(), resource_path.strip(), offset=offset, max_chars=max_chars)
715
- result = _format_resource_content(resource_data)
716
- else:
717
- resources = _list_skill_resources(skill_name.strip())
718
- result = _format_resources_list(resources)
719
-
720
- elif action == "validate":
721
- if not skill_name:
722
- result = _format_error("skill_name is required for 'validate' action.")
723
- else:
724
- skills_root = _get_skills_root()
725
- skill_dir = skills_root / skill_name.strip()
726
- errors = _validate_skill(skill_dir)
727
- result = _format_validation(skill_name, errors)
728
-
729
- elif action == "search":
730
- if not query:
731
- result = _format_error("query is required for 'search' action.")
732
- else:
733
- matches = _search_skills(query.strip())
734
- result = _format_search(query, matches)
735
-
736
- else:
737
- result = _format_error(f"Action '{action}' not implemented.")
738
-
739
- except FileNotFoundError as e:
740
- result = _format_error(str(e))
741
- except PermissionError as e:
742
- result = _format_error(str(e))
743
- except ParseError as e:
744
- result = _format_error(str(e))
745
- except ValidationError as e:
746
- result = _format_error(str(e))
747
- except Exception as e:
748
- result = _format_error(f"Unexpected error: {e}")
749
-
750
- _log_call_end("Agent_Skills", _truncate_for_log(result))
751
- return result
752
-
753
-
754
- # ---------------------------------------------------------------------------
755
- # Gradio Interface
756
- # ---------------------------------------------------------------------------
757
-
758
- def build_interface() -> gr.Interface:
759
- return gr.Interface(
760
- fn=Agent_Skills,
761
- inputs=[
762
- gr.Radio(
763
- label="Action",
764
- choices=["discover", "info", "resources", "validate", "search", "help"],
765
- value="help",
766
- info="Operation to perform",
767
- ),
768
- gr.Textbox(label="Skill Name", placeholder="pdf", max_lines=1, info="Name of the skill"),
769
- gr.Textbox(label="Resource Path", placeholder="references/forms.md", max_lines=1, info="Path to resource within skill"),
770
- gr.Textbox(label="Search Query", placeholder="MCP", max_lines=1, info="Keyword to search for"),
771
- gr.Slider(minimum=0, maximum=100000, step=500, value=3000, label="Max Chars", info="Max characters for content (0 = no limit)"),
772
- gr.Slider(minimum=0, maximum=1_000_000, step=100, value=0, label="Offset", info="Start offset (Info/Resources)"),
773
- ],
774
- outputs=gr.Textbox(label="Result", lines=20),
775
- title="Agent Skills",
776
- description=(
777
- "<div style=\"text-align:center; overflow:hidden;\">"
778
- "Discover, inspect, and access Agent Skills. "
779
- "Skills provide structured instructions and resources for specialized tasks."
780
- "</div>"
781
- ),
782
- api_description=TOOL_SUMMARY,
783
- flagging_mode="never",
784
- submit_btn="Run",
785
- )
786
-
787
-
788
- __all__ = ["Agent_Skills", "build_interface"]
 
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Agent Skills Module for Nymbo-Tools MCP Server.
5
+
6
+ Provides structured skill discovery, activation, validation, and resource access
7
+ following the Agent Skills specification (https://agentskills.io).
8
+
9
+ Skills are directories containing a SKILL.md file with YAML frontmatter (name, description)
10
+ and Markdown instructions. This tool enables agents to efficiently discover and use skills
11
+ through progressive disclosure: low-token metadata discovery, on-demand full activation,
12
+ and targeted resource access.
13
+ """
14
+
15
+ import json
16
+ import os
17
+ import re
18
+ import unicodedata
19
+ from pathlib import Path
20
+ from typing import Annotated, Optional
21
+
22
+ import gradio as gr
23
+
24
+ from app import _log_call_end, _log_call_start, _truncate_for_log
25
+ from ._docstrings import autodoc
26
+ from .File_System import ROOT_DIR, _display_path
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Constants
31
+ # ---------------------------------------------------------------------------
32
+
33
+ SKILLS_SUBDIR = "Skills" # Subdirectory under ROOT_DIR containing skills
34
+ MAX_SKILL_NAME_LENGTH = 64
35
+ MAX_DESCRIPTION_LENGTH = 1024
36
+ MAX_COMPATIBILITY_LENGTH = 500
37
+
38
+ ALLOWED_FRONTMATTER_FIELDS = {
39
+ "name",
40
+ "description",
41
+ "license",
42
+ "allowed-tools",
43
+ "metadata",
44
+ "compatibility",
45
+ }
46
+
47
+ TOOL_SUMMARY = (
48
+ "Discover, inspect, validate, and access Agent Skills. "
49
+ "Actions: discover (list all skills), info (get SKILL.md contents), "
50
+ "resources (list/read bundled files), validate (check format), search (find by keyword). "
51
+ "Skills provide structured instructions for specialized tasks. "
52
+ "Use in combination with the `Shell_Command` and `File_System` tools."
53
+ )
54
+
55
+ HELP_TEXT = """\
56
+ Agent Skills — actions and usage
57
+
58
+ Skills are directories containing a SKILL.md file with YAML frontmatter (name, description)
59
+ and Markdown instructions. They live under /Skills/ in the filesystem root.
60
+
61
+ Actions:
62
+ - discover: List all available skills with their metadata (name, description, location)
63
+ - info: Get the full contents of a specific skill's SKILL.md file
64
+ - resources: List or read files within a skill's bundled directories (scripts/, references/, assets/)
65
+ - validate: Check if a skill conforms to the Agent Skills specification
66
+ - search: Find skills by keyword in name or description
67
+ - help: Show this guide
68
+
69
+ Examples:
70
+ - Discover all skills: action="discover"
71
+ - Get skill info: action="info", skill_name="pdf"
72
+ - List skill resources: action="resources", skill_name="mcp-builder"
73
+ - Read a resource: action="resources", skill_name="pdf", resource_path="references/forms.md"
74
+ - Validate a skill: action="validate", skill_name="pdf"
75
+ - Search for skills: action="search", query="MCP"
76
+ """
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # Skills Root Resolution
81
+ # ---------------------------------------------------------------------------
82
+
83
+ def _get_skills_root() -> Path:
84
+ """Get the absolute path to the skills directory."""
85
+ skills_root = os.getenv("NYMBO_SKILLS_ROOT")
86
+ if skills_root and skills_root.strip():
87
+ return Path(skills_root.strip()).resolve()
88
+ return Path(ROOT_DIR) / SKILLS_SUBDIR
89
+
90
+ # Import _fmt_size from shared utility instead of duplicating
91
+ from ._core import _fmt_size
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # YAML Frontmatter Parsing (adapted from skills_ref/parser.py)
96
+ # ---------------------------------------------------------------------------
97
+
98
+ class ParseError(Exception):
99
+ """Raised when SKILL.md parsing fails."""
100
+ pass
101
+
102
+
103
+ class ValidationError(Exception):
104
+ """Raised when skill validation fails."""
105
+ def __init__(self, message: str, errors: list[str] | None = None):
106
+ super().__init__(message)
107
+ self.errors = errors if errors is not None else [message]
108
+
109
+
110
+ def _parse_frontmatter(content: str) -> tuple[dict, str]:
111
+ """
112
+ Parse YAML frontmatter from SKILL.md content.
113
+
114
+ Returns (metadata dict, markdown body).
115
+ Raises ParseError if frontmatter is missing or invalid.
116
+ """
117
+ if not content.startswith("---"):
118
+ raise ParseError("SKILL.md must start with YAML frontmatter (---)")
119
+
120
+ parts = content.split("---", 2)
121
+ if len(parts) < 3:
122
+ raise ParseError("SKILL.md frontmatter not properly closed with ---")
123
+
124
+ frontmatter_str = parts[1]
125
+ body = parts[2].strip()
126
+
127
+ # Simple YAML parsing without external dependency
128
+ metadata: dict = {}
129
+ in_metadata_block = False
130
+ metadata_dict: dict = {}
131
+
132
+ for line in frontmatter_str.strip().split("\n"):
133
+ if not line.strip():
134
+ continue
135
+
136
+ if line.strip() == "metadata:":
137
+ in_metadata_block = True
138
+ continue
139
+
140
+ if in_metadata_block:
141
+ if line.startswith(" "):
142
+ match = re.match(r"^\s+(\w+):\s*(.*)$", line)
143
+ if match:
144
+ key = match.group(1).strip()
145
+ value = match.group(2).strip().strip('"').strip("'")
146
+ metadata_dict[key] = value
147
+ continue
148
+ else:
149
+ in_metadata_block = False
150
+ if metadata_dict:
151
+ metadata["metadata"] = metadata_dict
152
+ metadata_dict = {}
153
+
154
+ match = re.match(r"^(\S+):\s*(.*)$", line)
155
+ if match:
156
+ key = match.group(1).strip()
157
+ value = match.group(2).strip()
158
+ if (value.startswith('"') and value.endswith('"')) or \
159
+ (value.startswith("'") and value.endswith("'")):
160
+ value = value[1:-1]
161
+ metadata[key] = value if value else ""
162
+
163
+ if in_metadata_block and metadata_dict:
164
+ metadata["metadata"] = metadata_dict
165
+
166
+ return metadata, body
167
+
168
+
169
+ def _find_skill_md(skill_dir: Path) -> Optional[Path]:
170
+ """Find the SKILL.md file in a skill directory (prefers uppercase)."""
171
+ for name in ("SKILL.md", "skill.md"):
172
+ path = skill_dir / name
173
+ if path.exists():
174
+ return path
175
+ return None
176
+
177
+
178
+ # ---------------------------------------------------------------------------
179
+ # Skill Validation (adapted from skills_ref/validator.py)
180
+ # ---------------------------------------------------------------------------
181
+
182
+ def _validate_name(name: str, skill_dir: Path) -> list[str]:
183
+ """Validate skill name format and directory match."""
184
+ errors = []
185
+
186
+ if not name or not isinstance(name, str) or not name.strip():
187
+ errors.append("Field 'name' must be a non-empty string")
188
+ return errors
189
+
190
+ name = unicodedata.normalize("NFKC", name.strip())
191
+
192
+ if len(name) > MAX_SKILL_NAME_LENGTH:
193
+ errors.append(f"Skill name '{name}' exceeds {MAX_SKILL_NAME_LENGTH} character limit ({len(name)} chars)")
194
+
195
+ if name != name.lower():
196
+ errors.append(f"Skill name '{name}' must be lowercase")
197
+
198
+ if name.startswith("-") or name.endswith("-"):
199
+ errors.append("Skill name cannot start or end with a hyphen")
200
+
201
+ if "--" in name:
202
+ errors.append("Skill name cannot contain consecutive hyphens")
203
+
204
+ if not all(c.isalnum() or c == "-" for c in name):
205
+ errors.append(f"Skill name '{name}' contains invalid characters. Only letters, digits, and hyphens allowed.")
206
+
207
+ if skill_dir:
208
+ dir_name = unicodedata.normalize("NFKC", skill_dir.name)
209
+ if dir_name != name:
210
+ errors.append(f"Directory name '{skill_dir.name}' must match skill name '{name}'")
211
+
212
+ return errors
213
+
214
+
215
+ def _validate_description(description: str) -> list[str]:
216
+ """Validate description format."""
217
+ errors = []
218
+
219
+ if not description or not isinstance(description, str) or not description.strip():
220
+ errors.append("Field 'description' must be a non-empty string")
221
+ return errors
222
+
223
+ if len(description) > MAX_DESCRIPTION_LENGTH:
224
+ errors.append(f"Description exceeds {MAX_DESCRIPTION_LENGTH} character limit ({len(description)} chars)")
225
+
226
+ return errors
227
+
228
+
229
+ def _validate_compatibility(compatibility: str) -> list[str]:
230
+ """Validate compatibility format."""
231
+ errors = []
232
+
233
+ if not isinstance(compatibility, str):
234
+ errors.append("Field 'compatibility' must be a string")
235
+ return errors
236
+
237
+ if len(compatibility) > MAX_COMPATIBILITY_LENGTH:
238
+ errors.append(f"Compatibility exceeds {MAX_COMPATIBILITY_LENGTH} character limit ({len(compatibility)} chars)")
239
+
240
+ return errors
241
+
242
+
243
+ def _validate_skill(skill_dir: Path) -> list[str]:
244
+ """Validate a skill directory. Returns list of error messages (empty = valid)."""
245
+ if not skill_dir.exists():
246
+ return [f"Path does not exist: {skill_dir}"]
247
+
248
+ if not skill_dir.is_dir():
249
+ return [f"Not a directory: {skill_dir}"]
250
+
251
+ skill_md = _find_skill_md(skill_dir)
252
+ if skill_md is None:
253
+ return ["Missing required file: SKILL.md"]
254
+
255
+ try:
256
+ content = skill_md.read_text(encoding="utf-8")
257
+ metadata, _ = _parse_frontmatter(content)
258
+ except ParseError as e:
259
+ return [str(e)]
260
+ except Exception as e:
261
+ return [f"Failed to read SKILL.md: {e}"]
262
+
263
+ errors = []
264
+
265
+ extra_fields = set(metadata.keys()) - ALLOWED_FRONTMATTER_FIELDS
266
+ if extra_fields:
267
+ errors.append(f"Unexpected fields in frontmatter: {', '.join(sorted(extra_fields))}")
268
+
269
+ if "name" not in metadata:
270
+ errors.append("Missing required field: name")
271
+ else:
272
+ errors.extend(_validate_name(metadata["name"], skill_dir))
273
+
274
+ if "description" not in metadata:
275
+ errors.append("Missing required field: description")
276
+ else:
277
+ errors.extend(_validate_description(metadata["description"]))
278
+
279
+ if "compatibility" in metadata:
280
+ errors.extend(_validate_compatibility(metadata["compatibility"]))
281
+
282
+ return errors
283
+
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # Skill Discovery and Info
287
+ # ---------------------------------------------------------------------------
288
+
289
+ def _read_skill_properties(skill_dir: Path) -> dict:
290
+ """Read skill properties from SKILL.md frontmatter. Returns dict with metadata."""
291
+ skill_md = _find_skill_md(skill_dir)
292
+ if skill_md is None:
293
+ raise ParseError(f"SKILL.md not found in {skill_dir}")
294
+
295
+ content = skill_md.read_text(encoding="utf-8")
296
+ metadata, body = _parse_frontmatter(content)
297
+
298
+ if "name" not in metadata:
299
+ raise ValidationError("Missing required field: name")
300
+ if "description" not in metadata:
301
+ raise ValidationError("Missing required field: description")
302
+
303
+ return {
304
+ "name": metadata.get("name", "").strip(),
305
+ "description": metadata.get("description", "").strip(),
306
+ "license": metadata.get("license"),
307
+ "compatibility": metadata.get("compatibility"),
308
+ "allowed_tools": metadata.get("allowed-tools"),
309
+ "metadata": metadata.get("metadata", {}),
310
+ "location": str(skill_md),
311
+ "body": body,
312
+ }
313
+
314
+
315
+ def _discover_skills() -> list[dict]:
316
+ """Discover all valid skills in the skills directory."""
317
+ skills_root = _get_skills_root()
318
+
319
+ if not skills_root.exists():
320
+ return []
321
+
322
+ skills = []
323
+ for item in sorted(skills_root.iterdir()):
324
+ if not item.is_dir():
325
+ continue
326
+
327
+ skill_md = _find_skill_md(item)
328
+ if skill_md is None:
329
+ continue
330
+
331
+ try:
332
+ props = _read_skill_properties(item)
333
+ skills.append({
334
+ "name": props["name"],
335
+ "description": props["description"],
336
+ "location": _display_path(str(skill_md)),
337
+ })
338
+ except Exception:
339
+ continue
340
+
341
+ return skills
342
+
343
+
344
+ def _get_skill_info(skill_name: str, offset: int = 0, max_chars: int = 0) -> dict:
345
+ """Get full information for a specific skill."""
346
+ skills_root = _get_skills_root()
347
+ skill_dir = skills_root / skill_name
348
+
349
+ if not skill_dir.exists():
350
+ raise FileNotFoundError(f"Skill not found: {skill_name}")
351
+
352
+ skill_md = _find_skill_md(skill_dir)
353
+ if skill_md is None:
354
+ raise FileNotFoundError(f"SKILL.md not found in skill: {skill_name}")
355
+
356
+ content = skill_md.read_text(encoding="utf-8")
357
+ metadata, body = _parse_frontmatter(content)
358
+
359
+ total_chars = len(body)
360
+ start = max(0, min(offset, total_chars))
361
+ if max_chars > 0:
362
+ end = min(total_chars, start + max_chars)
363
+ else:
364
+ end = total_chars
365
+
366
+ body_chunk = body[start:end]
367
+ truncated = end < total_chars
368
+ next_cursor = end if truncated else None
369
+
370
+ return {
371
+ "name": metadata.get("name", "").strip(),
372
+ "description": metadata.get("description", "").strip(),
373
+ "license": metadata.get("license"),
374
+ "compatibility": metadata.get("compatibility"),
375
+ "allowed_tools": metadata.get("allowed-tools"),
376
+ "metadata": metadata.get("metadata", {}),
377
+ "location": _display_path(str(skill_md)),
378
+ "body": body_chunk,
379
+ "offset": start,
380
+ "total_chars": total_chars,
381
+ "truncated": truncated,
382
+ "next_cursor": next_cursor,
383
+ }
384
+
385
+
386
+ def _list_skill_resources(skill_name: str) -> dict:
387
+ """List all resources within a skill directory.
388
+
389
+ Dynamically discovers all subdirectories, not just predefined ones.
390
+ """
391
+ skills_root = _get_skills_root()
392
+ skill_dir = skills_root / skill_name
393
+
394
+ if not skill_dir.exists():
395
+ raise FileNotFoundError(f"Skill not found: {skill_name}")
396
+
397
+ resources = {
398
+ "skill": skill_name,
399
+ "directories": {}, # Dynamic: dirname -> file list
400
+ "other_files": [],
401
+ }
402
+
403
+ for item in sorted(skill_dir.iterdir()):
404
+ if item.name.lower() in ("skill.md",):
405
+ continue
406
+
407
+ if item.is_dir():
408
+ files = []
409
+ for f in sorted(item.rglob("*")):
410
+ if f.is_file():
411
+ files.append({
412
+ "path": f.relative_to(item).as_posix(),
413
+ "size": f.stat().st_size,
414
+ })
415
+ resources["directories"][item.name] = files
416
+ elif item.is_file():
417
+ resources["other_files"].append({
418
+ "path": item.name,
419
+ "size": item.stat().st_size,
420
+ })
421
+
422
+ return resources
423
+
424
+
425
+ def _read_skill_resource(skill_name: str, resource_path: str, offset: int = 0, max_chars: int = 3000) -> dict:
426
+ """Read a specific resource file from a skill."""
427
+ skills_root = _get_skills_root()
428
+ skill_dir = skills_root / skill_name
429
+
430
+ if not skill_dir.exists():
431
+ raise FileNotFoundError(f"Skill not found: {skill_name}")
432
+
433
+ resource_file = skill_dir / resource_path
434
+
435
+ try:
436
+ resource_file.resolve().relative_to(skill_dir.resolve())
437
+ except ValueError:
438
+ raise PermissionError(f"Resource path escapes skill directory: {resource_path}")
439
+
440
+ if not resource_file.exists():
441
+ raise FileNotFoundError(f"Resource not found: {resource_path}")
442
+
443
+ if resource_file.is_dir():
444
+ raise IsADirectoryError(f"Path is a directory: {resource_path}")
445
+
446
+ content = resource_file.read_text(encoding="utf-8", errors="replace")
447
+ total_chars = len(content)
448
+
449
+ start = max(0, min(offset, total_chars))
450
+ if max_chars > 0:
451
+ end = min(total_chars, start + max_chars)
452
+ else:
453
+ end = total_chars
454
+
455
+ chunk = content[start:end]
456
+ truncated = end < total_chars
457
+ next_cursor = end if truncated else None
458
+
459
+ return {
460
+ "skill": skill_name,
461
+ "resource": resource_path,
462
+ "content": chunk,
463
+ "size": resource_file.stat().st_size,
464
+ "offset": start,
465
+ "total_chars": total_chars,
466
+ "truncated": truncated,
467
+ "next_cursor": next_cursor,
468
+ }
469
+
470
+
471
+ def _search_skills(query: str) -> list[dict]:
472
+ """Search for skills by keyword in name or description."""
473
+ query_lower = query.lower()
474
+ all_skills = _discover_skills()
475
+
476
+ matches = []
477
+ for skill in all_skills:
478
+ name_match = query_lower in skill["name"].lower()
479
+ desc_match = query_lower in skill["description"].lower()
480
+
481
+ if name_match or desc_match:
482
+ matches.append({
483
+ **skill,
484
+ "match_in": "name" if name_match else "description",
485
+ })
486
+
487
+ return matches
488
+
489
+
490
+ # ---------------------------------------------------------------------------
491
+ # Human-Readable Output Formatters
492
+ # ---------------------------------------------------------------------------
493
+
494
+ def _format_discover(skills: list[dict]) -> str:
495
+ """Format skill discovery results as human-readable text."""
496
+ skills_root = _display_path(str(_get_skills_root()))
497
+ lines = [
498
+ f"Available Skills",
499
+ f"Root: {skills_root}",
500
+ f"Total: {len(skills)} skills",
501
+ "",
502
+ ]
503
+
504
+ if not skills:
505
+ lines.append("No skills found.")
506
+ else:
507
+ for i, skill in enumerate(skills, 1):
508
+ name = skill["name"]
509
+ desc = skill["description"]
510
+ # Truncate long descriptions
511
+ if len(desc) > 100:
512
+ desc = desc[:97] + "..."
513
+ lines.append(f"{i}. {name}")
514
+ lines.append(f" {desc}")
515
+ lines.append("")
516
+
517
+ return "\n".join(lines).strip()
518
+
519
+
520
+ def _format_skill_info(info: dict) -> str:
521
+ """Format skill info as human-readable text."""
522
+ lines = [
523
+ f"Skill: {info['name']}",
524
+ f"Location: {info['location']}",
525
+ "",
526
+ f"Description: {info['description']}",
527
+ ]
528
+
529
+ if info.get("license"):
530
+ lines.append(f"License: {info['license']}")
531
+ if info.get("compatibility"):
532
+ lines.append(f"Compatibility: {info['compatibility']}")
533
+ if info.get("allowed_tools"):
534
+ lines.append(f"Allowed Tools: {info['allowed_tools']}")
535
+ if info.get("metadata"):
536
+ meta_str = ", ".join(f"{k}={v}" for k, v in info["metadata"].items())
537
+ lines.append(f"Metadata: {meta_str}")
538
+
539
+ lines.append("")
540
+ lines.append("--- SKILL.md Body ---")
541
+ if info.get("offset", 0) > 0:
542
+ lines.append(f"(Showing content from offset {info['offset']})")
543
+ lines.append("")
544
+ lines.append(info["body"])
545
+
546
+ if info.get("truncated"):
547
+ lines.append("")
548
+ lines.append(f"… Truncated. Showing {len(info['body'])} chars (offset {info['offset']}). Total: {info['total_chars']}.")
549
+ lines.append(f"Next cursor: {info['next_cursor']}")
550
+
551
+ return "\n".join(lines)
552
+
553
+
554
+ def _format_resources_list(resources: dict) -> str:
555
+ """Format resource listing as a visual filesystem tree with line connectors."""
556
+ from ._core import build_tree, render_tree
557
+
558
+ skill = resources["skill"]
559
+ lines = [
560
+ f"Resources for skill: {skill}",
561
+ "",
562
+ ]
563
+
564
+ # Build entries list for the tree
565
+ entries: list[tuple[str, dict]] = []
566
+
567
+ # Add all discovered directories and their files
568
+ directories = resources.get("directories", {})
569
+ for dirname, files in directories.items():
570
+ for f in files:
571
+ path = f"{dirname}/{f['path']}"
572
+ entries.append((path, {"size": f["size"]}))
573
+
574
+ # Add root files
575
+ other = resources.get("other_files", [])
576
+ for f in other:
577
+ entries.append((f["path"], {"size": f["size"]}))
578
+
579
+ # Build and render the unified tree
580
+ tree = build_tree(entries)
581
+
582
+ # Count files
583
+ total_files = len(entries)
584
+
585
+ # Render with skill as root
586
+ lines.append(f"└── {skill}/")
587
+ lines.extend(render_tree(tree, " "))
588
+
589
+ lines.append("")
590
+ if total_files == 0:
591
+ lines.append("No resource files found.")
592
+ else:
593
+ lines.append(f"Total: {total_files} files")
594
+
595
+ return "\n".join(lines).strip()
596
+
597
+
598
+ def _format_resource_content(data: dict) -> str:
599
+ """Format resource file content as human-readable text."""
600
+ lines = [
601
+ f"Resource: {data['resource']}",
602
+ f"Skill: {data['skill']}",
603
+ f"Size: {_fmt_size(data['size'])}",
604
+ ]
605
+
606
+ offset = data.get("offset", 0)
607
+ lines.append(f"Showing: {len(data['content'])} of {data['total_chars']} chars (offset {offset})")
608
+
609
+ lines.append("")
610
+ lines.append("--- Content ---")
611
+ lines.append("")
612
+ lines.append(data["content"])
613
+
614
+ if data.get("truncated"):
615
+ lines.append("")
616
+ lines.append(f"… Truncated. Next cursor: {data['next_cursor']}")
617
+
618
+ return "\n".join(lines)
619
+
620
+
621
+ def _format_validation(skill_name: str, errors: list[str]) -> str:
622
+ """Format validation results as human-readable text."""
623
+ if not errors:
624
+ return f"✓ Skill '{skill_name}' is valid."
625
+
626
+ lines = [
627
+ f"✗ Validation failed for skill '{skill_name}'",
628
+ f"Errors: {len(errors)}",
629
+ "",
630
+ ]
631
+
632
+ for i, err in enumerate(errors, 1):
633
+ lines.append(f" {i}. {err}")
634
+
635
+ return "\n".join(lines)
636
+
637
+
638
+ def _format_search(query: str, matches: list[dict]) -> str:
639
+ """Format search results as human-readable text."""
640
+ lines = [
641
+ f"Search results for: {query}",
642
+ f"Matches: {len(matches)}",
643
+ "",
644
+ ]
645
+
646
+ if not matches:
647
+ lines.append("No matching skills found.")
648
+ else:
649
+ for i, m in enumerate(matches, 1):
650
+ name = m["name"]
651
+ desc = m["description"]
652
+ match_in = m.get("match_in", "")
653
+ if len(desc) > 80:
654
+ desc = desc[:77] + "..."
655
+ lines.append(f"{i}. {name} (matched in {match_in})")
656
+ lines.append(f" {desc}")
657
+ lines.append("")
658
+
659
+ return "\n".join(lines).strip()
660
+
661
+
662
+ def _format_error(message: str, hint: str = "") -> str:
663
+ """Format error as human-readable text."""
664
+ lines = [f"Error: {message}"]
665
+ if hint:
666
+ lines.append(f"Hint: {hint}")
667
+ return "\n".join(lines)
668
+
669
+
670
+ # ---------------------------------------------------------------------------
671
+ # Main Tool Function
672
+ # ---------------------------------------------------------------------------
673
+
674
+ @autodoc(summary=TOOL_SUMMARY)
675
+ def Agent_Skills(
676
+ action: Annotated[str, "Operation: 'discover', 'info', 'resources', 'validate', 'search', 'help'."],
677
+ skill_name: Annotated[Optional[str], "Name of skill (required for info/resources/validate)."] = None,
678
+ resource_path: Annotated[Optional[str], "Path to resource file within skill (for resources action)."] = None,
679
+ query: Annotated[Optional[str], "Search query (for search action)."] = None,
680
+ max_chars: Annotated[int, "Max characters to return for skill body or resource content (0 = no limit)."] = 3000,
681
+ offset: Annotated[int, "Start offset for reading content (for info/resources)."] = 0,
682
+ ) -> str:
683
+ _log_call_start("Agent_Skills", action=action, skill_name=skill_name, resource_path=resource_path, query=query, max_chars=max_chars, offset=offset)
684
+
685
+ action = (action or "").strip().lower()
686
+
687
+ if action not in {"discover", "info", "resources", "validate", "search", "help"}:
688
+ result = _format_error(
689
+ f"Invalid action: {action}",
690
+ "Choose from: discover, info, resources, validate, search, help."
691
+ )
692
+ _log_call_end("Agent_Skills", _truncate_for_log(result))
693
+ return result
694
+
695
+ try:
696
+ if action == "help":
697
+ result = HELP_TEXT
698
+
699
+ elif action == "discover":
700
+ skills = _discover_skills()
701
+ result = _format_discover(skills)
702
+
703
+ elif action == "info":
704
+ if not skill_name:
705
+ result = _format_error("skill_name is required for 'info' action.")
706
+ else:
707
+ info = _get_skill_info(skill_name.strip(), offset=offset, max_chars=max_chars)
708
+ result = _format_skill_info(info)
709
+
710
+ elif action == "resources":
711
+ if not skill_name:
712
+ result = _format_error("skill_name is required for 'resources' action.")
713
+ elif resource_path:
714
+ resource_data = _read_skill_resource(skill_name.strip(), resource_path.strip(), offset=offset, max_chars=max_chars)
715
+ result = _format_resource_content(resource_data)
716
+ else:
717
+ resources = _list_skill_resources(skill_name.strip())
718
+ result = _format_resources_list(resources)
719
+
720
+ elif action == "validate":
721
+ if not skill_name:
722
+ result = _format_error("skill_name is required for 'validate' action.")
723
+ else:
724
+ skills_root = _get_skills_root()
725
+ skill_dir = skills_root / skill_name.strip()
726
+ errors = _validate_skill(skill_dir)
727
+ result = _format_validation(skill_name, errors)
728
+
729
+ elif action == "search":
730
+ if not query:
731
+ result = _format_error("query is required for 'search' action.")
732
+ else:
733
+ matches = _search_skills(query.strip())
734
+ result = _format_search(query, matches)
735
+
736
+ else:
737
+ result = _format_error(f"Action '{action}' not implemented.")
738
+
739
+ except FileNotFoundError as e:
740
+ result = _format_error(str(e))
741
+ except PermissionError as e:
742
+ result = _format_error(str(e))
743
+ except ParseError as e:
744
+ result = _format_error(str(e))
745
+ except ValidationError as e:
746
+ result = _format_error(str(e))
747
+ except Exception as e:
748
+ result = _format_error(f"Unexpected error: {e}")
749
+
750
+ _log_call_end("Agent_Skills", _truncate_for_log(result))
751
+ return result
752
+
753
+
754
+ # ---------------------------------------------------------------------------
755
+ # Gradio Interface
756
+ # ---------------------------------------------------------------------------
757
+
758
+ def build_interface() -> gr.Interface:
759
+ return gr.Interface(
760
+ fn=Agent_Skills,
761
+ inputs=[
762
+ gr.Radio(
763
+ label="Action",
764
+ choices=["discover", "info", "resources", "validate", "search", "help"],
765
+ value="help",
766
+ info="Operation to perform",
767
+ ),
768
+ gr.Textbox(label="Skill Name", placeholder="pdf", max_lines=1, info="Name of the skill"),
769
+ gr.Textbox(label="Resource Path", placeholder="references/forms.md", max_lines=1, info="Path to resource within skill"),
770
+ gr.Textbox(label="Search Query", placeholder="MCP", max_lines=1, info="Keyword to search for"),
771
+ gr.Slider(minimum=0, maximum=100000, step=500, value=3000, label="Max Chars", info="Max characters for content (0 = no limit)"),
772
+ gr.Slider(minimum=0, maximum=1_000_000, step=100, value=0, label="Offset", info="Start offset (Info/Resources)"),
773
+ ],
774
+ outputs=gr.Textbox(label="Result", lines=20),
775
+ title="Agent Skills",
776
+ description=(
777
+ "<div style=\"text-align:center; overflow:hidden;\">"
778
+ "Discover, inspect, and access Agent Skills. "
779
+ "Skills provide structured instructions and resources for specialized tasks."
780
+ "</div>"
781
+ ),
782
+ api_description=TOOL_SUMMARY,
783
+ flagging_mode="never",
784
+ submit_btn="Run",
785
+ )
786
+
787
+
788
+ __all__ = ["Agent_Skills", "build_interface"]
Modules/Agent_Terminal.py CHANGED
@@ -1,159 +1,135 @@
1
- from __future__ import annotations
2
-
3
  import sys
4
  import types
5
  import inspect
6
  import functools
7
  from typing import Annotated, get_type_hints, get_origin, get_args
8
-
9
- import gradio as gr
10
- from ._docstrings import autodoc
11
- from ._core import sandboxed_exec
12
-
13
- from app import _log_call_end, _log_call_start, _truncate_for_log
14
-
15
  # NOTE: Tool imports are deferred to _get_tools_map() to avoid circular imports
16
- # (app.py imports Agent_Terminal, which would import File_System before it's fully loaded)
17
-
18
-
19
- # Example usages for each tool - simple and advanced
20
  _TOOL_EXAMPLES = {
21
- "Web_Fetch": (
22
- 'Web_Fetch(url="https://example.com")',
23
- 'Web_Fetch(url="https://example.com", max_chars=5000, mode="url_scraper")',
24
- ),
25
  "Web_Search": (
26
  'Web_Search(query="Python tutorials")',
27
  'Web_Search(query="AI news", max_results=10, search_type="news", date_filter="week")',
28
  ),
 
 
 
 
29
  "Code_Interpreter": (
30
  'Code_Interpreter(code="print(2 + 2)")',
31
  'Code_Interpreter(code="import math; print(math.pi)", timeout=60)',
32
- ),
33
  "Shell_Command": (
34
  'Shell_Command(command="echo Hello")',
35
  'Shell_Command(command="ls -la", timeout=30)',
36
  ),
37
- "File_System": (
38
- 'File_System(action="list", path="/")',
39
- 'File_System(action="edit", path="/script.py", content="<<<<<<< SEARCH\\nold_text\\n=======\\nnew_text\\n>>>>>>> REPLACE")',
40
- ),
41
- "Obsidian_Vault": (
42
- 'Obsidian_Vault(action="list", path="/")',
43
- 'Obsidian_Vault(action="search", query="meeting notes", recursive=True)',
44
- ),
45
  "Memory_Manager": (
46
  'Memory_Manager(action="list")',
47
  'Memory_Manager(action="save", text="Remember this fact", tags="important, facts")',
48
  ),
49
- "Generate_Speech": (
50
- 'Generate_Speech(text="Hello, world!")',
51
- 'Generate_Speech(text="Welcome to the demo", voice="af_heart", speed=1.2)',
52
- ),
53
  "Generate_Image": (
54
  'Generate_Image(prompt="A sunset over mountains")',
55
  'Generate_Image(prompt="A cyberpunk city", steps=50, cfg_scale=9.0, width=1024, height=768)',
56
  ),
57
- "Generate_Video": (
58
- 'Generate_Video(prompt="A cat playing piano")',
59
- 'Generate_Video(prompt="Ocean waves", duration=5, aspect_ratio="16:9")',
60
- ),
61
- "Deep_Research": (
62
- 'Deep_Research(query="Climate change effects")',
63
- 'Deep_Research(query="Quantum computing advances", max_sources=10, search_type="news")',
64
- ),
65
- "Agent_Skills": (
66
- 'Agent_Skills(action="discover")',
67
- 'Agent_Skills(action="info", skill_name="pdf")',
68
- ),
69
  }
70
-
71
-
72
- def _format_tool_usage(func) -> str:
73
- """Generate detailed usage information for a tool function."""
74
- name = func.__name__
75
- doc = func.__doc__ or "No description available."
76
-
77
- # Extract just the summary (first paragraph) - skip Args/Returns sections
78
- # since we generate our own detailed parameter list
79
- doc_lines = doc.strip().split('\n')
80
- summary_lines = []
81
- for line in doc_lines:
82
- stripped = line.strip().lower()
83
- # Stop at Args:, Returns:, Parameters:, etc.
84
- if stripped.startswith(('args:', 'returns:', 'parameters:', 'raises:', 'example:', 'note:', 'notes:')):
85
- break
86
- summary_lines.append(line)
87
- summary = '\n'.join(summary_lines).strip()
88
-
89
- # Get the signature
90
- sig = inspect.signature(func)
91
-
92
- # Try to get type hints
93
- try:
94
- hints = get_type_hints(func, include_extras=True)
95
- except Exception:
96
- hints = {}
97
-
98
- lines = [f"=== {name} ===", "", summary, "", "Parameters:"]
99
-
100
- for param_name, param in sig.parameters.items():
101
- if param_name in ("self", "cls"):
102
- continue
103
-
104
- # Get type and description from Annotated if available
105
- hint = hints.get(param_name)
106
- type_str = "any"
107
- desc = ""
108
-
109
- if hint is not None:
110
- if get_origin(hint) is Annotated:
111
- args = get_args(hint)
112
- if args:
113
- type_str = getattr(args[0], "__name__", str(args[0]))
114
- if len(args) > 1 and isinstance(args[1], str):
115
- desc = args[1]
116
- else:
117
- type_str = getattr(hint, "__name__", str(hint))
118
-
119
- # Check for default
120
- if param.default is not inspect.Parameter.empty:
121
- default_repr = repr(param.default)
122
- if len(default_repr) > 50:
123
- default_repr = default_repr[:47] + "..."
124
- default_str = f" = {default_repr}"
125
- else:
126
- default_str = " (required)"
127
-
128
- lines.append(f" - {param_name}: {type_str}{default_str}")
129
- if desc:
130
- lines.append(f" {desc}")
131
-
132
- # Add examples
133
- lines.append("")
134
- lines.append("Examples:")
135
- if name in _TOOL_EXAMPLES:
136
- simple, advanced = _TOOL_EXAMPLES[name]
137
- lines.append(f" {simple}")
138
- lines.append(f" {advanced}")
139
- else:
140
- lines.append(f" {name}(...)")
141
-
142
- return "\n".join(lines)
143
-
144
-
145
  def _wrap_tool_for_no_arg_usage(func):
146
- """
147
- Wrap a tool function so that calling it with no arguments
148
- returns usage information instead of raising an error.
149
- """
150
- @functools.wraps(func)
151
- def wrapper(*args, **kwargs):
152
- # If called with no arguments, return usage info
153
- if not args and not kwargs:
154
- return _format_tool_usage(func)
155
- return func(*args, **kwargs)
156
-
157
  # Preserve the original function for introspection
158
  wrapper._original_func = func
159
  return wrapper
@@ -164,140 +140,126 @@ def _get_tools_map():
164
  Imports are done here (lazily) to avoid circular imports when app.py loads Agent_Terminal.
165
  """
166
  # Lazy imports to avoid circular import during app startup
167
- from .File_System import File_System
168
- from .Web_Fetch import Web_Fetch
169
  from .Web_Search import Web_Search
 
170
  from .Memory_Manager import Memory_Manager
171
- from .Generate_Speech import Generate_Speech, List_Kokoro_Voices
172
  from .Generate_Image import Generate_Image
173
- from .Generate_Video import Generate_Video
174
- from .Deep_Research import Deep_Research
175
- from .Obsidian_Vault import Obsidian_Vault
176
  from .Shell_Command import Shell_Command
177
  from .Code_Interpreter import Code_Interpreter
178
- from .Agent_Skills import Agent_Skills
179
-
180
  raw_tools = {
181
- "Web_Fetch": Web_Fetch,
182
  "Web_Search": Web_Search,
 
183
  "Memory_Manager": Memory_Manager,
184
- "Generate_Speech": Generate_Speech,
185
- "List_Kokoro_Voices": List_Kokoro_Voices,
186
  "Generate_Image": Generate_Image,
187
- "Generate_Video": Generate_Video,
188
- "Deep_Research": Deep_Research,
189
- "File_System": File_System,
190
- "Obsidian_Vault": Obsidian_Vault,
191
  "Shell_Command": Shell_Command,
192
  "Code_Interpreter": Code_Interpreter,
193
- "Agent_Skills": Agent_Skills,
194
  }
195
  return {name: _wrap_tool_for_no_arg_usage(func) for name, func in raw_tools.items()}
196
-
197
-
198
-
199
- def search_tools(query: str) -> str:
200
- """Search for tools by name or description. Returns usage info for matches."""
201
- query = query.lower()
202
- matches = []
203
- tools = _get_tools_map()
204
- for name, func in tools.items():
205
- # Get original function for docstring if wrapped
206
- original = getattr(func, '_original_func', func)
207
- doc = (original.__doc__ or "").lower()
208
- if query in name.lower() or query in doc:
209
- matches.append((name, func))
210
-
211
- if not matches:
212
- return f"No tools found matching '{query}'."
213
-
214
- output = []
215
- for name, func in matches:
216
- output.append(_format_tool_usage(getattr(func, '_original_func', func)))
217
- output.append("")
218
- return "\n".join(output)
219
-
220
- def _initialize_mock_modules():
221
- """
222
- Registers a mock 'functions' module in sys.modules so that LLMs
223
- can do 'from functions import ...' without error.
224
- Uses wrapped tools that return usage info when called with no args.
225
- """
226
- mock_module = types.ModuleType("functions")
227
-
228
- # Add wrapped tools (return usage when called with no args)
229
- for name, tool in _get_tools_map().items():
230
- setattr(mock_module, name, tool)
231
-
232
- # Add helpers
233
- helpers = {
234
- "search_tools": search_tools,
235
- }
236
- for name, func in helpers.items():
237
- setattr(mock_module, name, func)
238
-
239
- sys.modules["functions"] = mock_module
240
-
241
- # Defer initialization until first use to avoid circular imports during app startup
242
- _mock_modules_initialized = False
243
-
244
- def _ensure_mock_modules():
245
- """Initialize mock modules on first use (deferred to avoid circular imports)."""
246
- global _mock_modules_initialized
247
- if not _mock_modules_initialized:
248
- _initialize_mock_modules()
249
- _mock_modules_initialized = True
250
-
251
  TOOL_SUMMARY = (
252
  "Executes Python code as the unified interface for the entire tools ecosystem. "
253
  "Use Agent Terminal repeatedly whenever you need to chain or combine tool operations. Input must be JSON that will be executed in Python. "
254
- "Available tools: `Web_Fetch`, `Web_Search`, `Code_Interpreter`, `Shell_Command`, `File_System`, `Obsidian_Vault`, `Memory_Manager`, `Generate_Speech`, `Generate_Image`, `Generate_Video`, `Deep_Research`, `Agent_Skills`."
255
  )
256
-
257
-
258
-
259
- @autodoc(
260
- summary=TOOL_SUMMARY,
261
- )
262
- def Agent_Terminal(input: Annotated[str, (
263
- "Python source code to run; stdout is captured and returned. "
264
- "Use `search_tools(`query`)` to search tools by name or capability, returns tool definitions and examples. "
265
- "Call any tool with no arguments to get its full usage info (e.g., `Generate_Image()`)."
266
- )]) -> str:
267
- # Initialize mock modules on first call (deferred to avoid circular imports)
268
- _ensure_mock_modules()
269
-
270
- _log_call_start("Agent_Terminal", input=_truncate_for_log(input or "", 300))
271
- if input is None:
272
- result = "No code provided."
273
- _log_call_end("Agent_Terminal", result)
274
- return result
275
-
276
- # Get wrapped tools that return usage info when called with no args
277
- wrapped_tools = _get_tools_map()
278
-
279
- # Build tools environment to inject
280
- tools_env = {
281
- **wrapped_tools,
282
- "search_tools": search_tools,
283
- }
284
-
285
- # Execute with AST mode to print all expression results
286
- result = sandboxed_exec(input, extra_globals=tools_env, ast_mode=True)
287
- _log_call_end("Agent_Terminal", _truncate_for_log(result))
288
- return result
289
-
290
-
291
- def build_interface() -> gr.Interface:
292
- return gr.Interface(
293
- fn=Agent_Terminal,
294
- inputs=gr.Code(label="Python Code", language="python"),
295
- outputs=gr.Textbox(label="Output", lines=5, max_lines=20),
296
- title="Agent Terminal",
297
- description="<div style=\"text-align:center\">Interact with all other tools via a Python API. Reduces token usage by 90%.</div>",
298
- api_description=TOOL_SUMMARY,
299
- flagging_mode="never",
300
- )
301
-
302
-
303
- __all__ = ["Agent_Terminal", "build_interface"]
 
1
+ from __future__ import annotations
2
+
3
  import sys
4
  import types
5
  import inspect
6
  import functools
7
  from typing import Annotated, get_type_hints, get_origin, get_args
8
+
9
+ import gradio as gr
10
+ from ._docstrings import autodoc
11
+ from ._core import sandboxed_exec
12
+
13
+ from app import _log_call_end, _log_call_start, _truncate_for_log
14
+
15
  # NOTE: Tool imports are deferred to _get_tools_map() to avoid circular imports
16
+ # during app startup.
17
+
18
+
19
+ # Example usages for each tool - simple and advanced
20
  _TOOL_EXAMPLES = {
 
 
 
 
21
  "Web_Search": (
22
  'Web_Search(query="Python tutorials")',
23
  'Web_Search(query="AI news", max_results=10, search_type="news", date_filter="week")',
24
  ),
25
+ "ScrapeGraphAI": (
26
+ 'ScrapeGraphAI(action="extract", url="https://example.com", prompt="Extract the main offer")',
27
+ 'ScrapeGraphAI(action="multi_extract", urls=["https://example.com/a", "https://example.com/b"], prompt="Compare pricing", schema_json={"type":"object","properties":{"plans":{"type":"array","items":{"type":"string"}}}})',
28
+ ),
29
  "Code_Interpreter": (
30
  'Code_Interpreter(code="print(2 + 2)")',
31
  'Code_Interpreter(code="import math; print(math.pi)", timeout=60)',
32
+ ),
33
  "Shell_Command": (
34
  'Shell_Command(command="echo Hello")',
35
  'Shell_Command(command="ls -la", timeout=30)',
36
  ),
 
 
 
 
 
 
 
 
37
  "Memory_Manager": (
38
  'Memory_Manager(action="list")',
39
  'Memory_Manager(action="save", text="Remember this fact", tags="important, facts")',
40
  ),
 
 
 
 
41
  "Generate_Image": (
42
  'Generate_Image(prompt="A sunset over mountains")',
43
  'Generate_Image(prompt="A cyberpunk city", steps=50, cfg_scale=9.0, width=1024, height=768)',
44
  ),
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
+
47
+
48
+ def _format_tool_usage(func) -> str:
49
+ """Generate detailed usage information for a tool function."""
50
+ name = func.__name__
51
+ doc = func.__doc__ or "No description available."
52
+
53
+ # Extract just the summary (first paragraph) - skip Args/Returns sections
54
+ # since we generate our own detailed parameter list
55
+ doc_lines = doc.strip().split('\n')
56
+ summary_lines = []
57
+ for line in doc_lines:
58
+ stripped = line.strip().lower()
59
+ # Stop at Args:, Returns:, Parameters:, etc.
60
+ if stripped.startswith(('args:', 'returns:', 'parameters:', 'raises:', 'example:', 'note:', 'notes:')):
61
+ break
62
+ summary_lines.append(line)
63
+ summary = '\n'.join(summary_lines).strip()
64
+
65
+ # Get the signature
66
+ sig = inspect.signature(func)
67
+
68
+ # Try to get type hints
69
+ try:
70
+ hints = get_type_hints(func, include_extras=True)
71
+ except Exception:
72
+ hints = {}
73
+
74
+ lines = [f"=== {name} ===", "", summary, "", "Parameters:"]
75
+
76
+ for param_name, param in sig.parameters.items():
77
+ if param_name in ("self", "cls"):
78
+ continue
79
+
80
+ # Get type and description from Annotated if available
81
+ hint = hints.get(param_name)
82
+ type_str = "any"
83
+ desc = ""
84
+
85
+ if hint is not None:
86
+ if get_origin(hint) is Annotated:
87
+ args = get_args(hint)
88
+ if args:
89
+ type_str = getattr(args[0], "__name__", str(args[0]))
90
+ if len(args) > 1 and isinstance(args[1], str):
91
+ desc = args[1]
92
+ else:
93
+ type_str = getattr(hint, "__name__", str(hint))
94
+
95
+ # Check for default
96
+ if param.default is not inspect.Parameter.empty:
97
+ default_repr = repr(param.default)
98
+ if len(default_repr) > 50:
99
+ default_repr = default_repr[:47] + "..."
100
+ default_str = f" = {default_repr}"
101
+ else:
102
+ default_str = " (required)"
103
+
104
+ lines.append(f" - {param_name}: {type_str}{default_str}")
105
+ if desc:
106
+ lines.append(f" {desc}")
107
+
108
+ # Add examples
109
+ lines.append("")
110
+ lines.append("Examples:")
111
+ if name in _TOOL_EXAMPLES:
112
+ simple, advanced = _TOOL_EXAMPLES[name]
113
+ lines.append(f" {simple}")
114
+ lines.append(f" {advanced}")
115
+ else:
116
+ lines.append(f" {name}(...)")
117
+
118
+ return "\n".join(lines)
119
+
120
+
121
  def _wrap_tool_for_no_arg_usage(func):
122
+ """
123
+ Wrap a tool function so that calling it with no arguments
124
+ returns usage information instead of raising an error.
125
+ """
126
+ @functools.wraps(func)
127
+ def wrapper(*args, **kwargs):
128
+ # If called with no arguments, return usage info
129
+ if not args and not kwargs:
130
+ return _format_tool_usage(func)
131
+ return func(*args, **kwargs)
132
+
133
  # Preserve the original function for introspection
134
  wrapper._original_func = func
135
  return wrapper
 
140
  Imports are done here (lazily) to avoid circular imports when app.py loads Agent_Terminal.
141
  """
142
  # Lazy imports to avoid circular import during app startup
 
 
143
  from .Web_Search import Web_Search
144
+ from .ScrapeGraphAI import ScrapeGraphAI
145
  from .Memory_Manager import Memory_Manager
 
146
  from .Generate_Image import Generate_Image
 
 
 
147
  from .Shell_Command import Shell_Command
148
  from .Code_Interpreter import Code_Interpreter
 
 
149
  raw_tools = {
 
150
  "Web_Search": Web_Search,
151
+ "ScrapeGraphAI": ScrapeGraphAI,
152
  "Memory_Manager": Memory_Manager,
 
 
153
  "Generate_Image": Generate_Image,
 
 
 
 
154
  "Shell_Command": Shell_Command,
155
  "Code_Interpreter": Code_Interpreter,
 
156
  }
157
  return {name: _wrap_tool_for_no_arg_usage(func) for name, func in raw_tools.items()}
158
+
159
+
160
+
161
+ def search_tools(query: str) -> str:
162
+ """Search for tools by name or description. Returns usage info for matches."""
163
+ query = query.lower()
164
+ matches = []
165
+ tools = _get_tools_map()
166
+ for name, func in tools.items():
167
+ # Get original function for docstring if wrapped
168
+ original = getattr(func, '_original_func', func)
169
+ doc = (original.__doc__ or "").lower()
170
+ if query in name.lower() or query in doc:
171
+ matches.append((name, func))
172
+
173
+ if not matches:
174
+ return f"No tools found matching '{query}'."
175
+
176
+ output = []
177
+ for name, func in matches:
178
+ output.append(_format_tool_usage(getattr(func, '_original_func', func)))
179
+ output.append("")
180
+ return "\n".join(output)
181
+
182
+ def _initialize_mock_modules():
183
+ """
184
+ Registers a mock 'functions' module in sys.modules so that LLMs
185
+ can do 'from functions import ...' without error.
186
+ Uses wrapped tools that return usage info when called with no args.
187
+ """
188
+ mock_module = types.ModuleType("functions")
189
+
190
+ # Add wrapped tools (return usage when called with no args)
191
+ for name, tool in _get_tools_map().items():
192
+ setattr(mock_module, name, tool)
193
+
194
+ # Add helpers
195
+ helpers = {
196
+ "search_tools": search_tools,
197
+ }
198
+ for name, func in helpers.items():
199
+ setattr(mock_module, name, func)
200
+
201
+ sys.modules["functions"] = mock_module
202
+
203
+ # Defer initialization until first use to avoid circular imports during app startup
204
+ _mock_modules_initialized = False
205
+
206
+ def _ensure_mock_modules():
207
+ """Initialize mock modules on first use (deferred to avoid circular imports)."""
208
+ global _mock_modules_initialized
209
+ if not _mock_modules_initialized:
210
+ _initialize_mock_modules()
211
+ _mock_modules_initialized = True
212
+
213
  TOOL_SUMMARY = (
214
  "Executes Python code as the unified interface for the entire tools ecosystem. "
215
  "Use Agent Terminal repeatedly whenever you need to chain or combine tool operations. Input must be JSON that will be executed in Python. "
216
+ "Available tools: `Web_Search`, `ScrapeGraphAI`, `Code_Interpreter`, `Shell_Command`, `Memory_Manager`, `Generate_Image`."
217
  )
218
+
219
+
220
+
221
+ @autodoc(
222
+ summary=TOOL_SUMMARY,
223
+ )
224
+ def Agent_Terminal(input: Annotated[str, (
225
+ "Python source code to run; stdout is captured and returned. "
226
+ "Use `search_tools(`query`)` to search tools by name or capability, returns tool definitions and examples. "
227
+ "Call any tool with no arguments to get its full usage info (e.g., `Generate_Image()`)."
228
+ )]) -> str:
229
+ # Initialize mock modules on first call (deferred to avoid circular imports)
230
+ _ensure_mock_modules()
231
+
232
+ _log_call_start("Agent_Terminal", input=_truncate_for_log(input or "", 300))
233
+ if input is None:
234
+ result = "No code provided."
235
+ _log_call_end("Agent_Terminal", result)
236
+ return result
237
+
238
+ # Get wrapped tools that return usage info when called with no args
239
+ wrapped_tools = _get_tools_map()
240
+
241
+ # Build tools environment to inject
242
+ tools_env = {
243
+ **wrapped_tools,
244
+ "search_tools": search_tools,
245
+ }
246
+
247
+ # Execute with AST mode to print all expression results
248
+ result = sandboxed_exec(input, extra_globals=tools_env, ast_mode=True)
249
+ _log_call_end("Agent_Terminal", _truncate_for_log(result))
250
+ return result
251
+
252
+
253
+ def build_interface() -> gr.Interface:
254
+ return gr.Interface(
255
+ fn=Agent_Terminal,
256
+ inputs=gr.Code(label="Python Code", language="python"),
257
+ outputs=gr.Textbox(label="Output", lines=5, max_lines=20),
258
+ title="Agent Terminal",
259
+ description="<div style=\"text-align:center\">Interact with all other tools via a Python API. Reduces token usage by 90%.</div>",
260
+ api_description=TOOL_SUMMARY,
261
+ flagging_mode="never",
262
+ )
263
+
264
+
265
+ __all__ = ["Agent_Terminal", "build_interface"]
Modules/Code_Interpreter.py CHANGED
@@ -1,40 +1,40 @@
1
- from __future__ import annotations
2
-
3
- from typing import Annotated
4
-
5
- import gradio as gr
6
- from ._docstrings import autodoc
7
- from ._core import sandboxed_exec
8
-
9
- from app import _log_call_end, _log_call_start, _truncate_for_log
10
-
11
-
12
- # Single source of truth for the LLM-facing tool description
13
- TOOL_SUMMARY = (
14
- "Execute Python code from the tool root; returns captured stdout or the exception text."
15
- )
16
-
17
-
18
- @autodoc(
19
- summary=TOOL_SUMMARY,
20
- )
21
- def Code_Interpreter(code: Annotated[str, "Python source code to run; stdout is captured and returned."]) -> str:
22
- _log_call_start("Code_Interpreter", code=_truncate_for_log(code or "", 300))
23
- result = sandboxed_exec(code, ast_mode=False)
24
- _log_call_end("Code_Interpreter", _truncate_for_log(result))
25
- return result
26
-
27
-
28
- def build_interface() -> gr.Interface:
29
- return gr.Interface(
30
- fn=Code_Interpreter,
31
- inputs=gr.Code(label="Python Code", language="python"),
32
- outputs=gr.Textbox(label="Output", lines=5, max_lines=20),
33
- title="Code Interpreter",
34
- description="<div style=\"text-align:center\">Execute Python code and see the output.</div>",
35
- api_description=TOOL_SUMMARY,
36
- flagging_mode="never",
37
- )
38
-
39
-
40
  __all__ = ["Code_Interpreter", "build_interface"]
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Annotated
4
+
5
+ import gradio as gr
6
+ from ._docstrings import autodoc
7
+ from ._core import sandboxed_exec
8
+
9
+ from app import _log_call_end, _log_call_start, _truncate_for_log
10
+
11
+
12
+ # Single source of truth for the LLM-facing tool description
13
+ TOOL_SUMMARY = (
14
+ "Execute Python code from the tool root; returns captured stdout or the exception text."
15
+ )
16
+
17
+
18
+ @autodoc(
19
+ summary=TOOL_SUMMARY,
20
+ )
21
+ def Code_Interpreter(code: Annotated[str, "Python source code to run; stdout is captured and returned."]) -> str:
22
+ _log_call_start("Code_Interpreter", code=_truncate_for_log(code or "", 300))
23
+ result = sandboxed_exec(code, ast_mode=False)
24
+ _log_call_end("Code_Interpreter", _truncate_for_log(result))
25
+ return result
26
+
27
+
28
+ def build_interface() -> gr.Interface:
29
+ return gr.Interface(
30
+ fn=Code_Interpreter,
31
+ inputs=gr.Code(label="Python Code", language="python"),
32
+ outputs=gr.Textbox(label="Output", lines=5, max_lines=20),
33
+ title="Code Interpreter",
34
+ description="<div style=\"text-align:center\">Execute Python code and see the output.</div>",
35
+ api_description=TOOL_SUMMARY,
36
+ flagging_mode="never",
37
+ )
38
+
39
+
40
  __all__ = ["Code_Interpreter", "build_interface"]
Modules/Deep_Research.py CHANGED
@@ -1,596 +1,596 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import re
5
- import tempfile
6
- import time
7
- import uuid
8
- from collections import OrderedDict, deque
9
- from concurrent.futures import Future, ThreadPoolExecutor, as_completed
10
- from datetime import datetime
11
- from typing import Annotated, Callable, Dict, List, Tuple
12
- from urllib.parse import urlparse
13
-
14
- import gradio as gr
15
- import requests
16
- from bs4 import BeautifulSoup
17
- from ddgs import DDGS
18
- from huggingface_hub import InferenceClient
19
-
20
- from .Web_Fetch import _fullpage_markdown_from_soup, _http_get_enhanced
21
- from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
22
- from ._docstrings import autodoc
23
- from .File_System import ROOT_DIR
24
- from ._core import get_hf_token
25
-
26
- HF_TEXTGEN_TOKEN = get_hf_token()
27
-
28
-
29
- # Single source of truth for the LLM-facing tool description
30
- TOOL_SUMMARY = (
31
- "Write a summary of what the user wants to research, and "
32
- "run multiple DuckDuckGo searches (up to 50 max results between all queries), fetch pages, and a Research agent will produce a comprehensive research report with sources; "
33
- "returns (Markdown report, newline-separated source links, downloadable report path). "
34
- "Provide the user with one-paragraph summary of the research report and the txt file in this format `![research_report](URL)`."
35
- )
36
-
37
- RESEARCHER_SYSTEM_PROMPT = (
38
- "You are Nymbot, a helpful deep research assistant. You will be asked a Query from a user and you will create a long, comprehensive, well-structured research report in response to the user's Query.\n\n"
39
- "You will receive a summary of the user question, the search queries used, and the fetched webpages. Follow the guidance below when writing the report.\n\n"
40
- "<report_format>\n"
41
- "Write a well-formatted report in the structure of a scientific report to a broad audience. The report must be readable and have a nice flow of Markdown headers and paragraphs of text. Do NOT use bullet points or lists which break up the natural flow. The report must be exhaustive for comprehensive topics.\n"
42
- "For any given user query, first determine the major themes or areas that need investigation, then structure these as main sections, and develop detailed subsections that explore various facets of each theme. Each section and subsection requires paragraphs of texts that need to all connect into one narrative flow.\n"
43
- "</report_format>\n\n"
44
- "<document_structure>\n"
45
- "- Always begin with a clear title using a single # header\n"
46
- "- Organize content into major sections using ## headers\n"
47
- "- Further divide into subsections using ### headers\n"
48
- "- Use #### headers sparingly for special subsections\n"
49
- "- Never skip header levels\n"
50
- "- Write multiple paragraphs per section or subsection\n"
51
- "- Each paragraph must contain at least 4-5 sentences, present novel insights and analysis grounded in source material, connect ideas to original query, and build upon previous paragraphs to create a narrative flow\n"
52
- "- Never use lists, instead always use text or tables\n\n"
53
- "Mandatory Section Flow:\n"
54
- "1. Title (# level)\n - Before writing the main report, start with one detailed paragraph summarizing key findings\n"
55
- "2. Main Body Sections (## level)\n - Each major topic gets its own section (## level). There MUST BE at least 5 sections.\n - Use ### subsections for detailed analysis\n - Every section or subsection needs at least one paragraph of narrative before moving to the next section\n - Do NOT have a section titled \"Main Body Sections\" and instead pick informative section names that convey the theme of the section\n"
56
- "3. Conclusion (## level)\n - Synthesis of findings\n - Potential recommendations or next steps\n"
57
- "</document_structure>\n\n"
58
- "<planning_rules>\n"
59
- "- Always break it down into multiple steps\n"
60
- "- Assess the different sources and whether they are useful for any steps needed to answer the query\n"
61
- "- Create the best report that weighs all the evidence from the sources\n"
62
- "- Use the current date supplied in the first user message to contextualize findings\n"
63
- "- Make sure that your final report addresses all parts of the query\n"
64
- "- Communicate a brief high-level plan in the introduction; do not reveal chain-of-thought.\n"
65
- "- When referencing sources during analysis, you should still refer to them by index with brackets and follow <citations>\n"
66
- "- As a final step, review your planned report structure and ensure it completely answers the query.\n"
67
- "</planning_rules>\n\n"
68
- )
69
-
70
- FILTERER_SYSTEM_PROMPT = (
71
- "You are Nymbot Filterer, an analyst who selects the most relevant sources for a research task. "
72
- "You will be given a summary of the research topic (and optional search queries) followed by multiple fetched documents. "
73
- "Each document includes its URL and a truncated excerpt. Evaluate how well each source helps answer the research topic. "
74
- "Return only the URLs that should be used for the final research step. Output plain text with exactly one URL per line and no additional commentary, bullets, numbering, or explanations. "
75
- "If no sources are relevant, return an empty string."
76
- )
77
-
78
-
79
- class SlowHost(Exception):
80
- pass
81
-
82
-
83
- def _normalize_query(q: str) -> str:
84
- if not q:
85
- return ""
86
- repl = {"“": '"', "”": '"', "‘": "'", "’": "'", "`": "'"}
87
- for key, value in repl.items():
88
- q = q.replace(key, value)
89
- q = re.sub(r"\s+", " ", q)
90
- q = re.sub(r'"\s+"', " ", q)
91
- q = q.strip().strip('"').strip()
92
- return q
93
-
94
-
95
- def _search_urls_only(query: str, max_results: int) -> list[str]:
96
- if not query or not query.strip() or max_results <= 0:
97
- return []
98
- urls: list[str] = []
99
- try:
100
- _search_rate_limiter.acquire()
101
- with DDGS() as ddgs:
102
- for item in ddgs.text(query, region="wt-wt", safesearch="moderate", max_results=max_results):
103
- url = (item.get("href") or item.get("url") or "").strip()
104
- if url:
105
- urls.append(url)
106
- except Exception:
107
- pass
108
- seen = set()
109
- deduped = []
110
- for url in urls:
111
- if url not in seen:
112
- seen.add(url)
113
- deduped.append(url)
114
- return deduped
115
-
116
-
117
- def _fetch_page_markdown_fast(url: str, max_chars: int = 3000, timeout: float = 10.0) -> str:
118
- try:
119
- resp = _http_get_enhanced(url, timeout=timeout, skip_rate_limit=True)
120
- resp.raise_for_status()
121
- except requests.exceptions.RequestException as exc:
122
- msg = str(exc)
123
- if "timed out" in msg.lower():
124
- raise SlowHost(msg) from exc
125
- return ""
126
- final_url = str(resp.url)
127
- ctype = resp.headers.get("Content-Type", "")
128
- if "html" not in ctype.lower():
129
- return ""
130
- resp.encoding = resp.encoding or resp.apparent_encoding
131
- html = resp.text
132
- soup = BeautifulSoup(html, "lxml")
133
- md_text = _fullpage_markdown_from_soup(soup, final_url, "")
134
- if max_chars > 0 and len(md_text) > max_chars:
135
- md_text = md_text[:max_chars]
136
- return md_text
137
-
138
-
139
- def _truncate_join(parts: List[str], max_chars: int) -> Tuple[str, bool]:
140
- out = []
141
- total = 0
142
- truncated = False
143
- for part in parts:
144
- if not part:
145
- continue
146
- if total + len(part) > max_chars:
147
- out.append(part[: max(0, max_chars - total)])
148
- truncated = True
149
- break
150
- out.append(part)
151
- total += len(part)
152
- return ("\n\n".join(out), truncated)
153
-
154
-
155
- def _build_research_prompt(summary: str, queries: List[str], url_list: List[str], pages_map: Dict[str, str]) -> str:
156
- sources_blocks: List[str] = []
157
- indexed_urls: List[str] = []
158
- for idx, url in enumerate(url_list, start=1):
159
- text = pages_map.get(url, "").strip()
160
- if not text:
161
- continue
162
- indexed_urls.append(f"[{idx}] {url}")
163
- sources_blocks.append(f"[Source {idx}] URL: {url}\n\n{text}")
164
- sources_joined, truncated = _truncate_join(sources_blocks, max_chars=100_000)
165
- prompt_parts: List[str] = []
166
- prompt_parts.append("<user_query_summary>\n" + (summary or "") + "\n</user_query_summary>\n")
167
- populated = [q for q in queries if q and q.strip()]
168
- if populated:
169
- prompt_parts.append("<search_queries>\n" + "\n".join(f"- {q.strip()}" for q in populated) + "\n</search_queries>\n")
170
- if indexed_urls:
171
- prompt_parts.append("<sources_list>\n" + "\n".join(indexed_urls) + "\n</sources_list>\n")
172
- prompt_parts.append("<fetched_documents>\n" + sources_joined + ("\n\n[NOTE] Sources truncated due to context limits." if truncated else "") + "\n</fetched_documents>")
173
- return "\n\n".join(prompt_parts)
174
-
175
-
176
- def _build_filter_prompt(summary: str, queries: List[str], pages_map: Dict[str, str]) -> str:
177
- populated = [q for q in queries if q and q.strip()]
178
- summary_text = summary or ""
179
- prompt_sections: List[str] = []
180
- prompt_sections.append("<research_topic_summary>\n" + summary_text + "\n</research_topic_summary>")
181
- if populated:
182
- prompt_sections.append("<search_queries>\n" + "\n".join(populated) + "\n</search_queries>")
183
- sources: List[str] = []
184
- for idx, (url, text) in enumerate(pages_map.items(), start=1):
185
- content = text.strip()
186
- if not content:
187
- continue
188
- sources.append(f"[Source {idx}] URL: {url}\n\n{content}")
189
- sources_joined, truncated = _truncate_join(sources, max_chars=60_000)
190
- prompt_sections.append("<candidate_sources>\n" + sources_joined + ("\n\n[NOTE] Sources truncated due to context limits." if truncated else "") + "\n</candidate_sources>")
191
- prompt_sections.append(
192
- "<task>\nIdentify which of the provided URLs should be retained for the final research synthesis. "
193
- "Consider coverage, credibility, and relevance to the research topic. "
194
- "Return ONLY the URLs you choose, with one URL per line and no additional text.\n</task>"
195
- )
196
- return "\n\n".join(prompt_sections)
197
-
198
-
199
- def _parse_filterer_output(raw: str, allowed_urls: List[str]) -> List[str]:
200
- if not raw:
201
- return []
202
- allowed_set = {url.strip(): idx for idx, url in enumerate(allowed_urls)}
203
- found_indices: set[int] = set()
204
- for line in raw.splitlines():
205
- candidate = line.strip()
206
- if not candidate:
207
- continue
208
- if candidate in allowed_set:
209
- found_indices.add(allowed_set[candidate])
210
- continue
211
- match = re.search(r"https?://[^\s]+", candidate)
212
- if not match:
213
- continue
214
- url = match.group(0).rstrip(".,);]")
215
- if url in allowed_set:
216
- found_indices.add(allowed_set[url])
217
- selected = [allowed_urls[idx] for idx in sorted(found_indices)]
218
- return selected
219
-
220
-
221
- def _write_report_tmp(text: str) -> str:
222
- filename = f"research_report_{uuid.uuid4().hex}.txt"
223
- path = os.path.join(ROOT_DIR, filename)
224
- with open(path, "w", encoding="utf-8") as file:
225
- file.write(text)
226
- return path
227
-
228
-
229
- def _fetch_pages_within_budget(urls: List[str], char_limit: int, time_left_fn: Callable[[], float]) -> OrderedDict:
230
- pages: dict[str, str] = {}
231
- if not urls:
232
- return OrderedDict()
233
- queue = deque(urls)
234
- attempts: dict[str, int] = {url: 0 for url in urls}
235
- max_attempts = 2
236
- max_workers = min(12, max(4, len(urls)))
237
- in_flight: dict[Future, str] = {}
238
- delayed: list[tuple[float, str]] = []
239
-
240
- def schedule_next(executor: ThreadPoolExecutor) -> None:
241
- while queue and len(in_flight) < max_workers:
242
- url = queue.popleft()
243
- if url in pages:
244
- continue
245
- attempts.setdefault(url, 0)
246
- if attempts[url] >= max_attempts:
247
- continue
248
- attempts[url] += 1
249
- tl = time_left_fn()
250
- if tl <= 0.1:
251
- return
252
- per_timeout = 10.0 if tl > 15 else (5.0 if tl > 8 else 2.0)
253
- future = executor.submit(_fetch_page_markdown_fast, url, char_limit, per_timeout)
254
- in_flight[future] = url
255
-
256
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
257
- schedule_next(executor)
258
- while (in_flight or queue or delayed) and time_left_fn() > 0.2:
259
- now = time.time()
260
- if delayed:
261
- ready: list[tuple[float, str]] = []
262
- not_ready: list[tuple[float, str]] = []
263
- for ready_time, delayed_url in delayed:
264
- (ready if ready_time <= now else not_ready).append((ready_time, delayed_url))
265
- delayed = not_ready
266
- for _, delayed_url in ready:
267
- queue.append(delayed_url)
268
- if ready:
269
- schedule_next(executor)
270
- done = [future for future in list(in_flight.keys()) if future.done()]
271
- if not done:
272
- if not queue and delayed:
273
- next_ready = min((t for t, _ in delayed), default=time.time())
274
- sleep_for = max(0.0, next_ready - time.time())
275
- time.sleep(max(0.02, min(0.25, sleep_for)))
276
- else:
277
- time.sleep(0.05)
278
- continue
279
- for future in done:
280
- url = in_flight.pop(future)
281
- try:
282
- md = future.result()
283
- if md and not md.startswith("Unsupported content type") and not md.startswith("An error occurred"):
284
- pages[url] = md
285
- try:
286
- print(f"[FETCH OK] {url} (chars={len(md)})", flush=True)
287
- except Exception:
288
- pass
289
- except SlowHost:
290
- if time_left_fn() > 5.0:
291
- delayed.append((time.time() + 3.0, url))
292
- except Exception:
293
- pass
294
- schedule_next(executor)
295
- ordered = OrderedDict((url, pages[url]) for url in urls if url in pages)
296
- return ordered
297
-
298
-
299
- @autodoc(
300
- summary=TOOL_SUMMARY,
301
- )
302
- def Deep_Research(
303
- summary: Annotated[str, "Summarization of research topic (one or more sentences)."],
304
- query1: Annotated[str, "DDG Search Query 1"],
305
- max1: Annotated[int, "Max results for Query 1 (1-50)"] = 10,
306
- query2: Annotated[str, "DDG Search Query 2"] = "",
307
- max2: Annotated[int, "Max results for Query 2 (1-50)"] = 10,
308
- query3: Annotated[str, "DDG Search Query 3"] = "",
309
- max3: Annotated[int, "Max results for Query 3 (1-50)"] = 10,
310
- query4: Annotated[str, "DDG Search Query 4"] = "",
311
- max4: Annotated[int, "Max results for Query 4 (1-50)"] = 10,
312
- query5: Annotated[str, "DDG Search Query 5"] = "",
313
- max5: Annotated[int, "Max results for Query 5 (1-50)"] = 10,
314
- ) -> tuple[str, str, str]:
315
- _log_call_start(
316
- "Deep_Research",
317
- summary=_truncate_for_log(summary or "", 200),
318
- queries=[q for q in [query1, query2, query3, query4, query5] if q],
319
- )
320
- if not HF_TEXTGEN_TOKEN:
321
- _log_call_end("Deep_Research", "error=missing HF token")
322
- raise gr.Error("Please provide a `HF_READ_TOKEN` to enable Deep Research.")
323
- queries = [
324
- _normalize_query(query1 or ""),
325
- _normalize_query(query2 or ""),
326
- _normalize_query(query3 or ""),
327
- _normalize_query(query4 or ""),
328
- _normalize_query(query5 or ""),
329
- ]
330
- reqs = [
331
- max(1, min(50, int(max1))),
332
- max(1, min(50, int(max2))),
333
- max(1, min(50, int(max3))),
334
- max(1, min(50, int(max4))),
335
- max(1, min(50, int(max5))),
336
- ]
337
- total_requested = sum(reqs)
338
- if total_requested > 50:
339
- reqs = [10, 10, 10, 10, 10]
340
- start_ts = time.time()
341
- budget_seconds = 55.0
342
- deadline = start_ts + budget_seconds
343
-
344
- def time_left() -> float:
345
- return max(0.0, deadline - time.time())
346
-
347
- now_dt = datetime.now().astimezone()
348
- date_str = now_dt.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
349
- if not date_str:
350
- date_str = now_dt.isoformat()
351
-
352
- all_urls: list[str] = []
353
- tasks = []
354
- with ThreadPoolExecutor(max_workers=min(5, sum(1 for q in queries if q.strip())) or 1) as executor:
355
- for query, count in zip(queries, reqs):
356
- if not query.strip():
357
- continue
358
- tasks.append(executor.submit(_search_urls_only, query.strip(), count))
359
- for future in as_completed(tasks):
360
- try:
361
- urls = future.result() or []
362
- except Exception:
363
- urls = []
364
- for url in urls:
365
- if url not in all_urls:
366
- all_urls.append(url)
367
- if len(all_urls) >= 50:
368
- break
369
- if time_left() <= 0.5:
370
- break
371
- if len(all_urls) > 50:
372
- all_urls = all_urls[:50]
373
- blacklist = {
374
- "homedepot.com",
375
- "tractorsupply.com",
376
- "mcmaster.com",
377
- "mrchain.com",
378
- "answers.com",
379
- "city-data.com",
380
- "dictionary.cambridge.org",
381
- }
382
-
383
- def _domain(url: str) -> str:
384
- try:
385
- return urlparse(url).netloc.lower()
386
- except Exception:
387
- return ""
388
-
389
- all_urls = [url for url in all_urls if _domain(url) not in blacklist]
390
- skip_exts = (
391
- ".pdf",
392
- ".ppt",
393
- ".pptx",
394
- ".doc",
395
- ".docx",
396
- ".xls",
397
- ".xlsx",
398
- ".zip",
399
- ".gz",
400
- ".tgz",
401
- ".bz2",
402
- ".7z",
403
- ".rar",
404
- )
405
-
406
- def _skip_url(url: str) -> bool:
407
- try:
408
- path = urlparse(url).path.lower()
409
- except Exception:
410
- return False
411
- return any(path.endswith(ext) for ext in skip_exts)
412
-
413
- all_urls = [url for url in all_urls if not _skip_url(url)]
414
- truncated_pages = OrderedDict()
415
- if all_urls and time_left() > 0.2:
416
- truncated_pages = _fetch_pages_within_budget(all_urls, 3000, time_left)
417
- print(
418
- f"[PIPELINE] Initial fetch complete: candidates={len(all_urls)}, truncated_documents={len(truncated_pages)}, time_left={time_left():.2f}s",
419
- flush=True,
420
- )
421
-
422
- def _invoke_chat(messages, provider: str, max_tokens: int, temp: float, top_p: float):
423
- client = InferenceClient(provider=provider, api_key=HF_TEXTGEN_TOKEN)
424
- return client.chat.completions.create(
425
- model="zai-org/GLM-4.7",
426
- messages=messages,
427
- max_tokens=max_tokens,
428
- temperature=temp,
429
- top_p=top_p,
430
- )
431
-
432
- filtered_urls: List[str] = list(truncated_pages.keys())
433
- filter_output = ""
434
- filter_used_fallback = False
435
- filter_success = False
436
- if truncated_pages and time_left() > 3.0:
437
- filter_prompt = _build_filter_prompt(summary or "", [q for q in queries if q.strip()], truncated_pages)
438
- filter_messages = [
439
- {"role": "system", "content": FILTERER_SYSTEM_PROMPT},
440
- {"role": "user", "content": f"The current date is {date_str}. Consider how recent each source is when deciding relevance."},
441
- {"role": "user", "content": filter_prompt},
442
- ]
443
- filter_completion = None
444
- try:
445
- print("[FILTER] Attempt 1: provider=cerebras, max_tokens=2048", flush=True)
446
- filter_completion = _invoke_chat(filter_messages, "cerebras", 2048, 0.2, 0.9)
447
- except Exception as exc1:
448
- print(f"[FILTER] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
449
- try:
450
- print("[FILTER] Attempt 2: provider=auto, max_tokens=2048", flush=True)
451
- filter_completion = _invoke_chat(filter_messages, "auto", 2048, 0.2, 0.9)
452
- except Exception as exc2:
453
- print(f"[FILTER] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
454
- if filter_completion and filter_completion.choices:
455
- filter_output = filter_completion.choices[0].message.content or ""
456
- filtered_urls = _parse_filterer_output(filter_output, list(truncated_pages.keys()))
457
- filter_success = bool(filter_output.strip()) and bool(filtered_urls)
458
- if not filtered_urls:
459
- filter_used_fallback = True
460
- fallback_count = min(8, len(truncated_pages))
461
- filtered_urls = list(truncated_pages.keys())[:fallback_count]
462
- max_final_urls = 20
463
- if len(filtered_urls) > max_final_urls:
464
- filter_used_fallback = True
465
- filtered_urls = filtered_urls[:max_final_urls]
466
- if not filter_success:
467
- filter_used_fallback = True
468
- print(
469
- f"[FILTER] Selected URLs={len(filtered_urls)}, fallback={filter_used_fallback}, time_left={time_left():.2f}s",
470
- flush=True,
471
- )
472
-
473
- final_pages_fetched = OrderedDict()
474
- if filtered_urls and time_left() > 0.2:
475
- final_pages_fetched = _fetch_pages_within_budget(filtered_urls, 8000, time_left)
476
- merged_pages = OrderedDict()
477
- for url in filtered_urls:
478
- content = final_pages_fetched.get(url) or truncated_pages.get(url) or ""
479
- if content:
480
- merged_pages[url] = content
481
- pages = merged_pages
482
- print(
483
- f"[PIPELINE] Final fetch complete: retained_documents={len(pages)}, time_left={time_left():.2f}s",
484
- flush=True,
485
- )
486
- prompt = _build_research_prompt(summary=summary or "", queries=[q for q in queries if q.strip()], url_list=list(pages.keys()), pages_map=pages)
487
- system_message = {"role": "system", "content": RESEARCHER_SYSTEM_PROMPT}
488
- date_message = {"role": "user", "content": f"The current date is {date_str}. Return only the research report."}
489
- messages = [
490
- system_message,
491
- date_message,
492
- {"role": "user", "content": prompt},
493
- ]
494
- try:
495
- prompt_chars = len(prompt)
496
- except Exception:
497
- prompt_chars = -1
498
- print(f"[PIPELINE] Fetch complete: pages={len(pages)}, unique_urls={len(pages.keys())}, prompt_chars={prompt_chars}", flush=True)
499
- print("[PIPELINE] Starting inference (provider=cerebras, model=zai-org/GLM-4.7)", flush=True)
500
-
501
- try:
502
- print("[LLM] Attempt 1: provider=cerebras, max_tokens=32768", flush=True)
503
- completion = _invoke_chat(messages, "cerebras", max_tokens=32768, temp=0.3, top_p=0.95)
504
- except Exception as exc1:
505
- print(f"[LLM] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
506
- try:
507
- prompt2 = _build_research_prompt(
508
- summary=summary or "",
509
- queries=[q for q in queries if q.strip()],
510
- url_list=list(pages.keys())[:30],
511
- pages_map={key: pages[key] for key in list(pages.keys())[:30]},
512
- )
513
- messages = [
514
- system_message,
515
- date_message,
516
- {"role": "user", "content": prompt2},
517
- ]
518
- print("[LLM] Attempt 2: provider=cerebras (trimmed), max_tokens=16384", flush=True)
519
- completion = _invoke_chat(messages, "cerebras", max_tokens=16384, temp=0.7, top_p=0.95)
520
- except Exception as exc2:
521
- print(f"[LLM] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
522
- try:
523
- print("[LLM] Attempt 3: provider=auto, max_tokens=8192", flush=True)
524
- completion = _invoke_chat(messages, "auto", max_tokens=8192, temp=0.7, top_p=0.95)
525
- except Exception as exc3:
526
- _log_call_end("Deep_Research", f"error={_truncate_for_log(str(exc3), 260)}")
527
- raise gr.Error(f"Researcher model call failed: {exc3}")
528
- raw = completion.choices[0].message.content or ""
529
- try:
530
- no_think = re.sub(r"<think>[\s\S]*?<\\/think>", "", raw, flags=re.IGNORECASE)
531
- no_think = re.sub(r"<\\/?think>", "", no_think, flags=re.IGNORECASE)
532
- except Exception:
533
- no_think = raw
534
- try:
535
- paragraphs = [p for p in re.split(r"\n\s*\n", no_think) if p.strip()]
536
- keep: List[str] = []
537
- removed = 0
538
- planning_re = re.compile(r"\b(let me|now i(?:'ll| will)?|first,|i will now|i will|i'll|let's|now let me|i need to|now i'll|now i will)\b", re.IGNORECASE)
539
- for paragraph in paragraphs:
540
- if planning_re.search(paragraph):
541
- removed += 1
542
- continue
543
- keep.append(paragraph)
544
- report = "\n\n".join(keep).strip()
545
- if not report:
546
- report = no_think.strip()
547
- except Exception:
548
- report = no_think
549
- removed = 0
550
- report = re.sub(r"\n\s*\n\s*\n+", "\n\n", report)
551
- try:
552
- print(f"[POSTPROCESS] removed_planning_paragraphs={removed}, raw_chars={len(raw)}, final_chars={len(report)}", flush=True)
553
- except Exception:
554
- pass
555
- links_text = "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
556
- if links_text:
557
- sources_section = "\n\n## Sources\n" + "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
558
- report = report.rstrip() + sources_section
559
- file_path = _write_report_tmp(report)
560
- elapsed = time.time() - start_ts
561
- print(f"[TIMING] Deep_Research elapsed: {elapsed:.2f}s", flush=True)
562
- _log_call_end("Deep_Research", f"urls={len(pages)} file={os.path.basename(file_path)} duration={elapsed:.2f}s")
563
- return report, links_text, file_path
564
-
565
-
566
- def build_interface() -> gr.Interface:
567
- return gr.Interface(
568
- fn=Deep_Research,
569
- inputs=[
570
- gr.Textbox(label="Summarization of research topic", lines=3, placeholder="Briefly summarize the research topic or user question", info="Summarization of research topic (one or more sentences)"),
571
- gr.Textbox(label="DDG Search Query 1", max_lines=1, info="DDG Search Query 1"),
572
- gr.Slider(1, 50, value=10, step=1, label="Max results (Q1)", info="Max results for Query 1 (1-50)"),
573
- gr.Textbox(label="DDG Search Query 2", value="", max_lines=1, info="DDG Search Query 2"),
574
- gr.Slider(1, 50, value=10, step=1, label="Max results (Q2)", info="Max results for Query 2 (1-50)"),
575
- gr.Textbox(label="DDG Search Query 3", value="", max_lines=1, info="DDG Search Query 3"),
576
- gr.Slider(1, 50, value=10, step=1, label="Max results (Q3)", info="Max results for Query 3 (1-50)"),
577
- gr.Textbox(label="DDG Search Query 4", value="", max_lines=1, info="DDG Search Query 4"),
578
- gr.Slider(1, 50, value=10, step=1, label="Max results (Q4)", info="Max results for Query 4 (1-50)"),
579
- gr.Textbox(label="DDG Search Query 5", value="", max_lines=1, info="DDG Search Query 5"),
580
- gr.Slider(1, 50, value=10, step=1, label="Max results (Q5)", info="Max results for Query 5 (1-50)"),
581
- ],
582
- outputs=[
583
- gr.Markdown(label="Research Report"),
584
- gr.Textbox(label="Fetched Links", lines=8),
585
- gr.File(label="Download Research Report", file_count="single"),
586
- ],
587
- title="Deep Research",
588
- description=(
589
- "<div style=\"text-align:center\">Generate a research report based on dozens of sources. Default model is GLM-4.7</div>"
590
- ),
591
- api_description=TOOL_SUMMARY,
592
- flagging_mode="never",
593
- )
594
-
595
-
596
- __all__ = ["Deep_Research", "build_interface"]
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ import tempfile
6
+ import time
7
+ import uuid
8
+ from collections import OrderedDict, deque
9
+ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
10
+ from datetime import datetime
11
+ from typing import Annotated, Callable, Dict, List, Tuple
12
+ from urllib.parse import urlparse
13
+
14
+ import gradio as gr
15
+ import requests
16
+ from bs4 import BeautifulSoup
17
+ from ddgs import DDGS
18
+ from huggingface_hub import InferenceClient
19
+
20
+ from .Web_Fetch import _fullpage_markdown_from_soup, _http_get_enhanced
21
+ from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
22
+ from ._docstrings import autodoc
23
+ from .File_System import ROOT_DIR
24
+ from ._core import get_hf_token
25
+
26
+ HF_TEXTGEN_TOKEN = get_hf_token()
27
+
28
+
29
+ # Single source of truth for the LLM-facing tool description
30
+ TOOL_SUMMARY = (
31
+ "Write a summary of what the user wants to research, and "
32
+ "run multiple DuckDuckGo searches (up to 50 max results between all queries), fetch pages, and a Research agent will produce a comprehensive research report with sources; "
33
+ "returns (Markdown report, newline-separated source links, downloadable report path). "
34
+ "Provide the user with one-paragraph summary of the research report and the txt file in this format `![research_report](URL)`."
35
+ )
36
+
37
+ RESEARCHER_SYSTEM_PROMPT = (
38
+ "You are Nymbot, a helpful deep research assistant. You will be asked a Query from a user and you will create a long, comprehensive, well-structured research report in response to the user's Query.\n\n"
39
+ "You will receive a summary of the user question, the search queries used, and the fetched webpages. Follow the guidance below when writing the report.\n\n"
40
+ "<report_format>\n"
41
+ "Write a well-formatted report in the structure of a scientific report to a broad audience. The report must be readable and have a nice flow of Markdown headers and paragraphs of text. Do NOT use bullet points or lists which break up the natural flow. The report must be exhaustive for comprehensive topics.\n"
42
+ "For any given user query, first determine the major themes or areas that need investigation, then structure these as main sections, and develop detailed subsections that explore various facets of each theme. Each section and subsection requires paragraphs of texts that need to all connect into one narrative flow.\n"
43
+ "</report_format>\n\n"
44
+ "<document_structure>\n"
45
+ "- Always begin with a clear title using a single # header\n"
46
+ "- Organize content into major sections using ## headers\n"
47
+ "- Further divide into subsections using ### headers\n"
48
+ "- Use #### headers sparingly for special subsections\n"
49
+ "- Never skip header levels\n"
50
+ "- Write multiple paragraphs per section or subsection\n"
51
+ "- Each paragraph must contain at least 4-5 sentences, present novel insights and analysis grounded in source material, connect ideas to original query, and build upon previous paragraphs to create a narrative flow\n"
52
+ "- Never use lists, instead always use text or tables\n\n"
53
+ "Mandatory Section Flow:\n"
54
+ "1. Title (# level)\n - Before writing the main report, start with one detailed paragraph summarizing key findings\n"
55
+ "2. Main Body Sections (## level)\n - Each major topic gets its own section (## level). There MUST BE at least 5 sections.\n - Use ### subsections for detailed analysis\n - Every section or subsection needs at least one paragraph of narrative before moving to the next section\n - Do NOT have a section titled \"Main Body Sections\" and instead pick informative section names that convey the theme of the section\n"
56
+ "3. Conclusion (## level)\n - Synthesis of findings\n - Potential recommendations or next steps\n"
57
+ "</document_structure>\n\n"
58
+ "<planning_rules>\n"
59
+ "- Always break it down into multiple steps\n"
60
+ "- Assess the different sources and whether they are useful for any steps needed to answer the query\n"
61
+ "- Create the best report that weighs all the evidence from the sources\n"
62
+ "- Use the current date supplied in the first user message to contextualize findings\n"
63
+ "- Make sure that your final report addresses all parts of the query\n"
64
+ "- Communicate a brief high-level plan in the introduction; do not reveal chain-of-thought.\n"
65
+ "- When referencing sources during analysis, you should still refer to them by index with brackets and follow <citations>\n"
66
+ "- As a final step, review your planned report structure and ensure it completely answers the query.\n"
67
+ "</planning_rules>\n\n"
68
+ )
69
+
70
+ FILTERER_SYSTEM_PROMPT = (
71
+ "You are Nymbot Filterer, an analyst who selects the most relevant sources for a research task. "
72
+ "You will be given a summary of the research topic (and optional search queries) followed by multiple fetched documents. "
73
+ "Each document includes its URL and a truncated excerpt. Evaluate how well each source helps answer the research topic. "
74
+ "Return only the URLs that should be used for the final research step. Output plain text with exactly one URL per line and no additional commentary, bullets, numbering, or explanations. "
75
+ "If no sources are relevant, return an empty string."
76
+ )
77
+
78
+
79
+ class SlowHost(Exception):
80
+ pass
81
+
82
+
83
+ def _normalize_query(q: str) -> str:
84
+ if not q:
85
+ return ""
86
+ repl = {"“": '"', "”": '"', "‘": "'", "’": "'", "`": "'"}
87
+ for key, value in repl.items():
88
+ q = q.replace(key, value)
89
+ q = re.sub(r"\s+", " ", q)
90
+ q = re.sub(r'"\s+"', " ", q)
91
+ q = q.strip().strip('"').strip()
92
+ return q
93
+
94
+
95
+ def _search_urls_only(query: str, max_results: int) -> list[str]:
96
+ if not query or not query.strip() or max_results <= 0:
97
+ return []
98
+ urls: list[str] = []
99
+ try:
100
+ _search_rate_limiter.acquire()
101
+ with DDGS() as ddgs:
102
+ for item in ddgs.text(query, region="wt-wt", safesearch="moderate", max_results=max_results):
103
+ url = (item.get("href") or item.get("url") or "").strip()
104
+ if url:
105
+ urls.append(url)
106
+ except Exception:
107
+ pass
108
+ seen = set()
109
+ deduped = []
110
+ for url in urls:
111
+ if url not in seen:
112
+ seen.add(url)
113
+ deduped.append(url)
114
+ return deduped
115
+
116
+
117
+ def _fetch_page_markdown_fast(url: str, max_chars: int = 3000, timeout: float = 10.0) -> str:
118
+ try:
119
+ resp = _http_get_enhanced(url, timeout=timeout, skip_rate_limit=True)
120
+ resp.raise_for_status()
121
+ except requests.exceptions.RequestException as exc:
122
+ msg = str(exc)
123
+ if "timed out" in msg.lower():
124
+ raise SlowHost(msg) from exc
125
+ return ""
126
+ final_url = str(resp.url)
127
+ ctype = resp.headers.get("Content-Type", "")
128
+ if "html" not in ctype.lower():
129
+ return ""
130
+ resp.encoding = resp.encoding or resp.apparent_encoding
131
+ html = resp.text
132
+ soup = BeautifulSoup(html, "lxml")
133
+ md_text = _fullpage_markdown_from_soup(soup, final_url, "")
134
+ if max_chars > 0 and len(md_text) > max_chars:
135
+ md_text = md_text[:max_chars]
136
+ return md_text
137
+
138
+
139
+ def _truncate_join(parts: List[str], max_chars: int) -> Tuple[str, bool]:
140
+ out = []
141
+ total = 0
142
+ truncated = False
143
+ for part in parts:
144
+ if not part:
145
+ continue
146
+ if total + len(part) > max_chars:
147
+ out.append(part[: max(0, max_chars - total)])
148
+ truncated = True
149
+ break
150
+ out.append(part)
151
+ total += len(part)
152
+ return ("\n\n".join(out), truncated)
153
+
154
+
155
+ def _build_research_prompt(summary: str, queries: List[str], url_list: List[str], pages_map: Dict[str, str]) -> str:
156
+ sources_blocks: List[str] = []
157
+ indexed_urls: List[str] = []
158
+ for idx, url in enumerate(url_list, start=1):
159
+ text = pages_map.get(url, "").strip()
160
+ if not text:
161
+ continue
162
+ indexed_urls.append(f"[{idx}] {url}")
163
+ sources_blocks.append(f"[Source {idx}] URL: {url}\n\n{text}")
164
+ sources_joined, truncated = _truncate_join(sources_blocks, max_chars=100_000)
165
+ prompt_parts: List[str] = []
166
+ prompt_parts.append("<user_query_summary>\n" + (summary or "") + "\n</user_query_summary>\n")
167
+ populated = [q for q in queries if q and q.strip()]
168
+ if populated:
169
+ prompt_parts.append("<search_queries>\n" + "\n".join(f"- {q.strip()}" for q in populated) + "\n</search_queries>\n")
170
+ if indexed_urls:
171
+ prompt_parts.append("<sources_list>\n" + "\n".join(indexed_urls) + "\n</sources_list>\n")
172
+ prompt_parts.append("<fetched_documents>\n" + sources_joined + ("\n\n[NOTE] Sources truncated due to context limits." if truncated else "") + "\n</fetched_documents>")
173
+ return "\n\n".join(prompt_parts)
174
+
175
+
176
+ def _build_filter_prompt(summary: str, queries: List[str], pages_map: Dict[str, str]) -> str:
177
+ populated = [q for q in queries if q and q.strip()]
178
+ summary_text = summary or ""
179
+ prompt_sections: List[str] = []
180
+ prompt_sections.append("<research_topic_summary>\n" + summary_text + "\n</research_topic_summary>")
181
+ if populated:
182
+ prompt_sections.append("<search_queries>\n" + "\n".join(populated) + "\n</search_queries>")
183
+ sources: List[str] = []
184
+ for idx, (url, text) in enumerate(pages_map.items(), start=1):
185
+ content = text.strip()
186
+ if not content:
187
+ continue
188
+ sources.append(f"[Source {idx}] URL: {url}\n\n{content}")
189
+ sources_joined, truncated = _truncate_join(sources, max_chars=60_000)
190
+ prompt_sections.append("<candidate_sources>\n" + sources_joined + ("\n\n[NOTE] Sources truncated due to context limits." if truncated else "") + "\n</candidate_sources>")
191
+ prompt_sections.append(
192
+ "<task>\nIdentify which of the provided URLs should be retained for the final research synthesis. "
193
+ "Consider coverage, credibility, and relevance to the research topic. "
194
+ "Return ONLY the URLs you choose, with one URL per line and no additional text.\n</task>"
195
+ )
196
+ return "\n\n".join(prompt_sections)
197
+
198
+
199
+ def _parse_filterer_output(raw: str, allowed_urls: List[str]) -> List[str]:
200
+ if not raw:
201
+ return []
202
+ allowed_set = {url.strip(): idx for idx, url in enumerate(allowed_urls)}
203
+ found_indices: set[int] = set()
204
+ for line in raw.splitlines():
205
+ candidate = line.strip()
206
+ if not candidate:
207
+ continue
208
+ if candidate in allowed_set:
209
+ found_indices.add(allowed_set[candidate])
210
+ continue
211
+ match = re.search(r"https?://[^\s]+", candidate)
212
+ if not match:
213
+ continue
214
+ url = match.group(0).rstrip(".,);]")
215
+ if url in allowed_set:
216
+ found_indices.add(allowed_set[url])
217
+ selected = [allowed_urls[idx] for idx in sorted(found_indices)]
218
+ return selected
219
+
220
+
221
+ def _write_report_tmp(text: str) -> str:
222
+ filename = f"research_report_{uuid.uuid4().hex}.txt"
223
+ path = os.path.join(ROOT_DIR, filename)
224
+ with open(path, "w", encoding="utf-8") as file:
225
+ file.write(text)
226
+ return path
227
+
228
+
229
+ def _fetch_pages_within_budget(urls: List[str], char_limit: int, time_left_fn: Callable[[], float]) -> OrderedDict:
230
+ pages: dict[str, str] = {}
231
+ if not urls:
232
+ return OrderedDict()
233
+ queue = deque(urls)
234
+ attempts: dict[str, int] = {url: 0 for url in urls}
235
+ max_attempts = 2
236
+ max_workers = min(12, max(4, len(urls)))
237
+ in_flight: dict[Future, str] = {}
238
+ delayed: list[tuple[float, str]] = []
239
+
240
+ def schedule_next(executor: ThreadPoolExecutor) -> None:
241
+ while queue and len(in_flight) < max_workers:
242
+ url = queue.popleft()
243
+ if url in pages:
244
+ continue
245
+ attempts.setdefault(url, 0)
246
+ if attempts[url] >= max_attempts:
247
+ continue
248
+ attempts[url] += 1
249
+ tl = time_left_fn()
250
+ if tl <= 0.1:
251
+ return
252
+ per_timeout = 10.0 if tl > 15 else (5.0 if tl > 8 else 2.0)
253
+ future = executor.submit(_fetch_page_markdown_fast, url, char_limit, per_timeout)
254
+ in_flight[future] = url
255
+
256
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
257
+ schedule_next(executor)
258
+ while (in_flight or queue or delayed) and time_left_fn() > 0.2:
259
+ now = time.time()
260
+ if delayed:
261
+ ready: list[tuple[float, str]] = []
262
+ not_ready: list[tuple[float, str]] = []
263
+ for ready_time, delayed_url in delayed:
264
+ (ready if ready_time <= now else not_ready).append((ready_time, delayed_url))
265
+ delayed = not_ready
266
+ for _, delayed_url in ready:
267
+ queue.append(delayed_url)
268
+ if ready:
269
+ schedule_next(executor)
270
+ done = [future for future in list(in_flight.keys()) if future.done()]
271
+ if not done:
272
+ if not queue and delayed:
273
+ next_ready = min((t for t, _ in delayed), default=time.time())
274
+ sleep_for = max(0.0, next_ready - time.time())
275
+ time.sleep(max(0.02, min(0.25, sleep_for)))
276
+ else:
277
+ time.sleep(0.05)
278
+ continue
279
+ for future in done:
280
+ url = in_flight.pop(future)
281
+ try:
282
+ md = future.result()
283
+ if md and not md.startswith("Unsupported content type") and not md.startswith("An error occurred"):
284
+ pages[url] = md
285
+ try:
286
+ print(f"[FETCH OK] {url} (chars={len(md)})", flush=True)
287
+ except Exception:
288
+ pass
289
+ except SlowHost:
290
+ if time_left_fn() > 5.0:
291
+ delayed.append((time.time() + 3.0, url))
292
+ except Exception:
293
+ pass
294
+ schedule_next(executor)
295
+ ordered = OrderedDict((url, pages[url]) for url in urls if url in pages)
296
+ return ordered
297
+
298
+
299
+ @autodoc(
300
+ summary=TOOL_SUMMARY,
301
+ )
302
+ def Deep_Research(
303
+ summary: Annotated[str, "Summarization of research topic (one or more sentences)."],
304
+ query1: Annotated[str, "DDG Search Query 1"],
305
+ max1: Annotated[int, "Max results for Query 1 (1-50)"] = 10,
306
+ query2: Annotated[str, "DDG Search Query 2"] = "",
307
+ max2: Annotated[int, "Max results for Query 2 (1-50)"] = 10,
308
+ query3: Annotated[str, "DDG Search Query 3"] = "",
309
+ max3: Annotated[int, "Max results for Query 3 (1-50)"] = 10,
310
+ query4: Annotated[str, "DDG Search Query 4"] = "",
311
+ max4: Annotated[int, "Max results for Query 4 (1-50)"] = 10,
312
+ query5: Annotated[str, "DDG Search Query 5"] = "",
313
+ max5: Annotated[int, "Max results for Query 5 (1-50)"] = 10,
314
+ ) -> tuple[str, str, str]:
315
+ _log_call_start(
316
+ "Deep_Research",
317
+ summary=_truncate_for_log(summary or "", 200),
318
+ queries=[q for q in [query1, query2, query3, query4, query5] if q],
319
+ )
320
+ if not HF_TEXTGEN_TOKEN:
321
+ _log_call_end("Deep_Research", "error=missing HF token")
322
+ raise gr.Error("Please provide a `HF_READ_TOKEN` to enable Deep Research.")
323
+ queries = [
324
+ _normalize_query(query1 or ""),
325
+ _normalize_query(query2 or ""),
326
+ _normalize_query(query3 or ""),
327
+ _normalize_query(query4 or ""),
328
+ _normalize_query(query5 or ""),
329
+ ]
330
+ reqs = [
331
+ max(1, min(50, int(max1))),
332
+ max(1, min(50, int(max2))),
333
+ max(1, min(50, int(max3))),
334
+ max(1, min(50, int(max4))),
335
+ max(1, min(50, int(max5))),
336
+ ]
337
+ total_requested = sum(reqs)
338
+ if total_requested > 50:
339
+ reqs = [10, 10, 10, 10, 10]
340
+ start_ts = time.time()
341
+ budget_seconds = 55.0
342
+ deadline = start_ts + budget_seconds
343
+
344
+ def time_left() -> float:
345
+ return max(0.0, deadline - time.time())
346
+
347
+ now_dt = datetime.now().astimezone()
348
+ date_str = now_dt.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
349
+ if not date_str:
350
+ date_str = now_dt.isoformat()
351
+
352
+ all_urls: list[str] = []
353
+ tasks = []
354
+ with ThreadPoolExecutor(max_workers=min(5, sum(1 for q in queries if q.strip())) or 1) as executor:
355
+ for query, count in zip(queries, reqs):
356
+ if not query.strip():
357
+ continue
358
+ tasks.append(executor.submit(_search_urls_only, query.strip(), count))
359
+ for future in as_completed(tasks):
360
+ try:
361
+ urls = future.result() or []
362
+ except Exception:
363
+ urls = []
364
+ for url in urls:
365
+ if url not in all_urls:
366
+ all_urls.append(url)
367
+ if len(all_urls) >= 50:
368
+ break
369
+ if time_left() <= 0.5:
370
+ break
371
+ if len(all_urls) > 50:
372
+ all_urls = all_urls[:50]
373
+ blacklist = {
374
+ "homedepot.com",
375
+ "tractorsupply.com",
376
+ "mcmaster.com",
377
+ "mrchain.com",
378
+ "answers.com",
379
+ "city-data.com",
380
+ "dictionary.cambridge.org",
381
+ }
382
+
383
+ def _domain(url: str) -> str:
384
+ try:
385
+ return urlparse(url).netloc.lower()
386
+ except Exception:
387
+ return ""
388
+
389
+ all_urls = [url for url in all_urls if _domain(url) not in blacklist]
390
+ skip_exts = (
391
+ ".pdf",
392
+ ".ppt",
393
+ ".pptx",
394
+ ".doc",
395
+ ".docx",
396
+ ".xls",
397
+ ".xlsx",
398
+ ".zip",
399
+ ".gz",
400
+ ".tgz",
401
+ ".bz2",
402
+ ".7z",
403
+ ".rar",
404
+ )
405
+
406
+ def _skip_url(url: str) -> bool:
407
+ try:
408
+ path = urlparse(url).path.lower()
409
+ except Exception:
410
+ return False
411
+ return any(path.endswith(ext) for ext in skip_exts)
412
+
413
+ all_urls = [url for url in all_urls if not _skip_url(url)]
414
+ truncated_pages = OrderedDict()
415
+ if all_urls and time_left() > 0.2:
416
+ truncated_pages = _fetch_pages_within_budget(all_urls, 3000, time_left)
417
+ print(
418
+ f"[PIPELINE] Initial fetch complete: candidates={len(all_urls)}, truncated_documents={len(truncated_pages)}, time_left={time_left():.2f}s",
419
+ flush=True,
420
+ )
421
+
422
+ def _invoke_chat(messages, provider: str, max_tokens: int, temp: float, top_p: float):
423
+ client = InferenceClient(provider=provider, api_key=HF_TEXTGEN_TOKEN)
424
+ return client.chat.completions.create(
425
+ model="zai-org/GLM-4.7",
426
+ messages=messages,
427
+ max_tokens=max_tokens,
428
+ temperature=temp,
429
+ top_p=top_p,
430
+ )
431
+
432
+ filtered_urls: List[str] = list(truncated_pages.keys())
433
+ filter_output = ""
434
+ filter_used_fallback = False
435
+ filter_success = False
436
+ if truncated_pages and time_left() > 3.0:
437
+ filter_prompt = _build_filter_prompt(summary or "", [q for q in queries if q.strip()], truncated_pages)
438
+ filter_messages = [
439
+ {"role": "system", "content": FILTERER_SYSTEM_PROMPT},
440
+ {"role": "user", "content": f"The current date is {date_str}. Consider how recent each source is when deciding relevance."},
441
+ {"role": "user", "content": filter_prompt},
442
+ ]
443
+ filter_completion = None
444
+ try:
445
+ print("[FILTER] Attempt 1: provider=cerebras, max_tokens=2048", flush=True)
446
+ filter_completion = _invoke_chat(filter_messages, "cerebras", 2048, 0.2, 0.9)
447
+ except Exception as exc1:
448
+ print(f"[FILTER] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
449
+ try:
450
+ print("[FILTER] Attempt 2: provider=auto, max_tokens=2048", flush=True)
451
+ filter_completion = _invoke_chat(filter_messages, "auto", 2048, 0.2, 0.9)
452
+ except Exception as exc2:
453
+ print(f"[FILTER] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
454
+ if filter_completion and filter_completion.choices:
455
+ filter_output = filter_completion.choices[0].message.content or ""
456
+ filtered_urls = _parse_filterer_output(filter_output, list(truncated_pages.keys()))
457
+ filter_success = bool(filter_output.strip()) and bool(filtered_urls)
458
+ if not filtered_urls:
459
+ filter_used_fallback = True
460
+ fallback_count = min(8, len(truncated_pages))
461
+ filtered_urls = list(truncated_pages.keys())[:fallback_count]
462
+ max_final_urls = 20
463
+ if len(filtered_urls) > max_final_urls:
464
+ filter_used_fallback = True
465
+ filtered_urls = filtered_urls[:max_final_urls]
466
+ if not filter_success:
467
+ filter_used_fallback = True
468
+ print(
469
+ f"[FILTER] Selected URLs={len(filtered_urls)}, fallback={filter_used_fallback}, time_left={time_left():.2f}s",
470
+ flush=True,
471
+ )
472
+
473
+ final_pages_fetched = OrderedDict()
474
+ if filtered_urls and time_left() > 0.2:
475
+ final_pages_fetched = _fetch_pages_within_budget(filtered_urls, 8000, time_left)
476
+ merged_pages = OrderedDict()
477
+ for url in filtered_urls:
478
+ content = final_pages_fetched.get(url) or truncated_pages.get(url) or ""
479
+ if content:
480
+ merged_pages[url] = content
481
+ pages = merged_pages
482
+ print(
483
+ f"[PIPELINE] Final fetch complete: retained_documents={len(pages)}, time_left={time_left():.2f}s",
484
+ flush=True,
485
+ )
486
+ prompt = _build_research_prompt(summary=summary or "", queries=[q for q in queries if q.strip()], url_list=list(pages.keys()), pages_map=pages)
487
+ system_message = {"role": "system", "content": RESEARCHER_SYSTEM_PROMPT}
488
+ date_message = {"role": "user", "content": f"The current date is {date_str}. Return only the research report."}
489
+ messages = [
490
+ system_message,
491
+ date_message,
492
+ {"role": "user", "content": prompt},
493
+ ]
494
+ try:
495
+ prompt_chars = len(prompt)
496
+ except Exception:
497
+ prompt_chars = -1
498
+ print(f"[PIPELINE] Fetch complete: pages={len(pages)}, unique_urls={len(pages.keys())}, prompt_chars={prompt_chars}", flush=True)
499
+ print("[PIPELINE] Starting inference (provider=cerebras, model=zai-org/GLM-4.7)", flush=True)
500
+
501
+ try:
502
+ print("[LLM] Attempt 1: provider=cerebras, max_tokens=32768", flush=True)
503
+ completion = _invoke_chat(messages, "cerebras", max_tokens=32768, temp=0.3, top_p=0.95)
504
+ except Exception as exc1:
505
+ print(f"[LLM] Attempt 1 failed: {str(exc1)[:200]}", flush=True)
506
+ try:
507
+ prompt2 = _build_research_prompt(
508
+ summary=summary or "",
509
+ queries=[q for q in queries if q.strip()],
510
+ url_list=list(pages.keys())[:30],
511
+ pages_map={key: pages[key] for key in list(pages.keys())[:30]},
512
+ )
513
+ messages = [
514
+ system_message,
515
+ date_message,
516
+ {"role": "user", "content": prompt2},
517
+ ]
518
+ print("[LLM] Attempt 2: provider=cerebras (trimmed), max_tokens=16384", flush=True)
519
+ completion = _invoke_chat(messages, "cerebras", max_tokens=16384, temp=0.7, top_p=0.95)
520
+ except Exception as exc2:
521
+ print(f"[LLM] Attempt 2 failed: {str(exc2)[:200]}", flush=True)
522
+ try:
523
+ print("[LLM] Attempt 3: provider=auto, max_tokens=8192", flush=True)
524
+ completion = _invoke_chat(messages, "auto", max_tokens=8192, temp=0.7, top_p=0.95)
525
+ except Exception as exc3:
526
+ _log_call_end("Deep_Research", f"error={_truncate_for_log(str(exc3), 260)}")
527
+ raise gr.Error(f"Researcher model call failed: {exc3}")
528
+ raw = completion.choices[0].message.content or ""
529
+ try:
530
+ no_think = re.sub(r"<think>[\s\S]*?<\\/think>", "", raw, flags=re.IGNORECASE)
531
+ no_think = re.sub(r"<\\/?think>", "", no_think, flags=re.IGNORECASE)
532
+ except Exception:
533
+ no_think = raw
534
+ try:
535
+ paragraphs = [p for p in re.split(r"\n\s*\n", no_think) if p.strip()]
536
+ keep: List[str] = []
537
+ removed = 0
538
+ planning_re = re.compile(r"\b(let me|now i(?:'ll| will)?|first,|i will now|i will|i'll|let's|now let me|i need to|now i'll|now i will)\b", re.IGNORECASE)
539
+ for paragraph in paragraphs:
540
+ if planning_re.search(paragraph):
541
+ removed += 1
542
+ continue
543
+ keep.append(paragraph)
544
+ report = "\n\n".join(keep).strip()
545
+ if not report:
546
+ report = no_think.strip()
547
+ except Exception:
548
+ report = no_think
549
+ removed = 0
550
+ report = re.sub(r"\n\s*\n\s*\n+", "\n\n", report)
551
+ try:
552
+ print(f"[POSTPROCESS] removed_planning_paragraphs={removed}, raw_chars={len(raw)}, final_chars={len(report)}", flush=True)
553
+ except Exception:
554
+ pass
555
+ links_text = "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
556
+ if links_text:
557
+ sources_section = "\n\n## Sources\n" + "\n".join([f"[{i+1}] {url}" for i, url in enumerate(pages.keys())])
558
+ report = report.rstrip() + sources_section
559
+ file_path = _write_report_tmp(report)
560
+ elapsed = time.time() - start_ts
561
+ print(f"[TIMING] Deep_Research elapsed: {elapsed:.2f}s", flush=True)
562
+ _log_call_end("Deep_Research", f"urls={len(pages)} file={os.path.basename(file_path)} duration={elapsed:.2f}s")
563
+ return report, links_text, file_path
564
+
565
+
566
+ def build_interface() -> gr.Interface:
567
+ return gr.Interface(
568
+ fn=Deep_Research,
569
+ inputs=[
570
+ gr.Textbox(label="Summarization of research topic", lines=3, placeholder="Briefly summarize the research topic or user question", info="Summarization of research topic (one or more sentences)"),
571
+ gr.Textbox(label="DDG Search Query 1", max_lines=1, info="DDG Search Query 1"),
572
+ gr.Slider(1, 50, value=10, step=1, label="Max results (Q1)", info="Max results for Query 1 (1-50)"),
573
+ gr.Textbox(label="DDG Search Query 2", value="", max_lines=1, info="DDG Search Query 2"),
574
+ gr.Slider(1, 50, value=10, step=1, label="Max results (Q2)", info="Max results for Query 2 (1-50)"),
575
+ gr.Textbox(label="DDG Search Query 3", value="", max_lines=1, info="DDG Search Query 3"),
576
+ gr.Slider(1, 50, value=10, step=1, label="Max results (Q3)", info="Max results for Query 3 (1-50)"),
577
+ gr.Textbox(label="DDG Search Query 4", value="", max_lines=1, info="DDG Search Query 4"),
578
+ gr.Slider(1, 50, value=10, step=1, label="Max results (Q4)", info="Max results for Query 4 (1-50)"),
579
+ gr.Textbox(label="DDG Search Query 5", value="", max_lines=1, info="DDG Search Query 5"),
580
+ gr.Slider(1, 50, value=10, step=1, label="Max results (Q5)", info="Max results for Query 5 (1-50)"),
581
+ ],
582
+ outputs=[
583
+ gr.Markdown(label="Research Report"),
584
+ gr.Textbox(label="Fetched Links", lines=8),
585
+ gr.File(label="Download Research Report", file_count="single"),
586
+ ],
587
+ title="Deep Research",
588
+ description=(
589
+ "<div style=\"text-align:center\">Generate a research report based on dozens of sources. Default model is GLM-4.7</div>"
590
+ ),
591
+ api_description=TOOL_SUMMARY,
592
+ flagging_mode="never",
593
+ )
594
+
595
+
596
+ __all__ = ["Deep_Research", "build_interface"]
Modules/Generate_Image.py CHANGED
@@ -1,132 +1,132 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import uuid
5
- import random
6
- from typing import Annotated
7
-
8
- import gradio as gr
9
- from PIL import Image
10
- from huggingface_hub import InferenceClient
11
- from ._core import ROOT_DIR, get_hf_token, DEFAULT_PROVIDERS, handle_hf_error
12
-
13
- from app import _log_call_end, _log_call_start, _truncate_for_log
14
- from ._docstrings import autodoc
15
-
16
- HF_API_TOKEN = get_hf_token()
17
-
18
- # Single source of truth for the LLM-facing tool description
19
- TOOL_SUMMARY = (
20
- "Generate an image from a text prompt via Hugging Face serverless inference; "
21
- "tunable model/steps/guidance/size, supports negative prompt and seed; returns a PIL.Image. "
22
- "Return the generated media to the user in this format `![Alt text](URL)`."
23
- )
24
-
25
-
26
- @autodoc(
27
- summary=TOOL_SUMMARY,
28
- )
29
- def Generate_Image(
30
- prompt: Annotated[str, "Text description of the image to generate."],
31
- model_id: Annotated[str, "Hugging Face model id in the form 'creator/model-name' (e.g., Tongyi-MAI/Z-Image-Turbo)."] = "Tongyi-MAI/Z-Image-Turbo",
32
- negative_prompt: Annotated[str, "What should NOT appear in the image."] = (
33
- "(deformed, distorted, disfigured), poorly drawn, bad anatomy, wrong anatomy, extra limb, "
34
- "missing limb, floating limbs, (mutated hands and fingers), disconnected limbs, mutation, "
35
- "mutated, ugly, disgusting, blurry, amputation, misspellings, typos"
36
- ),
37
- steps: Annotated[int, "Number of denoising steps (1–100). Higher = slower, potentially higher quality."] = 35,
38
- cfg_scale: Annotated[float, "Classifier-free guidance scale (1–20). Higher = follow the prompt more closely."] = 7.0,
39
- seed: Annotated[int, "Random seed for reproducibility. Use -1 for a random seed per call."] = -1,
40
- width: Annotated[int, "Output width in pixels (64–1216, multiple of 32 recommended)."] = 1024,
41
- height: Annotated[int, "Output height in pixels (64–1216, multiple of 32 recommended)."] = 1024,
42
- sampler: Annotated[str, "Sampling method label (UI only). Common options: 'DPM++ 2M Karras', 'DPM++ SDE Karras', 'Euler', 'Euler a', 'Heun', 'DDIM'."] = "DPM++ 2M Karras",
43
- ) -> str:
44
- _log_call_start(
45
- "Generate_Image",
46
- prompt=_truncate_for_log(prompt, 200),
47
- model_id=model_id,
48
- steps=steps,
49
- cfg_scale=cfg_scale,
50
- seed=seed,
51
- size=f"{width}x{height}",
52
- )
53
- if not prompt or not prompt.strip():
54
- _log_call_end("Generate_Image", "error=empty prompt")
55
- raise gr.Error("Please provide a non-empty prompt.")
56
- enhanced_prompt = f"{prompt} | ultra detail, ultra elaboration, ultra quality, perfect."
57
- last_error: Exception | None = None
58
- for provider in DEFAULT_PROVIDERS:
59
- try:
60
- client = InferenceClient(api_key=HF_API_TOKEN, provider=provider)
61
- image = client.text_to_image(
62
- prompt=enhanced_prompt,
63
- negative_prompt=negative_prompt,
64
- model=model_id,
65
- width=width,
66
- height=height,
67
- num_inference_steps=steps,
68
- guidance_scale=cfg_scale,
69
- seed=seed if seed != -1 else random.randint(1, 1_000_000_000),
70
- )
71
-
72
- filename = f"image_{uuid.uuid4().hex[:8]}.png"
73
- output_path = os.path.join(ROOT_DIR, filename)
74
- image.save(output_path)
75
-
76
- _log_call_end("Generate_Image", f"provider={provider} size={image.size} saved_to={filename}")
77
- return output_path
78
- except Exception as exc:
79
- last_error = exc
80
- continue
81
-
82
- msg = str(last_error) if last_error else "Unknown error"
83
- _log_call_end("Generate_Image", f"error={_truncate_for_log(msg, 200)}")
84
- handle_hf_error(msg, model_id, context="Image generation")
85
-
86
-
87
- def build_interface() -> gr.Interface:
88
- return gr.Interface(
89
- fn=Generate_Image,
90
- inputs=[
91
- gr.Textbox(label="Prompt", placeholder="Enter a prompt", lines=2, info="Text description of the image to generate"),
92
- gr.Textbox(
93
- label="Model",
94
- value="Tongyi-MAI/Z-Image-Turbo",
95
- placeholder="creator/model-name",
96
- max_lines=1,
97
- info="<a href=\"https://huggingface.co/models?pipeline_tag=text-to-image&inference_provider=nebius,cerebras,novita,fireworks-ai,together,fal-ai,groq,featherless-ai,nscale,hyperbolic,sambanova,cohere,replicate,scaleway,publicai,hf-inference&sort=trending\" target=\"_blank\" rel=\"noopener noreferrer\">Browse models</a>",
98
- ),
99
- gr.Textbox(
100
- label="Negative Prompt",
101
- value=(
102
- "(deformed, distorted, disfigured), poorly drawn, bad anatomy, wrong anatomy, extra limb, "
103
- "missing limb, floating limbs, (mutated hands and fingers), disconnected limbs, mutation, "
104
- "mutated, ugly, disgusting, blurry, amputation, misspellings, typos"
105
- ),
106
- lines=2,
107
- info="What should NOT appear in the image",
108
- ),
109
- gr.Slider(minimum=1, maximum=100, value=35, step=1, label="Steps", info="Number of denoising steps (1–100)"),
110
- gr.Slider(minimum=1.0, maximum=20.0, value=7.0, step=0.1, label="CFG Scale", info="Classifier-free guidance scale (1–20)"),
111
- gr.Slider(minimum=-1, maximum=1_000_000_000, value=-1, step=1, label="Seed (-1 = random)", info="Random seed for reproducibility"),
112
- gr.Slider(minimum=64, maximum=1216, value=1024, step=32, label="Width", info="Output width in pixels"),
113
- gr.Slider(minimum=64, maximum=1216, value=1024, step=32, label="Height", info="Output height in pixels"),
114
- gr.Radio(
115
- label="Sampler",
116
- value="DPM++ 2M Karras",
117
- choices=["DPM++ 2M Karras", "DPM++ SDE Karras", "Euler", "Euler a", "Heun", "DDIM"],
118
- info="Sampling method",
119
- ),
120
- ],
121
- outputs=gr.Image(label="Generated Image"),
122
- title="Generate Image",
123
- description=(
124
- "<div style=\"text-align:center\">Generate images via Hugging Face serverless inference. "
125
- "Default model is Z-Image-Turbo.</div>"
126
- ),
127
- api_description=TOOL_SUMMARY,
128
- flagging_mode="never",
129
- )
130
-
131
-
132
- __all__ = ["Generate_Image", "build_interface"]
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import uuid
5
+ import random
6
+ from typing import Annotated
7
+
8
+ import gradio as gr
9
+ from PIL import Image
10
+ from huggingface_hub import InferenceClient
11
+ from ._core import ROOT_DIR, get_hf_token, DEFAULT_PROVIDERS, handle_hf_error
12
+
13
+ from app import _log_call_end, _log_call_start, _truncate_for_log
14
+ from ._docstrings import autodoc
15
+
16
+ HF_API_TOKEN = get_hf_token()
17
+
18
+ # Single source of truth for the LLM-facing tool description
19
+ TOOL_SUMMARY = (
20
+ "Generate an image from a text prompt via Hugging Face serverless inference; "
21
+ "tunable model/steps/guidance/size, supports negative prompt and seed; returns a PIL.Image. "
22
+ "Return the generated media to the user in this format `![Alt text](URL)`."
23
+ )
24
+
25
+
26
+ @autodoc(
27
+ summary=TOOL_SUMMARY,
28
+ )
29
+ def Generate_Image(
30
+ prompt: Annotated[str, "Text description of the image to generate."],
31
+ model_id: Annotated[str, "Hugging Face model id in the form 'creator/model-name' (e.g., Tongyi-MAI/Z-Image-Turbo)."] = "Tongyi-MAI/Z-Image-Turbo",
32
+ negative_prompt: Annotated[str, "What should NOT appear in the image."] = (
33
+ "(deformed, distorted, disfigured), poorly drawn, bad anatomy, wrong anatomy, extra limb, "
34
+ "missing limb, floating limbs, (mutated hands and fingers), disconnected limbs, mutation, "
35
+ "mutated, ugly, disgusting, blurry, amputation, misspellings, typos"
36
+ ),
37
+ steps: Annotated[int, "Number of denoising steps (1–100). Higher = slower, potentially higher quality."] = 35,
38
+ cfg_scale: Annotated[float, "Classifier-free guidance scale (1–20). Higher = follow the prompt more closely."] = 7.0,
39
+ seed: Annotated[int, "Random seed for reproducibility. Use -1 for a random seed per call."] = -1,
40
+ width: Annotated[int, "Output width in pixels (64–1216, multiple of 32 recommended)."] = 1024,
41
+ height: Annotated[int, "Output height in pixels (64–1216, multiple of 32 recommended)."] = 1024,
42
+ sampler: Annotated[str, "Sampling method label (UI only). Common options: 'DPM++ 2M Karras', 'DPM++ SDE Karras', 'Euler', 'Euler a', 'Heun', 'DDIM'."] = "DPM++ 2M Karras",
43
+ ) -> str:
44
+ _log_call_start(
45
+ "Generate_Image",
46
+ prompt=_truncate_for_log(prompt, 200),
47
+ model_id=model_id,
48
+ steps=steps,
49
+ cfg_scale=cfg_scale,
50
+ seed=seed,
51
+ size=f"{width}x{height}",
52
+ )
53
+ if not prompt or not prompt.strip():
54
+ _log_call_end("Generate_Image", "error=empty prompt")
55
+ raise gr.Error("Please provide a non-empty prompt.")
56
+ enhanced_prompt = f"{prompt} | ultra detail, ultra elaboration, ultra quality, perfect."
57
+ last_error: Exception | None = None
58
+ for provider in DEFAULT_PROVIDERS:
59
+ try:
60
+ client = InferenceClient(api_key=HF_API_TOKEN, provider=provider)
61
+ image = client.text_to_image(
62
+ prompt=enhanced_prompt,
63
+ negative_prompt=negative_prompt,
64
+ model=model_id,
65
+ width=width,
66
+ height=height,
67
+ num_inference_steps=steps,
68
+ guidance_scale=cfg_scale,
69
+ seed=seed if seed != -1 else random.randint(1, 1_000_000_000),
70
+ )
71
+
72
+ filename = f"image_{uuid.uuid4().hex[:8]}.png"
73
+ output_path = os.path.join(ROOT_DIR, filename)
74
+ image.save(output_path)
75
+
76
+ _log_call_end("Generate_Image", f"provider={provider} size={image.size} saved_to={filename}")
77
+ return output_path
78
+ except Exception as exc:
79
+ last_error = exc
80
+ continue
81
+
82
+ msg = str(last_error) if last_error else "Unknown error"
83
+ _log_call_end("Generate_Image", f"error={_truncate_for_log(msg, 200)}")
84
+ handle_hf_error(msg, model_id, context="Image generation")
85
+
86
+
87
+ def build_interface() -> gr.Interface:
88
+ return gr.Interface(
89
+ fn=Generate_Image,
90
+ inputs=[
91
+ gr.Textbox(label="Prompt", placeholder="Enter a prompt", lines=2, info="Text description of the image to generate"),
92
+ gr.Textbox(
93
+ label="Model",
94
+ value="Tongyi-MAI/Z-Image-Turbo",
95
+ placeholder="creator/model-name",
96
+ max_lines=1,
97
+ info="<a href=\"https://huggingface.co/models?pipeline_tag=text-to-image&inference_provider=nebius,cerebras,novita,fireworks-ai,together,fal-ai,groq,featherless-ai,nscale,hyperbolic,sambanova,cohere,replicate,scaleway,publicai,hf-inference&sort=trending\" target=\"_blank\" rel=\"noopener noreferrer\">Browse models</a>",
98
+ ),
99
+ gr.Textbox(
100
+ label="Negative Prompt",
101
+ value=(
102
+ "(deformed, distorted, disfigured), poorly drawn, bad anatomy, wrong anatomy, extra limb, "
103
+ "missing limb, floating limbs, (mutated hands and fingers), disconnected limbs, mutation, "
104
+ "mutated, ugly, disgusting, blurry, amputation, misspellings, typos"
105
+ ),
106
+ lines=2,
107
+ info="What should NOT appear in the image",
108
+ ),
109
+ gr.Slider(minimum=1, maximum=100, value=35, step=1, label="Steps", info="Number of denoising steps (1–100)"),
110
+ gr.Slider(minimum=1.0, maximum=20.0, value=7.0, step=0.1, label="CFG Scale", info="Classifier-free guidance scale (1–20)"),
111
+ gr.Slider(minimum=-1, maximum=1_000_000_000, value=-1, step=1, label="Seed (-1 = random)", info="Random seed for reproducibility"),
112
+ gr.Slider(minimum=64, maximum=1216, value=1024, step=32, label="Width", info="Output width in pixels"),
113
+ gr.Slider(minimum=64, maximum=1216, value=1024, step=32, label="Height", info="Output height in pixels"),
114
+ gr.Radio(
115
+ label="Sampler",
116
+ value="DPM++ 2M Karras",
117
+ choices=["DPM++ 2M Karras", "DPM++ SDE Karras", "Euler", "Euler a", "Heun", "DDIM"],
118
+ info="Sampling method",
119
+ ),
120
+ ],
121
+ outputs=gr.Image(label="Generated Image"),
122
+ title="Generate Image",
123
+ description=(
124
+ "<div style=\"text-align:center\">Generate images via Hugging Face serverless inference. "
125
+ "Default model is Z-Image-Turbo.</div>"
126
+ ),
127
+ api_description=TOOL_SUMMARY,
128
+ flagging_mode="never",
129
+ )
130
+
131
+
132
+ __all__ = ["Generate_Image", "build_interface"]
Modules/Memory_Manager.py CHANGED
@@ -1,253 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
- import json
4
  import os
5
- import threading
6
- import uuid
7
- from datetime import datetime
8
- from typing import Annotated, Dict, List, Literal, Optional
9
 
10
  import gradio as gr
11
  from ._docstrings import autodoc
12
 
13
- _MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
14
- MEMORY_FILE = os.path.join(os.path.dirname(_MODULE_DIR), "memories.json")
15
- _MEMORY_LOCK = threading.RLock()
16
- _MAX_MEMORIES = 10_000
 
17
 
 
 
18
 
19
- def _now_iso() -> str:
20
- return datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
21
 
22
 
23
- def _load_memories() -> List[Dict[str, str]]:
24
- if not os.path.exists(MEMORY_FILE):
25
- return []
26
- try:
27
- with open(MEMORY_FILE, "r", encoding="utf-8") as file:
28
- data = json.load(file)
29
- if isinstance(data, list):
30
- cleaned: List[Dict[str, str]] = []
31
- for item in data:
32
- if isinstance(item, dict) and "id" in item and "text" in item:
33
- cleaned.append(item)
34
- return cleaned
35
- return []
36
- except Exception:
37
  try:
38
- backup = MEMORY_FILE + ".corrupt"
39
- if not os.path.exists(backup):
40
- os.replace(MEMORY_FILE, backup)
41
- except Exception:
42
- pass
43
- return []
44
-
45
-
46
- def _save_memories(memories: List[Dict[str, str]]) -> None:
47
- tmp_path = MEMORY_FILE + ".tmp"
48
- with open(tmp_path, "w", encoding="utf-8") as file:
49
- json.dump(memories, file, ensure_ascii=False, indent=2)
50
- os.replace(tmp_path, MEMORY_FILE)
51
-
52
-
53
- def _mem_save(text: str, tags: str) -> str:
54
- text_clean = (text or "").strip()
55
- if not text_clean:
56
- return "Error: memory text is empty."
57
- with _MEMORY_LOCK:
58
- memories = _load_memories()
59
- if memories and memories[-1].get("text") == text_clean:
60
- return "Skipped: identical to last stored memory."
61
- mem_id = str(uuid.uuid4())
62
- entry = {
63
- "id": mem_id,
64
- "text": text_clean,
65
- "timestamp": _now_iso(),
66
- "tags": tags.strip(),
67
- }
68
- memories.append(entry)
69
- if len(memories) > _MAX_MEMORIES:
70
- overflow = len(memories) - _MAX_MEMORIES
71
- memories = memories[overflow:]
72
- _save_memories(memories)
73
- return f"Memory saved: {mem_id}"
74
-
75
-
76
- def _mem_list(limit: int, include_tags: bool) -> str:
77
- limit = max(1, min(200, limit))
78
- with _MEMORY_LOCK:
79
- memories = _load_memories()
80
- if not memories:
81
- return "No memories stored yet."
82
- chosen = memories[-limit:][::-1]
83
- lines: List[str] = []
84
- for memory in chosen:
85
- base = f"{memory['id'][:8]} [{memory.get('timestamp','?')}] {memory.get('text','')}"
86
- if include_tags and memory.get("tags"):
87
- base += f" | tags: {memory['tags']}"
88
- lines.append(base)
89
- omitted = len(memories) - len(chosen)
90
- if omitted > 0:
91
- lines.append(f"… ({omitted} older memorie{'s' if omitted!=1 else ''} omitted; total={len(memories)})")
92
- return "\n".join(lines)
93
-
94
-
95
- def _parse_search_query(query: str) -> Dict[str, List[str]]:
96
- import re
97
-
98
- result = {"tag_terms": [], "text_terms": [], "operator": "and"}
99
- if not query or not query.strip():
100
- return result
101
- query = re.sub(r"\s+", " ", query.strip())
102
- if re.search(r"\bOR\b", query, re.IGNORECASE):
103
- result["operator"] = "or"
104
- parts = re.split(r"\s+OR\s+", query, flags=re.IGNORECASE)
105
- else:
106
- parts = re.split(r"\s+(?:AND\s+)?", query, flags=re.IGNORECASE)
107
- parts = [p for p in parts if p.strip() and p.strip().upper() != "AND"]
108
- for part in parts:
109
- part = part.strip()
110
- if not part:
111
- continue
112
- tag_match = re.match(r"^tag:(.+)$", part, re.IGNORECASE)
113
- if tag_match:
114
- tag_name = tag_match.group(1).strip()
115
- if tag_name:
116
- result["tag_terms"].append(tag_name.lower())
117
- else:
118
- result["text_terms"].append(part.lower())
119
- return result
120
-
121
-
122
- def _match_memory_with_query(memory: Dict[str, str], parsed_query: Dict[str, List[str]]) -> bool:
123
- tag_terms = parsed_query["tag_terms"]
124
- text_terms = parsed_query["text_terms"]
125
- operator = parsed_query["operator"]
126
- if not tag_terms and not text_terms:
127
- return False
128
- memory_text = memory.get("text", "").lower()
129
- memory_tags = memory.get("tags", "").lower()
130
- memory_tag_list = [tag.strip() for tag in memory_tags.split(",") if tag.strip()]
131
- tag_matches = [any(tag_term in tag for tag in memory_tag_list) for tag_term in tag_terms]
132
- combined_text = memory_text + " " + memory_tags
133
- text_matches = [text_term in combined_text for text_term in text_terms]
134
- all_matches = tag_matches + text_matches
135
- if not all_matches:
136
- return False
137
- if operator == "or":
138
- return any(all_matches)
139
- return all(all_matches)
140
-
141
-
142
- def _mem_search(query: str, limit: int) -> str:
143
- q = (query or "").strip()
144
- if not q:
145
- return "Error: empty query."
146
- parsed_query = _parse_search_query(q)
147
- if not parsed_query["tag_terms"] and not parsed_query["text_terms"]:
148
- return "Error: no valid search terms found."
149
- limit = max(1, min(200, limit))
150
- with _MEMORY_LOCK:
151
- memories = _load_memories()
152
- matches: List[Dict[str, str]] = []
153
- total_matches = 0
154
- for memory in reversed(memories):
155
- if _match_memory_with_query(memory, parsed_query):
156
- total_matches += 1
157
- if len(matches) < limit:
158
- matches.append(memory)
159
- if not matches:
160
- return f"No matches for: {query}"
161
- lines = [
162
- f"{memory['id'][:8]} [{memory.get('timestamp','?')}] {memory.get('text','')}" + (f" | tags: {memory['tags']}" if memory.get('tags') else "")
163
- for memory in matches
164
- ]
165
- omitted = total_matches - len(matches)
166
- if omitted > 0:
167
- lines.append(f"… ({omitted} additional match{'es' if omitted!=1 else ''} omitted; total_matches={total_matches})")
168
- return "\n".join(lines)
169
-
170
-
171
- def _mem_delete(memory_id: str) -> str:
172
- key = (memory_id or "").strip().lower()
173
- if len(key) < 4:
174
- return "Error: supply at least 4 characters of the id."
175
- with _MEMORY_LOCK:
176
- memories = _load_memories()
177
- matched = [memory for memory in memories if memory["id"].lower().startswith(key)]
178
- if not matched:
179
- return "Memory not found."
180
- if len(matched) > 1 and key != matched[0]["id"].lower():
181
- sample = ", ".join(memory["id"][:8] for memory in matched[:5])
182
- more = "…" if len(matched) > 5 else ""
183
- return f"Ambiguous prefix (matches {len(matched)} ids: {sample}{more}). Provide more characters."
184
- target_id = matched[0]["id"]
185
- memories = [memory for memory in memories if memory["id"] != target_id]
186
- _save_memories(memories)
187
- return f"Deleted memory: {target_id}"
188
-
189
-
190
- # Single source of truth for the LLM-facing tool description
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  TOOL_SUMMARY = (
192
- "Manage short text memories (save, list, search, delete) in a local JSON store with tags and simple query language; "
193
- "returns a result string (confirmation, listing, matches, or error)."
 
194
  )
195
 
196
 
197
- @autodoc(
198
- summary=TOOL_SUMMARY,
199
- )
200
  def Memory_Manager(
201
- action: Annotated[Literal["save", "list", "search", "delete"], "Action to perform: save | list | search | delete"],
202
- text: Annotated[Optional[str], "Text content (Save only)"] = None,
203
  tags: Annotated[Optional[str], "Comma-separated tags (Save only)"] = None,
204
- query: Annotated[Optional[str], "Enhanced search with tag:name syntax, AND/OR operators (Search only)"] = None,
205
  limit: Annotated[int, "Max results (List/Search only)"] = 20,
206
- memory_id: Annotated[Optional[str], "Full UUID or unique prefix (Delete only)"] = None,
207
- include_tags: Annotated[bool, "Include tags (List/Search only)"] = True,
208
  ) -> str:
209
- act = (action or "").lower().strip()
210
- text = text or ""
211
- tags = tags or ""
212
- query = query or ""
213
- memory_id = memory_id or ""
 
 
 
 
 
 
 
214
  if act == "save":
215
- if not text.strip():
 
216
  return "Error: 'text' is required when action=save."
217
- return _mem_save(text=text, tags=tags)
 
218
  if act == "list":
219
- return _mem_list(limit=limit, include_tags=include_tags)
 
220
  if act == "search":
221
- if not query.strip():
 
222
  return "Error: 'query' is required when action=search."
223
- return _mem_search(query=query, limit=limit)
224
- if act == "delete":
225
- if not memory_id.strip():
226
- return "Error: 'memory_id' is required when action=delete."
227
- return _mem_delete(memory_id=memory_id)
228
- return "Error: invalid action (use save|list|search|delete)."
229
 
230
 
231
  def build_interface() -> gr.Interface:
 
 
 
232
  return gr.Interface(
233
  fn=Memory_Manager,
234
  inputs=[
235
- gr.Radio(label="Action", choices=["save", "list", "search", "delete"], value="list", info="Action to perform"),
 
 
 
 
 
236
  gr.Textbox(label="Text", lines=3, info="Memory text (Save only)"),
237
  gr.Textbox(label="Tags", placeholder="tag1, tag2", max_lines=1, info="Comma-separated tags (Save only)"),
238
- gr.Textbox(label="Query", placeholder="tag:work AND tag:project OR meeting", max_lines=1, info="Search query (Search only)"),
239
  gr.Slider(1, 200, value=20, step=1, label="Limit", info="Max results (List/Search only)"),
240
- gr.Textbox(label="Memory ID / Prefix", max_lines=1, info="UUID or prefix (Delete only)"),
241
- gr.Checkbox(value=True, label="Include Tags", info="Include tags in output (List/Search only)"),
242
  ],
243
  outputs=gr.Textbox(label="Result", lines=14),
244
- title="Memory Manager",
245
- description=(
246
- "<div style=\"text-align:center\">Lightweight local JSON memory store (no external DB). Choose an Action, fill only the relevant fields, and run.</div>"
247
- ),
248
  api_description=TOOL_SUMMARY,
249
  flagging_mode="never",
250
  )
251
 
252
 
253
- __all__ = ["Memory_Manager", "build_interface", "_load_memories", "_save_memories"]
 
 
 
 
 
 
1
+ """
2
+ Memory Manager - Graphiti Knowledge Graph Interface.
3
+
4
+ Provides unified memory operations using the same Graphiti instance
5
+ configured for Claude Code's Graphiti MCP server.
6
+
7
+ Configuration (must match Graphiti MCP):
8
+ - FALKORDB_URI: redis://localhost:6379 (default)
9
+ - FALKORDB_DATABASE: graphiti (default)
10
+ - MISTRAL_API_KEY: Required for entity extraction
11
+ - GRAPHITI_GROUP_ID: main (default)
12
+ """
13
+
14
  from __future__ import annotations
15
 
 
16
  import os
17
+ from datetime import datetime, timezone
18
+ from typing import Annotated, Literal, Optional
 
 
19
 
20
  import gradio as gr
21
  from ._docstrings import autodoc
22
 
23
+ # Graphiti configuration - matches Graphiti MCP server
24
+ FALKORDB_URI = os.getenv("FALKORDB_URI", "redis://localhost:6379")
25
+ FALKORDB_DATABASE = os.getenv("FALKORDB_DATABASE", "graphiti")
26
+ MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
27
+ GRAPHITI_GROUP_ID = os.getenv("GRAPHITI_GROUP_ID", "main")
28
 
29
+ # Check if Graphiti is available
30
+ GRAPHITI_AVAILABLE = bool(MISTRAL_API_KEY)
31
 
32
+ # Lazy-loaded Graphiti client
33
+ _graphiti_client = None
34
 
35
 
36
+ def _get_graphiti_client():
37
+ """Get or create the Graphiti client (lazy load to avoid import errors)."""
38
+ global _graphiti_client
39
+ if _graphiti_client is None and GRAPHITI_AVAILABLE:
 
 
 
 
 
 
 
 
 
 
40
  try:
41
+ from graphiti_core import Graphiti
42
+ from graphiti_core.llm_client import OpenAIClient
43
+ from graphiti_core.driver.falkordb_driver import FalkorDriver
44
+
45
+ # Create FalkorDB driver
46
+ driver = FalkorDriver(
47
+ uri=FALKORDB_URI,
48
+ database=FALKORDB_DATABASE,
49
+ )
50
+
51
+ # Create Mistral LLM client (OpenAI-compatible API)
52
+ llm_client = OpenAIClient(
53
+ api_key=MISTRAL_API_KEY,
54
+ base_url="https://api.mistral.ai/v1",
55
+ model="mistral-large-2411",
56
+ )
57
+
58
+ # Create Graphiti client
59
+ _graphiti_client = Graphiti(
60
+ uri=FALKORDB_URI,
61
+ driver=driver,
62
+ llm_client=llm_client,
63
+ )
64
+ except ImportError as e:
65
+ print(f"[Memory_Manager] Graphiti not available: {e}")
66
+ return None
67
+ except Exception as e:
68
+ print(f"[Memory_Manager] Failed to initialize Graphiti: {e}")
69
+ return None
70
+ return _graphiti_client
71
+
72
+
73
+ def _format_timestamp() -> str:
74
+ """Return current UTC timestamp in ISO format."""
75
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
76
+
77
+
78
+ # ============================================================================
79
+ # Graphiti Memory Operations
80
+ # ============================================================================
81
+
82
+
83
+ def _graphiti_save(text: str, tags: str) -> str:
84
+ """Save memory to Graphiti knowledge graph."""
85
+ if not GRAPHITI_AVAILABLE:
86
+ return "Error: MISTRAL_API_KEY not set. Cannot save to Graphiti."
87
+
88
+ client = _get_graphiti_client()
89
+ if not client:
90
+ return "Error: Failed to initialize Graphiti client."
91
+
92
+ try:
93
+ # Build episode body with tags
94
+ episode_body = text.strip()
95
+ if tags and tags.strip():
96
+ episode_body = f"{text.strip()}\n\nTags: {tags.strip()}"
97
+
98
+ # Add episode to Graphiti
99
+ import asyncio
100
+
101
+ async def _save():
102
+ return await client.add_episode(
103
+ name=f"Memory {_format_timestamp()}",
104
+ episode_body=episode_body,
105
+ source_description="Memory_Manager tool",
106
+ group_id=GRAPHITI_GROUP_ID,
107
+ )
108
+
109
+ result = asyncio.run(_save())
110
+ return f"Memory saved to Graphiti knowledge graph (group: {GRAPHITI_GROUP_ID})"
111
+ except Exception as e:
112
+ return f"Error saving to Graphiti: {e}"
113
+
114
+
115
+ def _graphiti_list(limit: int, include_tags: bool) -> str:
116
+ """List recent memories from Graphiti."""
117
+ if not GRAPHITI_AVAILABLE:
118
+ return "Error: MISTRAL_API_KEY not set. Cannot access Graphiti."
119
+
120
+ client = _get_graphiti_client()
121
+ if not client:
122
+ return "Error: Failed to initialize Graphiti client."
123
+
124
+ try:
125
+ import asyncio
126
+
127
+ async def _list():
128
+ # Get episodes from Graphiti
129
+ return await client.get_episodes(
130
+ group_ids=[GRAPHITI_GROUP_ID],
131
+ limit=limit,
132
+ )
133
+
134
+ episodes = asyncio.run(_list())
135
+
136
+ if not episodes:
137
+ return f"No memories found in Graphiti (group: {GRAPHITI_GROUP_ID})"
138
+
139
+ lines = [f"Graphiti Memories (group: {GRAPHITI_GROUP_ID})", "-" * 50]
140
+ for ep in episodes:
141
+ name = ep.name if hasattr(ep, "name") else "?"
142
+ created = ep.created_at if hasattr(ep, "created_at") else "?"
143
+ content = ep.content if hasattr(ep, "content") else str(ep)
144
+
145
+ # Extract tags from content if present
146
+ tags_str = ""
147
+ if include_tags and "Tags:" in content:
148
+ parts = content.split("Tags:")
149
+ if len(parts) > 1:
150
+ tags_str = f" | tags: {parts[1].strip()}"
151
+ content = parts[0].strip()
152
+
153
+ lines.append(f"[{created}] {content[:100]}{'...' if len(content) > 100 else ''}{tags_str}")
154
+
155
+ return "\n".join(lines)
156
+ except Exception as e:
157
+ return f"Error listing from Graphiti: {e}"
158
+
159
+
160
+ def _graphiti_search(query: str, limit: int) -> str:
161
+ """Search memories in Graphiti knowledge graph."""
162
+ if not GRAPHITI_AVAILABLE:
163
+ return "Error: MISTRAL_API_KEY not set. Cannot search Graphiti."
164
+
165
+ client = _get_graphiti_client()
166
+ if not client:
167
+ return "Error: Failed to initialize Graphiti client."
168
+
169
+ try:
170
+ import asyncio
171
+
172
+ async def _search():
173
+ # Use Graphiti's hybrid search
174
+ return await client.search(
175
+ query=query,
176
+ group_ids=[GRAPHITI_GROUP_ID],
177
+ num_results=limit,
178
+ )
179
+
180
+ results = asyncio.run(_search())
181
+
182
+ if not results:
183
+ return f"No matches found for: {query}"
184
+
185
+ lines = [f"Graphiti Search Results for: {query}", "-" * 50]
186
+
187
+ for i, result in enumerate(results, 1):
188
+ if hasattr(result, "fact"):
189
+ # Edge/fact result
190
+ source = getattr(result, "source_node", "?")
191
+ target = getattr(result, "target_node", "?")
192
+ fact = result.fact
193
+ lines.append(f"{i}. {source} -> {target}: {fact}")
194
+ elif hasattr(result, "name"):
195
+ # Node result
196
+ name = result.name
197
+ summary = getattr(result, "summary", "")
198
+ lines.append(f"{i}. [{name}] {summary[:150]}{'...' if len(summary) > 150 else ''}")
199
+ else:
200
+ lines.append(f"{i}. {str(result)[:150]}")
201
+
202
+ return "\n".join(lines)
203
+ except Exception as e:
204
+ return f"Error searching Graphiti: {e}"
205
+
206
+
207
+ def _graphiti_delete(memory_id: str) -> str:
208
+ """Delete memory from Graphiti (requires episode UUID)."""
209
+ if not GRAPHITI_AVAILABLE:
210
+ return "Error: MISTRAL_API_KEY not set. Cannot access Graphiti."
211
+
212
+ # Note: Graphiti deletion requires the full episode UUID
213
+ # This is a simplified implementation
214
+ return f"Note: To delete from Graphiti, use the Graphiti MCP directly with the episode UUID. Memory deletion is not fully implemented in this interface."
215
+
216
+
217
+ # ============================================================================
218
+ # Status Check
219
+ # ============================================================================
220
+
221
+
222
+ def _get_status() -> str:
223
+ """Get Graphiti connection status."""
224
+ if not GRAPHITI_AVAILABLE:
225
+ return "Status: MISTRAL_API_KEY not configured"
226
+
227
+ client = _get_graphiti_client()
228
+ if client:
229
+ return f"Status: Connected to Graphiti\nDatabase: {FALKORDB_DATABASE}\nGroup: {GRAPHITI_GROUP_ID}"
230
+ return "Status: Failed to initialize Graphiti client"
231
+
232
+
233
+ # ============================================================================
234
+ # Main Tool Function
235
+ # ============================================================================
236
+
237
+
238
  TOOL_SUMMARY = (
239
+ "Manage memories in Graphiti knowledge graph (save, list, search, status). "
240
+ "Connects to the same Graphiti instance as the Graphiti MCP server. "
241
+ "Requires MISTRAL_API_KEY for entity extraction and knowledge graph operations."
242
  )
243
 
244
 
245
+ @autodoc(summary=TOOL_SUMMARY)
 
 
246
  def Memory_Manager(
247
+ action: Annotated[Literal["save", "list", "search", "status"], "Action: save | list | search | status"] = "list",
248
+ text: Annotated[Optional[str], "Memory text (Save only)"] = None,
249
  tags: Annotated[Optional[str], "Comma-separated tags (Save only)"] = None,
250
+ query: Annotated[Optional[str], "Search query (Search only)"] = None,
251
  limit: Annotated[int, "Max results (List/Search only)"] = 20,
252
+ include_tags: Annotated[bool, "Include tags in output"] = True,
 
253
  ) -> str:
254
+ """
255
+ Memory Manager - Graphiti Knowledge Graph Interface.
256
+
257
+ Connects to the same Graphiti instance used by Claude Code's Graphiti MCP.
258
+ All memories are stored in the knowledge graph with automatic entity extraction
259
+ and relationship detection.
260
+ """
261
+ act = (action or "list").lower().strip()
262
+
263
+ if act == "status":
264
+ return _get_status()
265
+
266
  if act == "save":
267
+ text = (text or "").strip()
268
+ if not text:
269
  return "Error: 'text' is required when action=save."
270
+ return _graphiti_save(text=text, tags=tags or "")
271
+
272
  if act == "list":
273
+ return _graphiti_list(limit=max(1, min(200, limit)), include_tags=include_tags)
274
+
275
  if act == "search":
276
+ query = (query or "").strip()
277
+ if not query:
278
  return "Error: 'query' is required when action=search."
279
+ return _graphiti_search(query=query, limit=max(1, min(200, limit)))
280
+
281
+ return "Error: invalid action (use save|list|search|status)."
 
 
 
282
 
283
 
284
  def build_interface() -> gr.Interface:
285
+ """Build Gradio interface for Memory Manager."""
286
+ status_info = _get_status()
287
+
288
  return gr.Interface(
289
  fn=Memory_Manager,
290
  inputs=[
291
+ gr.Radio(
292
+ label="Action",
293
+ choices=["save", "list", "search", "status"],
294
+ value="status",
295
+ info="Action to perform",
296
+ ),
297
  gr.Textbox(label="Text", lines=3, info="Memory text (Save only)"),
298
  gr.Textbox(label="Tags", placeholder="tag1, tag2", max_lines=1, info="Comma-separated tags (Save only)"),
299
+ gr.Textbox(label="Query", placeholder="search terms...", max_lines=1, info="Search query (Search only)"),
300
  gr.Slider(1, 200, value=20, step=1, label="Limit", info="Max results (List/Search only)"),
301
+ gr.Checkbox(value=True, label="Include Tags", info="Include tags in output"),
 
302
  ],
303
  outputs=gr.Textbox(label="Result", lines=14),
304
+ title="Memory Manager - Graphiti",
305
+ description=f"<div style='text-align:center'><strong>{status_info}</strong><br/>Knowledge graph memory with entity extraction</div>",
 
 
306
  api_description=TOOL_SUMMARY,
307
  flagging_mode="never",
308
  )
309
 
310
 
311
+ __all__ = [
312
+ "Memory_Manager",
313
+ "build_interface",
314
+ "GRAPHITI_AVAILABLE",
315
+ "GRAPHITI_GROUP_ID",
316
+ ]
Modules/ScrapeGraphAI.py ADDED
@@ -0,0 +1,779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Annotated, Any, Literal
6
+
7
+ import gradio as gr
8
+
9
+ from app import _log_call_end, _log_call_start, _truncate_for_log
10
+ from ._core import _resolve_path
11
+ from ._docstrings import autodoc
12
+
13
+ TOOL_SUMMARY = (
14
+ "Scrape and extract structured data from known URLs using ScrapeGraphAI with "
15
+ "Mistral-only models. Supports single-page extraction, bounded crawl extraction, "
16
+ "multi-URL extraction, rendered markdown, and image-aware extraction."
17
+ )
18
+
19
+ ACTION_CHOICES = [
20
+ "extract",
21
+ "crawl_extract",
22
+ "multi_extract",
23
+ "render_markdown",
24
+ "vision_extract",
25
+ ]
26
+
27
+ RENDER_CHOICES = ["auto", "browser", "http"]
28
+
29
+ TEXT_MODEL_ENV = "SCRAPEGRAPH_TEXT_MODEL"
30
+ VISION_MODEL_ENV = "SCRAPEGRAPH_VISION_MODEL"
31
+ DEFAULT_TEXT_MODEL = "mistral-small-latest"
32
+ DEFAULT_VISION_MODEL = "pixtral-12b-latest"
33
+
34
+ _IMPORT_ERROR: Exception | None = None
35
+
36
+ try:
37
+ from langchain.chat_models import init_chat_model
38
+ from pydantic import BaseModel, Field, create_model
39
+ from scrapegraphai.graphs import SmartScraperGraph, SmartScraperMultiGraph
40
+ from scrapegraphai.graphs.abstract_graph import AbstractGraph
41
+ from scrapegraphai.graphs.base_graph import BaseGraph
42
+ from scrapegraphai.nodes import (
43
+ DescriptionNode,
44
+ FetchNode,
45
+ FetchNodeLevelK,
46
+ GenerateAnswerNodeKLevel,
47
+ GenerateAnswerOmniNode,
48
+ ImageToTextNode,
49
+ ParseNode,
50
+ ParseNodeDepthK,
51
+ RAGNode,
52
+ )
53
+ from scrapegraphai.utils.convert_to_md import convert_to_md
54
+ except Exception as exc: # pragma: no cover - import error path is runtime-only
55
+ _IMPORT_ERROR = exc
56
+ init_chat_model = None
57
+ BaseModel = None
58
+ Field = None
59
+ create_model = None
60
+ SmartScraperGraph = None
61
+ SmartScraperMultiGraph = None
62
+ AbstractGraph = None
63
+ BaseGraph = None
64
+ DescriptionNode = None
65
+ FetchNode = None
66
+ FetchNodeLevelK = None
67
+ GenerateAnswerNodeKLevel = None
68
+ GenerateAnswerOmniNode = None
69
+ ImageToTextNode = None
70
+ ParseNode = None
71
+ ParseNodeDepthK = None
72
+ RAGNode = None
73
+ convert_to_md = None
74
+ else:
75
+ class _LimitedFetchNodeLevelK(FetchNodeLevelK):
76
+ def __init__(self, *args, **kwargs):
77
+ super().__init__(*args, **kwargs)
78
+ self.max_pages = None if self.node_config is None else self.node_config.get("max_pages")
79
+
80
+ def obtain_content(self, documents, loader_kwargs):
81
+ documents = super().obtain_content(documents, loader_kwargs)
82
+ if self.max_pages and len(documents) > self.max_pages:
83
+ return documents[: self.max_pages]
84
+ return documents
85
+
86
+ class _BoundedDepthSearchGraph(AbstractGraph):
87
+ def __init__(self, prompt: str, source: str, config: dict, schema: type[BaseModel] | None = None):
88
+ super().__init__(prompt, config, source, schema)
89
+ self.input_key = "url" if source.startswith("http") else "local_dir"
90
+
91
+ def _create_graph(self):
92
+ fetch_node_k = _LimitedFetchNodeLevelK(
93
+ input="url| local_dir",
94
+ output=["docs"],
95
+ node_config={
96
+ "loader_kwargs": self.config.get("loader_kwargs", {}),
97
+ "force": self.config.get("force", False),
98
+ "cut": self.config.get("cut", True),
99
+ "browser_base": self.config.get("browser_base"),
100
+ "storage_state": self.config.get("storage_state"),
101
+ "depth": self.config.get("depth", 1),
102
+ "only_inside_links": self.config.get("only_inside_links", False),
103
+ "max_pages": self.config.get("max_pages"),
104
+ },
105
+ )
106
+ parse_node_k = ParseNodeDepthK(
107
+ input="docs",
108
+ output=["docs"],
109
+ node_config={"verbose": self.config.get("verbose", False)},
110
+ )
111
+ description_node = DescriptionNode(
112
+ input="docs",
113
+ output=["docs"],
114
+ node_config={
115
+ "llm_model": self.llm_model,
116
+ "verbose": self.config.get("verbose", False),
117
+ "cache_path": self.config.get("cache_path", False),
118
+ },
119
+ )
120
+ rag_node = RAGNode(
121
+ input="docs",
122
+ output=["vectorial_db"],
123
+ node_config={
124
+ "llm_model": self.llm_model,
125
+ "embedder_model": self.config.get("embedder_model", False),
126
+ "verbose": self.config.get("verbose", False),
127
+ },
128
+ )
129
+ generate_answer_k = GenerateAnswerNodeKLevel(
130
+ input="vectorial_db",
131
+ output=["answer"],
132
+ node_config={
133
+ "llm_model": self.llm_model,
134
+ "embedder_model": self.config.get("embedder_model", False),
135
+ "verbose": self.config.get("verbose", False),
136
+ "schema": self.schema,
137
+ },
138
+ )
139
+ return BaseGraph(
140
+ nodes=[fetch_node_k, parse_node_k, description_node, rag_node, generate_answer_k],
141
+ edges=[
142
+ (fetch_node_k, parse_node_k),
143
+ (parse_node_k, description_node),
144
+ (description_node, rag_node),
145
+ (rag_node, generate_answer_k),
146
+ ],
147
+ entry_point=fetch_node_k,
148
+ graph_name=self.__class__.__name__,
149
+ )
150
+
151
+ def run(self):
152
+ inputs = {"user_prompt": self.prompt, self.input_key: self.source}
153
+ self.final_state, self.execution_info = self.graph.execute(inputs)
154
+ return self.final_state.get("answer", "No answer found.")
155
+
156
+ class _MistralOmniScraperGraph(AbstractGraph):
157
+ def __init__(self, prompt: str, source: str, config: dict, schema: type[BaseModel] | None = None):
158
+ self.max_images = config.get("max_images", 5)
159
+ super().__init__(prompt, config, source, schema)
160
+ self.input_key = "url" if source.startswith("http") else "local_dir"
161
+
162
+ def _create_graph(self):
163
+ vision_model = init_chat_model(
164
+ model=self.config.get("vision_model", DEFAULT_VISION_MODEL),
165
+ model_provider="mistralai",
166
+ api_key=self.config["llm"]["api_key"],
167
+ temperature=0,
168
+ )
169
+ fetch_node = FetchNode(
170
+ input="url | local_dir",
171
+ output=["doc"],
172
+ node_config={
173
+ "loader_kwargs": self.config.get("loader_kwargs", {}),
174
+ "storage_state": self.config.get("storage_state"),
175
+ "use_soup": self.config.get("use_soup", False),
176
+ "timeout": self.config.get("timeout", 30),
177
+ },
178
+ )
179
+ parse_node = ParseNode(
180
+ input="doc & (url | local_dir)",
181
+ output=["parsed_doc", "link_urls", "img_urls"],
182
+ node_config={
183
+ "chunk_size": self.model_token,
184
+ "parse_urls": True,
185
+ "llm_model": self.llm_model,
186
+ },
187
+ )
188
+ image_to_text_node = ImageToTextNode(
189
+ input="img_urls",
190
+ output=["img_desc"],
191
+ node_config={
192
+ "llm_model": vision_model,
193
+ "max_images": self.max_images,
194
+ },
195
+ )
196
+ generate_answer_omni_node = GenerateAnswerOmniNode(
197
+ input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
198
+ output=["answer"],
199
+ node_config={
200
+ "llm_model": self.llm_model,
201
+ "additional_info": self.config.get("additional_info"),
202
+ "schema": self.schema,
203
+ },
204
+ )
205
+ return BaseGraph(
206
+ nodes=[fetch_node, parse_node, image_to_text_node, generate_answer_omni_node],
207
+ edges=[
208
+ (fetch_node, parse_node),
209
+ (parse_node, image_to_text_node),
210
+ (image_to_text_node, generate_answer_omni_node),
211
+ ],
212
+ entry_point=fetch_node,
213
+ graph_name=self.__class__.__name__,
214
+ )
215
+
216
+ def run(self):
217
+ inputs = {"user_prompt": self.prompt, self.input_key: self.source}
218
+ self.final_state, self.execution_info = self.graph.execute(inputs)
219
+ return self.final_state.get("answer", "No answer found.")
220
+
221
+
222
+ class ScrapeGraphToolError(RuntimeError):
223
+ def __init__(self, code: str, message: str, hint: str | None = None):
224
+ super().__init__(message)
225
+ self.code = code
226
+ self.message = message
227
+ self.hint = hint
228
+
229
+
230
+ def _json_response(payload: dict[str, Any]) -> str:
231
+ return json.dumps(payload, ensure_ascii=False, indent=2, default=str)
232
+
233
+
234
+ def _error_response(action: str, code: str, message: str, hint: str | None = None) -> str:
235
+ return _json_response(
236
+ {
237
+ "action": action,
238
+ "error": {"code": code, "message": message, **({"hint": hint} if hint else {})},
239
+ }
240
+ )
241
+
242
+
243
+ def _require_scrapegraph() -> None:
244
+ if _IMPORT_ERROR is not None:
245
+ raise ScrapeGraphToolError(
246
+ "missing_scrapegraph_dependencies",
247
+ f"ScrapeGraphAI dependencies are unavailable: {_IMPORT_ERROR}",
248
+ "Install `scrapegraphai>=1.75.1` and its runtime dependencies.",
249
+ )
250
+
251
+
252
+ def _require_mistral_key() -> str:
253
+ api_key = os.getenv("MISTRAL_API_KEY", "").strip()
254
+ if not api_key:
255
+ raise ScrapeGraphToolError(
256
+ "missing_mistral_api_key",
257
+ "MISTRAL_API_KEY is not configured.",
258
+ "Set MISTRAL_API_KEY in the environment before using ScrapeGraphAI extraction actions.",
259
+ )
260
+ return api_key
261
+
262
+
263
+ def _coerce_urls(urls: Any) -> list[str]:
264
+ if urls is None or urls == "":
265
+ return []
266
+ if isinstance(urls, list):
267
+ return [str(url).strip() for url in urls if str(url).strip()]
268
+ if isinstance(urls, str):
269
+ text = urls.strip()
270
+ if not text:
271
+ return []
272
+ if text.startswith("["):
273
+ parsed = json.loads(text)
274
+ if not isinstance(parsed, list):
275
+ raise ScrapeGraphToolError("invalid_urls", "urls must be a JSON array of URL strings.")
276
+ return [str(url).strip() for url in parsed if str(url).strip()]
277
+ return [part.strip() for part in text.replace("\r", "\n").replace(",", "\n").split("\n") if part.strip()]
278
+ raise ScrapeGraphToolError("invalid_urls", "urls must be provided as a list or JSON array string.")
279
+
280
+
281
+ def _coerce_schema(schema_json: Any) -> dict[str, Any] | None:
282
+ if schema_json in (None, "", {}):
283
+ return None
284
+ if isinstance(schema_json, dict):
285
+ return schema_json
286
+ if isinstance(schema_json, str):
287
+ try:
288
+ parsed = json.loads(schema_json)
289
+ except json.JSONDecodeError as exc:
290
+ raise ScrapeGraphToolError("invalid_schema_json", f"schema_json is not valid JSON: {exc}") from exc
291
+ if not isinstance(parsed, dict):
292
+ raise ScrapeGraphToolError("invalid_schema_json", "schema_json must decode to a JSON object.")
293
+ return parsed
294
+ raise ScrapeGraphToolError("invalid_schema_json", "schema_json must be a JSON object or JSON string.")
295
+
296
+
297
+ def _schema_to_type(name: str, schema: dict[str, Any]) -> Any:
298
+ schema_type = schema.get("type")
299
+ if schema_type == "string":
300
+ return str
301
+ if schema_type == "integer":
302
+ return int
303
+ if schema_type == "number":
304
+ return float
305
+ if schema_type == "boolean":
306
+ return bool
307
+ if schema_type == "array":
308
+ item_schema = schema.get("items", {})
309
+ return list[_schema_to_type(f"{name}Item", item_schema)]
310
+ if schema_type == "object" or "properties" in schema:
311
+ properties = schema.get("properties", {})
312
+ required = set(schema.get("required", []))
313
+ fields: dict[str, tuple[Any, Any]] = {}
314
+ for prop_name, prop_schema in properties.items():
315
+ prop_type = _schema_to_type(f"{name}{prop_name.title()}", prop_schema)
316
+ description = prop_schema.get("description")
317
+ is_required = prop_name in required
318
+ annotation = prop_type if is_required else (prop_type | None)
319
+ default = Field(... if is_required else None, description=description)
320
+ fields[prop_name] = (annotation, default)
321
+ return create_model(name, **fields)
322
+ return Any
323
+
324
+
325
+ def _schema_to_model(schema: dict[str, Any] | None) -> type[BaseModel] | None:
326
+ if not schema:
327
+ return None
328
+ if schema.get("type") not in (None, "object") and "properties" not in schema:
329
+ raise ScrapeGraphToolError(
330
+ "invalid_schema_json",
331
+ "Only object-shaped JSON schemas are supported for schema_json.",
332
+ )
333
+ model_type = _schema_to_type("ScrapeGraphResult", schema)
334
+ if not isinstance(model_type, type) or not issubclass(model_type, BaseModel):
335
+ raise ScrapeGraphToolError(
336
+ "invalid_schema_json",
337
+ "schema_json must define an object with properties for structured extraction.",
338
+ )
339
+ return model_type
340
+
341
+
342
+ def _resolve_storage_state(storage_state_path: str | None) -> str | None:
343
+ if not storage_state_path:
344
+ return None
345
+ candidate = storage_state_path.strip()
346
+ if not candidate:
347
+ return None
348
+ if os.path.isabs(candidate):
349
+ resolved = candidate
350
+ else:
351
+ resolved, _ = _resolve_path(candidate)
352
+ if not os.path.exists(resolved):
353
+ raise ScrapeGraphToolError(
354
+ "invalid_storage_state_path",
355
+ f"Storage state file not found: {candidate}",
356
+ )
357
+ return resolved
358
+
359
+
360
+ def _build_config(
361
+ *,
362
+ api_key: str | None,
363
+ text_model: str | None = None,
364
+ render_mode: str = "auto",
365
+ timeout_s: int = 30,
366
+ storage_state_path: str | None = None,
367
+ depth: int | None = None,
368
+ max_pages: int | None = None,
369
+ same_domain_only: bool | None = None,
370
+ max_images: int | None = None,
371
+ vision_model: str | None = None,
372
+ ) -> dict[str, Any]:
373
+ if render_mode not in RENDER_CHOICES:
374
+ raise ScrapeGraphToolError("invalid_render_mode", f"Unsupported render_mode: {render_mode}")
375
+ config: dict[str, Any] = {
376
+ "headless": True,
377
+ "verbose": False,
378
+ "timeout": max(5, int(timeout_s)),
379
+ "use_soup": render_mode == "http",
380
+ }
381
+ if api_key:
382
+ config["llm"] = {
383
+ "api_key": api_key,
384
+ "model": f"mistralai/{text_model or os.getenv(TEXT_MODEL_ENV, DEFAULT_TEXT_MODEL)}",
385
+ "temperature": 0,
386
+ }
387
+ if storage_state_path:
388
+ config["storage_state"] = storage_state_path
389
+ if depth is not None:
390
+ config["depth"] = max(1, int(depth))
391
+ if max_pages is not None:
392
+ config["max_pages"] = max(1, int(max_pages))
393
+ if same_domain_only is not None:
394
+ config["only_inside_links"] = bool(same_domain_only)
395
+ if max_images is not None:
396
+ config["max_images"] = max(1, int(max_images))
397
+ if vision_model:
398
+ config["vision_model"] = vision_model
399
+ return config
400
+
401
+
402
+ def _json_safe(value: Any) -> Any:
403
+ if BaseModel is not None and isinstance(value, BaseModel):
404
+ return value.model_dump(mode="json")
405
+ if isinstance(value, dict):
406
+ return {key: _json_safe(val) for key, val in value.items()}
407
+ if isinstance(value, list):
408
+ return [_json_safe(item) for item in value]
409
+ if isinstance(value, tuple):
410
+ return [_json_safe(item) for item in value]
411
+ if hasattr(value, "metadata") and hasattr(value, "page_content"):
412
+ return {
413
+ "page_content": getattr(value, "page_content", ""),
414
+ "metadata": _json_safe(getattr(value, "metadata", {})),
415
+ }
416
+ if isinstance(value, str):
417
+ stripped = value.strip()
418
+ if stripped.startswith("{") or stripped.startswith("["):
419
+ try:
420
+ return json.loads(stripped)
421
+ except Exception:
422
+ return value
423
+ return value
424
+
425
+
426
+ def _extract_sources(state: dict[str, Any], fallback: list[str] | None = None) -> list[str]:
427
+ sources: list[str] = []
428
+ for item in state.get("docs", []) or []:
429
+ source = item.get("source") if isinstance(item, dict) else None
430
+ if source and source not in sources:
431
+ sources.append(source)
432
+ for doc in state.get("doc", []) or []:
433
+ metadata = getattr(doc, "metadata", {}) or {}
434
+ source = metadata.get("source")
435
+ if source and source not in sources:
436
+ sources.append(source)
437
+ if not sources and fallback:
438
+ sources.extend([source for source in fallback if source])
439
+ return sources
440
+
441
+
442
+ def _extract_links_and_images(doc_state: dict[str, Any], url: str) -> tuple[list[str], list[str]]:
443
+ parse_node = ParseNode(
444
+ input="doc & url",
445
+ output=["parsed_doc", "link_urls", "img_urls"],
446
+ node_config={
447
+ "parse_urls": True,
448
+ "parse_html": True,
449
+ "chunk_size": 8192,
450
+ "llm_model": None,
451
+ },
452
+ )
453
+ docs = doc_state.get("doc")
454
+ if not docs:
455
+ docs = doc_state.get("html_content", [])
456
+ if not docs:
457
+ return [], []
458
+ state = {"doc": docs, "url": url}
459
+ parse_node.execute(state)
460
+ return state.get("link_urls", []) or [], state.get("img_urls", []) or []
461
+
462
+
463
+ def _render_markdown_with_fetch(url: str, config: dict[str, Any]) -> tuple[dict[str, Any], list[dict[str, Any]]]:
464
+ fetch_node = FetchNode(
465
+ input="url",
466
+ output=["doc"],
467
+ node_config=config,
468
+ )
469
+ state = {"url": url}
470
+ state = fetch_node.execute(state)
471
+ docs = state.get("doc", []) or []
472
+ if not docs:
473
+ raise ScrapeGraphToolError("fetch_failed", "ScrapeGraph fetch returned no documents for render_markdown.")
474
+ html = getattr(docs[0], "page_content", None) or ""
475
+ if not html.strip():
476
+ raise ScrapeGraphToolError("fetch_failed", "Fetched document for render_markdown had empty content.")
477
+ state["markdown"] = convert_to_md(html)
478
+ return state, []
479
+
480
+
481
+ @autodoc(summary=TOOL_SUMMARY)
482
+ def ScrapeGraphAI(
483
+ action: Annotated[
484
+ Literal["extract", "crawl_extract", "multi_extract", "render_markdown", "vision_extract"],
485
+ "Action to run: extract, crawl_extract, multi_extract, render_markdown, or vision_extract.",
486
+ ] = "extract",
487
+ url: Annotated[str, "Single URL for extract, crawl_extract, render_markdown, or vision_extract."] = "",
488
+ urls: Annotated[list[str] | str | None, "Explicit list of URLs for multi_extract. Accepts a list or JSON array string."] = None,
489
+ prompt: Annotated[str, "Natural-language extraction prompt. Required for extraction actions."] = "",
490
+ schema_json: Annotated[dict[str, Any] | str | None, "Optional object-shaped JSON schema for structured extraction."] = None,
491
+ render_mode: Annotated[Literal["auto", "browser", "http"], "Fetch mode. `browser` uses ScrapeGraph browser loading, `http` uses requests + soup, `auto` currently follows ScrapeGraph's browser-first path."] = "auto",
492
+ include_images: Annotated[bool, "For `extract`, include page images in the extraction context."] = False,
493
+ depth: Annotated[int, "For `crawl_extract`, crawl depth from the starting URL."] = 1,
494
+ max_pages: Annotated[int, "For `crawl_extract`, soft cap on fetched pages."] = 4,
495
+ same_domain_only: Annotated[bool, "For `crawl_extract`, stay within the starting site's links only."] = True,
496
+ max_urls: Annotated[int, "For `multi_extract`, maximum URLs allowed in one call."] = 8,
497
+ max_images: Annotated[int, "For `vision_extract` and image-aware extraction, maximum images to describe."] = 5,
498
+ max_chars: Annotated[int, "For `render_markdown`, trim returned markdown to this many characters."] = 12000,
499
+ include_links: Annotated[bool, "For `render_markdown`, include discovered page links."] = True,
500
+ timeout_s: Annotated[int, "Timeout in seconds passed to ScrapeGraph fetch and generation nodes."] = 30,
501
+ storage_state_path: Annotated[str, "Optional Playwright storage state JSON path for authenticated pages."] = "",
502
+ return_debug: Annotated[bool, "Include execution metadata and graph execution info in the response."] = False,
503
+ ) -> str:
504
+ _log_call_start(
505
+ "ScrapeGraphAI",
506
+ action=action,
507
+ url=url,
508
+ urls=urls,
509
+ prompt=_truncate_for_log(prompt or "", 180),
510
+ render_mode=render_mode,
511
+ include_images=include_images,
512
+ depth=depth,
513
+ max_pages=max_pages,
514
+ max_urls=max_urls,
515
+ max_images=max_images,
516
+ timeout_s=timeout_s,
517
+ storage_state_path=storage_state_path,
518
+ return_debug=return_debug,
519
+ )
520
+
521
+ try:
522
+ _require_scrapegraph()
523
+ storage_state = _resolve_storage_state(storage_state_path)
524
+ schema = _coerce_schema(schema_json)
525
+ schema_model = _schema_to_model(schema)
526
+ text_model_name = os.getenv(TEXT_MODEL_ENV, DEFAULT_TEXT_MODEL)
527
+ vision_model_name = os.getenv(VISION_MODEL_ENV, DEFAULT_VISION_MODEL)
528
+
529
+ if action == "render_markdown":
530
+ if not url.strip():
531
+ raise ScrapeGraphToolError("missing_url", "url is required for render_markdown.")
532
+ final_state, exec_info = _render_markdown_with_fetch(
533
+ url.strip(),
534
+ _build_config(
535
+ api_key=None,
536
+ render_mode=render_mode,
537
+ timeout_s=timeout_s,
538
+ storage_state_path=storage_state,
539
+ ),
540
+ )
541
+ markdown = (final_state.get("markdown") or "")[: max(1000, int(max_chars))]
542
+ links, images = _extract_links_and_images(final_state, url.strip())
543
+ response = {
544
+ "action": action,
545
+ "result": {"markdown": markdown},
546
+ "sources": [url.strip()],
547
+ "artifacts": {
548
+ "markdown": markdown,
549
+ "links": links if include_links else [],
550
+ "images": images if include_images else [],
551
+ "per_url_results": [],
552
+ },
553
+ "meta": {
554
+ "render_mode_used": render_mode,
555
+ "text_model": None,
556
+ "vision_model": None,
557
+ },
558
+ "warnings": [],
559
+ }
560
+ if return_debug:
561
+ response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(exec_info)}
562
+ result = _json_response(response)
563
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
564
+ return result
565
+
566
+ api_key = _require_mistral_key()
567
+ if action == "extract":
568
+ if not url.strip() or not prompt.strip():
569
+ raise ScrapeGraphToolError("missing_arguments", "url and prompt are required for extract.")
570
+ config = _build_config(
571
+ api_key=api_key,
572
+ text_model=text_model_name,
573
+ render_mode=render_mode,
574
+ timeout_s=timeout_s,
575
+ storage_state_path=storage_state,
576
+ max_images=max_images,
577
+ vision_model=vision_model_name,
578
+ )
579
+ graph_cls = _MistralOmniScraperGraph if include_images else SmartScraperGraph
580
+ graph = graph_cls(prompt=prompt.strip(), source=url.strip(), config=config, schema=schema_model)
581
+ result_data = _json_safe(graph.run())
582
+ final_state = graph.get_state()
583
+ response = {
584
+ "action": action,
585
+ "result": result_data,
586
+ "sources": _extract_sources(final_state, [url.strip()]),
587
+ "artifacts": {
588
+ "markdown": None,
589
+ "links": final_state.get("link_urls", []) or [],
590
+ "images": final_state.get("img_urls", []) or [],
591
+ "per_url_results": [],
592
+ },
593
+ "meta": {
594
+ "render_mode_used": render_mode,
595
+ "text_model": text_model_name,
596
+ "vision_model": vision_model_name if include_images else None,
597
+ },
598
+ "warnings": [],
599
+ }
600
+ if return_debug:
601
+ response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
602
+ result = _json_response(response)
603
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
604
+ return result
605
+
606
+ if action == "vision_extract":
607
+ if not url.strip() or not prompt.strip():
608
+ raise ScrapeGraphToolError("missing_arguments", "url and prompt are required for vision_extract.")
609
+ graph = _MistralOmniScraperGraph(
610
+ prompt=prompt.strip(),
611
+ source=url.strip(),
612
+ config=_build_config(
613
+ api_key=api_key,
614
+ text_model=text_model_name,
615
+ render_mode=render_mode,
616
+ timeout_s=timeout_s,
617
+ storage_state_path=storage_state,
618
+ max_images=max_images,
619
+ vision_model=vision_model_name,
620
+ ),
621
+ schema=schema_model,
622
+ )
623
+ result_data = _json_safe(graph.run())
624
+ final_state = graph.get_state()
625
+ img_urls = final_state.get("img_urls", []) or []
626
+ if not img_urls:
627
+ raise ScrapeGraphToolError("no_images_found", "No images were found on the page for vision_extract.")
628
+ response = {
629
+ "action": action,
630
+ "result": result_data,
631
+ "sources": _extract_sources(final_state, [url.strip()]),
632
+ "artifacts": {
633
+ "markdown": None,
634
+ "links": final_state.get("link_urls", []) or [],
635
+ "images": img_urls,
636
+ "per_url_results": [],
637
+ },
638
+ "meta": {
639
+ "render_mode_used": render_mode,
640
+ "text_model": text_model_name,
641
+ "vision_model": vision_model_name,
642
+ },
643
+ "warnings": [],
644
+ }
645
+ if return_debug:
646
+ response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
647
+ result = _json_response(response)
648
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
649
+ return result
650
+
651
+ if action == "multi_extract":
652
+ normalized_urls = _coerce_urls(urls)
653
+ if not normalized_urls or not prompt.strip():
654
+ raise ScrapeGraphToolError("missing_arguments", "urls and prompt are required for multi_extract.")
655
+ if len(normalized_urls) > max(1, int(max_urls)):
656
+ raise ScrapeGraphToolError("too_many_urls", f"multi_extract supports at most {max_urls} URLs per call.")
657
+ graph = SmartScraperMultiGraph(
658
+ prompt=prompt.strip(),
659
+ source=normalized_urls,
660
+ config=_build_config(
661
+ api_key=api_key,
662
+ text_model=text_model_name,
663
+ render_mode=render_mode,
664
+ timeout_s=timeout_s,
665
+ storage_state_path=storage_state,
666
+ ),
667
+ schema=schema_model,
668
+ )
669
+ result_data = _json_safe(graph.run())
670
+ final_state = graph.get_state()
671
+ response = {
672
+ "action": action,
673
+ "result": result_data,
674
+ "sources": normalized_urls,
675
+ "artifacts": {
676
+ "markdown": None,
677
+ "links": [],
678
+ "images": [],
679
+ "per_url_results": _json_safe(final_state.get("results", [])),
680
+ },
681
+ "meta": {
682
+ "render_mode_used": render_mode,
683
+ "text_model": text_model_name,
684
+ "vision_model": None,
685
+ },
686
+ "warnings": [],
687
+ }
688
+ if return_debug:
689
+ response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
690
+ result = _json_response(response)
691
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
692
+ return result
693
+
694
+ if action == "crawl_extract":
695
+ if not url.strip() or not prompt.strip():
696
+ raise ScrapeGraphToolError("missing_arguments", "url and prompt are required for crawl_extract.")
697
+ graph = _BoundedDepthSearchGraph(
698
+ prompt=prompt.strip(),
699
+ source=url.strip(),
700
+ config=_build_config(
701
+ api_key=api_key,
702
+ text_model=text_model_name,
703
+ render_mode=render_mode,
704
+ timeout_s=timeout_s,
705
+ storage_state_path=storage_state,
706
+ depth=depth,
707
+ max_pages=max_pages,
708
+ same_domain_only=same_domain_only,
709
+ ),
710
+ schema=schema_model,
711
+ )
712
+ result_data = _json_safe(graph.run())
713
+ final_state = graph.get_state()
714
+ response = {
715
+ "action": action,
716
+ "result": result_data,
717
+ "sources": _extract_sources(final_state, [url.strip()]),
718
+ "artifacts": {
719
+ "markdown": None,
720
+ "links": [],
721
+ "images": [],
722
+ "per_url_results": [],
723
+ },
724
+ "meta": {
725
+ "render_mode_used": render_mode,
726
+ "text_model": text_model_name,
727
+ "vision_model": None,
728
+ },
729
+ "warnings": [],
730
+ }
731
+ if return_debug:
732
+ response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
733
+ result = _json_response(response)
734
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
735
+ return result
736
+
737
+ raise ScrapeGraphToolError("unsupported_action", f"Unsupported action: {action}")
738
+ except ScrapeGraphToolError as exc:
739
+ result = _error_response(action, exc.code, exc.message, exc.hint)
740
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
741
+ return result
742
+ except Exception as exc: # pragma: no cover - runtime integration path
743
+ code = "browser_unavailable" if "playwright" in str(exc).lower() or "chromium" in str(exc).lower() else "fetch_failed"
744
+ result = _error_response(action, code, f"ScrapeGraphAI action failed: {exc}")
745
+ _log_call_end("ScrapeGraphAI", _truncate_for_log(result))
746
+ return result
747
+
748
+
749
+ def build_interface() -> gr.Interface:
750
+ return gr.Interface(
751
+ fn=ScrapeGraphAI,
752
+ inputs=[
753
+ gr.Dropdown(choices=ACTION_CHOICES, value="extract", label="Action"),
754
+ gr.Textbox(label="URL", placeholder="https://example.com"),
755
+ gr.JSON(label="URLs", value=[]),
756
+ gr.Textbox(label="Prompt", lines=4, placeholder="Extract pricing tiers and main limits."),
757
+ gr.JSON(label="Schema JSON", value={}),
758
+ gr.Dropdown(choices=RENDER_CHOICES, value="auto", label="Render Mode"),
759
+ gr.Checkbox(label="Include Images", value=False),
760
+ gr.Number(label="Depth", value=1, precision=0),
761
+ gr.Number(label="Max Pages", value=4, precision=0),
762
+ gr.Checkbox(label="Same Domain Only", value=True),
763
+ gr.Number(label="Max URLs", value=8, precision=0),
764
+ gr.Number(label="Max Images", value=5, precision=0),
765
+ gr.Number(label="Max Chars", value=12000, precision=0),
766
+ gr.Checkbox(label="Include Links", value=True),
767
+ gr.Number(label="Timeout (seconds)", value=30, precision=0),
768
+ gr.Textbox(label="Storage State Path", placeholder="Optional Playwright storage_state JSON path"),
769
+ gr.Checkbox(label="Return Debug", value=False),
770
+ ],
771
+ outputs=gr.Textbox(label="Result", lines=20, max_lines=40),
772
+ title="ScrapeGraphAI",
773
+ description="<div style=\"text-align:center\">Mistral-only structured scraping using ScrapeGraphAI graphs.</div>",
774
+ api_description=TOOL_SUMMARY,
775
+ flagging_mode="never",
776
+ )
777
+
778
+
779
+ __all__ = ["ScrapeGraphAI", "build_interface"]
Modules/Shell_Command.py CHANGED
@@ -1,194 +1,194 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import platform
5
- import shlex
6
- import subprocess
7
- from typing import Annotated
8
-
9
- import gradio as gr
10
-
11
- from app import _log_call_end, _log_call_start, _truncate_for_log
12
- from ._docstrings import autodoc
13
- from ._core import _resolve_path, ROOT_DIR, _display_path, ALLOW_ABS
14
- import shutil
15
-
16
-
17
-
18
- def _detect_shell(prefer_powershell: bool = True) -> tuple[list[str], str]:
19
- """
20
- Pick an appropriate shell for the host OS.
21
- - Windows: use PowerShell by default, fall back to cmd.exe.
22
- - POSIX: use /bin/bash if available, else /bin/sh.
23
- Returns (shell_cmd_prefix, shell_name) where shell_cmd_prefix is the command list to launch the shell.
24
- """
25
- system = platform.system().lower()
26
- if system == "windows":
27
- if prefer_powershell:
28
- pwsh = shutil.which("pwsh")
29
- candidates = [pwsh, shutil.which("powershell"), shutil.which("powershell.exe")]
30
- for cand in candidates:
31
- if cand:
32
- return [cand, "-NoLogo", "-NoProfile", "-Command"], "powershell"
33
- # Fallback to cmd
34
- comspec = os.environ.get("ComSpec", r"C:\\Windows\\System32\\cmd.exe")
35
- return [comspec, "/C"], "cmd"
36
- # POSIX
37
- bash = shutil.which("bash")
38
- if bash:
39
- return [bash, "-lc"], "bash"
40
- sh = os.environ.get("SHELL", "/bin/sh")
41
- return [sh, "-lc"], "sh"
42
-
43
-
44
- # Detect shell at import time for docs/UI purposes
45
- _DETECTED_SHELL_PREFIX, _DETECTED_SHELL_NAME = _detect_shell()
46
-
47
-
48
- # Clarify path semantics and expose detected shell in summary
49
- TOOL_SUMMARY = (
50
- "Execute a shell command within a safe working directory under the tool root ('/'). "
51
- "Paths must be relative to '/'. "
52
- "Set workdir to '.' to use the root. "
53
- "Absolute paths are disabled."
54
- f"Detected shell: {_DETECTED_SHELL_NAME}."
55
- )
56
-
57
-
58
- def _run_command(command: str, cwd: str, timeout: int) -> tuple[str, str, int]:
59
- shell_prefix, shell_name = _detect_shell()
60
- full_cmd = shell_prefix + [command]
61
- try:
62
- proc = subprocess.run(
63
- full_cmd,
64
- cwd=cwd,
65
- stdout=subprocess.PIPE,
66
- stderr=subprocess.PIPE,
67
- text=True,
68
- encoding="utf-8",
69
- errors="replace",
70
- timeout=timeout if timeout and timeout > 0 else None,
71
- )
72
- return proc.stdout, proc.stderr, proc.returncode
73
- except subprocess.TimeoutExpired as exc:
74
- return exc.stdout or "", (exc.stderr or "") + "\n[timeout]", 124
75
- except Exception as exc:
76
- return "", f"Execution failed: {exc}", 1
77
-
78
-
79
- @autodoc(summary=TOOL_SUMMARY)
80
- def Shell_Command(
81
- command: Annotated[str, "Shell command to execute. Accepts multi-part pipelines as a single string."],
82
- workdir: Annotated[str, "Working directory (relative to root unless UNSAFE_ALLOW_ABS_PATHS=1)."] = ".",
83
- timeout: Annotated[int, "Timeout in seconds (0 = no timeout, be careful on public hosting)."] = 60,
84
- ) -> str:
85
- _log_call_start("Shell_Command", command=command, workdir=workdir, timeout=timeout)
86
- if not command or not command.strip():
87
- result = "No command provided."
88
- _log_call_end("Shell_Command", _truncate_for_log(result))
89
- return result
90
-
91
- abs_cwd, err = _resolve_path(workdir)
92
- if err:
93
- _log_call_end("Shell_Command", _truncate_for_log(err))
94
- return err
95
- if not os.path.exists(abs_cwd):
96
- result = f"Working directory not found: {abs_cwd}"
97
- _log_call_end("Shell_Command", _truncate_for_log(result))
98
- return result
99
-
100
- # Heuristic check for absolute paths in arguments if sandboxing is strictly enforced
101
- # We look for typical absolute path patterns: "/..." or "C:\..."
102
- # This is not perfect (e.g., inside strings) but helps enforce "Impossible" rule.
103
- import re
104
- if not ALLOW_ABS:
105
-
106
- # Regex for Unix-style absolute path (start with /)
107
- # or Windows-style absolute path (start with drive letter)
108
- # We look for these patterns preceded by space or start of string
109
- # to avoid matching arguments like --flag=/value (though those might be paths too!)
110
- # Actually, matching ANY absolute path substring is safer for "Impossible".
111
- # Patterns:
112
- # Unix: / followed by non-space
113
- # Win: X:\ followed by non-space
114
-
115
- # Simple heuristic: if command contains potential absolute path
116
- unix_abs = r"(?:\s|^)/[a-zA-Z0-9_.]"
117
- win_abs = r"(?:\s|^)[a-zA-Z]:\\"
118
-
119
- if re.search(unix_abs, command) or re.search(win_abs, command):
120
- # We allow a few exceptions if needed, but for "Impossible" we block.
121
- # Note: This might block flags like /C, but we run powershell/cmd separately.
122
- # Wait, Windows flags start with /. 'dir /s'. This heuristic is dangerous for Windows flags.
123
- # We should refine it.
124
- pass
125
-
126
- # Refined check:
127
- # On Windows, flags start with /, so checking for / is bad.
128
- # But paths in Windows usually use \ or /.
129
- # Let's focus on Unix roots and Windows Drive roots.
130
-
131
- has_abs_path = False
132
- if platform.system().lower() == "windows":
133
- # Look for Drive:\ - anchored to start of string, space, or quote to avoid matching URLs like https://
134
- if re.search(r"(?:\s|^|['\"])[a-zA-Z]:[\\/]", command):
135
- has_abs_path = True
136
- # On Windows with PowerShell, /path is valid too, but confusing with flags.
137
- # We'll trust that Drive:\ is the main vector to save OUTSIDE tool root (which is likely C: or P:).
138
- # If tool root is P:/Code..., writing to C:/... requires Drive arg.
139
- else:
140
- # Unix: Look for / at start of token, but exclude common flags?
141
- # Actually, just looking for " /" or start "/" is decent.
142
- # But flags like /dev/null are common.
143
- # Maybe we just warn or block known dangerous patterns?
144
- # User said "Make it impossible". a broad block is better than a leak.
145
- if re.search(r"(?:\s|^)/", command):
146
- # This blocks flags like /bin/bash or paths.
147
- has_abs_path = True
148
-
149
- if has_abs_path:
150
- result = "Error: Absolute paths are not allowed in commands to ensure sandbox safety. Use relative paths."
151
- _log_call_end("Shell_Command", _truncate_for_log(result))
152
- return result
153
-
154
- # Capture shell used for transparency
155
- _, shell_name = _detect_shell()
156
- stdout, stderr, code = _run_command(command, cwd=abs_cwd, timeout=timeout)
157
- display_cwd = _display_path(abs_cwd)
158
- header = (
159
- f"Command: {command}\n"
160
- f"CWD: {display_cwd}\n"
161
- f"Root: /\n"
162
- f"Shell: {shell_name}\n"
163
- f"Exit code: {code}\n"
164
- f"--- STDOUT ---\n"
165
- )
166
- output = header + (stdout or "<empty>") + "\n--- STDERR ---\n" + (stderr or "<empty>")
167
- _log_call_end("Shell_Command", _truncate_for_log(f"exit={code} stdout={len(stdout)} stderr={len(stderr)}"))
168
- return output
169
-
170
-
171
- def build_interface() -> gr.Interface:
172
- return gr.Interface(
173
- fn=Shell_Command,
174
- inputs=[
175
- gr.Textbox(label="Command", placeholder="echo hello || dir", lines=2, info="Shell command to execute"),
176
- gr.Textbox(label="Workdir", value=".", max_lines=1, info="Working directory (relative to root)"),
177
- gr.Slider(minimum=0, maximum=600, step=5, value=60, label="Timeout (seconds)", info="Timeout in seconds (0 = no timeout)"),
178
- ],
179
- outputs=gr.Textbox(label="Output", lines=20),
180
- title="Shell Command",
181
- description=(
182
- "<div style=\"text-align:center; overflow:hidden;\">"
183
- "Run a shell command under the same safe root as File System. "
184
- "Absolute paths are disabled, use relative paths. "
185
- f"Detected shell: {_DETECTED_SHELL_NAME}. "
186
- "</div>"
187
- ),
188
- api_description=TOOL_SUMMARY,
189
- flagging_mode="never",
190
- submit_btn="Run",
191
- )
192
-
193
-
194
- __all__ = ["Shell_Command", "build_interface"]
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import platform
5
+ import shlex
6
+ import subprocess
7
+ from typing import Annotated
8
+
9
+ import gradio as gr
10
+
11
+ from app import _log_call_end, _log_call_start, _truncate_for_log
12
+ from ._docstrings import autodoc
13
+ from ._core import _resolve_path, ROOT_DIR, _display_path, ALLOW_ABS
14
+ import shutil
15
+
16
+
17
+
18
+ def _detect_shell(prefer_powershell: bool = True) -> tuple[list[str], str]:
19
+ """
20
+ Pick an appropriate shell for the host OS.
21
+ - Windows: use PowerShell by default, fall back to cmd.exe.
22
+ - POSIX: use /bin/bash if available, else /bin/sh.
23
+ Returns (shell_cmd_prefix, shell_name) where shell_cmd_prefix is the command list to launch the shell.
24
+ """
25
+ system = platform.system().lower()
26
+ if system == "windows":
27
+ if prefer_powershell:
28
+ pwsh = shutil.which("pwsh")
29
+ candidates = [pwsh, shutil.which("powershell"), shutil.which("powershell.exe")]
30
+ for cand in candidates:
31
+ if cand:
32
+ return [cand, "-NoLogo", "-NoProfile", "-Command"], "powershell"
33
+ # Fallback to cmd
34
+ comspec = os.environ.get("ComSpec", r"C:\\Windows\\System32\\cmd.exe")
35
+ return [comspec, "/C"], "cmd"
36
+ # POSIX
37
+ bash = shutil.which("bash")
38
+ if bash:
39
+ return [bash, "-lc"], "bash"
40
+ sh = os.environ.get("SHELL", "/bin/sh")
41
+ return [sh, "-lc"], "sh"
42
+
43
+
44
+ # Detect shell at import time for docs/UI purposes
45
+ _DETECTED_SHELL_PREFIX, _DETECTED_SHELL_NAME = _detect_shell()
46
+
47
+
48
+ # Clarify path semantics and expose detected shell in summary
49
+ TOOL_SUMMARY = (
50
+ "Execute a shell command within a safe working directory under the tool root ('/'). "
51
+ "Paths must be relative to '/'. "
52
+ "Set workdir to '.' to use the root. "
53
+ "Absolute paths are disabled."
54
+ f"Detected shell: {_DETECTED_SHELL_NAME}."
55
+ )
56
+
57
+
58
+ def _run_command(command: str, cwd: str, timeout: int) -> tuple[str, str, int]:
59
+ shell_prefix, shell_name = _detect_shell()
60
+ full_cmd = shell_prefix + [command]
61
+ try:
62
+ proc = subprocess.run(
63
+ full_cmd,
64
+ cwd=cwd,
65
+ stdout=subprocess.PIPE,
66
+ stderr=subprocess.PIPE,
67
+ text=True,
68
+ encoding="utf-8",
69
+ errors="replace",
70
+ timeout=timeout if timeout and timeout > 0 else None,
71
+ )
72
+ return proc.stdout, proc.stderr, proc.returncode
73
+ except subprocess.TimeoutExpired as exc:
74
+ return exc.stdout or "", (exc.stderr or "") + "\n[timeout]", 124
75
+ except Exception as exc:
76
+ return "", f"Execution failed: {exc}", 1
77
+
78
+
79
+ @autodoc(summary=TOOL_SUMMARY)
80
+ def Shell_Command(
81
+ command: Annotated[str, "Shell command to execute. Accepts multi-part pipelines as a single string."],
82
+ workdir: Annotated[str, "Working directory (relative to root unless UNSAFE_ALLOW_ABS_PATHS=1)."] = ".",
83
+ timeout: Annotated[int, "Timeout in seconds (0 = no timeout, be careful on public hosting)."] = 60,
84
+ ) -> str:
85
+ _log_call_start("Shell_Command", command=command, workdir=workdir, timeout=timeout)
86
+ if not command or not command.strip():
87
+ result = "No command provided."
88
+ _log_call_end("Shell_Command", _truncate_for_log(result))
89
+ return result
90
+
91
+ abs_cwd, err = _resolve_path(workdir)
92
+ if err:
93
+ _log_call_end("Shell_Command", _truncate_for_log(err))
94
+ return err
95
+ if not os.path.exists(abs_cwd):
96
+ result = f"Working directory not found: {abs_cwd}"
97
+ _log_call_end("Shell_Command", _truncate_for_log(result))
98
+ return result
99
+
100
+ # Heuristic check for absolute paths in arguments if sandboxing is strictly enforced
101
+ # We look for typical absolute path patterns: "/..." or "C:\..."
102
+ # This is not perfect (e.g., inside strings) but helps enforce "Impossible" rule.
103
+ import re
104
+ if not ALLOW_ABS:
105
+
106
+ # Regex for Unix-style absolute path (start with /)
107
+ # or Windows-style absolute path (start with drive letter)
108
+ # We look for these patterns preceded by space or start of string
109
+ # to avoid matching arguments like --flag=/value (though those might be paths too!)
110
+ # Actually, matching ANY absolute path substring is safer for "Impossible".
111
+ # Patterns:
112
+ # Unix: / followed by non-space
113
+ # Win: X:\ followed by non-space
114
+
115
+ # Simple heuristic: if command contains potential absolute path
116
+ unix_abs = r"(?:\s|^)/[a-zA-Z0-9_.]"
117
+ win_abs = r"(?:\s|^)[a-zA-Z]:\\"
118
+
119
+ if re.search(unix_abs, command) or re.search(win_abs, command):
120
+ # We allow a few exceptions if needed, but for "Impossible" we block.
121
+ # Note: This might block flags like /C, but we run powershell/cmd separately.
122
+ # Wait, Windows flags start with /. 'dir /s'. This heuristic is dangerous for Windows flags.
123
+ # We should refine it.
124
+ pass
125
+
126
+ # Refined check:
127
+ # On Windows, flags start with /, so checking for / is bad.
128
+ # But paths in Windows usually use \ or /.
129
+ # Let's focus on Unix roots and Windows Drive roots.
130
+
131
+ has_abs_path = False
132
+ if platform.system().lower() == "windows":
133
+ # Look for Drive:\ - anchored to start of string, space, or quote to avoid matching URLs like https://
134
+ if re.search(r"(?:\s|^|['\"])[a-zA-Z]:[\\/]", command):
135
+ has_abs_path = True
136
+ # On Windows with PowerShell, /path is valid too, but confusing with flags.
137
+ # We'll trust that Drive:\ is the main vector to save OUTSIDE tool root (which is likely C: or P:).
138
+ # If tool root is P:/Code..., writing to C:/... requires Drive arg.
139
+ else:
140
+ # Unix: Look for / at start of token, but exclude common flags?
141
+ # Actually, just looking for " /" or start "/" is decent.
142
+ # But flags like /dev/null are common.
143
+ # Maybe we just warn or block known dangerous patterns?
144
+ # User said "Make it impossible". a broad block is better than a leak.
145
+ if re.search(r"(?:\s|^)/", command):
146
+ # This blocks flags like /bin/bash or paths.
147
+ has_abs_path = True
148
+
149
+ if has_abs_path:
150
+ result = "Error: Absolute paths are not allowed in commands to ensure sandbox safety. Use relative paths."
151
+ _log_call_end("Shell_Command", _truncate_for_log(result))
152
+ return result
153
+
154
+ # Capture shell used for transparency
155
+ _, shell_name = _detect_shell()
156
+ stdout, stderr, code = _run_command(command, cwd=abs_cwd, timeout=timeout)
157
+ display_cwd = _display_path(abs_cwd)
158
+ header = (
159
+ f"Command: {command}\n"
160
+ f"CWD: {display_cwd}\n"
161
+ f"Root: /\n"
162
+ f"Shell: {shell_name}\n"
163
+ f"Exit code: {code}\n"
164
+ f"--- STDOUT ---\n"
165
+ )
166
+ output = header + (stdout or "<empty>") + "\n--- STDERR ---\n" + (stderr or "<empty>")
167
+ _log_call_end("Shell_Command", _truncate_for_log(f"exit={code} stdout={len(stdout)} stderr={len(stderr)}"))
168
+ return output
169
+
170
+
171
+ def build_interface() -> gr.Interface:
172
+ return gr.Interface(
173
+ fn=Shell_Command,
174
+ inputs=[
175
+ gr.Textbox(label="Command", placeholder="echo hello || dir", lines=2, info="Shell command to execute"),
176
+ gr.Textbox(label="Workdir", value=".", max_lines=1, info="Working directory (relative to root)"),
177
+ gr.Slider(minimum=0, maximum=600, step=5, value=60, label="Timeout (seconds)", info="Timeout in seconds (0 = no timeout)"),
178
+ ],
179
+ outputs=gr.Textbox(label="Output", lines=20),
180
+ title="Shell Command",
181
+ description=(
182
+ "<div style=\"text-align:center; overflow:hidden;\">"
183
+ "Run a shell command under the same safe root as File System. "
184
+ "Absolute paths are disabled, use relative paths. "
185
+ f"Detected shell: {_DETECTED_SHELL_NAME}. "
186
+ "</div>"
187
+ ),
188
+ api_description=TOOL_SUMMARY,
189
+ flagging_mode="never",
190
+ submit_btn="Run",
191
+ )
192
+
193
+
194
+ __all__ = ["Shell_Command", "build_interface"]
Modules/Web_Search.py CHANGED
@@ -1,499 +1,517 @@
1
- from __future__ import annotations
2
-
3
- from typing import Annotated, List
4
- from datetime import datetime
5
-
6
- import gradio as gr
7
- from ddgs import DDGS
8
-
9
- from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
10
- from ._docstrings import autodoc
11
-
12
-
13
- # Single source of truth for the LLM-facing tool description
14
- TOOL_SUMMARY = (
15
- "Run a DuckDuckGo-backed search across text, news, images, videos, or books. "
16
- "Readable results include pagination hints and next_offset when more results are available; "
17
- "Use in combination with `Web_Fetch` to navigate the web."
18
- )
19
-
20
-
21
- _SAFESEARCH_LEVEL = "off"
22
-
23
- # Defaults and choices for newly added parameters
24
- BACKEND_CHOICES = [
25
- "auto",
26
- "duckduckgo",
27
- "bing",
28
- "brave",
29
- "yahoo",
30
- "wikipedia",
31
- ]
32
-
33
- # Allowed backends per type (explicit selection set)
34
- _ALLOWED_BACKENDS = {
35
- "text": ["duckduckgo", "bing", "brave", "yahoo", "wikipedia"],
36
- "news": ["duckduckgo", "bing", "yahoo"],
37
- "images": ["duckduckgo"],
38
- "videos": ["duckduckgo"],
39
- "books": ["annasarchive"],
40
- }
41
-
42
- # Auto order per type (used when backend == "auto"); wikipedia excluded for text
43
- _AUTO_ORDER = {
44
- "text": ["duckduckgo", "bing", "brave", "yahoo"],
45
- "news": ["duckduckgo", "bing", "yahoo"],
46
- "images": ["duckduckgo"],
47
- "videos": ["duckduckgo"],
48
- "books": ["annasarchive"],
49
- }
50
-
51
- # Date filter choices: canonical values used by resolver
52
- DATE_FILTER_CHOICES = ["any", "day", "week", "month", "year"]
53
-
54
-
55
- def _resolve_backend(search_type: str, backend_choice: str) -> str:
56
- """Resolve backend string for DDGS based on search type and user choice.
57
-
58
- - If backend_choice is "auto", return a comma-separated fallback order for that type.
59
- - If backend_choice is not supported by the type, fall back to the first allowed backend.
60
- - Books endpoint uses only 'annasarchive'.
61
- """
62
- stype = search_type if search_type in _ALLOWED_BACKENDS else "text"
63
- allowed = _ALLOWED_BACKENDS[stype]
64
- if backend_choice == "auto":
65
- return ", ".join(_AUTO_ORDER[stype])
66
- if stype == "books":
67
- return "annasarchive"
68
- # Validate backend against allowed set for this type
69
- if backend_choice in allowed:
70
- return backend_choice
71
- # Fallback to first allowed backend
72
- return allowed[0]
73
-
74
-
75
- def _resolve_timelimit(date_filter: str, search_type: str) -> str | None:
76
- """Map UI date filter to DDGS timelimit code per endpoint.
77
-
78
- Returns one of: None, 'd', 'w', 'm', 'y'. For news/videos (which support d/w/m),
79
- selecting 'year' will coerce to 'm' to stay within supported range.
80
- """
81
- normalized = (date_filter or "any").strip().lower()
82
- if normalized in ("any", "none", ""):
83
- return None
84
- mapping = {
85
- "day": "d",
86
- "week": "w",
87
- "month": "m",
88
- "year": "y",
89
- }
90
- code = mapping.get(normalized)
91
- if not code:
92
- return None
93
- if search_type in ("news", "videos") and code == "y":
94
- return "m"
95
- return code
96
-
97
-
98
- def _extract_date_from_snippet(snippet: str) -> str:
99
- if not snippet:
100
- return ""
101
- import re
102
-
103
- date_patterns = [
104
- r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b",
105
- r"\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b",
106
- r"\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b",
107
- r"\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b",
108
- r"(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)",
109
- ]
110
- for pattern in date_patterns:
111
- matches = re.findall(pattern, snippet, re.IGNORECASE)
112
- if matches:
113
- return matches[0].strip()
114
- return ""
115
-
116
-
117
- def _format_search_result(result: dict, search_type: str, index: int) -> List[str]:
118
- lines: List[str] = []
119
- if search_type == "text":
120
- title = result.get("title", "").strip()
121
- url = result.get("href", "").strip()
122
- snippet = result.get("body", "").strip()
123
- date = _extract_date_from_snippet(snippet)
124
- lines.append(f"{index}. {title}")
125
- lines.append(f" URL: {url}")
126
- if snippet:
127
- lines.append(f" Summary: {snippet}")
128
- if date:
129
- lines.append(f" Date: {date}")
130
- elif search_type == "news":
131
- title = result.get("title", "").strip()
132
- url = result.get("url", "").strip()
133
- body = result.get("body", "").strip()
134
- date = result.get("date", "").strip()
135
- source = result.get("source", "").strip()
136
- lines.append(f"{index}. {title}")
137
- lines.append(f" URL: {url}")
138
- if source:
139
- lines.append(f" Source: {source}")
140
- if date:
141
- lines.append(f" Date: {date}")
142
- if body:
143
- lines.append(f" Summary: {body}")
144
- elif search_type == "images":
145
- title = result.get("title", "").strip()
146
- image_url = result.get("image", "").strip()
147
- source_url = result.get("url", "").strip()
148
- source = result.get("source", "").strip()
149
- width = result.get("width", "")
150
- height = result.get("height", "")
151
- lines.append(f"{index}. {title}")
152
- lines.append(f" Image: {image_url}")
153
- lines.append(f" Source: {source_url}")
154
- if source:
155
- lines.append(f" Publisher: {source}")
156
- if width and height:
157
- lines.append(f" Dimensions: {width}x{height}")
158
- elif search_type == "videos":
159
- title = result.get("title", "").strip()
160
- description = result.get("description", "").strip()
161
- duration = result.get("duration", "").strip()
162
- published = result.get("published", "").strip()
163
- uploader = result.get("uploader", "").strip()
164
- embed_url = result.get("embed_url", "").strip()
165
- lines.append(f"{index}. {title}")
166
- if embed_url:
167
- lines.append(f" Video: {embed_url}")
168
- if uploader:
169
- lines.append(f" Uploader: {uploader}")
170
- if duration:
171
- lines.append(f" Duration: {duration}")
172
- if published:
173
- lines.append(f" Published: {published}")
174
- if description:
175
- lines.append(f" Description: {description}")
176
- elif search_type == "books":
177
- title = result.get("title", "").strip()
178
- url = result.get("url", "").strip()
179
- body = result.get("body", "").strip()
180
- lines.append(f"{index}. {title}")
181
- lines.append(f" URL: {url}")
182
- if body:
183
- lines.append(f" Description: {body}")
184
- return lines
185
-
186
-
187
- @autodoc(
188
- summary=TOOL_SUMMARY,
189
- )
190
- def Web_Search(
191
- query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
192
- max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
193
- page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
194
- offset: Annotated[int, "Result offset to start from (overrides page if > 0, for precise continuation)."] = 0,
195
- search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",
196
- backend: Annotated[str, "Search backend or ordered fallbacks. Use 'auto' for recommended order."] = "auto",
197
- date_filter: Annotated[str, "Time filter: any, day, week, month, year."] = "any",
198
- ) -> str:
199
- _log_call_start(
200
- "Web_Search",
201
- query=query,
202
- max_results=max_results,
203
- page=page,
204
- search_type=search_type,
205
- offset=offset,
206
- backend=backend,
207
- date_filter=date_filter,
208
- )
209
- if not query or not query.strip():
210
- result = "No search query provided. Please enter a search term."
211
- _log_call_end("Web_Search", _truncate_for_log(result))
212
- return result
213
- max_results = max(1, min(20, max_results))
214
- page = max(1, page)
215
- offset = max(0, offset)
216
- valid_types = ["text", "news", "images", "videos", "books"]
217
- if search_type not in valid_types:
218
- search_type = "text"
219
- if offset > 0:
220
- actual_offset = offset
221
- calculated_page = (offset // max_results) + 1
222
- else:
223
- actual_offset = (page - 1) * max_results
224
- calculated_page = page
225
- total_needed = actual_offset + max_results
226
- used_fallback = False
227
- original_search_type = search_type
228
- # Prepare cross-cutting parameters
229
- resolved_backend = _resolve_backend(search_type, (backend or "auto").lower())
230
- timelimit = _resolve_timelimit(date_filter, search_type)
231
-
232
- def _perform_search(stype: str) -> list[dict]:
233
- try:
234
- _search_rate_limiter.acquire()
235
- with DDGS() as ddgs:
236
- if stype == "text":
237
- user_backend_choice = (backend or "auto").lower()
238
- if user_backend_choice == "auto":
239
- # Custom auto: DDG first, then append other engines
240
- results: list[dict] = []
241
- seen: set[str] = set()
242
-
243
- def add_unique(items: list[dict], key_field: str) -> None:
244
- for it in items or []:
245
- url = (it.get(key_field, "") or "").strip()
246
- if url and url not in seen:
247
- seen.add(url)
248
- results.append(it)
249
-
250
- # First: duckduckgo
251
- try:
252
- ddg_items = list(
253
- ddgs.text(
254
- query,
255
- max_results=total_needed + 10,
256
- safesearch=_SAFESEARCH_LEVEL,
257
- timelimit=timelimit,
258
- backend="duckduckgo",
259
- )
260
- )
261
- except Exception:
262
- ddg_items = []
263
- add_unique(ddg_items, "href")
264
-
265
- # Then: other engines appended (excluding duckduckgo)
266
- for eng in [b for b in _AUTO_ORDER["text"] if b != "duckduckgo"]:
267
- try:
268
- extra = list(
269
- ddgs.text(
270
- query,
271
- max_results=total_needed + 10,
272
- safesearch=_SAFESEARCH_LEVEL,
273
- timelimit=timelimit,
274
- backend=eng,
275
- )
276
- )
277
- except Exception:
278
- extra = []
279
- add_unique(extra, "href")
280
-
281
- return results
282
- else:
283
- raw_gen = ddgs.text(
284
- query,
285
- max_results=total_needed + 10,
286
- safesearch=_SAFESEARCH_LEVEL,
287
- timelimit=timelimit,
288
- backend=resolved_backend,
289
- )
290
- elif stype == "news":
291
- user_backend_choice = (backend or "auto").lower()
292
- if user_backend_choice == "auto":
293
- # Custom auto: DDG first, then append other engines
294
- results: list[dict] = []
295
- seen: set[str] = set()
296
-
297
- def add_unique(items: list[dict], key_field: str) -> None:
298
- for it in items or []:
299
- url = (it.get(key_field, "") or "").strip()
300
- if url and url not in seen:
301
- seen.add(url)
302
- results.append(it)
303
-
304
- # First: duckduckgo news
305
- try:
306
- ddg_news = list(
307
- ddgs.news(
308
- query,
309
- max_results=total_needed + 10,
310
- safesearch=_SAFESEARCH_LEVEL,
311
- timelimit=timelimit,
312
- backend="duckduckgo",
313
- )
314
- )
315
- except Exception:
316
- ddg_news = []
317
- add_unique(ddg_news, "url")
318
-
319
- # Then: other news engines appended
320
- for eng in [b for b in _AUTO_ORDER["news"] if b != "duckduckgo"]:
321
- try:
322
- extra = list(
323
- ddgs.news(
324
- query,
325
- max_results=total_needed + 10,
326
- safesearch=_SAFESEARCH_LEVEL,
327
- timelimit=timelimit,
328
- backend=eng,
329
- )
330
- )
331
- except Exception:
332
- extra = []
333
- add_unique(extra, "url")
334
-
335
- return results
336
- else:
337
- raw_gen = ddgs.news(
338
- query,
339
- max_results=total_needed + 10,
340
- safesearch=_SAFESEARCH_LEVEL,
341
- timelimit=timelimit,
342
- backend=_resolve_backend("news", (backend or "auto").lower()),
343
- )
344
- elif stype == "images":
345
- raw_gen = ddgs.images(
346
- query,
347
- max_results=total_needed + 10,
348
- safesearch=_SAFESEARCH_LEVEL,
349
- timelimit=timelimit,
350
- backend=_resolve_backend("images", (backend or "auto").lower()),
351
- )
352
- elif stype == "videos":
353
- raw_gen = ddgs.videos(
354
- query,
355
- max_results=total_needed + 10,
356
- safesearch=_SAFESEARCH_LEVEL,
357
- timelimit=timelimit,
358
- backend=_resolve_backend("videos", (backend or "auto").lower()),
359
- )
360
- else:
361
- raw_gen = ddgs.books(
362
- query,
363
- max_results=total_needed + 10,
364
- backend=_resolve_backend("books", (backend or "auto").lower()),
365
- )
366
- try:
367
- return list(raw_gen)
368
- except Exception as inner_exc:
369
- if "no results" in str(inner_exc).lower() or "not found" in str(inner_exc).lower():
370
- return []
371
- raise inner_exc
372
- except Exception as exc:
373
- error_msg = f"Search failed: {str(exc)[:200]}"
374
- lowered = str(exc).lower()
375
- if "blocked" in lowered or "rate" in lowered:
376
- error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
377
- elif "timeout" in lowered:
378
- error_msg = "Search timed out. Please try again with a simpler query."
379
- elif "network" in lowered or "connection" in lowered:
380
- error_msg = "Network connection error. Please check your internet connection and try again."
381
- elif "no results" in lowered or "not found" in lowered:
382
- return []
383
- raise Exception(error_msg)
384
-
385
- try:
386
- raw = _perform_search(search_type)
387
- except Exception as exc:
388
- result = f"Error: {exc}"
389
- _log_call_end("Web_Search", _truncate_for_log(result))
390
- return result
391
-
392
- if not raw and search_type == "news":
393
- try:
394
- raw = _perform_search("text")
395
- if raw:
396
- used_fallback = True
397
- search_type = "text"
398
- except Exception:
399
- pass
400
-
401
- if not raw:
402
- fallback_note = " (also tried 'text' search as fallback)" if original_search_type == "news" and used_fallback else ""
403
- result = f"No {original_search_type} results found for query: {query}{fallback_note}"
404
- _log_call_end("Web_Search", _truncate_for_log(result))
405
- return result
406
-
407
- paginated_results = raw[actual_offset: actual_offset + max_results]
408
- if not paginated_results:
409
- if actual_offset >= len(raw):
410
- result = f"Offset {actual_offset} exceeds available results ({len(raw)} total). Try offset=0 to start from beginning."
411
- else:
412
- result = f"No {original_search_type} results found on page {calculated_page} for query: {query}. Try page 1 or reduce page number."
413
- _log_call_end("Web_Search", _truncate_for_log(result))
414
- return result
415
-
416
- total_available = len(raw)
417
- start_num = actual_offset + 1
418
- end_num = actual_offset + len(paginated_results)
419
- next_offset = actual_offset + len(paginated_results)
420
- search_label = original_search_type.title()
421
- if used_fallback:
422
- search_label += " → Text (Smart Fallback)"
423
-
424
- now_dt = datetime.now().astimezone()
425
- date_str = now_dt.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
426
- if not date_str:
427
- date_str = now_dt.isoformat()
428
-
429
- pagination_info = f"Page {calculated_page}"
430
- if offset > 0:
431
- pagination_info = f"Offset {actual_offset} (≈ {pagination_info})"
432
- lines = [f"Current Date: {date_str}", f"{search_label} search results for: {query}"]
433
- if used_fallback:
434
- lines.append("📍 Note: News search returned no results, automatically searched general web content instead")
435
- lines.append(f"{pagination_info} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
436
- for i, result in enumerate(paginated_results, start_num):
437
- result_lines = _format_search_result(result, search_type, i)
438
- lines.extend(result_lines)
439
- lines.append("")
440
- if total_available > end_num:
441
- lines.append("💡 More results available:")
442
- lines.append(f" Next page: page={calculated_page + 1}")
443
- lines.append(f" • Next offset: offset={next_offset}")
444
- lines.append(f" Use offset={next_offset} to continue exactly from result {next_offset + 1}")
445
- result = "\n".join(lines)
446
- search_info = f"type={original_search_type}"
447
- if used_fallback:
448
- search_info += "→text"
449
- _log_call_end("Web_Search", f"{search_info} page={calculated_page} offset={actual_offset} results={len(paginated_results)} chars={len(result)}")
450
- return result
451
-
452
-
453
- def build_interface() -> gr.Interface:
454
- return gr.Interface(
455
- fn=Web_Search,
456
- inputs=[
457
- gr.Textbox(label="Query", placeholder="topic OR site:example.com", max_lines=1, info="The search query"),
458
- gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results", info="Number of results to return (1–20)"),
459
- gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination (ignored if offset > 0)"),
460
- gr.Slider(
461
- minimum=0,
462
- maximum=1000,
463
- value=0,
464
- step=1,
465
- label="Offset",
466
- info="Result offset to start from (overrides page if > 0, use next_offset from previous search)",
467
- ),
468
- gr.Radio(
469
- label="Search Type",
470
- choices=["text", "news", "images", "videos", "books"],
471
- value="text",
472
- info="Type of content to search for",
473
- ),
474
- gr.Radio(
475
- label="Backend",
476
- choices=BACKEND_CHOICES,
477
- value="auto",
478
- info="Search engine backend or fallback order (auto applies recommended order)",
479
- ),
480
- gr.Radio(
481
- label="Date filter",
482
- choices=DATE_FILTER_CHOICES,
483
- value="any",
484
- info="Limit results to: day, week, month, or year (varies by type)",
485
- ),
486
- ],
487
- outputs=gr.Textbox(label="Search Results", interactive=False, lines=20, max_lines=20),
488
- title="Web Search",
489
- description=(
490
- "<div style=\"text-align:center\">Multi-type web search with readable output format, date detection, and flexible pagination. "
491
- "Supports text, news, images, videos, and books. Features smart fallback for news searches and precise offset control.</div>"
492
- ),
493
- api_description=TOOL_SUMMARY,
494
- flagging_mode="never",
495
- submit_btn="Search",
496
- )
497
-
498
-
499
- __all__ = ["Web_Search", "build_interface"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Annotated, List, Literal
4
+ from datetime import datetime
5
+
6
+ import gradio as gr
7
+ from ddgs import DDGS
8
+
9
+ from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
10
+ from ._docstrings import autodoc
11
+ from ._searxng_client import SearXNGClient, TimeRange
12
+ from ._query_optimizer import get_optimizer
13
+
14
+
15
+ # Single source of truth for the LLM-facing tool description
16
+ TOOL_SUMMARY = (
17
+ "Run a web search across text, news, images, videos, or books. "
18
+ "Supports multiple backends (DuckDuckGo, SearXNG) with optional AI query optimization. "
19
+ "Readable results include pagination hints and next_offset when more results are available."
20
+ )
21
+
22
+
23
+ _SAFESEARCH_LEVEL = "off"
24
+
25
+ # Defaults and choices for newly added parameters
26
+ BACKEND_CHOICES = [
27
+ "auto",
28
+ "duckduckgo",
29
+ "searxng",
30
+ "bing",
31
+ "brave",
32
+ "yahoo",
33
+ "wikipedia",
34
+ ]
35
+
36
+ # Allowed backends per type (explicit selection set)
37
+ _ALLOWED_BACKENDS = {
38
+ "text": ["duckduckgo", "searxng", "bing", "brave", "yahoo", "wikipedia"],
39
+ "news": ["duckduckgo", "searxng", "bing", "yahoo"],
40
+ "images": ["duckduckgo", "searxng"],
41
+ "videos": ["duckduckgo"],
42
+ "books": ["annasarchive"],
43
+ }
44
+
45
+ # Auto order per type (used when backend == "auto"); wikipedia excluded for text
46
+ _AUTO_ORDER = {
47
+ "text": ["searxng", "duckduckgo", "bing", "brave", "yahoo"],
48
+ "news": ["searxng", "duckduckgo", "bing", "yahoo"],
49
+ "images": ["searxng", "duckduckgo"],
50
+ "videos": ["duckduckgo"],
51
+ "books": ["annasarchive"],
52
+ }
53
+
54
+ # Date filter choices: canonical values used by resolver
55
+ DATE_FILTER_CHOICES = ["any", "day", "week", "month", "year"]
56
+
57
+
58
+ def _resolve_backend(search_type: str, backend_choice: str) -> str:
59
+ """Resolve backend string for DDGS based on search type and user choice.
60
+
61
+ - If backend_choice is "auto", return a comma-separated fallback order for that type.
62
+ - If backend_choice is not supported by the type, fall back to the first allowed backend.
63
+ - Books endpoint uses only 'annasarchive'.
64
+ """
65
+ stype = search_type if search_type in _ALLOWED_BACKENDS else "text"
66
+ allowed = _ALLOWED_BACKENDS[stype]
67
+ if backend_choice == "auto":
68
+ return ", ".join(_AUTO_ORDER[stype])
69
+ if stype == "books":
70
+ return "annasarchive"
71
+ # Validate backend against allowed set for this type
72
+ if backend_choice in allowed:
73
+ return backend_choice
74
+ # Fallback to first allowed backend
75
+ return allowed[0]
76
+
77
+
78
+ def _resolve_timelimit(date_filter: str, search_type: str) -> str | None:
79
+ """Map UI date filter to DDGS timelimit code per endpoint.
80
+
81
+ Returns one of: None, 'd', 'w', 'm', 'y'. For news/videos (which support d/w/m),
82
+ selecting 'year' will coerce to 'm' to stay within supported range.
83
+ """
84
+ normalized = (date_filter or "any").strip().lower()
85
+ if normalized in ("any", "none", ""):
86
+ return None
87
+ mapping = {
88
+ "day": "d",
89
+ "week": "w",
90
+ "month": "m",
91
+ "year": "y",
92
+ }
93
+ code = mapping.get(normalized)
94
+ if not code:
95
+ return None
96
+ if search_type in ("news", "videos") and code == "y":
97
+ return "m"
98
+ return code
99
+
100
+
101
+ def _extract_date_from_snippet(snippet: str) -> str:
102
+ if not snippet:
103
+ return ""
104
+ import re
105
+
106
+ date_patterns = [
107
+ r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b",
108
+ r"\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b",
109
+ r"\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b",
110
+ r"\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b",
111
+ r"(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)",
112
+ ]
113
+ for pattern in date_patterns:
114
+ matches = re.findall(pattern, snippet, re.IGNORECASE)
115
+ if matches:
116
+ return matches[0].strip()
117
+ return ""
118
+
119
+
120
+ def _format_search_result(result: dict, search_type: str, index: int) -> List[str]:
121
+ lines: List[str] = []
122
+ if search_type == "text":
123
+ title = result.get("title", "").strip()
124
+ url = result.get("href", "").strip()
125
+ snippet = result.get("body", "").strip()
126
+ date = _extract_date_from_snippet(snippet)
127
+ lines.append(f"{index}. {title}")
128
+ lines.append(f" URL: {url}")
129
+ if snippet:
130
+ lines.append(f" Summary: {snippet}")
131
+ if date:
132
+ lines.append(f" Date: {date}")
133
+ elif search_type == "news":
134
+ title = result.get("title", "").strip()
135
+ url = result.get("url", "").strip()
136
+ body = result.get("body", "").strip()
137
+ date = result.get("date", "").strip()
138
+ source = result.get("source", "").strip()
139
+ lines.append(f"{index}. {title}")
140
+ lines.append(f" URL: {url}")
141
+ if source:
142
+ lines.append(f" Source: {source}")
143
+ if date:
144
+ lines.append(f" Date: {date}")
145
+ if body:
146
+ lines.append(f" Summary: {body}")
147
+ elif search_type == "images":
148
+ title = result.get("title", "").strip()
149
+ image_url = result.get("image", "").strip()
150
+ source_url = result.get("url", "").strip()
151
+ source = result.get("source", "").strip()
152
+ width = result.get("width", "")
153
+ height = result.get("height", "")
154
+ lines.append(f"{index}. {title}")
155
+ lines.append(f" Image: {image_url}")
156
+ lines.append(f" Source: {source_url}")
157
+ if source:
158
+ lines.append(f" Publisher: {source}")
159
+ if width and height:
160
+ lines.append(f" Dimensions: {width}x{height}")
161
+ elif search_type == "videos":
162
+ title = result.get("title", "").strip()
163
+ description = result.get("description", "").strip()
164
+ duration = result.get("duration", "").strip()
165
+ published = result.get("published", "").strip()
166
+ uploader = result.get("uploader", "").strip()
167
+ embed_url = result.get("embed_url", "").strip()
168
+ lines.append(f"{index}. {title}")
169
+ if embed_url:
170
+ lines.append(f" Video: {embed_url}")
171
+ if uploader:
172
+ lines.append(f" Uploader: {uploader}")
173
+ if duration:
174
+ lines.append(f" Duration: {duration}")
175
+ if published:
176
+ lines.append(f" Published: {published}")
177
+ if description:
178
+ lines.append(f" Description: {description}")
179
+ elif search_type == "books":
180
+ title = result.get("title", "").strip()
181
+ url = result.get("url", "").strip()
182
+ body = result.get("body", "").strip()
183
+ lines.append(f"{index}. {title}")
184
+ lines.append(f" URL: {url}")
185
+ if body:
186
+ lines.append(f" Description: {body}")
187
+ return lines
188
+
189
+
190
+ @autodoc(
191
+ summary=TOOL_SUMMARY,
192
+ )
193
+ def Web_Search(
194
+ query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
195
+ max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
196
+ page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
197
+ offset: Annotated[int, "Result offset to start from (overrides page if > 0, for precise continuation)."] = 0,
198
+ search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",
199
+ backend: Annotated[str, "Search backend: 'duckduckgo', 'searxng', or 'auto' (SearXNG first, then DDG)."] = "auto",
200
+ date_filter: Annotated[str, "Time filter: any, day, week, month, year."] = "any",
201
+ optimize_query: Annotated[bool, "Use AI to optimize the query for better results (adds ~2s latency)."] = False,
202
+ ) -> str:
203
+ _log_call_start(
204
+ "Web_Search",
205
+ query=query,
206
+ max_results=max_results,
207
+ page=page,
208
+ search_type=search_type,
209
+ offset=offset,
210
+ backend=backend,
211
+ date_filter=date_filter,
212
+ optimize_query=optimize_query,
213
+ )
214
+
215
+ # Query optimization (optional)
216
+ optimization_metadata = None
217
+ if optimize_query:
218
+ try:
219
+ optimizer = get_optimizer()
220
+ query, optimization_metadata = optimizer.optimize_for_search_engine(query)
221
+ except Exception as exc:
222
+ print(f"[Web_Search] Query optimization failed: {exc}", flush=True)
223
+ # Continue with original query
224
+
225
+ if not query or not query.strip():
226
+ result = "No search query provided. Please enter a search term."
227
+ _log_call_end("Web_Search", _truncate_for_log(result))
228
+ return result
229
+ max_results = max(1, min(20, max_results))
230
+ page = max(1, page)
231
+ offset = max(0, offset)
232
+ valid_types = ["text", "news", "images", "videos", "books"]
233
+ if search_type not in valid_types:
234
+ search_type = "text"
235
+ if offset > 0:
236
+ actual_offset = offset
237
+ calculated_page = (offset // max_results) + 1
238
+ else:
239
+ actual_offset = (page - 1) * max_results
240
+ calculated_page = page
241
+ total_needed = actual_offset + max_results
242
+ used_fallback = False
243
+ original_search_type = search_type
244
+ # Prepare cross-cutting parameters
245
+ resolved_backend = _resolve_backend(search_type, (backend or "auto").lower())
246
+ timelimit = _resolve_timelimit(date_filter, search_type)
247
+
248
+ # Map date_filter to SearXNG TimeRange
249
+ _TIME_RANGE_MAP = {
250
+ "day": TimeRange.DAY,
251
+ "week": TimeRange.WEEK,
252
+ "month": TimeRange.MONTH,
253
+ "year": TimeRange.YEAR,
254
+ }
255
+ searxng_time_range = _TIME_RANGE_MAP.get(date_filter.lower()) if date_filter else None
256
+
257
+ def _perform_searxng_search(stype: str) -> list[dict]:
258
+ """Perform search using SearXNG backend."""
259
+ try:
260
+ _search_rate_limiter.acquire()
261
+ with SearXNGClient() as client:
262
+ if stype == "text":
263
+ results = client.text(query, max_results=total_needed, time_range=searxng_time_range)
264
+ return [
265
+ {
266
+ "title": r.title,
267
+ "href": r.url,
268
+ "body": r.content,
269
+ "engine": r.engine,
270
+ }
271
+ for r in results
272
+ ]
273
+ elif stype == "news":
274
+ results = client.news(query, max_results=total_needed, time_range=searxng_time_range)
275
+ return [
276
+ {
277
+ "title": r.title,
278
+ "url": r.url,
279
+ "body": r.content,
280
+ "date": r.published_date or "",
281
+ "source": r.engine or "",
282
+ }
283
+ for r in results
284
+ ]
285
+ elif stype == "images":
286
+ results = client.images(query, max_results=total_needed)
287
+ return [
288
+ {
289
+ "title": r.title,
290
+ "image": r.img_src,
291
+ "url": r.url,
292
+ "source": r.source or r.engine or "",
293
+ "thumbnail": r.thumbnail_src,
294
+ }
295
+ for r in results
296
+ ]
297
+ return []
298
+ except Exception as exc:
299
+ print(f"[Web_Search] SearXNG error: {exc}", flush=True)
300
+ return []
301
+
302
+ def _perform_search(stype: str) -> list[dict]:
303
+ user_backend_choice = (backend or "auto").lower()
304
+
305
+ # Handle SearXNG backend explicitly
306
+ if user_backend_choice == "searxng":
307
+ return _perform_searxng_search(stype)
308
+
309
+ # Handle auto: SearXNG first, then DDG fallback
310
+ if user_backend_choice == "auto":
311
+ # Try SearXNG first
312
+ searxng_results = _perform_searxng_search(stype)
313
+ if searxng_results:
314
+ return searxng_results
315
+ # Fallback to DDG
316
+ print(f"[Web_Search] SearXNG returned no results, falling back to DuckDuckGo", flush=True)
317
+
318
+ try:
319
+ _search_rate_limiter.acquire()
320
+ with DDGS() as ddgs:
321
+ if stype == "text":
322
+ if user_backend_choice == "auto":
323
+ # Auto fallback to DDG after SearXNG failed
324
+ raw_gen = ddgs.text(
325
+ query,
326
+ max_results=total_needed + 10,
327
+ safesearch=_SAFESEARCH_LEVEL,
328
+ timelimit=timelimit,
329
+ backend="duckduckgo",
330
+ )
331
+ else:
332
+ raw_gen = ddgs.text(
333
+ query,
334
+ max_results=total_needed + 10,
335
+ safesearch=_SAFESEARCH_LEVEL,
336
+ timelimit=timelimit,
337
+ backend=resolved_backend,
338
+ )
339
+ elif stype == "news":
340
+ if user_backend_choice == "auto":
341
+ # Auto fallback to DDG after SearXNG failed
342
+ raw_gen = ddgs.news(
343
+ query,
344
+ max_results=total_needed + 10,
345
+ safesearch=_SAFESEARCH_LEVEL,
346
+ timelimit=timelimit,
347
+ backend="duckduckgo",
348
+ )
349
+ else:
350
+ raw_gen = ddgs.news(
351
+ query,
352
+ max_results=total_needed + 10,
353
+ safesearch=_SAFESEARCH_LEVEL,
354
+ timelimit=timelimit,
355
+ backend=_resolve_backend("news", user_backend_choice),
356
+ )
357
+ elif stype == "images":
358
+ raw_gen = ddgs.images(
359
+ query,
360
+ max_results=total_needed + 10,
361
+ safesearch=_SAFESEARCH_LEVEL,
362
+ timelimit=timelimit,
363
+ backend=_resolve_backend("images", (backend or "auto").lower()),
364
+ )
365
+ elif stype == "videos":
366
+ raw_gen = ddgs.videos(
367
+ query,
368
+ max_results=total_needed + 10,
369
+ safesearch=_SAFESEARCH_LEVEL,
370
+ timelimit=timelimit,
371
+ backend=_resolve_backend("videos", (backend or "auto").lower()),
372
+ )
373
+ else:
374
+ raw_gen = ddgs.books(
375
+ query,
376
+ max_results=total_needed + 10,
377
+ backend=_resolve_backend("books", (backend or "auto").lower()),
378
+ )
379
+ try:
380
+ return list(raw_gen)
381
+ except Exception as inner_exc:
382
+ if "no results" in str(inner_exc).lower() or "not found" in str(inner_exc).lower():
383
+ return []
384
+ raise inner_exc
385
+ except Exception as exc:
386
+ error_msg = f"Search failed: {str(exc)[:200]}"
387
+ lowered = str(exc).lower()
388
+ if "blocked" in lowered or "rate" in lowered:
389
+ error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
390
+ elif "timeout" in lowered:
391
+ error_msg = "Search timed out. Please try again with a simpler query."
392
+ elif "network" in lowered or "connection" in lowered:
393
+ error_msg = "Network connection error. Please check your internet connection and try again."
394
+ elif "no results" in lowered or "not found" in lowered:
395
+ return []
396
+ raise Exception(error_msg)
397
+
398
+ try:
399
+ raw = _perform_search(search_type)
400
+ except Exception as exc:
401
+ result = f"Error: {exc}"
402
+ _log_call_end("Web_Search", _truncate_for_log(result))
403
+ return result
404
+
405
+ if not raw and search_type == "news":
406
+ try:
407
+ raw = _perform_search("text")
408
+ if raw:
409
+ used_fallback = True
410
+ search_type = "text"
411
+ except Exception:
412
+ pass
413
+
414
+ if not raw:
415
+ fallback_note = " (also tried 'text' search as fallback)" if original_search_type == "news" and used_fallback else ""
416
+ result = f"No {original_search_type} results found for query: {query}{fallback_note}"
417
+ _log_call_end("Web_Search", _truncate_for_log(result))
418
+ return result
419
+
420
+ paginated_results = raw[actual_offset: actual_offset + max_results]
421
+ if not paginated_results:
422
+ if actual_offset >= len(raw):
423
+ result = f"Offset {actual_offset} exceeds available results ({len(raw)} total). Try offset=0 to start from beginning."
424
+ else:
425
+ result = f"No {original_search_type} results found on page {calculated_page} for query: {query}. Try page 1 or reduce page number."
426
+ _log_call_end("Web_Search", _truncate_for_log(result))
427
+ return result
428
+
429
+ total_available = len(raw)
430
+ start_num = actual_offset + 1
431
+ end_num = actual_offset + len(paginated_results)
432
+ next_offset = actual_offset + len(paginated_results)
433
+ search_label = original_search_type.title()
434
+ if used_fallback:
435
+ search_label += " Text (Smart Fallback)"
436
+
437
+ now_dt = datetime.now().astimezone()
438
+ date_str = now_dt.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
439
+ if not date_str:
440
+ date_str = now_dt.isoformat()
441
+
442
+ pagination_info = f"Page {calculated_page}"
443
+ if offset > 0:
444
+ pagination_info = f"Offset {actual_offset} (≈ {pagination_info})"
445
+ lines = [f"Current Date: {date_str}", f"{search_label} search results for: {query}"]
446
+ if used_fallback:
447
+ lines.append("📍 Note: News search returned no results, automatically searched general web content instead")
448
+ lines.append(f"{pagination_info} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
449
+ for i, result in enumerate(paginated_results, start_num):
450
+ result_lines = _format_search_result(result, search_type, i)
451
+ lines.extend(result_lines)
452
+ lines.append("")
453
+ if total_available > end_num:
454
+ lines.append("💡 More results available:")
455
+ lines.append(f" • Next page: page={calculated_page + 1}")
456
+ lines.append(f" • Next offset: offset={next_offset}")
457
+ lines.append(f" Use offset={next_offset} to continue exactly from result {next_offset + 1}")
458
+ result = "\n".join(lines)
459
+ search_info = f"type={original_search_type}"
460
+ if used_fallback:
461
+ search_info += "→text"
462
+ _log_call_end("Web_Search", f"{search_info} page={calculated_page} offset={actual_offset} results={len(paginated_results)} chars={len(result)}")
463
+ return result
464
+
465
+
466
+ def build_interface() -> gr.Interface:
467
+ return gr.Interface(
468
+ fn=Web_Search,
469
+ inputs=[
470
+ gr.Textbox(label="Query", placeholder="topic OR site:example.com", max_lines=1, info="The search query"),
471
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results", info="Number of results to return (1–20)"),
472
+ gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination (ignored if offset > 0)"),
473
+ gr.Slider(
474
+ minimum=0,
475
+ maximum=1000,
476
+ value=0,
477
+ step=1,
478
+ label="Offset",
479
+ info="Result offset to start from (overrides page if > 0, use next_offset from previous search)",
480
+ ),
481
+ gr.Radio(
482
+ label="Search Type",
483
+ choices=["text", "news", "images", "videos", "books"],
484
+ value="text",
485
+ info="Type of content to search for",
486
+ ),
487
+ gr.Radio(
488
+ label="Backend",
489
+ choices=BACKEND_CHOICES,
490
+ value="auto",
491
+ info="Search backend: auto (SearXNG → DDG), searxng, or duckduckgo",
492
+ ),
493
+ gr.Radio(
494
+ label="Date filter",
495
+ choices=DATE_FILTER_CHOICES,
496
+ value="any",
497
+ info="Limit results to: day, week, month, or year",
498
+ ),
499
+ gr.Checkbox(
500
+ label="Optimize Query",
501
+ value=False,
502
+ info="Use AI to optimize the query for better results (adds ~2s latency)",
503
+ ),
504
+ ],
505
+ outputs=gr.Textbox(label="Search Results", interactive=False, lines=20, max_lines=20),
506
+ title="Web Search",
507
+ description=(
508
+ "<div style=\"text-align:center\">Multi-backend web search (SearXNG + DuckDuckGo) with optional AI query optimization. "
509
+ "Supports text, news, images, videos, and books. Auto backend tries SearXNG first, then DDG fallback.</div>"
510
+ ),
511
+ api_description=TOOL_SUMMARY,
512
+ flagging_mode="never",
513
+ submit_btn="Search",
514
+ )
515
+
516
+
517
+ __all__ = ["Web_Search", "build_interface"]
Modules/_core.py CHANGED
@@ -1,861 +1,861 @@
1
- """
2
- Core shared utilities for the Nymbo-Tools MCP server.
3
-
4
- Consolidates three key areas:
5
- 1. Sandboxed filesystem operations (path resolution, reading, writing, safe_open)
6
- 2. Sandboxed Python execution (code interpreter, agent terminal)
7
- 3. Hugging Face inference utilities (token, providers, error handling)
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import ast
13
- import json
14
- import os
15
- import re
16
- import stat
17
- import sys
18
- from datetime import datetime
19
- from io import StringIO
20
- from typing import Any, Callable, Optional, TypeVar
21
-
22
- import gradio as gr
23
-
24
-
25
- # ===========================================================================
26
- # Part 0: Tree Rendering Utilities
27
- # ===========================================================================
28
-
29
-
30
- def _fmt_size(num_bytes: int) -> str:
31
- """Format byte size as human-readable string."""
32
- units = ["B", "KB", "MB", "GB"]
33
- size = float(num_bytes)
34
- for unit in units:
35
- if size < 1024.0:
36
- return f"{size:.1f} {unit}"
37
- size /= 1024.0
38
- return f"{size:.1f} TB"
39
-
40
-
41
- def build_tree(entries: list[tuple[str, dict]]) -> dict:
42
- """
43
- Build a nested tree structure from flat path entries.
44
-
45
- Args:
46
- entries: List of (path, metadata) tuples where path uses forward slashes.
47
- Paths ending with '/' are treated as directories.
48
-
49
- Returns:
50
- Nested dict with "__files__" key for files at each level.
51
- """
52
- root: dict = {"__files__": []}
53
-
54
- for path, metadata in entries:
55
- parts = path.rstrip("/").split("/")
56
- is_dir = path.endswith("/")
57
-
58
- node = root
59
- for i, part in enumerate(parts[:-1]):
60
- if part not in node:
61
- node[part] = {"__files__": []}
62
- node = node[part]
63
-
64
- final = parts[-1]
65
- if is_dir:
66
- if final not in node:
67
- node[final] = {"__files__": []}
68
- if metadata:
69
- node[final]["__meta__"] = metadata
70
- else:
71
- node["__files__"].append((final, metadata))
72
-
73
- return root
74
-
75
-
76
- def render_tree(
77
- node: dict,
78
- prefix: str = "",
79
- format_entry: Optional[Callable[[str, dict, bool], str]] = None,
80
- ) -> list[str]:
81
- """
82
- Render a tree with line connectors.
83
-
84
- Args:
85
- node: Nested dict from build_tree()
86
- prefix: Current line prefix for indentation
87
- format_entry: Optional callback to format each entry.
88
-
89
- Returns:
90
- List of formatted lines.
91
- """
92
- result = []
93
-
94
- def default_format(name: str, meta: dict, is_dir: bool) -> str:
95
- if is_dir:
96
- return f"{name}/"
97
- size = meta.get("size")
98
- if size is not None:
99
- return f"{name} ({_fmt_size(size)})"
100
- return name
101
-
102
- fmt = format_entry or default_format
103
-
104
- entries = []
105
- subdirs = sorted(k for k in node.keys() if k not in ("__files__", "__meta__"))
106
- files_here = sorted(node.get("__files__", []), key=lambda x: x[0])
107
-
108
- for dirname in subdirs:
109
- dir_meta = node[dirname].get("__meta__", {})
110
- entries.append(("dir", dirname, node[dirname], dir_meta))
111
- for fname, fmeta in files_here:
112
- entries.append(("file", fname, None, fmeta))
113
-
114
- for i, entry in enumerate(entries):
115
- is_last = (i == len(entries) - 1)
116
- connector = "└── " if is_last else "├── "
117
- child_prefix = prefix + (" " if is_last else "│ ")
118
-
119
- etype, name, subtree, meta = entry
120
-
121
- if etype == "dir":
122
- result.append(f"{prefix}{connector}{fmt(name, meta, True)}")
123
- result.extend(render_tree(subtree, child_prefix, format_entry))
124
- else:
125
- result.append(f"{prefix}{connector}{fmt(name, meta, False)}")
126
-
127
- return result
128
-
129
-
130
- def walk_and_build_tree(
131
- abs_path: str,
132
- *,
133
- show_hidden: bool = False,
134
- recursive: bool = False,
135
- max_entries: int = 100,
136
- ) -> tuple[dict, int, bool]:
137
- """
138
- Walk a directory and build a tree structure.
139
-
140
- Returns:
141
- (tree, total_entries, truncated)
142
- """
143
- entries: list[tuple[str, dict]] = []
144
- total = 0
145
- truncated = False
146
-
147
- for root, dirs, files in os.walk(abs_path):
148
- if not show_hidden:
149
- dirs[:] = [d for d in dirs if not d.startswith('.')]
150
- files = [f for f in files if not f.startswith('.')]
151
-
152
- dirs.sort()
153
- files.sort()
154
-
155
- try:
156
- rel_root = os.path.relpath(root, abs_path)
157
- except Exception:
158
- rel_root = ""
159
- prefix = "" if rel_root == "." else rel_root.replace("\\", "/") + "/"
160
-
161
- for d in dirs:
162
- p = os.path.join(root, d)
163
- try:
164
- mtime = datetime.fromtimestamp(os.path.getmtime(p)).strftime("%Y-%m-%d %H:%M")
165
- except Exception:
166
- mtime = "?"
167
- entries.append((f"{prefix}{d}/", {"mtime": mtime}))
168
- total += 1
169
- if total >= max_entries:
170
- truncated = True
171
- break
172
-
173
- if truncated:
174
- break
175
-
176
- for f in files:
177
- p = os.path.join(root, f)
178
- try:
179
- size = os.path.getsize(p)
180
- mtime = datetime.fromtimestamp(os.path.getmtime(p)).strftime("%Y-%m-%d %H:%M")
181
- except Exception:
182
- size, mtime = 0, "?"
183
- entries.append((f"{prefix}{f}", {"size": size, "mtime": mtime}))
184
- total += 1
185
- if total >= max_entries:
186
- truncated = True
187
- break
188
-
189
- if truncated:
190
- break
191
-
192
- if not recursive:
193
- break
194
-
195
- return build_tree(entries), total, truncated
196
-
197
-
198
- def format_dir_listing(
199
- abs_path: str,
200
- display_path: str,
201
- *,
202
- show_hidden: bool = False,
203
- recursive: bool = False,
204
- max_entries: int = 100,
205
- fmt_size_fn: Optional[Callable[[int], str]] = None,
206
- ) -> str:
207
- """Format a directory listing as a visual tree."""
208
- fmt_size = fmt_size_fn or _fmt_size
209
-
210
- tree, total, truncated = walk_and_build_tree(
211
- abs_path,
212
- show_hidden=show_hidden,
213
- recursive=recursive,
214
- max_entries=max_entries,
215
- )
216
-
217
- def format_entry(name: str, meta: dict, is_dir: bool) -> str:
218
- mtime = meta.get("mtime", "")
219
- if is_dir:
220
- return f"{name}/ ({mtime})"
221
- size = meta.get("size", 0)
222
- return f"{name} ({fmt_size(size)}, {mtime})"
223
-
224
- tree_lines = render_tree(tree, " ", format_entry)
225
-
226
- header = f"Listing of {display_path}\nRoot: /\nEntries: {total}"
227
- if truncated:
228
- header += f"\n… Truncated at {max_entries} entries."
229
-
230
- lines = [header, "", "└── /"]
231
- lines.extend(tree_lines)
232
-
233
- return "\n".join(lines).strip()
234
-
235
-
236
- # ===========================================================================
237
- # Part 1: Sandboxed Filesystem Operations
238
- # ===========================================================================
239
-
240
-
241
- class SandboxedRoot:
242
- """
243
- A configurable sandboxed root directory with path resolution and safety checks.
244
-
245
- Args:
246
- root_dir: Absolute path to the sandbox root.
247
- allow_abs: If True, allow absolute paths outside the sandbox.
248
- """
249
-
250
- def __init__(self, root_dir: str, allow_abs: bool = False):
251
- self.root_dir = os.path.abspath(root_dir)
252
- self.allow_abs = allow_abs
253
- # Ensure root exists
254
- try:
255
- os.makedirs(self.root_dir, exist_ok=True)
256
- except Exception:
257
- pass
258
-
259
- def safe_err(self, exc: Exception | str) -> str:
260
- """Return an error string with any absolute root replaced by '/' and slashes normalized."""
261
- s = str(exc)
262
- s_norm = s.replace("\\", "/")
263
- root_fwd = self.root_dir.replace("\\", "/")
264
- root_variants = {self.root_dir, root_fwd, re.sub(r"/+", "/", root_fwd)}
265
- for variant in root_variants:
266
- if variant:
267
- s_norm = s_norm.replace(variant, "/")
268
- s_norm = re.sub(r"/+", "/", s_norm)
269
- return s_norm
270
-
271
- def err(
272
- self,
273
- code: str,
274
- message: str,
275
- *,
276
- path: Optional[str] = None,
277
- hint: Optional[str] = None,
278
- data: Optional[dict] = None,
279
- ) -> str:
280
- """Return a structured error JSON string."""
281
- payload = {
282
- "status": "error",
283
- "code": code,
284
- "message": message,
285
- "root": "/",
286
- }
287
- if path is not None and path != "":
288
- payload["path"] = path
289
- if hint:
290
- payload["hint"] = hint
291
- if data:
292
- payload["data"] = data
293
- return json.dumps(payload, ensure_ascii=False)
294
-
295
- def display_path(self, abs_path: str) -> str:
296
- """Return a user-friendly path relative to root using forward slashes."""
297
- try:
298
- norm_root = os.path.normpath(self.root_dir)
299
- norm_abs = os.path.normpath(abs_path)
300
- common = os.path.commonpath([norm_root, norm_abs])
301
- if os.path.normcase(common) == os.path.normcase(norm_root):
302
- rel = os.path.relpath(norm_abs, norm_root)
303
- if rel == ".":
304
- return "/"
305
- return "/" + rel.replace("\\", "/")
306
- except Exception:
307
- pass
308
- return abs_path.replace("\\", "/")
309
-
310
- def resolve_path(self, path: str) -> tuple[str, str]:
311
- """
312
- Resolve a user-provided path to an absolute, normalized path constrained to root.
313
- Returns (abs_path, error_message). error_message is empty when ok.
314
- """
315
- try:
316
- user_input = (path or "/").strip() or "/"
317
- if user_input.startswith("/"):
318
- rel_part = user_input.lstrip("/") or "."
319
- raw = os.path.expanduser(rel_part)
320
- treat_as_relative = True
321
- else:
322
- raw = os.path.expanduser(user_input)
323
- treat_as_relative = False
324
-
325
- if not treat_as_relative and os.path.isabs(raw):
326
- if not self.allow_abs:
327
- return "", self.err(
328
- "absolute_path_disabled",
329
- "Absolute paths are disabled in safe mode.",
330
- path=raw.replace("\\", "/"),
331
- hint="Use a path relative to / (e.g., /notes/todo.txt).",
332
- )
333
- abs_path = os.path.abspath(raw)
334
- else:
335
- abs_path = os.path.abspath(os.path.join(self.root_dir, raw))
336
-
337
- # Constrain to root when not allowing absolute paths
338
- if not self.allow_abs:
339
- try:
340
- common = os.path.commonpath(
341
- [os.path.normpath(self.root_dir), os.path.normpath(abs_path)]
342
- )
343
- if common != os.path.normpath(self.root_dir):
344
- return "", self.err(
345
- "path_outside_root",
346
- "Path is outside the sandbox root.",
347
- path=abs_path,
348
- )
349
- except Exception:
350
- return "", self.err(
351
- "path_outside_root",
352
- "Path is outside the sandbox root.",
353
- path=abs_path,
354
- )
355
-
356
- return abs_path, ""
357
- except Exception as exc:
358
- return "", self.err(
359
- "resolve_path_failed",
360
- "Failed to resolve path.",
361
- path=(path or ""),
362
- data={"error": self.safe_err(exc)},
363
- )
364
-
365
- def safe_open(self, file, *args, **kwargs):
366
- """A drop-in replacement for open() that enforces sandbox constraints."""
367
- if isinstance(file, int):
368
- return open(file, *args, **kwargs)
369
-
370
- path_str = os.fspath(file)
371
- abs_path, err = self.resolve_path(path_str)
372
- if err:
373
- try:
374
- msg = json.loads(err)["message"]
375
- except Exception:
376
- msg = err
377
- raise PermissionError(f"Sandboxed open() failed: {msg}")
378
-
379
- return open(abs_path, *args, **kwargs)
380
-
381
- def list_dir(
382
- self,
383
- abs_path: str,
384
- *,
385
- show_hidden: bool = False,
386
- recursive: bool = False,
387
- max_entries: int = 100,
388
- ) -> str:
389
- """List directory contents as a visual tree."""
390
- return format_dir_listing(
391
- abs_path,
392
- self.display_path(abs_path),
393
- show_hidden=show_hidden,
394
- recursive=recursive,
395
- max_entries=max_entries,
396
- fmt_size_fn=_fmt_size,
397
- )
398
-
399
- def search_text(
400
- self,
401
- abs_path: str,
402
- query: str,
403
- *,
404
- recursive: bool = False,
405
- show_hidden: bool = False,
406
- max_results: int = 20,
407
- case_sensitive: bool = False,
408
- start_index: int = 0,
409
- ) -> str:
410
- """Search for text within files."""
411
- if not os.path.exists(abs_path):
412
- return self.err(
413
- "path_not_found",
414
- f"Path not found: {self.display_path(abs_path)}",
415
- path=self.display_path(abs_path),
416
- )
417
-
418
- query = query or ""
419
- normalized_query = query if case_sensitive else query.lower()
420
- if normalized_query == "":
421
- return self.err(
422
- "missing_search_query",
423
- "Search query is required for the search action.",
424
- hint="Provide text in the Content field to search for.",
425
- )
426
-
427
- max_results = max(1, int(max_results) if max_results is not None else 20)
428
- start_index = max(0, int(start_index) if start_index is not None else 0)
429
- matches: list[tuple[str, int, str]] = []
430
- errors: list[str] = []
431
- files_scanned = 0
432
- truncated = False
433
- total_matches = 0
434
-
435
- def _should_skip(name: str) -> bool:
436
- return not show_hidden and name.startswith(".")
437
-
438
- def _handle_match(file_path: str, line_no: int, line_text: str) -> bool:
439
- nonlocal truncated, total_matches
440
- total_matches += 1
441
- if total_matches <= start_index:
442
- return False
443
- if len(matches) < max_results:
444
- snippet = line_text.strip()
445
- if len(snippet) > 200:
446
- snippet = snippet[:197] + "…"
447
- matches.append((self.display_path(file_path), line_no, snippet))
448
- return False
449
- truncated = True
450
- return True
451
-
452
- def _search_file(file_path: str) -> bool:
453
- nonlocal files_scanned
454
- files_scanned += 1
455
- try:
456
- with open(file_path, "r", encoding="utf-8", errors="replace") as handle:
457
- for line_no, line in enumerate(handle, start=1):
458
- haystack = line if case_sensitive else line.lower()
459
- if normalized_query in haystack:
460
- if _handle_match(file_path, line_no, line):
461
- return True
462
- except Exception as exc:
463
- errors.append(f"{self.display_path(file_path)} ({self.safe_err(exc)})")
464
- return truncated
465
-
466
- if os.path.isfile(abs_path):
467
- _search_file(abs_path)
468
- else:
469
- for root, dirs, files in os.walk(abs_path):
470
- dirs[:] = [d for d in dirs if not _should_skip(d)]
471
- visible_files = [f for f in files if show_hidden or not f.startswith(".")]
472
- for name in visible_files:
473
- file_path = os.path.join(root, name)
474
- if _search_file(file_path):
475
- break
476
- if truncated:
477
- break
478
- if not recursive:
479
- break
480
-
481
- header_lines = [
482
- f"Search results for {query!r}",
483
- f"Scope: {self.display_path(abs_path)}",
484
- f"Recursive: {'yes' if recursive else 'no'}, Hidden: {'yes' if show_hidden else 'no'}, Case-sensitive: {'yes' if case_sensitive else 'no'}",
485
- f"Start offset: {start_index}",
486
- f"Matches returned: {len(matches)}" + (" (truncated)" if truncated else ""),
487
- f"Files scanned: {files_scanned}",
488
- ]
489
-
490
- next_cursor = start_index + len(matches) if truncated else None
491
-
492
- if truncated:
493
- header_lines.append(f"Matches encountered before truncation: {total_matches}")
494
- header_lines.append(f"Truncated: yes — re-run with offset={next_cursor} to continue.")
495
- header_lines.append(f"Next cursor: {next_cursor}")
496
- else:
497
- header_lines.append(f"Total matches found: {total_matches}")
498
- header_lines.append("Truncated: no — end of results.")
499
- header_lines.append("Next cursor: None")
500
-
501
- if not matches:
502
- if total_matches > 0 and start_index >= total_matches:
503
- hint_limit = max(total_matches - 1, 0)
504
- body_lines = [
505
- f"No matches found at or after offset {start_index}. Total matches available: {total_matches}.",
506
- (f"Try a smaller offset (≤ {hint_limit})." if hint_limit >= 0 else ""),
507
- ]
508
- body_lines = [line for line in body_lines if line]
509
- else:
510
- body_lines = [
511
- "No matches found.",
512
- (f"Total matches encountered: {total_matches}." if total_matches else ""),
513
- ]
514
- body_lines = [line for line in body_lines if line]
515
- else:
516
- body_lines = [
517
- f"{idx}. {path}:{line_no}: {text}"
518
- for idx, (path, line_no, text) in enumerate(matches, start=1)
519
- ]
520
-
521
- if errors:
522
- shown = errors[:5]
523
- body_lines.extend(["", "Warnings:"])
524
- body_lines.extend(shown)
525
- if len(errors) > len(shown):
526
- body_lines.append(f"… {len(errors) - len(shown)} additional files could not be read.")
527
-
528
- return "\n".join(header_lines) + "\n\n" + "\n".join(body_lines)
529
-
530
- def read_file(self, abs_path: str, *, offset: int = 0, max_chars: int = 4000) -> str:
531
- """Read file contents with optional offset and character limit."""
532
- if not os.path.exists(abs_path):
533
- return self.err(
534
- "file_not_found",
535
- f"File not found: {self.display_path(abs_path)}",
536
- path=self.display_path(abs_path),
537
- )
538
- if os.path.isdir(abs_path):
539
- return self.err(
540
- "is_directory",
541
- f"Path is a directory, not a file: {self.display_path(abs_path)}",
542
- path=self.display_path(abs_path),
543
- hint="Provide a file path.",
544
- )
545
- try:
546
- with open(abs_path, "r", encoding="utf-8", errors="replace") as f:
547
- data = f.read()
548
- except Exception as exc:
549
- return self.err(
550
- "read_failed",
551
- "Failed to read file.",
552
- path=self.display_path(abs_path),
553
- data={"error": self.safe_err(exc)},
554
- )
555
- total = len(data)
556
- start = max(0, min(offset, total))
557
- if max_chars > 0:
558
- end = min(total, start + max_chars)
559
- else:
560
- end = total
561
- chunk = data[start:end]
562
- next_cursor = end if end < total else None
563
- header = (
564
- f"Reading {self.display_path(abs_path)}\n"
565
- f"Offset {start}, returned {len(chunk)} of {total}."
566
- + (f"\nNext cursor: {next_cursor}" if next_cursor is not None else "")
567
- )
568
- sep = "\n\n---\n\n"
569
- return header + sep + chunk
570
-
571
- def info(self, abs_path: str) -> str:
572
- """Get file/directory metadata as JSON."""
573
- try:
574
- st = os.stat(abs_path)
575
- except Exception as exc:
576
- return self.err(
577
- "stat_failed",
578
- "Failed to stat path.",
579
- path=self.display_path(abs_path),
580
- data={"error": self.safe_err(exc)},
581
- )
582
- info_dict = {
583
- "path": self.display_path(abs_path),
584
- "type": "directory" if stat.S_ISDIR(st.st_mode) else "file",
585
- "size": st.st_size,
586
- "modified": datetime.fromtimestamp(st.st_mtime).isoformat(sep=" ", timespec="seconds"),
587
- "created": datetime.fromtimestamp(st.st_ctime).isoformat(sep=" ", timespec="seconds"),
588
- "mode": oct(st.st_mode),
589
- "root": "/",
590
- }
591
- return json.dumps(info_dict, indent=2)
592
-
593
-
594
- # ---------------------------------------------------------------------------
595
- # Default roots (can be overridden by environment variables)
596
- # ---------------------------------------------------------------------------
597
-
598
- def _get_filesystem_root() -> str:
599
- """Get the default filesystem root directory."""
600
- root = os.getenv("NYMBO_TOOLS_ROOT")
601
- if root and root.strip():
602
- return os.path.abspath(os.path.expanduser(root.strip()))
603
- try:
604
- here = os.path.abspath(__file__)
605
- tools_dir = os.path.dirname(os.path.dirname(here))
606
- return os.path.abspath(os.path.join(tools_dir, "Filesystem"))
607
- except Exception:
608
- return os.path.abspath(os.getcwd())
609
-
610
-
611
- def _get_obsidian_root() -> str:
612
- """Get the default Obsidian vault root directory."""
613
- env_root = os.getenv("OBSIDIAN_VAULT_ROOT")
614
- if env_root and env_root.strip():
615
- return os.path.abspath(os.path.expanduser(env_root.strip()))
616
- try:
617
- here = os.path.abspath(__file__)
618
- tools_dir = os.path.dirname(os.path.dirname(here))
619
- return os.path.abspath(os.path.join(tools_dir, "Obsidian"))
620
- except Exception:
621
- return os.path.abspath(os.getcwd())
622
-
623
-
624
- # Pre-configured sandbox instances
625
- ALLOW_ABS = bool(int(os.getenv("UNSAFE_ALLOW_ABS_PATHS", "0")))
626
-
627
- FILESYSTEM_ROOT = _get_filesystem_root()
628
- OBSIDIAN_ROOT = _get_obsidian_root()
629
-
630
- # Default sandbox for /Filesystem (used by most tools)
631
- filesystem_sandbox = SandboxedRoot(FILESYSTEM_ROOT, allow_abs=ALLOW_ABS)
632
-
633
- # Sandbox for /Obsidian vault
634
- obsidian_sandbox = SandboxedRoot(OBSIDIAN_ROOT, allow_abs=ALLOW_ABS)
635
-
636
-
637
- # Convenience exports (for backward compatibility)
638
- ROOT_DIR = FILESYSTEM_ROOT
639
-
640
- def _resolve_path(path: str) -> tuple[str, str]:
641
- """Resolve path using the default filesystem sandbox."""
642
- return filesystem_sandbox.resolve_path(path)
643
-
644
- def _display_path(abs_path: str) -> str:
645
- """Display path using the default filesystem sandbox."""
646
- return filesystem_sandbox.display_path(abs_path)
647
-
648
- def safe_open(file, *args, **kwargs):
649
- """Open file using the default filesystem sandbox."""
650
- return filesystem_sandbox.safe_open(file, *args, **kwargs)
651
-
652
-
653
- # ===========================================================================
654
- # Part 2: Sandboxed Python Execution
655
- # ===========================================================================
656
-
657
-
658
- def create_safe_builtins() -> dict:
659
- """Create a builtins dict with sandboxed open()."""
660
- if isinstance(__builtins__, dict):
661
- safe_builtins = __builtins__.copy()
662
- else:
663
- safe_builtins = vars(__builtins__).copy()
664
- safe_builtins["open"] = safe_open
665
- return safe_builtins
666
-
667
-
668
- def sandboxed_exec(
669
- code: str,
670
- *,
671
- extra_globals: dict[str, Any] | None = None,
672
- ast_mode: bool = False,
673
- ) -> str:
674
- """
675
- Execute Python code in a sandboxed environment.
676
-
677
- Args:
678
- code: Python source code to execute
679
- extra_globals: Additional globals to inject (e.g., tools)
680
- ast_mode: If True, parse and print results of all expression statements
681
- (like Agent_Terminal). If False, simple exec (like Code_Interpreter).
682
-
683
- Returns:
684
- Captured stdout output, or exception text on error.
685
- """
686
- if not code:
687
- return "No code provided."
688
-
689
- old_stdout = sys.stdout
690
- old_cwd = os.getcwd()
691
- redirected_output = sys.stdout = StringIO()
692
-
693
- # Build execution environment
694
- safe_builtins = create_safe_builtins()
695
- env: dict[str, Any] = {
696
- "open": safe_open,
697
- "__builtins__": safe_builtins,
698
- "print": print,
699
- }
700
- if extra_globals:
701
- env.update(extra_globals)
702
-
703
- try:
704
- os.chdir(ROOT_DIR)
705
-
706
- if ast_mode:
707
- # Parse and evaluate each statement, printing expression results
708
- tree = ast.parse(code)
709
- for node in tree.body:
710
- if isinstance(node, ast.Expr):
711
- # Standalone expression - evaluate and print result
712
- expr = compile(ast.Expression(node.value), filename="<string>", mode="eval")
713
- result_val = eval(expr, env)
714
- if result_val is not None:
715
- print(result_val)
716
- else:
717
- # Statement - execute it
718
- mod = ast.Module(body=[node], type_ignores=[])
719
- exec(compile(mod, filename="<string>", mode="exec"), env)
720
- else:
721
- # Simple exec mode
722
- exec(code, env)
723
-
724
- result = redirected_output.getvalue()
725
- except Exception as exc:
726
- result = str(exc)
727
- finally:
728
- sys.stdout = old_stdout
729
- try:
730
- os.chdir(old_cwd)
731
- except Exception:
732
- pass
733
-
734
- return result
735
-
736
-
737
- # ===========================================================================
738
- # Part 3: Hugging Face Inference Utilities
739
- # ===========================================================================
740
-
741
-
742
- def get_hf_token() -> str | None:
743
- """Get the HF API token from environment variables.
744
-
745
- Checks HF_READ_TOKEN first, then falls back to HF_TOKEN.
746
- """
747
- return os.getenv("HF_READ_TOKEN") or os.getenv("HF_TOKEN")
748
-
749
-
750
- # Pre-instantiated token for modules that prefer this pattern
751
- HF_TOKEN = get_hf_token()
752
-
753
- # Standard provider list for image/video generation
754
- DEFAULT_PROVIDERS = ["auto", "replicate", "fal-ai"]
755
-
756
- # Provider list for text generation (Deep Research)
757
- TEXTGEN_PROVIDERS = ["cerebras", "auto"]
758
-
759
-
760
- T = TypeVar("T")
761
-
762
-
763
- def handle_hf_error(msg: str, model_id: str, *, context: str = "generation") -> None:
764
- """
765
- Raise appropriate gr.Error for common HF API error codes.
766
-
767
- Args:
768
- msg: Error message string to analyze
769
- model_id: The model ID being used (for error messages)
770
- context: Description of operation for error messages
771
-
772
- Raises:
773
- gr.Error: With user-friendly message based on error type
774
- """
775
- lowered = msg.lower()
776
-
777
- if "404" in msg:
778
- raise gr.Error(f"Model not found or unavailable: {model_id}. Check the id and your HF token access.")
779
-
780
- if "503" in msg:
781
- raise gr.Error("The model is warming up. Please try again shortly.")
782
-
783
- if "401" in msg or "403" in msg:
784
- raise gr.Error("Please duplicate the space and provide a `HF_READ_TOKEN` to enable Image and Video Generation.")
785
-
786
- if any(pattern in lowered for pattern in ("api_key", "hf auth login", "unauthorized", "forbidden")):
787
- raise gr.Error("Please duplicate the space and provide a `HF_READ_TOKEN` to enable Image and Video Generation.")
788
-
789
- # If none of the known patterns match, raise generic error
790
- raise gr.Error(f"{context.capitalize()} failed: {msg}")
791
-
792
-
793
- def invoke_with_fallback(
794
- fn: Callable[[str], T],
795
- providers: list[str] | None = None,
796
- ) -> T:
797
- """
798
- Try calling fn(provider) for each provider until one succeeds.
799
-
800
- Args:
801
- fn: Function that takes a provider string and returns a result.
802
- Should raise an exception on failure.
803
- providers: List of provider strings to try. Defaults to DEFAULT_PROVIDERS.
804
-
805
- Returns:
806
- The result from the first successful fn() call.
807
-
808
- Raises:
809
- The last exception if all providers fail.
810
- """
811
- if providers is None:
812
- providers = DEFAULT_PROVIDERS
813
-
814
- last_error: Exception | None = None
815
-
816
- for provider in providers:
817
- try:
818
- return fn(provider)
819
- except Exception as exc:
820
- last_error = exc
821
- continue
822
-
823
- # All providers failed
824
- if last_error:
825
- raise last_error
826
- raise RuntimeError("No providers available")
827
-
828
-
829
- # ===========================================================================
830
- # Public API
831
- # ===========================================================================
832
-
833
- __all__ = [
834
- # Tree Utils
835
- "_fmt_size",
836
- "build_tree",
837
- "render_tree",
838
- "walk_and_build_tree",
839
- "format_dir_listing",
840
- # Filesystem
841
- "SandboxedRoot",
842
- "filesystem_sandbox",
843
- "obsidian_sandbox",
844
- "ROOT_DIR",
845
- "FILESYSTEM_ROOT",
846
- "OBSIDIAN_ROOT",
847
- "ALLOW_ABS",
848
- "_resolve_path",
849
- "_display_path",
850
- "safe_open",
851
- # Execution
852
- "sandboxed_exec",
853
- "create_safe_builtins",
854
- # HF Inference
855
- "get_hf_token",
856
- "HF_TOKEN",
857
- "DEFAULT_PROVIDERS",
858
- "TEXTGEN_PROVIDERS",
859
- "handle_hf_error",
860
- "invoke_with_fallback",
861
- ]
 
1
+ """
2
+ Core shared utilities for the Nymbo-Tools MCP server.
3
+
4
+ Consolidates three key areas:
5
+ 1. Sandboxed filesystem operations (path resolution, reading, writing, safe_open)
6
+ 2. Sandboxed Python execution (code interpreter, agent terminal)
7
+ 3. Hugging Face inference utilities (token, providers, error handling)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import ast
13
+ import json
14
+ import os
15
+ import re
16
+ import stat
17
+ import sys
18
+ from datetime import datetime
19
+ from io import StringIO
20
+ from typing import Any, Callable, Optional, TypeVar
21
+
22
+ import gradio as gr
23
+
24
+
25
+ # ===========================================================================
26
+ # Part 0: Tree Rendering Utilities
27
+ # ===========================================================================
28
+
29
+
30
+ def _fmt_size(num_bytes: int) -> str:
31
+ """Format byte size as human-readable string."""
32
+ units = ["B", "KB", "MB", "GB"]
33
+ size = float(num_bytes)
34
+ for unit in units:
35
+ if size < 1024.0:
36
+ return f"{size:.1f} {unit}"
37
+ size /= 1024.0
38
+ return f"{size:.1f} TB"
39
+
40
+
41
+ def build_tree(entries: list[tuple[str, dict]]) -> dict:
42
+ """
43
+ Build a nested tree structure from flat path entries.
44
+
45
+ Args:
46
+ entries: List of (path, metadata) tuples where path uses forward slashes.
47
+ Paths ending with '/' are treated as directories.
48
+
49
+ Returns:
50
+ Nested dict with "__files__" key for files at each level.
51
+ """
52
+ root: dict = {"__files__": []}
53
+
54
+ for path, metadata in entries:
55
+ parts = path.rstrip("/").split("/")
56
+ is_dir = path.endswith("/")
57
+
58
+ node = root
59
+ for i, part in enumerate(parts[:-1]):
60
+ if part not in node:
61
+ node[part] = {"__files__": []}
62
+ node = node[part]
63
+
64
+ final = parts[-1]
65
+ if is_dir:
66
+ if final not in node:
67
+ node[final] = {"__files__": []}
68
+ if metadata:
69
+ node[final]["__meta__"] = metadata
70
+ else:
71
+ node["__files__"].append((final, metadata))
72
+
73
+ return root
74
+
75
+
76
+ def render_tree(
77
+ node: dict,
78
+ prefix: str = "",
79
+ format_entry: Optional[Callable[[str, dict, bool], str]] = None,
80
+ ) -> list[str]:
81
+ """
82
+ Render a tree with line connectors.
83
+
84
+ Args:
85
+ node: Nested dict from build_tree()
86
+ prefix: Current line prefix for indentation
87
+ format_entry: Optional callback to format each entry.
88
+
89
+ Returns:
90
+ List of formatted lines.
91
+ """
92
+ result = []
93
+
94
+ def default_format(name: str, meta: dict, is_dir: bool) -> str:
95
+ if is_dir:
96
+ return f"{name}/"
97
+ size = meta.get("size")
98
+ if size is not None:
99
+ return f"{name} ({_fmt_size(size)})"
100
+ return name
101
+
102
+ fmt = format_entry or default_format
103
+
104
+ entries = []
105
+ subdirs = sorted(k for k in node.keys() if k not in ("__files__", "__meta__"))
106
+ files_here = sorted(node.get("__files__", []), key=lambda x: x[0])
107
+
108
+ for dirname in subdirs:
109
+ dir_meta = node[dirname].get("__meta__", {})
110
+ entries.append(("dir", dirname, node[dirname], dir_meta))
111
+ for fname, fmeta in files_here:
112
+ entries.append(("file", fname, None, fmeta))
113
+
114
+ for i, entry in enumerate(entries):
115
+ is_last = (i == len(entries) - 1)
116
+ connector = "└── " if is_last else "├── "
117
+ child_prefix = prefix + (" " if is_last else "│ ")
118
+
119
+ etype, name, subtree, meta = entry
120
+
121
+ if etype == "dir":
122
+ result.append(f"{prefix}{connector}{fmt(name, meta, True)}")
123
+ result.extend(render_tree(subtree, child_prefix, format_entry))
124
+ else:
125
+ result.append(f"{prefix}{connector}{fmt(name, meta, False)}")
126
+
127
+ return result
128
+
129
+
130
+ def walk_and_build_tree(
131
+ abs_path: str,
132
+ *,
133
+ show_hidden: bool = False,
134
+ recursive: bool = False,
135
+ max_entries: int = 100,
136
+ ) -> tuple[dict, int, bool]:
137
+ """
138
+ Walk a directory and build a tree structure.
139
+
140
+ Returns:
141
+ (tree, total_entries, truncated)
142
+ """
143
+ entries: list[tuple[str, dict]] = []
144
+ total = 0
145
+ truncated = False
146
+
147
+ for root, dirs, files in os.walk(abs_path):
148
+ if not show_hidden:
149
+ dirs[:] = [d for d in dirs if not d.startswith('.')]
150
+ files = [f for f in files if not f.startswith('.')]
151
+
152
+ dirs.sort()
153
+ files.sort()
154
+
155
+ try:
156
+ rel_root = os.path.relpath(root, abs_path)
157
+ except Exception:
158
+ rel_root = ""
159
+ prefix = "" if rel_root == "." else rel_root.replace("\\", "/") + "/"
160
+
161
+ for d in dirs:
162
+ p = os.path.join(root, d)
163
+ try:
164
+ mtime = datetime.fromtimestamp(os.path.getmtime(p)).strftime("%Y-%m-%d %H:%M")
165
+ except Exception:
166
+ mtime = "?"
167
+ entries.append((f"{prefix}{d}/", {"mtime": mtime}))
168
+ total += 1
169
+ if total >= max_entries:
170
+ truncated = True
171
+ break
172
+
173
+ if truncated:
174
+ break
175
+
176
+ for f in files:
177
+ p = os.path.join(root, f)
178
+ try:
179
+ size = os.path.getsize(p)
180
+ mtime = datetime.fromtimestamp(os.path.getmtime(p)).strftime("%Y-%m-%d %H:%M")
181
+ except Exception:
182
+ size, mtime = 0, "?"
183
+ entries.append((f"{prefix}{f}", {"size": size, "mtime": mtime}))
184
+ total += 1
185
+ if total >= max_entries:
186
+ truncated = True
187
+ break
188
+
189
+ if truncated:
190
+ break
191
+
192
+ if not recursive:
193
+ break
194
+
195
+ return build_tree(entries), total, truncated
196
+
197
+
198
+ def format_dir_listing(
199
+ abs_path: str,
200
+ display_path: str,
201
+ *,
202
+ show_hidden: bool = False,
203
+ recursive: bool = False,
204
+ max_entries: int = 100,
205
+ fmt_size_fn: Optional[Callable[[int], str]] = None,
206
+ ) -> str:
207
+ """Format a directory listing as a visual tree."""
208
+ fmt_size = fmt_size_fn or _fmt_size
209
+
210
+ tree, total, truncated = walk_and_build_tree(
211
+ abs_path,
212
+ show_hidden=show_hidden,
213
+ recursive=recursive,
214
+ max_entries=max_entries,
215
+ )
216
+
217
+ def format_entry(name: str, meta: dict, is_dir: bool) -> str:
218
+ mtime = meta.get("mtime", "")
219
+ if is_dir:
220
+ return f"{name}/ ({mtime})"
221
+ size = meta.get("size", 0)
222
+ return f"{name} ({fmt_size(size)}, {mtime})"
223
+
224
+ tree_lines = render_tree(tree, " ", format_entry)
225
+
226
+ header = f"Listing of {display_path}\nRoot: /\nEntries: {total}"
227
+ if truncated:
228
+ header += f"\n… Truncated at {max_entries} entries."
229
+
230
+ lines = [header, "", "└── /"]
231
+ lines.extend(tree_lines)
232
+
233
+ return "\n".join(lines).strip()
234
+
235
+
236
+ # ===========================================================================
237
+ # Part 1: Sandboxed Filesystem Operations
238
+ # ===========================================================================
239
+
240
+
241
+ class SandboxedRoot:
242
+ """
243
+ A configurable sandboxed root directory with path resolution and safety checks.
244
+
245
+ Args:
246
+ root_dir: Absolute path to the sandbox root.
247
+ allow_abs: If True, allow absolute paths outside the sandbox.
248
+ """
249
+
250
+ def __init__(self, root_dir: str, allow_abs: bool = False):
251
+ self.root_dir = os.path.abspath(root_dir)
252
+ self.allow_abs = allow_abs
253
+ # Ensure root exists
254
+ try:
255
+ os.makedirs(self.root_dir, exist_ok=True)
256
+ except Exception:
257
+ pass
258
+
259
+ def safe_err(self, exc: Exception | str) -> str:
260
+ """Return an error string with any absolute root replaced by '/' and slashes normalized."""
261
+ s = str(exc)
262
+ s_norm = s.replace("\\", "/")
263
+ root_fwd = self.root_dir.replace("\\", "/")
264
+ root_variants = {self.root_dir, root_fwd, re.sub(r"/+", "/", root_fwd)}
265
+ for variant in root_variants:
266
+ if variant:
267
+ s_norm = s_norm.replace(variant, "/")
268
+ s_norm = re.sub(r"/+", "/", s_norm)
269
+ return s_norm
270
+
271
+ def err(
272
+ self,
273
+ code: str,
274
+ message: str,
275
+ *,
276
+ path: Optional[str] = None,
277
+ hint: Optional[str] = None,
278
+ data: Optional[dict] = None,
279
+ ) -> str:
280
+ """Return a structured error JSON string."""
281
+ payload = {
282
+ "status": "error",
283
+ "code": code,
284
+ "message": message,
285
+ "root": "/",
286
+ }
287
+ if path is not None and path != "":
288
+ payload["path"] = path
289
+ if hint:
290
+ payload["hint"] = hint
291
+ if data:
292
+ payload["data"] = data
293
+ return json.dumps(payload, ensure_ascii=False)
294
+
295
+ def display_path(self, abs_path: str) -> str:
296
+ """Return a user-friendly path relative to root using forward slashes."""
297
+ try:
298
+ norm_root = os.path.normpath(self.root_dir)
299
+ norm_abs = os.path.normpath(abs_path)
300
+ common = os.path.commonpath([norm_root, norm_abs])
301
+ if os.path.normcase(common) == os.path.normcase(norm_root):
302
+ rel = os.path.relpath(norm_abs, norm_root)
303
+ if rel == ".":
304
+ return "/"
305
+ return "/" + rel.replace("\\", "/")
306
+ except Exception:
307
+ pass
308
+ return abs_path.replace("\\", "/")
309
+
310
+ def resolve_path(self, path: str) -> tuple[str, str]:
311
+ """
312
+ Resolve a user-provided path to an absolute, normalized path constrained to root.
313
+ Returns (abs_path, error_message). error_message is empty when ok.
314
+ """
315
+ try:
316
+ user_input = (path or "/").strip() or "/"
317
+ if user_input.startswith("/"):
318
+ rel_part = user_input.lstrip("/") or "."
319
+ raw = os.path.expanduser(rel_part)
320
+ treat_as_relative = True
321
+ else:
322
+ raw = os.path.expanduser(user_input)
323
+ treat_as_relative = False
324
+
325
+ if not treat_as_relative and os.path.isabs(raw):
326
+ if not self.allow_abs:
327
+ return "", self.err(
328
+ "absolute_path_disabled",
329
+ "Absolute paths are disabled in safe mode.",
330
+ path=raw.replace("\\", "/"),
331
+ hint="Use a path relative to / (e.g., /notes/todo.txt).",
332
+ )
333
+ abs_path = os.path.abspath(raw)
334
+ else:
335
+ abs_path = os.path.abspath(os.path.join(self.root_dir, raw))
336
+
337
+ # Constrain to root when not allowing absolute paths
338
+ if not self.allow_abs:
339
+ try:
340
+ common = os.path.commonpath(
341
+ [os.path.normpath(self.root_dir), os.path.normpath(abs_path)]
342
+ )
343
+ if common != os.path.normpath(self.root_dir):
344
+ return "", self.err(
345
+ "path_outside_root",
346
+ "Path is outside the sandbox root.",
347
+ path=abs_path,
348
+ )
349
+ except Exception:
350
+ return "", self.err(
351
+ "path_outside_root",
352
+ "Path is outside the sandbox root.",
353
+ path=abs_path,
354
+ )
355
+
356
+ return abs_path, ""
357
+ except Exception as exc:
358
+ return "", self.err(
359
+ "resolve_path_failed",
360
+ "Failed to resolve path.",
361
+ path=(path or ""),
362
+ data={"error": self.safe_err(exc)},
363
+ )
364
+
365
+ def safe_open(self, file, *args, **kwargs):
366
+ """A drop-in replacement for open() that enforces sandbox constraints."""
367
+ if isinstance(file, int):
368
+ return open(file, *args, **kwargs)
369
+
370
+ path_str = os.fspath(file)
371
+ abs_path, err = self.resolve_path(path_str)
372
+ if err:
373
+ try:
374
+ msg = json.loads(err)["message"]
375
+ except Exception:
376
+ msg = err
377
+ raise PermissionError(f"Sandboxed open() failed: {msg}")
378
+
379
+ return open(abs_path, *args, **kwargs)
380
+
381
+ def list_dir(
382
+ self,
383
+ abs_path: str,
384
+ *,
385
+ show_hidden: bool = False,
386
+ recursive: bool = False,
387
+ max_entries: int = 100,
388
+ ) -> str:
389
+ """List directory contents as a visual tree."""
390
+ return format_dir_listing(
391
+ abs_path,
392
+ self.display_path(abs_path),
393
+ show_hidden=show_hidden,
394
+ recursive=recursive,
395
+ max_entries=max_entries,
396
+ fmt_size_fn=_fmt_size,
397
+ )
398
+
399
+ def search_text(
400
+ self,
401
+ abs_path: str,
402
+ query: str,
403
+ *,
404
+ recursive: bool = False,
405
+ show_hidden: bool = False,
406
+ max_results: int = 20,
407
+ case_sensitive: bool = False,
408
+ start_index: int = 0,
409
+ ) -> str:
410
+ """Search for text within files."""
411
+ if not os.path.exists(abs_path):
412
+ return self.err(
413
+ "path_not_found",
414
+ f"Path not found: {self.display_path(abs_path)}",
415
+ path=self.display_path(abs_path),
416
+ )
417
+
418
+ query = query or ""
419
+ normalized_query = query if case_sensitive else query.lower()
420
+ if normalized_query == "":
421
+ return self.err(
422
+ "missing_search_query",
423
+ "Search query is required for the search action.",
424
+ hint="Provide text in the Content field to search for.",
425
+ )
426
+
427
+ max_results = max(1, int(max_results) if max_results is not None else 20)
428
+ start_index = max(0, int(start_index) if start_index is not None else 0)
429
+ matches: list[tuple[str, int, str]] = []
430
+ errors: list[str] = []
431
+ files_scanned = 0
432
+ truncated = False
433
+ total_matches = 0
434
+
435
+ def _should_skip(name: str) -> bool:
436
+ return not show_hidden and name.startswith(".")
437
+
438
+ def _handle_match(file_path: str, line_no: int, line_text: str) -> bool:
439
+ nonlocal truncated, total_matches
440
+ total_matches += 1
441
+ if total_matches <= start_index:
442
+ return False
443
+ if len(matches) < max_results:
444
+ snippet = line_text.strip()
445
+ if len(snippet) > 200:
446
+ snippet = snippet[:197] + "…"
447
+ matches.append((self.display_path(file_path), line_no, snippet))
448
+ return False
449
+ truncated = True
450
+ return True
451
+
452
+ def _search_file(file_path: str) -> bool:
453
+ nonlocal files_scanned
454
+ files_scanned += 1
455
+ try:
456
+ with open(file_path, "r", encoding="utf-8", errors="replace") as handle:
457
+ for line_no, line in enumerate(handle, start=1):
458
+ haystack = line if case_sensitive else line.lower()
459
+ if normalized_query in haystack:
460
+ if _handle_match(file_path, line_no, line):
461
+ return True
462
+ except Exception as exc:
463
+ errors.append(f"{self.display_path(file_path)} ({self.safe_err(exc)})")
464
+ return truncated
465
+
466
+ if os.path.isfile(abs_path):
467
+ _search_file(abs_path)
468
+ else:
469
+ for root, dirs, files in os.walk(abs_path):
470
+ dirs[:] = [d for d in dirs if not _should_skip(d)]
471
+ visible_files = [f for f in files if show_hidden or not f.startswith(".")]
472
+ for name in visible_files:
473
+ file_path = os.path.join(root, name)
474
+ if _search_file(file_path):
475
+ break
476
+ if truncated:
477
+ break
478
+ if not recursive:
479
+ break
480
+
481
+ header_lines = [
482
+ f"Search results for {query!r}",
483
+ f"Scope: {self.display_path(abs_path)}",
484
+ f"Recursive: {'yes' if recursive else 'no'}, Hidden: {'yes' if show_hidden else 'no'}, Case-sensitive: {'yes' if case_sensitive else 'no'}",
485
+ f"Start offset: {start_index}",
486
+ f"Matches returned: {len(matches)}" + (" (truncated)" if truncated else ""),
487
+ f"Files scanned: {files_scanned}",
488
+ ]
489
+
490
+ next_cursor = start_index + len(matches) if truncated else None
491
+
492
+ if truncated:
493
+ header_lines.append(f"Matches encountered before truncation: {total_matches}")
494
+ header_lines.append(f"Truncated: yes — re-run with offset={next_cursor} to continue.")
495
+ header_lines.append(f"Next cursor: {next_cursor}")
496
+ else:
497
+ header_lines.append(f"Total matches found: {total_matches}")
498
+ header_lines.append("Truncated: no — end of results.")
499
+ header_lines.append("Next cursor: None")
500
+
501
+ if not matches:
502
+ if total_matches > 0 and start_index >= total_matches:
503
+ hint_limit = max(total_matches - 1, 0)
504
+ body_lines = [
505
+ f"No matches found at or after offset {start_index}. Total matches available: {total_matches}.",
506
+ (f"Try a smaller offset (≤ {hint_limit})." if hint_limit >= 0 else ""),
507
+ ]
508
+ body_lines = [line for line in body_lines if line]
509
+ else:
510
+ body_lines = [
511
+ "No matches found.",
512
+ (f"Total matches encountered: {total_matches}." if total_matches else ""),
513
+ ]
514
+ body_lines = [line for line in body_lines if line]
515
+ else:
516
+ body_lines = [
517
+ f"{idx}. {path}:{line_no}: {text}"
518
+ for idx, (path, line_no, text) in enumerate(matches, start=1)
519
+ ]
520
+
521
+ if errors:
522
+ shown = errors[:5]
523
+ body_lines.extend(["", "Warnings:"])
524
+ body_lines.extend(shown)
525
+ if len(errors) > len(shown):
526
+ body_lines.append(f"… {len(errors) - len(shown)} additional files could not be read.")
527
+
528
+ return "\n".join(header_lines) + "\n\n" + "\n".join(body_lines)
529
+
530
+ def read_file(self, abs_path: str, *, offset: int = 0, max_chars: int = 4000) -> str:
531
+ """Read file contents with optional offset and character limit."""
532
+ if not os.path.exists(abs_path):
533
+ return self.err(
534
+ "file_not_found",
535
+ f"File not found: {self.display_path(abs_path)}",
536
+ path=self.display_path(abs_path),
537
+ )
538
+ if os.path.isdir(abs_path):
539
+ return self.err(
540
+ "is_directory",
541
+ f"Path is a directory, not a file: {self.display_path(abs_path)}",
542
+ path=self.display_path(abs_path),
543
+ hint="Provide a file path.",
544
+ )
545
+ try:
546
+ with open(abs_path, "r", encoding="utf-8", errors="replace") as f:
547
+ data = f.read()
548
+ except Exception as exc:
549
+ return self.err(
550
+ "read_failed",
551
+ "Failed to read file.",
552
+ path=self.display_path(abs_path),
553
+ data={"error": self.safe_err(exc)},
554
+ )
555
+ total = len(data)
556
+ start = max(0, min(offset, total))
557
+ if max_chars > 0:
558
+ end = min(total, start + max_chars)
559
+ else:
560
+ end = total
561
+ chunk = data[start:end]
562
+ next_cursor = end if end < total else None
563
+ header = (
564
+ f"Reading {self.display_path(abs_path)}\n"
565
+ f"Offset {start}, returned {len(chunk)} of {total}."
566
+ + (f"\nNext cursor: {next_cursor}" if next_cursor is not None else "")
567
+ )
568
+ sep = "\n\n---\n\n"
569
+ return header + sep + chunk
570
+
571
+ def info(self, abs_path: str) -> str:
572
+ """Get file/directory metadata as JSON."""
573
+ try:
574
+ st = os.stat(abs_path)
575
+ except Exception as exc:
576
+ return self.err(
577
+ "stat_failed",
578
+ "Failed to stat path.",
579
+ path=self.display_path(abs_path),
580
+ data={"error": self.safe_err(exc)},
581
+ )
582
+ info_dict = {
583
+ "path": self.display_path(abs_path),
584
+ "type": "directory" if stat.S_ISDIR(st.st_mode) else "file",
585
+ "size": st.st_size,
586
+ "modified": datetime.fromtimestamp(st.st_mtime).isoformat(sep=" ", timespec="seconds"),
587
+ "created": datetime.fromtimestamp(st.st_ctime).isoformat(sep=" ", timespec="seconds"),
588
+ "mode": oct(st.st_mode),
589
+ "root": "/",
590
+ }
591
+ return json.dumps(info_dict, indent=2)
592
+
593
+
594
+ # ---------------------------------------------------------------------------
595
+ # Default roots (can be overridden by environment variables)
596
+ # ---------------------------------------------------------------------------
597
+
598
+ def _get_filesystem_root() -> str:
599
+ """Get the default filesystem root directory."""
600
+ root = os.getenv("NYMBO_TOOLS_ROOT")
601
+ if root and root.strip():
602
+ return os.path.abspath(os.path.expanduser(root.strip()))
603
+ try:
604
+ here = os.path.abspath(__file__)
605
+ tools_dir = os.path.dirname(os.path.dirname(here))
606
+ return os.path.abspath(os.path.join(tools_dir, "Filesystem"))
607
+ except Exception:
608
+ return os.path.abspath(os.getcwd())
609
+
610
+
611
+ def _get_obsidian_root() -> str:
612
+ """Get the default Obsidian vault root directory."""
613
+ env_root = os.getenv("OBSIDIAN_VAULT_ROOT")
614
+ if env_root and env_root.strip():
615
+ return os.path.abspath(os.path.expanduser(env_root.strip()))
616
+ try:
617
+ here = os.path.abspath(__file__)
618
+ tools_dir = os.path.dirname(os.path.dirname(here))
619
+ return os.path.abspath(os.path.join(tools_dir, "Obsidian"))
620
+ except Exception:
621
+ return os.path.abspath(os.getcwd())
622
+
623
+
624
+ # Pre-configured sandbox instances
625
+ ALLOW_ABS = bool(int(os.getenv("UNSAFE_ALLOW_ABS_PATHS", "0")))
626
+
627
+ FILESYSTEM_ROOT = _get_filesystem_root()
628
+ OBSIDIAN_ROOT = _get_obsidian_root()
629
+
630
+ # Default sandbox for /Filesystem (used by most tools)
631
+ filesystem_sandbox = SandboxedRoot(FILESYSTEM_ROOT, allow_abs=ALLOW_ABS)
632
+
633
+ # Sandbox for /Obsidian vault
634
+ obsidian_sandbox = SandboxedRoot(OBSIDIAN_ROOT, allow_abs=ALLOW_ABS)
635
+
636
+
637
+ # Convenience exports (for backward compatibility)
638
+ ROOT_DIR = FILESYSTEM_ROOT
639
+
640
+ def _resolve_path(path: str) -> tuple[str, str]:
641
+ """Resolve path using the default filesystem sandbox."""
642
+ return filesystem_sandbox.resolve_path(path)
643
+
644
+ def _display_path(abs_path: str) -> str:
645
+ """Display path using the default filesystem sandbox."""
646
+ return filesystem_sandbox.display_path(abs_path)
647
+
648
+ def safe_open(file, *args, **kwargs):
649
+ """Open file using the default filesystem sandbox."""
650
+ return filesystem_sandbox.safe_open(file, *args, **kwargs)
651
+
652
+
653
+ # ===========================================================================
654
+ # Part 2: Sandboxed Python Execution
655
+ # ===========================================================================
656
+
657
+
658
+ def create_safe_builtins() -> dict:
659
+ """Create a builtins dict with sandboxed open()."""
660
+ if isinstance(__builtins__, dict):
661
+ safe_builtins = __builtins__.copy()
662
+ else:
663
+ safe_builtins = vars(__builtins__).copy()
664
+ safe_builtins["open"] = safe_open
665
+ return safe_builtins
666
+
667
+
668
+ def sandboxed_exec(
669
+ code: str,
670
+ *,
671
+ extra_globals: dict[str, Any] | None = None,
672
+ ast_mode: bool = False,
673
+ ) -> str:
674
+ """
675
+ Execute Python code in a sandboxed environment.
676
+
677
+ Args:
678
+ code: Python source code to execute
679
+ extra_globals: Additional globals to inject (e.g., tools)
680
+ ast_mode: If True, parse and print results of all expression statements
681
+ (like Agent_Terminal). If False, simple exec (like Code_Interpreter).
682
+
683
+ Returns:
684
+ Captured stdout output, or exception text on error.
685
+ """
686
+ if not code:
687
+ return "No code provided."
688
+
689
+ old_stdout = sys.stdout
690
+ old_cwd = os.getcwd()
691
+ redirected_output = sys.stdout = StringIO()
692
+
693
+ # Build execution environment
694
+ safe_builtins = create_safe_builtins()
695
+ env: dict[str, Any] = {
696
+ "open": safe_open,
697
+ "__builtins__": safe_builtins,
698
+ "print": print,
699
+ }
700
+ if extra_globals:
701
+ env.update(extra_globals)
702
+
703
+ try:
704
+ os.chdir(ROOT_DIR)
705
+
706
+ if ast_mode:
707
+ # Parse and evaluate each statement, printing expression results
708
+ tree = ast.parse(code)
709
+ for node in tree.body:
710
+ if isinstance(node, ast.Expr):
711
+ # Standalone expression - evaluate and print result
712
+ expr = compile(ast.Expression(node.value), filename="<string>", mode="eval")
713
+ result_val = eval(expr, env)
714
+ if result_val is not None:
715
+ print(result_val)
716
+ else:
717
+ # Statement - execute it
718
+ mod = ast.Module(body=[node], type_ignores=[])
719
+ exec(compile(mod, filename="<string>", mode="exec"), env)
720
+ else:
721
+ # Simple exec mode
722
+ exec(code, env)
723
+
724
+ result = redirected_output.getvalue()
725
+ except Exception as exc:
726
+ result = str(exc)
727
+ finally:
728
+ sys.stdout = old_stdout
729
+ try:
730
+ os.chdir(old_cwd)
731
+ except Exception:
732
+ pass
733
+
734
+ return result
735
+
736
+
737
+ # ===========================================================================
738
+ # Part 3: Hugging Face Inference Utilities
739
+ # ===========================================================================
740
+
741
+
742
+ def get_hf_token() -> str | None:
743
+ """Get the HF API token from environment variables.
744
+
745
+ Checks HF_READ_TOKEN first, then falls back to HF_TOKEN.
746
+ """
747
+ return os.getenv("HF_READ_TOKEN") or os.getenv("HF_TOKEN")
748
+
749
+
750
+ # Pre-instantiated token for modules that prefer this pattern
751
+ HF_TOKEN = get_hf_token()
752
+
753
+ # Standard provider list for image/video generation
754
+ DEFAULT_PROVIDERS = ["auto", "replicate", "fal-ai"]
755
+
756
+ # Provider list for text generation (Deep Research)
757
+ TEXTGEN_PROVIDERS = ["cerebras", "auto"]
758
+
759
+
760
+ T = TypeVar("T")
761
+
762
+
763
+ def handle_hf_error(msg: str, model_id: str, *, context: str = "generation") -> None:
764
+ """
765
+ Raise appropriate gr.Error for common HF API error codes.
766
+
767
+ Args:
768
+ msg: Error message string to analyze
769
+ model_id: The model ID being used (for error messages)
770
+ context: Description of operation for error messages
771
+
772
+ Raises:
773
+ gr.Error: With user-friendly message based on error type
774
+ """
775
+ lowered = msg.lower()
776
+
777
+ if "404" in msg:
778
+ raise gr.Error(f"Model not found or unavailable: {model_id}. Check the id and your HF token access.")
779
+
780
+ if "503" in msg:
781
+ raise gr.Error("The model is warming up. Please try again shortly.")
782
+
783
+ if "401" in msg or "403" in msg:
784
+ raise gr.Error("Please duplicate the space and provide a `HF_READ_TOKEN` to enable Image and Video Generation.")
785
+
786
+ if any(pattern in lowered for pattern in ("api_key", "hf auth login", "unauthorized", "forbidden")):
787
+ raise gr.Error("Please duplicate the space and provide a `HF_READ_TOKEN` to enable Image and Video Generation.")
788
+
789
+ # If none of the known patterns match, raise generic error
790
+ raise gr.Error(f"{context.capitalize()} failed: {msg}")
791
+
792
+
793
+ def invoke_with_fallback(
794
+ fn: Callable[[str], T],
795
+ providers: list[str] | None = None,
796
+ ) -> T:
797
+ """
798
+ Try calling fn(provider) for each provider until one succeeds.
799
+
800
+ Args:
801
+ fn: Function that takes a provider string and returns a result.
802
+ Should raise an exception on failure.
803
+ providers: List of provider strings to try. Defaults to DEFAULT_PROVIDERS.
804
+
805
+ Returns:
806
+ The result from the first successful fn() call.
807
+
808
+ Raises:
809
+ The last exception if all providers fail.
810
+ """
811
+ if providers is None:
812
+ providers = DEFAULT_PROVIDERS
813
+
814
+ last_error: Exception | None = None
815
+
816
+ for provider in providers:
817
+ try:
818
+ return fn(provider)
819
+ except Exception as exc:
820
+ last_error = exc
821
+ continue
822
+
823
+ # All providers failed
824
+ if last_error:
825
+ raise last_error
826
+ raise RuntimeError("No providers available")
827
+
828
+
829
+ # ===========================================================================
830
+ # Public API
831
+ # ===========================================================================
832
+
833
+ __all__ = [
834
+ # Tree Utils
835
+ "_fmt_size",
836
+ "build_tree",
837
+ "render_tree",
838
+ "walk_and_build_tree",
839
+ "format_dir_listing",
840
+ # Filesystem
841
+ "SandboxedRoot",
842
+ "filesystem_sandbox",
843
+ "obsidian_sandbox",
844
+ "ROOT_DIR",
845
+ "FILESYSTEM_ROOT",
846
+ "OBSIDIAN_ROOT",
847
+ "ALLOW_ABS",
848
+ "_resolve_path",
849
+ "_display_path",
850
+ "safe_open",
851
+ # Execution
852
+ "sandboxed_exec",
853
+ "create_safe_builtins",
854
+ # HF Inference
855
+ "get_hf_token",
856
+ "HF_TOKEN",
857
+ "DEFAULT_PROVIDERS",
858
+ "TEXTGEN_PROVIDERS",
859
+ "handle_hf_error",
860
+ "invoke_with_fallback",
861
+ ]
Modules/_docstrings.py CHANGED
@@ -1,149 +1,149 @@
1
- from __future__ import annotations
2
-
3
- import inspect
4
- import re
5
- from typing import Any, Annotated, get_args, get_origin, get_type_hints
6
-
7
-
8
- def _typename(tp: Any) -> str:
9
- """Return a readable type name from a type or annotation."""
10
- try:
11
- if hasattr(tp, "__name__"):
12
- return tp.__name__ # e.g. int, str
13
- if getattr(tp, "__module__", None) and getattr(tp, "__qualname__", None):
14
- return f"{tp.__module__}.{tp.__qualname__}"
15
- return str(tp).replace("typing.", "")
16
- except Exception:
17
- return str(tp)
18
-
19
-
20
- def _parse_string_annotation(annot_str: str) -> tuple[str | None, str | None]:
21
- """
22
- Parse a string annotation like "Annotated[Optional[str], 'description']"
23
- and extract the base type name and the description metadata.
24
-
25
- Returns (base_type_name, description) or (None, None) if parsing fails.
26
- """
27
- if not isinstance(annot_str, str):
28
- return None, None
29
-
30
- # Match Annotated[..., 'description'] or Annotated[..., "description"]
31
- # Pattern: Annotated[<base_type>, '<description>'] or with double quotes
32
- match = re.match(
33
- r"^Annotated\[(.+?),\s*['\"](.+?)['\"]\s*\]$",
34
- annot_str.strip(),
35
- re.DOTALL,
36
- )
37
- if match:
38
- base_type_str = match.group(1).strip()
39
- description = match.group(2)
40
- # Simplify Optional[X] -> just the base type for display
41
- opt_match = re.match(r"^Optional\[(.+)\]$", base_type_str)
42
- if opt_match:
43
- base_type_str = opt_match.group(1).strip()
44
- return base_type_str, description
45
-
46
- return None, None
47
-
48
-
49
- def _extract_base_and_meta(annotation: Any) -> tuple[Any, str | None]:
50
- """Given an annotation, return (base_type, first string metadata) if Annotated, else (annotation, None)."""
51
- try:
52
- # Handle string annotations from PEP 563 (__future__.annotations)
53
- if isinstance(annotation, str):
54
- base_str, meta = _parse_string_annotation(annotation)
55
- if meta:
56
- return base_str or annotation, meta
57
- return annotation, None
58
-
59
- if get_origin(annotation) is Annotated:
60
- args = get_args(annotation)
61
- base = args[0] if args else annotation
62
- # Grab the first string metadata if present
63
- for meta in args[1:]:
64
- if isinstance(meta, str):
65
- return base, meta
66
- return base, None
67
- return annotation, None
68
- except Exception:
69
- return annotation, None
70
-
71
-
72
- def autodoc(summary: str | None = None, returns: str | None = None, *, force: bool = False):
73
- """
74
- Decorator that auto-generates a concise Google-style docstring from a function's
75
- type hints and Annotated metadata. Useful for Gradio MCP where docstrings are
76
- used for tool descriptions and parameter docs.
77
-
78
- Args:
79
- summary: Optional one-line summary for the function. If not provided,
80
- will generate a simple sentence from the function name.
81
- returns: Optional return value description. If not provided, only the
82
- return type will be listed (if available).
83
- force: When True, overwrite an existing docstring. Default False.
84
-
85
- Returns:
86
- The original function with its __doc__ populated (unless skipped).
87
- """
88
-
89
- def decorator(func):
90
- # Skip if docstring already present and not forcing
91
- if not force and func.__doc__ and func.__doc__.strip():
92
- return func
93
-
94
- try:
95
- # include_extras=True to retain Annotated metadata
96
- hints = get_type_hints(func, include_extras=True, globalns=getattr(func, "__globals__", None))
97
- except Exception:
98
- hints = {}
99
-
100
- sig = inspect.signature(func)
101
-
102
- lines: list[str] = []
103
- # Summary line
104
- if summary and summary.strip():
105
- lines.append(summary.strip())
106
- else:
107
- pretty = func.__name__.replace("_", " ").strip().capitalize()
108
- if not pretty.endswith("."):
109
- pretty += "."
110
- lines.append(pretty)
111
-
112
- # Args section
113
- if sig.parameters:
114
- lines.append("")
115
- lines.append("Args:")
116
- for name, param in sig.parameters.items():
117
- if name == "self":
118
- continue
119
- annot = hints.get(name, param.annotation)
120
- base, meta = _extract_base_and_meta(annot)
121
- tname = _typename(base) if base is not inspect._empty else None
122
- desc = meta or ""
123
- if tname and tname != str(inspect._empty):
124
- lines.append(f" {name} ({tname}): {desc}".rstrip())
125
- else:
126
- lines.append(f" {name}: {desc}".rstrip())
127
-
128
- # Returns section
129
- ret_hint = hints.get("return", sig.return_annotation)
130
- if returns or (ret_hint and ret_hint is not inspect.Signature.empty):
131
- lines.append("")
132
- lines.append("Returns:")
133
- if returns:
134
- lines.append(f" {returns}")
135
- else:
136
- base, meta = _extract_base_and_meta(ret_hint)
137
- rtype = _typename(base)
138
- if meta:
139
- lines.append(f" {rtype}: {meta}")
140
- else:
141
- lines.append(f" {rtype}")
142
-
143
- func.__doc__ = "\n".join(lines).strip() + "\n"
144
- return func
145
-
146
- return decorator
147
-
148
-
149
- __all__ = ["autodoc"]
 
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import re
5
+ from typing import Any, Annotated, get_args, get_origin, get_type_hints
6
+
7
+
8
+ def _typename(tp: Any) -> str:
9
+ """Return a readable type name from a type or annotation."""
10
+ try:
11
+ if hasattr(tp, "__name__"):
12
+ return tp.__name__ # e.g. int, str
13
+ if getattr(tp, "__module__", None) and getattr(tp, "__qualname__", None):
14
+ return f"{tp.__module__}.{tp.__qualname__}"
15
+ return str(tp).replace("typing.", "")
16
+ except Exception:
17
+ return str(tp)
18
+
19
+
20
+ def _parse_string_annotation(annot_str: str) -> tuple[str | None, str | None]:
21
+ """
22
+ Parse a string annotation like "Annotated[Optional[str], 'description']"
23
+ and extract the base type name and the description metadata.
24
+
25
+ Returns (base_type_name, description) or (None, None) if parsing fails.
26
+ """
27
+ if not isinstance(annot_str, str):
28
+ return None, None
29
+
30
+ # Match Annotated[..., 'description'] or Annotated[..., "description"]
31
+ # Pattern: Annotated[<base_type>, '<description>'] or with double quotes
32
+ match = re.match(
33
+ r"^Annotated\[(.+?),\s*['\"](.+?)['\"]\s*\]$",
34
+ annot_str.strip(),
35
+ re.DOTALL,
36
+ )
37
+ if match:
38
+ base_type_str = match.group(1).strip()
39
+ description = match.group(2)
40
+ # Simplify Optional[X] -> just the base type for display
41
+ opt_match = re.match(r"^Optional\[(.+)\]$", base_type_str)
42
+ if opt_match:
43
+ base_type_str = opt_match.group(1).strip()
44
+ return base_type_str, description
45
+
46
+ return None, None
47
+
48
+
49
+ def _extract_base_and_meta(annotation: Any) -> tuple[Any, str | None]:
50
+ """Given an annotation, return (base_type, first string metadata) if Annotated, else (annotation, None)."""
51
+ try:
52
+ # Handle string annotations from PEP 563 (__future__.annotations)
53
+ if isinstance(annotation, str):
54
+ base_str, meta = _parse_string_annotation(annotation)
55
+ if meta:
56
+ return base_str or annotation, meta
57
+ return annotation, None
58
+
59
+ if get_origin(annotation) is Annotated:
60
+ args = get_args(annotation)
61
+ base = args[0] if args else annotation
62
+ # Grab the first string metadata if present
63
+ for meta in args[1:]:
64
+ if isinstance(meta, str):
65
+ return base, meta
66
+ return base, None
67
+ return annotation, None
68
+ except Exception:
69
+ return annotation, None
70
+
71
+
72
+ def autodoc(summary: str | None = None, returns: str | None = None, *, force: bool = False):
73
+ """
74
+ Decorator that auto-generates a concise Google-style docstring from a function's
75
+ type hints and Annotated metadata. Useful for Gradio MCP where docstrings are
76
+ used for tool descriptions and parameter docs.
77
+
78
+ Args:
79
+ summary: Optional one-line summary for the function. If not provided,
80
+ will generate a simple sentence from the function name.
81
+ returns: Optional return value description. If not provided, only the
82
+ return type will be listed (if available).
83
+ force: When True, overwrite an existing docstring. Default False.
84
+
85
+ Returns:
86
+ The original function with its __doc__ populated (unless skipped).
87
+ """
88
+
89
+ def decorator(func):
90
+ # Skip if docstring already present and not forcing
91
+ if not force and func.__doc__ and func.__doc__.strip():
92
+ return func
93
+
94
+ try:
95
+ # include_extras=True to retain Annotated metadata
96
+ hints = get_type_hints(func, include_extras=True, globalns=getattr(func, "__globals__", None))
97
+ except Exception:
98
+ hints = {}
99
+
100
+ sig = inspect.signature(func)
101
+
102
+ lines: list[str] = []
103
+ # Summary line
104
+ if summary and summary.strip():
105
+ lines.append(summary.strip())
106
+ else:
107
+ pretty = func.__name__.replace("_", " ").strip().capitalize()
108
+ if not pretty.endswith("."):
109
+ pretty += "."
110
+ lines.append(pretty)
111
+
112
+ # Args section
113
+ if sig.parameters:
114
+ lines.append("")
115
+ lines.append("Args:")
116
+ for name, param in sig.parameters.items():
117
+ if name == "self":
118
+ continue
119
+ annot = hints.get(name, param.annotation)
120
+ base, meta = _extract_base_and_meta(annot)
121
+ tname = _typename(base) if base is not inspect._empty else None
122
+ desc = meta or ""
123
+ if tname and tname != str(inspect._empty):
124
+ lines.append(f" {name} ({tname}): {desc}".rstrip())
125
+ else:
126
+ lines.append(f" {name}: {desc}".rstrip())
127
+
128
+ # Returns section
129
+ ret_hint = hints.get("return", sig.return_annotation)
130
+ if returns or (ret_hint and ret_hint is not inspect.Signature.empty):
131
+ lines.append("")
132
+ lines.append("Returns:")
133
+ if returns:
134
+ lines.append(f" {returns}")
135
+ else:
136
+ base, meta = _extract_base_and_meta(ret_hint)
137
+ rtype = _typename(base)
138
+ if meta:
139
+ lines.append(f" {rtype}: {meta}")
140
+ else:
141
+ lines.append(f" {rtype}")
142
+
143
+ func.__doc__ = "\n".join(lines).strip() + "\n"
144
+ return func
145
+
146
+ return decorator
147
+
148
+
149
+ __all__ = ["autodoc"]
Modules/_pollinations_client.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Annotated, Any, Literal
5
+
6
+ import httpx
7
+ import gradio as gr
8
+
9
+ from app import _log_call_end, _log_call_start, _truncate_for_log
10
+ from ._docstrings import autodoc
11
+
12
+
13
+ # ===========================================================================
14
+ # Constants
15
+ # ===========================================================================
16
+
17
+ BASE_URL = "https://text.pollinations.ai"
18
+
19
+ # Model mappings for different depth levels
20
+ MODEL_MAPPING = {
21
+ "fast": "gemini-search",
22
+ "normal": "perplexity-fast",
23
+ "deep": "perplexity-reasoning",
24
+ }
25
+
26
+ # System prompts for different detail levels
27
+ SYSTEM_PROMPTS = {
28
+ True: "Search the web and provide a comprehensive answer with sources. Include relevant details and cite your sources.",
29
+ False: "Search the web and provide a concise, accurate answer. Include source URLs.",
30
+ }
31
+
32
+ # Timeout settings (seconds)
33
+ REQUEST_TIMEOUT = 30.0
34
+
35
+ # Single source of truth for the LLM-facing tool description
36
+ TOOL_SUMMARY = (
37
+ "Search the web using AI-powered search models with source citations. "
38
+ "Supports different depth levels: fast (Gemini with Google Search), normal (Perplexity Sonar), "
39
+ "and deep (Perplexity Sonar Reasoning). Returns answers with source URLs."
40
+ )
41
+
42
+
43
+ # ===========================================================================
44
+ # Core Client
45
+ # ===========================================================================
46
+
47
+
48
+ class PollinationsClient:
49
+ """
50
+ HTTP client for Pollinations AI web search API.
51
+
52
+ Provides web search functionality with different depth levels and citation support.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ base_url: str = BASE_URL,
58
+ timeout: float = REQUEST_TIMEOUT,
59
+ api_key: str | None = None,
60
+ ) -> None:
61
+ """
62
+ Initialize the Pollinations client.
63
+
64
+ Args:
65
+ base_url: Base URL for the Pollinations API (default: https://text.pollinations.ai)
66
+ timeout: Request timeout in seconds (default: 30)
67
+ api_key: Optional API key (reads from POLLINATIONS_API_KEY env var if not provided)
68
+ """
69
+ self.base_url = base_url.rstrip("/")
70
+ self.timeout = timeout
71
+ self.api_key = api_key or os.getenv("POLLINATIONS_API_KEY")
72
+
73
+ def _get_headers(self) -> dict[str, str]:
74
+ """Get request headers including API key if available."""
75
+ headers = {
76
+ "Content-Type": "application/json",
77
+ }
78
+ if self.api_key:
79
+ headers["Authorization"] = f"Bearer {self.api_key}"
80
+ return headers
81
+
82
+ def _resolve_model(self, depth: str) -> str:
83
+ """
84
+ Resolve depth level to actual model name.
85
+
86
+ Args:
87
+ depth: Depth level ('fast', 'normal', or 'deep')
88
+
89
+ Returns:
90
+ The model identifier for the Pollinations API
91
+ """
92
+ return MODEL_MAPPING.get(depth, "perplexity-fast")
93
+
94
+ async def web_search(
95
+ self,
96
+ query: str,
97
+ depth: str = "normal",
98
+ detailed: bool = False,
99
+ ) -> dict[str, Any]:
100
+ """
101
+ Perform web search using Pollinations AI.
102
+
103
+ Args:
104
+ query: The search query
105
+ depth: Search depth level ('fast', 'normal', or 'deep')
106
+ detailed: Whether to request a comprehensive answer
107
+
108
+ Returns:
109
+ Dictionary with keys:
110
+ - answer: The generated answer
111
+ - sources: List of source URLs (citations)
112
+ - model: The model used
113
+ - query: The original query
114
+
115
+ Raises:
116
+ httpx.HTTPError: For network/HTTP errors
117
+ ValueError: For invalid parameters
118
+ """
119
+ if not query or not query.strip():
120
+ raise ValueError("Query cannot be empty")
121
+
122
+ if depth not in MODEL_MAPPING:
123
+ raise ValueError(f"Invalid depth: {depth}. Must be one of {list(MODEL_MAPPING.keys())}")
124
+
125
+ model = self._resolve_model(depth)
126
+ system_prompt = SYSTEM_PROMPTS.get(detailed, SYSTEM_PROMPTS[False])
127
+
128
+ # Prepare OpenAI-compatible request
129
+ payload = {
130
+ "model": model,
131
+ "messages": [
132
+ {"role": "system", "content": system_prompt},
133
+ {"role": "user", "content": query},
134
+ ],
135
+ }
136
+
137
+ url = f"{self.base_url}/v1/chat/completions"
138
+
139
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
140
+ try:
141
+ response = await client.post(
142
+ url,
143
+ json=payload,
144
+ headers=self._get_headers(),
145
+ )
146
+ response.raise_for_status()
147
+ except httpx.TimeoutException as exc:
148
+ raise httpx.HTTPError(f"Request timed out after {self.timeout}s") from exc
149
+ except httpx.HTTPStatusError as exc:
150
+ # Handle rate limiting specifically
151
+ if exc.response.status_code == 429:
152
+ raise httpx.HTTPError("Rate limited. Please try again later.") from exc
153
+ raise
154
+
155
+ data = response.json()
156
+
157
+ # Extract answer and citations from response
158
+ answer = ""
159
+ sources = []
160
+
161
+ # OpenAI-compatible response format
162
+ if "choices" in data and data["choices"]:
163
+ answer = data["choices"][0].get("message", {}).get("content", "")
164
+
165
+ # Extract citations if present (Pollinations-specific extension)
166
+ if "citations" in data:
167
+ sources = data["citations"]
168
+
169
+ # Also check if citations are embedded in the message
170
+ if not sources and isinstance(answer, str):
171
+ # Try to extract URLs from the answer
172
+ import re
173
+ url_pattern = r'https?://[^\s<>"\'\)]+'
174
+ sources = list(dict.fromkeys(re.findall(url_pattern, answer))) # Unique URLs
175
+
176
+ return {
177
+ "answer": answer,
178
+ "sources": sources,
179
+ "model": model,
180
+ "query": query,
181
+ }
182
+
183
+ def web_search_sync(
184
+ self,
185
+ query: str,
186
+ depth: str = "normal",
187
+ detailed: bool = False,
188
+ ) -> dict[str, Any]:
189
+ """
190
+ Synchronous version of web_search.
191
+
192
+ Args:
193
+ query: The search query
194
+ depth: Search depth level ('fast', 'normal', or 'deep')
195
+ detailed: Whether to request a comprehensive answer
196
+
197
+ Returns:
198
+ Dictionary with answer, sources, model, and query
199
+ """
200
+ import asyncio
201
+
202
+ return asyncio.run(self.web_search(query, depth, detailed))
203
+
204
+
205
+ # ===========================================================================
206
+ # Gradio Tool Function
207
+ # ===========================================================================
208
+
209
+
210
+ @autodoc(
211
+ summary=TOOL_SUMMARY,
212
+ )
213
+ def Pollinations_Web_Search(
214
+ query: Annotated[str, "The search query string"],
215
+ depth: Annotated[
216
+ Literal["fast", "normal", "deep"],
217
+ "Search depth: 'fast' (Gemini with Google Search), 'normal' (Perplexity Sonar), or 'deep' (Perplexity Sonar Reasoning).",
218
+ ] = "normal",
219
+ detailed: Annotated[bool, "Request a comprehensive answer instead of concise summary"] = False,
220
+ ) -> str:
221
+ """
222
+ Search the web using Pollinations AI with source citations.
223
+
224
+ Uses AI-powered search models that provide direct answers with source citations.
225
+ Supports three depth levels for different search capabilities.
226
+ """
227
+ _log_call_start("Pollinations_Web_Search", query=query, depth=depth, detailed=detailed)
228
+
229
+ try:
230
+ client = PollinationsClient()
231
+ result = client.web_search_sync(query, depth, detailed)
232
+
233
+ # Format the result for display
234
+ lines = [
235
+ f"Query: {result['query']}",
236
+ f"Model: {result['model']}",
237
+ f"Depth: {depth}",
238
+ "",
239
+ "Answer:",
240
+ result["answer"] or "No answer generated.",
241
+ ]
242
+
243
+ if result["sources"]:
244
+ lines.append("")
245
+ lines.append("Sources:")
246
+ for i, source in enumerate(result["sources"], 1):
247
+ lines.append(f" {i}. {source}")
248
+ else:
249
+ lines.append("")
250
+ lines.append("(No sources provided)")
251
+
252
+ formatted_result = "\n".join(lines)
253
+ _log_call_end("Pollinations_Web_Search", _truncate_for_log(formatted_result))
254
+ return formatted_result
255
+
256
+ except ValueError as exc:
257
+ error_msg = f"Invalid input: {exc}"
258
+ _log_call_end("Pollinations_Web_Search", error_msg)
259
+ return error_msg
260
+ except httpx.HTTPError as exc:
261
+ error_msg = f"Search failed: {exc}"
262
+ _log_call_end("Pollinations_Web_Search", error_msg)
263
+ return error_msg
264
+ except Exception as exc:
265
+ error_msg = f"Unexpected error: {exc}"
266
+ _log_call_end("Pollinations_Web_Search", error_msg)
267
+ return error_msg
268
+
269
+
270
+ # ===========================================================================
271
+ # Gradio Interface
272
+ # ===========================================================================
273
+
274
+
275
+ def build_interface() -> gr.Interface:
276
+ """Build the Gradio interface for Pollinations web search."""
277
+ return gr.Interface(
278
+ fn=Pollinations_Web_Search,
279
+ inputs=[
280
+ gr.Textbox(
281
+ label="Query",
282
+ placeholder="Enter your search query here...",
283
+ max_lines=2,
284
+ info="The search query",
285
+ ),
286
+ gr.Radio(
287
+ label="Search Depth",
288
+ choices=["fast", "normal", "deep"],
289
+ value="normal",
290
+ info="Search depth level: fast (Gemini), normal (Perplexity), deep (Reasoning)",
291
+ ),
292
+ gr.Checkbox(
293
+ label="Detailed Answer",
294
+ value=False,
295
+ info="Request a comprehensive answer instead of concise summary",
296
+ ),
297
+ ],
298
+ outputs=gr.Textbox(
299
+ label="Search Results",
300
+ interactive=False,
301
+ lines=15,
302
+ max_lines=20,
303
+ ),
304
+ title="Pollinations Web Search",
305
+ description=(
306
+ "<div style=\"text-align:center\">AI-powered web search with source citations. "
307
+ "Uses Google Search, Perplexity Sonar, and Perplexity Sonar Reasoning models "
308
+ "to provide direct answers with reliable source URLs.</div>"
309
+ ),
310
+ api_description=TOOL_SUMMARY,
311
+ flagging_mode="never",
312
+ submit_btn="Search",
313
+ )
314
+
315
+
316
+ # ===========================================================================
317
+ # Public API
318
+ # ===========================================================================
319
+
320
+ __all__ = [
321
+ "PollinationsClient",
322
+ "Pollinations_Web_Search",
323
+ "build_interface",
324
+ ]
Modules/_query_optimizer.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Query Optimizer Module with Self-Consistency Chain-of-Thought (SC-CoT).
3
+
4
+ Optimizes search queries using AI-generated candidate scoring with a fallback chain:
5
+ 1. Mistral API (magistral-medium-2509) - Primary
6
+ 2. HuggingFace Inference (openai/gpt-oss-20b:cheapest) - Fallback
7
+ 3. Bypass (return raw query) - Final fallback
8
+
9
+ Two optimization modes:
10
+ - optimize_for_search_engine(): Boolean operators, site:, filetype:, exact phrases
11
+ - optimize_for_ai_search(): Clear intent, context, specific questions
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import re
19
+ from typing import Annotated, Any, Literal
20
+
21
+ import gradio as gr
22
+ from pydantic import BaseModel, Field
23
+
24
+ from app import _log_call_end, _log_call_start, _truncate_for_log
25
+ from ._docstrings import autodoc
26
+
27
+
28
+ # ===========================================================================
29
+ # Pydantic Schemas for Structured Output
30
+ # ===========================================================================
31
+
32
+
33
+ class OptimizedCandidate(BaseModel):
34
+ """A single optimized query candidate with reasoning."""
35
+
36
+ version: int = Field(description="Candidate version number (1-based)")
37
+ optimized_query: str = Field(description="The optimized query string")
38
+ reasoning: list[str] = Field(description="List of reasoning steps explaining optimizations")
39
+
40
+
41
+ class GenerationOutput(BaseModel):
42
+ """Output from candidate generation phase."""
43
+
44
+ original_query: str = Field(description="The original user query")
45
+ candidates: list[OptimizedCandidate] = Field(description="List of generated candidates")
46
+
47
+
48
+ class ScoringOutput(BaseModel):
49
+ """Output from candidate selection phase."""
50
+
51
+ selected_version: int = Field(description="Version number of the best candidate")
52
+
53
+
54
+ # ===========================================================================
55
+ # Core Query Optimizer Class
56
+ # ===========================================================================
57
+
58
+
59
+ class QueryOptimizer:
60
+ """
61
+ Self-Consistency Chain-of-Thought query optimizer.
62
+
63
+ Generates multiple optimized candidates and selects the best one through
64
+ self-consistency scoring. Implements a fallback chain for reliability.
65
+ """
66
+
67
+ # Few-shot examples for search engine optimization
68
+ _SEARCH_ENGINE_EXAMPLES = """
69
+ Example 1:
70
+ Input: python fastapi performance
71
+ Candidates:
72
+ 1. ("python fastapi performance", "Direct query covers main concepts")
73
+ 2. ("fastapi performance optimization python", "Added 'optimization' for more specific results")
74
+ 3. ("site:stackoverflow.com fastapi performance python", "Targeted technical Q&A for performance issues")
75
+ 4. ("fastapi async performance benchmark", "Added 'async' and 'benchmark' for technical depth")
76
+ 5. "fastapi OR flask performance python", "Added comparison with Flask for broader context")
77
+
78
+ Example 2:
79
+ Input: climate change effects on agriculture
80
+ Candidates:
81
+ 1. ("climate change effects on agriculture", "Clear and comprehensive query")
82
+ 2. ("site:nature.com OR site:science.org climate change agriculture", "Targeted reputable scientific sources")
83
+ 3. "\"climate change\" AND agriculture filetype:pdf", "Using exact phrase match and PDF filter for research papers")
84
+ 4. ("climate change impact crop yield 2023..2024", "Added temporal filter and specific terminology")
85
+ 5. ("agricultural adaptation climate change strategies", "Rephrased to focus on solutions")
86
+
87
+ Example 3:
88
+ Input: machine learning tutorial python
89
+ Candidates:
90
+ 1. ("python machine learning tutorial", "Reordered for better SEO")
91
+ 2. ("site:youtube.com python machine learning tutorial", "Targeted video tutorials")
92
+ 3. ("python machine learning tutorial filetype:pdf", "Focus on PDF documentation")
93
+ 4. ("machine learning python sklearn tutorial", "Added popular library 'sklearn' for relevance")
94
+ 5. "\"machine learning\" AND python AND tutorial", "Using boolean operators for precision")
95
+
96
+ Example 4:
97
+ Input: react native vs flutter
98
+ Candidates:
99
+ 1. ("react native vs flutter comparison", "Added 'comparison' for explicit intent")
100
+ 2. ("site:reddit.com \"react native\" flutter", "Targeted community discussions")
101
+ 3. "\"react native\" OR flutter mobile development", "Broader search for mobile frameworks")
102
+ 4. ("react native flutter performance benchmark", "Focus on technical comparison")
103
+ 5. ("flutter vs react native 2024", "Added year for current information")
104
+
105
+ Example 5:
106
+ Input: best restaurants in tokyo
107
+ Candidates:
108
+ 1. ("best restaurants tokyo", "Simplified for broad search")
109
+ 2. ("site:michelin.com Tokyo restaurants", "Targeted Michelin guide sources")
110
+ 3. ("Tokyo restaurant guide 2024", "Added temporal context")
111
+ 4. "\"best restaurants\" AND tokyo AND review", "Boolean operators for precision")
112
+ 5. ("tokyo food guide michelin OR local", "Added 'local' for authentic recommendations")
113
+ """
114
+
115
+ # Few-shot examples for AI search optimization
116
+ _AI_SEARCH_EXAMPLES = """
117
+ Example 1:
118
+ Input: python fastapi performance
119
+ Candidates:
120
+ 1. ("What are the performance characteristics of FastAPI in Python, and how does it compare to other web frameworks?", "Added comparison context and framework focus")
121
+ 2. ("Explain the key performance optimization techniques for FastAPI applications in Python.", "Focused on actionable optimization strategies")
122
+ 3. ("How does FastAPI's async/await model impact performance compared to synchronous frameworks?", "Targeted technical architectural question")
123
+ 4. ("What are the benchmarks and real-world performance metrics for FastAPI in production environments?", "Asked for empirical data")
124
+ 5. ("How can I identify and resolve performance bottlenecks in FastAPI applications?", "Problem-solving focused")
125
+
126
+ Example 2:
127
+ Input: climate change effects on agriculture
128
+ Candidates:
129
+ 1. ("What are the primary impacts of climate change on global agricultural productivity and crop yields?", "Comprehensive question covering direct effects")
130
+ 2. ("How is climate change affecting different agricultural regions around the world?", "Geographic focus")
131
+ 3. ("What adaptation strategies are farmers using to cope with climate change impacts?", "Solution-oriented focus")
132
+ 4. ("What scientific evidence exists linking climate change to agricultural changes?", "Evidence-based inquiry")
133
+ 5. ("How will climate change affect food security and agricultural sustainability by 2050?", "Temporal and sustainability focus")
134
+
135
+ Example 3:
136
+ Input: react native vs flutter
137
+ Candidates:
138
+ 1. ("What are the key differences between React Native and Flutter in terms of performance, development experience, and ecosystem?", "Comprehensive comparison framework")
139
+ 2. ("Which cross-platform mobile framework is better suited for startup applications: React Native or Flutter?", "Use-case specific question")
140
+ 3. ("How do React Native and Flutter compare in terms of learning curve, community support, and hiring availability?", "Practical development considerations")
141
+ 4. ("What are the long-term maintenance implications of choosing React Native vs Flutter?", "Strategic business question")
142
+ 5. ("Which framework provides better native performance and access to device features: React Native or Flutter?", "Technical performance focus")
143
+
144
+ Example 4:
145
+ Input: machine learning tutorial python
146
+ Candidates:
147
+ 1. ("What is the best learning path for getting started with machine learning using Python?", "Learning path focused question")
148
+ 2. ("Can you recommend a comprehensive Python machine learning tutorial for beginners?", "Resource-seeking question")
149
+ 3. ("What are the essential Python libraries and tools for implementing machine learning algorithms?", "Tool ecosystem question")
150
+ 4. ("How can I build my first machine learning model in Python from scratch?", hands-on implementation focus")
151
+ 5. ("What are the common pitfalls and best practices for learning machine learning with Python?", "Learning guidance question")
152
+
153
+ Example 5:
154
+ Input: quantum computing explained
155
+ Candidates:
156
+ 1. ("Can you explain quantum computing in simple terms for someone without a physics background?", "Accessible explanation request")
157
+ 2. ("What are the fundamental principles of quantum computing and how do they differ from classical computing?", "Conceptual comparison question")
158
+ 3. ("What are the practical applications of quantum computing and when might they become viable?", "Real-world impact question")
159
+ 4. ("How do qubits work and why do they enable quantum computational advantages?", "Technical explanation question")
160
+ 5. ("What are the current limitations and challenges in developing practical quantum computers?", "Critical analysis question")
161
+ """
162
+
163
+ _SELECTOR_PROMPT = """
164
+ Given the original query and multiple optimized candidates, select the best one.
165
+
166
+ Criteria for selection:
167
+ - Relevance: Most accurately captures the user's intent
168
+ - Precision: Will return the most relevant results
169
+ - Completeness: Covers all important aspects of the query
170
+ - Clarity: Easy to understand and well-structured
171
+
172
+ Return only the version number of the best candidate (1-indexed).
173
+ """
174
+
175
+ def __init__(self) -> None:
176
+ """Initialize the query optimizer with API clients."""
177
+ self._mistral_api_key: str | None = os.getenv("MISTRAL_API_KEY")
178
+ self._hf_token: str | None = os.getenv("HF_TOKEN")
179
+ self._mistral_model: str = "magistral-medium-2509"
180
+ self._hf_model: str = "openai/gpt-oss-20b:cheapest"
181
+ self._hf_endpoint: str = "https://router.huggingface.co/v1"
182
+
183
+ def _mistral_generate(
184
+ self, prompt: str, response_format: dict[str, Any]
185
+ ) -> str:
186
+ """Generate structured output using Mistral API with response_format."""
187
+ if not self._mistral_api_key:
188
+ raise ValueError("MISTRAL_API_KEY not set")
189
+
190
+ import httpx
191
+
192
+ messages = [
193
+ {
194
+ "role": "user",
195
+ "content": prompt,
196
+ }
197
+ ]
198
+
199
+ payload = {
200
+ "model": self._mistral_model,
201
+ "messages": messages,
202
+ "response_format": response_format,
203
+ "max_tokens": 2000,
204
+ "temperature": 0.3,
205
+ }
206
+
207
+ headers = {
208
+ "Authorization": f"Bearer {self._mistral_api_key}",
209
+ "Content-Type": "application/json",
210
+ }
211
+
212
+ response = httpx.post(
213
+ "https://api.mistral.ai/v1/chat/completions",
214
+ json=payload,
215
+ headers=headers,
216
+ timeout=30.0,
217
+ )
218
+ response.raise_for_status()
219
+ result = response.json()
220
+
221
+ if "choices" not in result or not result["choices"]:
222
+ raise ValueError("Invalid Mistral API response: no choices")
223
+
224
+ return result["choices"][0]["message"]["content"]
225
+
226
+ def _hf_generate(self, prompt: str) -> str:
227
+ """Generate output using HuggingFace Inference API."""
228
+ if not self._hf_token:
229
+ raise ValueError("HF_TOKEN not set")
230
+
231
+ import httpx
232
+
233
+ payload = {
234
+ "model": self._hf_model,
235
+ "messages": [
236
+ {
237
+ "role": "user",
238
+ "content": prompt,
239
+ }
240
+ ],
241
+ "max_tokens": 2000,
242
+ "temperature": 0.3,
243
+ }
244
+
245
+ headers = {
246
+ "Authorization": f"Bearer {self._hf_token}",
247
+ "Content-Type": "application/json",
248
+ }
249
+
250
+ response = httpx.post(
251
+ f"{self._hf_endpoint}/chat/completions",
252
+ json=payload,
253
+ headers=headers,
254
+ timeout=30.0,
255
+ )
256
+ response.raise_for_status()
257
+ result = response.json()
258
+
259
+ if "choices" not in result or not result["choices"]:
260
+ raise ValueError("Invalid HF API response: no choices")
261
+
262
+ return result["choices"][0]["message"]["content"]
263
+
264
+ def _extract_json_from_response(self, response: str) -> str:
265
+ """Extract JSON from a response that may have markdown formatting."""
266
+ # Try to find JSON between ```json and ``` or between ``` and ```
267
+ patterns = [
268
+ r"```json\s*([\s\S]*?)\s*```",
269
+ r"```\s*([\s\S]*?)\s*```",
270
+ r"(\{[\s\S]*\})",
271
+ ]
272
+
273
+ for pattern in patterns:
274
+ match = re.search(pattern, response.strip())
275
+ if match:
276
+ return match.group(1).strip()
277
+
278
+ return response.strip()
279
+
280
+ def _optimize_search_engine_mistral(self, query: str) -> str:
281
+ """Optimize for search engines using Mistral API."""
282
+ prompt = f"""Generate 5 optimized versions of the following search query for traditional search engines (DuckDuckGo, Google, etc.).
283
+
284
+ Optimization techniques to use:
285
+ - Add boolean operators (AND, OR, NOT)
286
+ - Use site: to target specific domains
287
+ - Use filetype: to filter by document type
288
+ - Use exact phrases with quotes
289
+ - Add relevant keywords for precision
290
+ - Include temporal filters when appropriate
291
+ - Target reputable sources (Wikipedia, StackOverflow, GitHub, etc.)
292
+
293
+ {self._SEARCH_ENGINE_EXAMPLES}
294
+
295
+ Original query: {query}
296
+
297
+ Generate candidates in the following JSON format:
298
+ {{
299
+ "original_query": "{query}",
300
+ "candidates": [
301
+ {{
302
+ "version": 1,
303
+ "optimized_query": "...",
304
+ "reasoning": ["...", "..."]
305
+ }},
306
+ ...
307
+ ]
308
+ }}
309
+
310
+ Return ONLY valid JSON, no markdown formatting."""
311
+
312
+ return self._mistral_generate(
313
+ prompt,
314
+ response_format={
315
+ "type": "json_schema",
316
+ "json_schema": GenerationOutput.model_json_schema(),
317
+ },
318
+ )
319
+
320
+ def _optimize_ai_search_mistral(self, query: str) -> str:
321
+ """Optimize for AI search using Mistral API."""
322
+ prompt = f"""Generate 5 optimized versions of the following query for AI-powered search engines (Perplexity, Gemini Search, etc.).
323
+
324
+ Optimization techniques to use:
325
+ - Reframe as clear, specific questions
326
+ - Add context about what information is needed
327
+ - Include comparative or evaluative language when relevant
328
+ - Ask for explanations, examples, or step-by-step guides
329
+ - Include temporal context (current state, recent developments)
330
+ - Focus on actionable information or insights
331
+
332
+ {self._AI_SEARCH_EXAMPLES}
333
+
334
+ Original query: {query}
335
+
336
+ Generate candidates in the following JSON format:
337
+ {{
338
+ "original_query": "{query}",
339
+ "candidates": [
340
+ {{
341
+ "version": 1,
342
+ "optimized_query": "...",
343
+ "reasoning": ["...", "..."]
344
+ }},
345
+ ...
346
+ ]
347
+ }}
348
+
349
+ Return ONLY valid JSON, no markdown formatting."""
350
+
351
+ return self._mistral_generate(
352
+ prompt,
353
+ response_format={
354
+ "type": "json_schema",
355
+ "json_schema": GenerationOutput.model_json_schema(),
356
+ },
357
+ )
358
+
359
+ def _select_best_mistral(self, candidates_json: str) -> int:
360
+ """Select best candidate using Mistral API."""
361
+ prompt = f"""{self._SELECTOR_PROMPT}
362
+
363
+ {candidates_json}
364
+
365
+ Return the version number (1-5) of the best candidate."""
366
+
367
+ response = self._mistral_generate(
368
+ prompt,
369
+ response_format={
370
+ "type": "json_schema",
371
+ "json_schema": ScoringOutput.model_json_schema(),
372
+ },
373
+ )
374
+
375
+ # Parse JSON response
376
+ json_str = self._extract_json_from_response(response)
377
+ result = json.loads(json_str)
378
+ return result["selected_version"]
379
+
380
+ def _optimize_search_engine_hf(self, query: str) -> str:
381
+ """Optimize for search engines using HF Inference (fallback)."""
382
+ prompt = f"""Generate 5 optimized search query candidates. Return as JSON with format:
383
+ {{
384
+ "original_query": "...",
385
+ "candidates": [
386
+ {{"version": 1, "optimized_query": "...", "reasoning": ["..."]}},
387
+ ...
388
+ ]
389
+ }}
390
+
391
+ Query: {query}
392
+
393
+ Optimize with boolean operators, site:, filetype:, quotes for phrases, and relevant keywords."""
394
+
395
+ response = self._hf_generate(prompt)
396
+ return self._extract_json_from_response(response)
397
+
398
+ def _optimize_ai_search_hf(self, query: str) -> str:
399
+ """Optimize for AI search using HF Inference (fallback)."""
400
+ prompt = f"""Generate 5 optimized query candidates for AI search. Return as JSON with format:
401
+ {{
402
+ "original_query": "...",
403
+ "candidates": [
404
+ {{"version": 1, "optimized_query": "...", "reasoning": ["..."]}},
405
+ ...
406
+ ]
407
+ }}
408
+
409
+ Query: {query}
410
+
411
+ Optimize as clear, specific questions with context and intent."""
412
+
413
+ response = self._hf_generate(prompt)
414
+ return self._extract_json_from_response(response)
415
+
416
+ def _select_best_hf(self, candidates_json: str) -> int:
417
+ """Select best candidate using HF Inference (fallback)."""
418
+ prompt = f"""{self._SELECTOR_PROMPT}
419
+
420
+ {candidates_json}
421
+
422
+ Return only the number (1-5)."""
423
+
424
+ response = self._hf_generate(prompt)
425
+ # Try to extract number from response
426
+ match = re.search(r"\b([1-5])\b", response)
427
+ if match:
428
+ return int(match.group(1))
429
+ return 1 # Default to first candidate
430
+
431
+ def _parse_candidates(self, json_str: str, original_query: str) -> GenerationOutput:
432
+ """Parse candidate JSON with fallback."""
433
+ try:
434
+ json_clean = self._extract_json_from_response(json_str)
435
+ return GenerationOutput.model_validate_json(json_clean)
436
+ except Exception:
437
+ # Fallback: create minimal candidate with original query
438
+ return GenerationOutput(
439
+ original_query=original_query,
440
+ candidates=[
441
+ OptimizedCandidate(
442
+ version=1,
443
+ optimized_query=original_query,
444
+ reasoning=["Fallback: using original query"],
445
+ )
446
+ ],
447
+ )
448
+
449
+ def _run_optimization_chain(
450
+ self,
451
+ query: str,
452
+ mode: Literal["search_engine", "ai_search"],
453
+ ) -> tuple[GenerationOutput, int, str]:
454
+ """
455
+ Run optimization with fallback chain.
456
+
457
+ Returns:
458
+ (candidates, best_version, provider_used)
459
+ """
460
+ provider = "bypass"
461
+
462
+ # Try Mistral API first
463
+ try:
464
+ if mode == "search_engine":
465
+ response = self._optimize_search_engine_mistral(query)
466
+ else:
467
+ response = self._optimize_ai_search_mistral(query)
468
+
469
+ candidates = self._parse_candidates(response, query)
470
+ best_version = self._select_best_mistral(response)
471
+ provider = "mistral"
472
+ return candidates, best_version, provider
473
+ except Exception as exc:
474
+ print(f"[QueryOptimizer] Mistral failed: {exc}", flush=True)
475
+
476
+ # Fallback to HF Inference
477
+ try:
478
+ if mode == "search_engine":
479
+ response = self._optimize_search_engine_hf(query)
480
+ else:
481
+ response = self._optimize_ai_search_hf(query)
482
+
483
+ candidates = self._parse_candidates(response, query)
484
+ best_version = self._select_best_hf(response)
485
+ provider = "hf"
486
+ return candidates, best_version, provider
487
+ except Exception as exc:
488
+ print(f"[QueryOptimizer] HF failed: {exc}", flush=True)
489
+
490
+ # Final bypass: return original query
491
+ candidates = GenerationOutput(
492
+ original_query=query,
493
+ candidates=[
494
+ OptimizedCandidate(
495
+ version=1,
496
+ optimized_query=query,
497
+ reasoning=["Bypass: using original query due to optimization failure"],
498
+ )
499
+ ],
500
+ )
501
+ return candidates, 1, provider
502
+
503
+ def optimize_for_search_engine(self, query: str) -> tuple[str, dict[str, Any]]:
504
+ """
505
+ Optimize query for traditional search engines.
506
+
507
+ Optimizes with boolean operators, site:, filetype:, exact phrases.
508
+
509
+ Args:
510
+ query: The original search query
511
+
512
+ Returns:
513
+ (optimized_query, metadata) tuple with metadata including:
514
+ - original_query: The input query
515
+ - all_candidates: List of all generated candidates
516
+ - reasoning: Reasoning for selected candidate
517
+ - provider: Which provider was used (mistral/hf/bypass)
518
+ """
519
+ candidates, best_version, provider = self._run_optimization_chain(
520
+ query, "search_engine"
521
+ )
522
+
523
+ # Get selected candidate
524
+ selected = next(
525
+ (c for c in candidates.candidates if c.version == best_version),
526
+ candidates.candidates[0],
527
+ )
528
+
529
+ metadata = {
530
+ "original_query": candidates.original_query,
531
+ "all_candidates": [
532
+ {"version": c.version, "query": c.optimized_query}
533
+ for c in candidates.candidates
534
+ ],
535
+ "reasoning": selected.reasoning,
536
+ "provider": provider,
537
+ }
538
+
539
+ return selected.optimized_query, metadata
540
+
541
+ def optimize_for_ai_search(self, query: str) -> tuple[str, dict[str, Any]]:
542
+ """
543
+ Optimize query for AI-powered search engines.
544
+
545
+ Optimizes with clear intent, context, specific questions.
546
+
547
+ Args:
548
+ query: The original search query
549
+
550
+ Returns:
551
+ (optimized_query, metadata) tuple with metadata including:
552
+ - original_query: The input query
553
+ - all_candidates: List of all generated candidates
554
+ - reasoning: Reasoning for selected candidate
555
+ - provider: Which provider was used (mistral/hf/bypass)
556
+ """
557
+ candidates, best_version, provider = self._run_optimization_chain(query, "ai_search")
558
+
559
+ # Get selected candidate
560
+ selected = next(
561
+ (c for c in candidates.candidates if c.version == best_version),
562
+ candidates.candidates[0],
563
+ )
564
+
565
+ metadata = {
566
+ "original_query": candidates.original_query,
567
+ "all_candidates": [
568
+ {"version": c.version, "query": c.optimized_query}
569
+ for c in candidates.candidates
570
+ ],
571
+ "reasoning": selected.reasoning,
572
+ "provider": provider,
573
+ }
574
+
575
+ return selected.optimized_query, metadata
576
+
577
+
578
+ # Singleton instance for module-level caching
579
+ _optimizer_instance: QueryOptimizer | None = None
580
+
581
+
582
+ def get_optimizer() -> QueryOptimizer:
583
+ """Get or create the singleton optimizer instance."""
584
+ global _optimizer_instance
585
+ if _optimizer_instance is None:
586
+ _optimizer_instance = QueryOptimizer()
587
+ return _optimizer_instance
588
+
589
+
590
+ # ===========================================================================
591
+ # Gradio Tool Functions
592
+ # ===========================================================================
593
+
594
+
595
+ @autodoc(
596
+ summary="Optimize a search query for traditional search engines using SC-CoT with fallback chain (Mistral → HF → bypass).",
597
+ )
598
+ def Optimize_for_Search_Engine(
599
+ query: Annotated[str, "The search query to optimize."],
600
+ ) -> str:
601
+ """
602
+ Optimize a query for traditional search engines (DuckDuckGo, Google, etc.).
603
+
604
+ Uses Self-Consistency Chain-of-Thought with fallback chain:
605
+ 1. Mistral API (magistral-medium-2509) - Primary
606
+ 2. HuggingFace Inference - Fallback
607
+ 3. Bypass (return raw query) - Final fallback
608
+
609
+ Optimization techniques:
610
+ - Boolean operators (AND, OR, NOT)
611
+ - site: for domain targeting
612
+ - filetype: for document type filtering
613
+ - Exact phrases with quotes
614
+ - Relevant keywords for precision
615
+ - Temporal filters when appropriate
616
+ """
617
+ _log_call_start("Optimize_for_Search_Engine", query=query)
618
+
619
+ if not query or not query.strip():
620
+ result = "No query provided. Please enter a search query to optimize."
621
+ _log_call_end("Optimize_for_Search_Engine", _truncate_for_log(result))
622
+ return result
623
+
624
+ optimizer = get_optimizer()
625
+
626
+ try:
627
+ optimized, metadata = optimizer.optimize_for_search_engine(query)
628
+
629
+ lines = [
630
+ f"Original: {metadata['original_query']}",
631
+ f"Optimized: {optimized}",
632
+ f"Provider: {metadata['provider']}",
633
+ "",
634
+ "All candidates:",
635
+ ]
636
+
637
+ for i, candidate in enumerate(metadata["all_candidates"], 1):
638
+ prefix = "→" if i == 1 else " "
639
+ lines.append(f"{prefix} {candidate['version']}. {candidate['query']}")
640
+
641
+ lines.append("")
642
+ lines.append("Reasoning:")
643
+ lines.extend(f" • {step}" for step in metadata["reasoning"])
644
+
645
+ result = "\n".join(lines)
646
+ _log_call_end("Optimize_for_Search_Engine", _truncate_for_log(result))
647
+ return result
648
+ except Exception as exc:
649
+ result = f"Optimization failed: {exc}"
650
+ _log_call_end("Optimize_for_Search_Engine", _truncate_for_log(result))
651
+ return result
652
+
653
+
654
+ @autodoc(
655
+ summary="Optimize a search query for AI-powered search engines using SC-CoT with fallback chain (Mistral → HF → bypass).",
656
+ )
657
+ def Optimize_for_AI_Search(
658
+ query: Annotated[str, "The search query to optimize."],
659
+ ) -> str:
660
+ """
661
+ Optimize a query for AI-powered search engines (Perplexity, Gemini, etc.).
662
+
663
+ Uses Self-Consistency Chain-of-Thought with fallback chain:
664
+ 1. Mistral API (magistral-medium-2509) - Primary
665
+ 2. HuggingFace Inference - Fallback
666
+ 3. Bypass (return raw query) - Final fallback
667
+
668
+ Optimization techniques:
669
+ - Clear, specific questions
670
+ - Context about what information is needed
671
+ - Comparative or evaluative language
672
+ - Requests for explanations or examples
673
+ - Temporal context (current state, recent developments)
674
+ - Focus on actionable information
675
+ """
676
+ _log_call_start("Optimize_for_AI_Search", query=query)
677
+
678
+ if not query or not query.strip():
679
+ result = "No query provided. Please enter a search query to optimize."
680
+ _log_call_end("Optimize_for_AI_Search", _truncate_for_log(result))
681
+ return result
682
+
683
+ optimizer = get_optimizer()
684
+
685
+ try:
686
+ optimized, metadata = optimizer.optimize_for_ai_search(query)
687
+
688
+ lines = [
689
+ f"Original: {metadata['original_query']}",
690
+ f"Optimized: {optimized}",
691
+ f"Provider: {metadata['provider']}",
692
+ "",
693
+ "All candidates:",
694
+ ]
695
+
696
+ for i, candidate in enumerate(metadata["all_candidates"], 1):
697
+ prefix = "→" if i == 1 else " "
698
+ lines.append(f"{prefix} {candidate['version']}. {candidate['query']}")
699
+
700
+ lines.append("")
701
+ lines.append("Reasoning:")
702
+ lines.extend(f" • {step}" for step in metadata["reasoning"])
703
+
704
+ result = "\n".join(lines)
705
+ _log_call_end("Optimize_for_AI_Search", _truncate_for_log(result))
706
+ return result
707
+ except Exception as exc:
708
+ result = f"Optimization failed: {exc}"
709
+ _log_call_end("Optimize_for_AI_Search", _truncate_for_log(result))
710
+ return result
711
+
712
+
713
+ def build_interfaces() -> list[gr.Interface]:
714
+ """Build Gradio interfaces for query optimizer tools."""
715
+ return [
716
+ gr.Interface(
717
+ fn=Optimize_for_Search_Engine,
718
+ inputs=[
719
+ gr.Textbox(
720
+ label="Query",
721
+ placeholder="Enter your search query",
722
+ max_lines=1,
723
+ info="The search query to optimize for traditional search engines",
724
+ ),
725
+ ],
726
+ outputs=gr.Textbox(
727
+ label="Optimization Results",
728
+ interactive=False,
729
+ lines=15,
730
+ max_lines=20,
731
+ ),
732
+ title="Query Optimizer (Search Engine)",
733
+ description=(
734
+ "<div style='text-align:center'>"
735
+ "Optimize queries for traditional search engines using AI. "
736
+ "Generates multiple candidates and selects the best one. "
737
+ "Optimizes with boolean operators, site:, filetype:, and precise keywords."
738
+ "</div>"
739
+ ),
740
+ api_name="optimize_for_search_engine",
741
+ flagging_mode="never",
742
+ submit_btn="Optimize",
743
+ ),
744
+ gr.Interface(
745
+ fn=Optimize_for_AI_Search,
746
+ inputs=[
747
+ gr.Textbox(
748
+ label="Query",
749
+ placeholder="Enter your search query",
750
+ max_lines=1,
751
+ info="The search query to optimize for AI-powered search engines",
752
+ ),
753
+ ],
754
+ outputs=gr.Textbox(
755
+ label="Optimization Results",
756
+ interactive=False,
757
+ lines=15,
758
+ max_lines=20,
759
+ ),
760
+ title="Query Optimizer (AI Search)",
761
+ description=(
762
+ "<div style='text-align:center'>"
763
+ "Optimize queries for AI-powered search engines using AI. "
764
+ "Generates multiple candidates and selects the best one. "
765
+ "Optimizes with clear questions, context, and specific intent."
766
+ "</div>"
767
+ ),
768
+ api_name="optimize_for_ai_search",
769
+ flagging_mode="never",
770
+ submit_btn="Optimize",
771
+ ),
772
+ ]
773
+
774
+
775
+ __all__ = [
776
+ "QueryOptimizer",
777
+ "get_optimizer",
778
+ "Optimize_for_Search_Engine",
779
+ "Optimize_for_AI_Search",
780
+ "build_interfaces",
781
+ ]
Modules/_searxng_client.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SearXNG Client Module
3
+
4
+ HTTP client for SearXNG metasearch engine with auto-fallback to multiple instances.
5
+ Supports text, image, and news search with rate limiting and error handling.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
+ from typing import Any, Optional
14
+
15
+ import httpx
16
+
17
+
18
+ class TimeRange(str, Enum):
19
+ """Time range options for search results."""
20
+ DAY = "day"
21
+ WEEK = "week"
22
+ MONTH = "month"
23
+ YEAR = "year"
24
+
25
+
26
+ @dataclass
27
+ class TextResult:
28
+ """Represents a text/web search result."""
29
+ url: str
30
+ title: str
31
+ content: str
32
+ engine: Optional[str] = None
33
+ category: Optional[str] = None
34
+ score: float = 0.0
35
+
36
+
37
+ @dataclass
38
+ class ImageResult:
39
+ """Represents an image search result."""
40
+ url: str
41
+ title: str
42
+ img_src: str
43
+ thumbnail_src: str
44
+ engine: Optional[str] = None
45
+ source: Optional[str] = None
46
+ resolution: Optional[str] = None
47
+
48
+
49
+ @dataclass
50
+ class NewsResult:
51
+ """Represents a news search result."""
52
+ url: str
53
+ title: str
54
+ content: str
55
+ engine: Optional[str] = None
56
+ published_date: Optional[str] = None
57
+ score: float = 0.0
58
+
59
+
60
+ class SearXNGError(Exception):
61
+ """Base exception for SearXNG client errors."""
62
+ pass
63
+
64
+
65
+ class RateLimitError(SearXNGError):
66
+ """Raised when rate limit is exceeded."""
67
+ pass
68
+
69
+
70
+ class InstanceUnavailableError(SearXNGError):
71
+ """Raised when a SearXNG instance is unavailable."""
72
+ pass
73
+
74
+
75
+ class AllInstancesFailedError(SearXNGError):
76
+ """Raised when all SearXNG instances fail."""
77
+ pass
78
+
79
+
80
+ class SearXNGClient:
81
+ """
82
+ HTTP client for SearXNG metasearch engine.
83
+
84
+ Features:
85
+ - Multiple instance support with auto-fallback
86
+ - Connection pooling for performance
87
+ - Automatic retries with exponential backoff
88
+ - Type-safe result parsing
89
+ - Configurable timeouts
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ instances: Optional[list[str]] = None,
95
+ timeout: float = 10.0,
96
+ max_retries: int = 3,
97
+ retry_delay: float = 1.0,
98
+ pool_connections: int = 10,
99
+ pool_maxsize: int = 10,
100
+ ) -> None:
101
+ """
102
+ Initialize SearXNG client.
103
+
104
+ Args:
105
+ instances: List of SearXNG instance URLs (primary first)
106
+ timeout: Request timeout in seconds
107
+ max_retries: Maximum retry attempts per instance
108
+ retry_delay: Initial retry delay in seconds (exponential backoff)
109
+ pool_connections: Connection pool size
110
+ pool_maxsize: Maximum connections in pool
111
+ """
112
+ self.instances = instances or [
113
+ "https://searx.be",
114
+ "https://search.sapti.me",
115
+ "https://searx.fmac.xyz",
116
+ ]
117
+ self.timeout = timeout
118
+ self.max_retries = max_retries
119
+ self.retry_delay = retry_delay
120
+
121
+ # Configure httpx client with connection pooling
122
+ limits = httpx.Limits(
123
+ max_connections=pool_connections,
124
+ max_keepalive_connections=pool_maxsize,
125
+ )
126
+ self._client = httpx.Client(
127
+ timeout=timeout,
128
+ limits=limits,
129
+ follow_redirects=True,
130
+ verify=True,
131
+ )
132
+
133
+ def _build_params(
134
+ self,
135
+ query: str,
136
+ categories: Optional[list[str]] = None,
137
+ engines: Optional[list[str]] = None,
138
+ pageno: int = 1,
139
+ time_range: Optional[TimeRange] = None,
140
+ ) -> dict[str, str | int]:
141
+ """Build query parameters for SearXNG API."""
142
+ params: dict[str, str | int] = {
143
+ "q": query,
144
+ "format": "json",
145
+ "pageno": pageno,
146
+ }
147
+
148
+ if categories:
149
+ params["categories"] = ",".join(categories)
150
+ if engines:
151
+ params["engines"] = ",".join(engines)
152
+ if time_range:
153
+ params["time_range"] = time_range.value
154
+
155
+ return params
156
+
157
+ def _make_request(
158
+ self,
159
+ instance: str,
160
+ params: dict[str, Any],
161
+ ) -> dict[str, Any]:
162
+ """
163
+ Make HTTP request to SearXNG instance.
164
+
165
+ Args:
166
+ instance: SearXNG instance URL
167
+ params: Query parameters
168
+
169
+ Returns:
170
+ JSON response data
171
+
172
+ Raises:
173
+ InstanceUnavailableError: If instance is unreachable
174
+ RateLimitError: If rate limit is exceeded
175
+ SearXNGError: For other API errors
176
+ """
177
+ url = f"{instance.rstrip('/')}/search"
178
+
179
+ try:
180
+ response = self._client.get(url, params=params)
181
+
182
+ # Handle rate limiting
183
+ if response.status_code == 429:
184
+ raise RateLimitError(f"Rate limit exceeded for {instance}")
185
+
186
+ # Handle server errors
187
+ if response.status_code >= 500:
188
+ raise InstanceUnavailableError(
189
+ f"Server error {response.status_code} for {instance}"
190
+ )
191
+
192
+ # Handle client errors
193
+ if response.status_code >= 400:
194
+ raise SearXNGError(
195
+ f"Client error {response.status_code}: {response.text}"
196
+ )
197
+
198
+ return response.json()
199
+
200
+ except httpx.TimeoutException as e:
201
+ raise InstanceUnavailableError(f"Timeout for {instance}: {e}") from e
202
+ except httpx.ConnectError as e:
203
+ raise InstanceUnavailableError(f"Connection failed to {instance}: {e}") from e
204
+ except httpx.HTTPStatusError as e:
205
+ raise InstanceUnavailableError(f"HTTP error from {instance}: {e}") from e
206
+
207
+ def _search_with_retry(
208
+ self,
209
+ params: dict[str, Any],
210
+ ) -> tuple[list[dict[str, Any]], str]:
211
+ """
212
+ Search with automatic instance fallback and retry logic.
213
+
214
+ Args:
215
+ params: Query parameters
216
+
217
+ Returns:
218
+ Tuple of (results list, instance URL that succeeded)
219
+
220
+ Raises:
221
+ AllInstancesFailedError: If all instances fail
222
+ """
223
+ last_error: Optional[Exception] = None
224
+
225
+ for instance in self.instances:
226
+ for attempt in range(self.max_retries):
227
+ try:
228
+ response = self._make_request(instance, params)
229
+
230
+ if not response.get("results"):
231
+ # No results but request succeeded
232
+ return [], instance
233
+
234
+ return response["results"], instance
235
+
236
+ except RateLimitError as e:
237
+ last_error = e
238
+ # Don't retry rate limit errors, move to next instance
239
+ break
240
+
241
+ except InstanceUnavailableError as e:
242
+ last_error = e
243
+ if attempt < self.max_retries - 1:
244
+ # Exponential backoff
245
+ delay = self.retry_delay * (2**attempt)
246
+ time.sleep(delay)
247
+ continue
248
+
249
+ except SearXNGError as e:
250
+ last_error = e
251
+ # Non-retryable error, move to next instance
252
+ break
253
+
254
+ # All instances failed
255
+ raise AllInstancesFailedError(
256
+ f"All SearXNG instances failed. Last error: {last_error}"
257
+ ) from last_error
258
+
259
+ def search(
260
+ self,
261
+ query: str,
262
+ categories: Optional[list[str]] = None,
263
+ engines: Optional[list[str]] = None,
264
+ pageno: int = 1,
265
+ time_range: Optional[TimeRange] = None,
266
+ ) -> list[dict[str, Any]]:
267
+ """
268
+ Generic search method returning raw results.
269
+
270
+ Args:
271
+ query: Search query string
272
+ categories: List of result categories (e.g., ['general', 'images'])
273
+ engines: List of search engines to use
274
+ pageno: Page number (1-indexed)
275
+ time_range: Time filter for results
276
+
277
+ Returns:
278
+ List of raw result dictionaries
279
+
280
+ Raises:
281
+ AllInstancesFailedError: If all instances fail
282
+ """
283
+ params = self._build_params(
284
+ query=query,
285
+ categories=categories,
286
+ engines=engines,
287
+ pageno=pageno,
288
+ time_range=time_range,
289
+ )
290
+
291
+ results, _instance = self._search_with_retry(params)
292
+ return results
293
+
294
+ def text(
295
+ self,
296
+ query: str,
297
+ max_results: int = 10,
298
+ time_range: Optional[TimeRange] = None,
299
+ ) -> list[TextResult]:
300
+ """
301
+ Perform text/web search.
302
+
303
+ Args:
304
+ query: Search query string
305
+ max_results: Maximum number of results to return
306
+ time_range: Time filter for results
307
+
308
+ Returns:
309
+ List of TextResult objects
310
+
311
+ Raises:
312
+ AllInstancesFailedError: If all instances fail
313
+ """
314
+ params = self._build_params(
315
+ query=query,
316
+ categories=["general"],
317
+ pageno=1,
318
+ time_range=time_range,
319
+ )
320
+
321
+ raw_results, _instance = self._search_with_retry(params)
322
+
323
+ results = []
324
+ for item in raw_results[:max_results]:
325
+ if item.get("category") in ["general", ""]:
326
+ results.append(
327
+ TextResult(
328
+ url=item.get("url", ""),
329
+ title=item.get("title", ""),
330
+ content=item.get("content", ""),
331
+ engine=item.get("engine"),
332
+ category=item.get("category"),
333
+ score=item.get("score", 0.0),
334
+ )
335
+ )
336
+
337
+ return results
338
+
339
+ def images(
340
+ self,
341
+ query: str,
342
+ max_results: int = 10,
343
+ ) -> list[ImageResult]:
344
+ """
345
+ Perform image search.
346
+
347
+ Args:
348
+ query: Search query string
349
+ max_results: Maximum number of results to return
350
+
351
+ Returns:
352
+ List of ImageResult objects
353
+
354
+ Raises:
355
+ AllInstancesFailedError: If all instances fail
356
+ """
357
+ params = self._build_params(
358
+ query=query,
359
+ categories=["images"],
360
+ pageno=1,
361
+ )
362
+
363
+ raw_results, _instance = self._search_with_retry(params)
364
+
365
+ results = []
366
+ for item in raw_results[:max_results]:
367
+ if item.get("category") == "images":
368
+ results.append(
369
+ ImageResult(
370
+ url=item.get("url", ""),
371
+ title=item.get("title", ""),
372
+ img_src=item.get("img_src", ""),
373
+ thumbnail_src=item.get("thumbnail_src", ""),
374
+ engine=item.get("engine"),
375
+ source=item.get("source"),
376
+ resolution=item.get("resolution"),
377
+ )
378
+ )
379
+
380
+ return results
381
+
382
+ def news(
383
+ self,
384
+ query: str,
385
+ max_results: int = 10,
386
+ time_range: Optional[TimeRange] = None,
387
+ ) -> list[NewsResult]:
388
+ """
389
+ Perform news search.
390
+
391
+ Args:
392
+ query: Search query string
393
+ max_results: Maximum number of results to return
394
+ time_range: Time filter for results
395
+
396
+ Returns:
397
+ List of NewsResult objects
398
+
399
+ Raises:
400
+ AllInstancesFailedError: If all instances fail
401
+ """
402
+ params = self._build_params(
403
+ query=query,
404
+ categories=["news"],
405
+ pageno=1,
406
+ time_range=time_range,
407
+ )
408
+
409
+ raw_results, _instance = self._search_with_retry(params)
410
+
411
+ results = []
412
+ for item in raw_results[:max_results]:
413
+ if item.get("category") == "news":
414
+ results.append(
415
+ NewsResult(
416
+ url=item.get("url", ""),
417
+ title=item.get("title", ""),
418
+ content=item.get("content", ""),
419
+ engine=item.get("engine"),
420
+ published_date=item.get("publishedDate"),
421
+ score=item.get("score", 0.0),
422
+ )
423
+ )
424
+
425
+ return results
426
+
427
+ def close(self) -> None:
428
+ """Close the HTTP client and release resources."""
429
+ self._client.close()
430
+
431
+ def __enter__(self) -> "SearXNGClient":
432
+ """Context manager entry."""
433
+ return self
434
+
435
+ def __exit__(self, _exc_type: Any, _exc_val: Any, _exc_tb: Any) -> None:
436
+ """Context manager exit."""
437
+ self.close()
438
+
439
+
440
+ # Convenience function for quick searches
441
+ def search_text(
442
+ query: str,
443
+ max_results: int = 10,
444
+ time_range: Optional[TimeRange] = None,
445
+ instances: Optional[list[str]] = None,
446
+ ) -> list[TextResult]:
447
+ """
448
+ Quick text search with default configuration.
449
+
450
+ Args:
451
+ query: Search query string
452
+ max_results: Maximum number of results to return
453
+ time_range: Time filter for results
454
+ instances: Optional custom instance list
455
+
456
+ Returns:
457
+ List of TextResult objects
458
+ """
459
+ with SearXNGClient(instances=instances) as client:
460
+ return client.text(query, max_results, time_range)
README.md CHANGED
@@ -1,267 +1,267 @@
1
- ---
2
- title: Nymbo Tools MCP
3
- emoji: ⚙️
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- python_version: 3.12
9
- app_file: app.py
10
- pinned: true
11
- license: apache-2.0
12
- short_description: All-in-one hub of general purpose tools useful for any agent
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
-
17
- ## Nymbo-Tools MCP Server
18
-
19
- All-in-one hub of general-purpose tools useful for any agent. Run it as a Gradio web app, or connect to it remotely as a Model Context Protocol (MCP) server to call its tools programmatically.
20
-
21
- Live Space: https://huggingface.co/spaces/Nymbo/Tools
22
-
23
- ### What’s inside
24
-
25
- - Web Fetch: Turn any webpage into clean Markdown with optional link-only scraping, CSS selector stripping, length limits, and pagination via cursor offset.
26
- - Web Search: DuckDuckGo-backed search across text, news, images, videos, and books with readable, paginated output.
27
- - Code Interpreter: Execute small Python snippets and capture stdout.
28
- - Memory Manager: Lightweight JSON-based memory store with save/list/search/delete and tag filters.
29
- - Generate Speech: Kokoro‑82M TTS with 54 voices and adjustable speed (CPU or CUDA if available).
30
- - Generate Image: Text-to-image via Hugging Face serverless inference (choose model, steps, CFG, size, seed).
31
- - Generate Video: Text-to-video via Hugging Face serverless inference (model, steps, guidance, size, fps, duration, seed).
32
- - Deep Research: Multi-query web research pipeline (DDG search + parallel fetch + LLM report synthesis) with downloadable report.
33
- - Agent Skills: Discover, inspect, and access specialized agent skills and resources.
34
- - Agent Terminal: Execute Python code to chain multiple tools together (e.g., fetch -> summarize -> save) efficiently.
35
- - Obsidian Vault: Read-only access to an Obsidian vault (list, read, search notes) with safelisted paths.
36
- - File System: Safe, sandboxed filesystem operations under a tool root.
37
- - Shell Command: Run shell commands inside the same safe root as File System.
38
-
39
- ## Quick start
40
-
41
- Run the following commands in sequence to run the server locally:
42
-
43
- ```shell
44
- git clone https://huggingface.co/spaces/Nymbo/Tools
45
- cd Tools
46
- python -m venv env
47
- source env/bin/activate
48
- pip install -r requirements.txt
49
- python app.py
50
- ```
51
-
52
- Defaults:
53
- - The Gradio UI typically serves on http://127.0.0.1:7860
54
- - The MCP endpoint is available at http://127.0.0.1:7860/gradio_api/mcp/
55
-
56
- ## Using it as an MCP server
57
-
58
- Remote MCP (hosted):
59
- - Base URL: https://mcp.nymbo.net/gradio_api/mcp/
60
- - SSE endpoint (for clients that need it): https://mcp.nymbo.net/gradio_api/mcp/sse
61
-
62
- Local MCP (when you run app.py):
63
- - Base URL: http://127.0.0.1:7860/gradio_api/mcp/
64
- - SSE endpoint: http://127.0.0.1:7860/gradio_api/mcp/sse
65
-
66
- Example client config (JSON):
67
-
68
- ```json
69
- {
70
- "mcpServers": {
71
- "nymbo-tools": {
72
- "url": "https://mcp.nymbo.net/gradio_api/mcp/"
73
- }
74
- }
75
- }
76
- ```
77
-
78
- ## Environment variables (optional but recommended)
79
-
80
- - HF_READ_TOKEN: Enables Image Generation, Video Generation, and Deep Research (Hugging Face serverless inference). These tools stay visible to MCP clients but calls require a valid token to succeed.
81
- - HF_TOKEN: Alternative token fallback used by some providers (also enables Deep Research/Video).
82
- - NYMBO_TOOLS_ROOT: Overrides the File System/Shell working root. Defaults to Nymbo-Tools/Filesystem.
83
- - UNSAFE_ALLOW_ABS_PATHS=1: Allow absolute paths in File System and Shell Command (off by default for safety).
84
-
85
- Notes:
86
- - Without a HF API key, you can still use Web Fetch, Web Search, Code Interpreter, Memory Manager, File System, Shell Command, and Generate Speech.
87
- - Generate Speech requires the kokoro package and its dependencies; it works on CPU and uses CUDA if available. Doesn't require an API key because it's computed on the server itself.
88
-
89
- ## Persistence and privacy
90
-
91
- - Memory Manager stores entries in `memories.json` at the Nymbo-Tools folder root when running locally.
92
- - File System defaults to the `Filesystem/` directory under Nymbo-Tools.
93
- - In the public demo Space, storage is ephemeral and visible to anyone using the Space; avoid personal or sensitive data.
94
-
95
- ## Tool reference (signatures and behavior)
96
-
97
- Below are the MCP tool parameters summarized by inputs, outputs, and notable behaviors.
98
-
99
- ### Web_Fetch (Webpages, converted to Markdown)
100
- Inputs:
101
- - url (str): Absolute URL to fetch (must return HTML).
102
- - max_chars (int, default 3000): 0 = full page; otherwise truncates with a next_cursor notice.
103
- - strip_selectors (str): Comma-separated CSS selectors to remove (e.g., .header, .footer, nav).
104
- - mode (str): "markdown" (default), "html", or "url_scraper" (returns list of links).
105
- - offset (int): Character offset for pagination; pass the previous next_cursor to continue.
106
-
107
- Output: Markdown string, raw HTML, or link list. If truncated, includes a next_cursor to continue.
108
-
109
- ### Web_Search (DuckDuckGo backend)
110
- Inputs:
111
- - query (str): DuckDuckGo query (supports site:, quotes, OR).
112
- - max_results (int 1–20, default 5)
113
- - page (int, default 1) or offset (int) for precise continuation
114
- - search_type (str): "text" | "news" | "images" | "videos" | "books"
115
-
116
- Output: Readable text with pagination hints and next_offset.
117
-
118
- ### Code_Interpreter (Python)
119
- Inputs:
120
- - code (str): Python source; stdout is captured.
121
-
122
- Output: Captured stdout or the exception text.
123
-
124
- ### Memory_Manager (Simple JSON store)
125
- Inputs:
126
- - action: "save" | "list" | "search" | "delete"
127
- - text (save only), tags (save only)
128
- - query (search only): supports tag:name terms and AND/OR
129
- - limit (list/search): default 20
130
- - memory_id (delete): full UUID or unique prefix
131
- - include_tags (bool): include tags when listing/searching
132
-
133
- Output: Confirmation string, listing, search matches, or structured error text.
134
-
135
- ### Generate_Speech (Kokoro-82M)
136
- Inputs:
137
- - text (str)
138
- - speed (float 0.5–2.0, default 1.25)
139
- - voice (str): One of 54 voices (e.g., af_heart, am_liam, bf_alice, zf_xiaoyi…)
140
-
141
- Output: (sample_rate:int, waveform:np.ndarray) – rendered as downloadable WAV in the UI.
142
-
143
- ### Generate_Image (HF inference)
144
- Requires: HF_READ_TOKEN
145
-
146
- Inputs:
147
- - prompt (str)
148
- - model_id (str): e.g., black-forest-labs/FLUX.1-Krea-dev
149
- - negative_prompt (str)
150
- - steps (1–100), cfg_scale (1–20), sampler (UI label), seed (-1=random), width/height
151
-
152
- Output: PIL.Image. In UI, displayed and downloadable. Errors guide you to provide a token or fix model id.
153
-
154
- ### Generate_Video (HF inference)
155
- Requires: HF_READ_TOKEN or HF_TOKEN
156
-
157
- Inputs:
158
- - prompt (str)
159
- - model_id (str): default Wan-AI/Wan2.2-T2V-A14B
160
- - negative_prompt (str)
161
- - steps (1–100), cfg_scale, seed, width/height, fps, duration (s)
162
-
163
- Output: Temporary MP4 file path; UI shows a playable/downloadable video.
164
-
165
- ### Deep_Research (HF inference)
166
- Requires: HF_READ_TOKEN or HF_TOKEN
167
-
168
- Inputs:
169
- - summary (str): One or more sentences describing the research task.
170
- - query1..query5 (str) with max1..max5 (1–50). Total requested results across queries are capped at 50.
171
-
172
- Behavior:
173
- - Parallel DDG searches → fetch pages in budget → filter candidate sources with an LLM → synthesize a long, well-structured Markdown report and list of sources.
174
-
175
- Output: (report_md, fetched_links_text, report_file_path)
176
-
177
- ### File_System (safe root)
178
- Root:
179
- - Defaults to `Nymbo-Tools/Filesystem` (or NYMBO_TOOLS_ROOT). Absolute paths disabled unless UNSAFE_ALLOW_ABS_PATHS=1.
180
-
181
- Actions:
182
- - list, read, write, append, edit, mkdir, move, copy, delete, info, help
183
-
184
- Key fields:
185
- - path, content (write/append/edit), dest_path (move/copy), recursive, show_hidden, max_entries, offset, max_chars, create_dirs, overwrite
186
-
187
- Edit format (SEARCH/REPLACE blocks):
188
- ```
189
- <<<<<<< SEARCH
190
- [exact content to find]
191
- =======
192
- [new content to replace with]
193
- >>>>>>> REPLACE
194
- ```
195
- - Multiple blocks can be included; each is applied in order
196
- - Search text must match exactly (whitespace, indentation)
197
- - Only the first occurrence of each search text is replaced
198
-
199
- Output:
200
- - Human-readable listings or JSON-like error strings with code/message/hint.
201
-
202
- ### Shell_Command (same safe root)
203
- Inputs:
204
- - command (str): Single-string shell command (pipelines supported by the host shell).
205
- - workdir (str): Relative to the root.
206
- - timeout (s)
207
-
208
- Output:
209
- - Combined header + STDOUT/STDERR. Absolute paths disabled by default. Shell is detected automatically (PowerShell on Windows when available; bash/sh on POSIX).
210
-
211
- ### Agent_Skills (Skill Discovery)
212
- Inputs:
213
- - action: "discover" | "info" | "resources" | "validate" | "search" | "help"
214
- - skill_name (str): Required for info/resources/validate.
215
- - resource_path (str): Specific file to read within a skill.
216
- - query (str): Search term for "search" action.
217
- - max_chars (int), offset (int)
218
-
219
- Output:
220
- - Detailed skill metadata, SKILL.md content, resource file content, or validation reports.
221
-
222
- ### Agent_Terminal (Tool Chaining)
223
- Inputs:
224
- - input (str): Python source code to execute.
225
- - Can call any other tool (e.g., `Web_Fetch(...)`, `File_System(...)`).
226
- - Use `search_tools("query")` to find tools.
227
- - Call a tool with no args to get its usage guide.
228
-
229
- Output:
230
- - Captured STDOUT validation of the script.
231
-
232
- ### Obsidian_Vault (Read-only Note Access)
233
- Root:
234
- - Defaults to `Tools/Obsidian` (or OBSIDIAN_VAULT_ROOT).
235
-
236
- Inputs:
237
- - action: "list" | "read" | "info" | "search" | "help"
238
- - path (str): Relative to vault root (start with /).
239
- - query (str): For search action.
240
- - recursive (bool), show_hidden (bool), max_entries (int)
241
- - offset (int), max_chars (int)
242
-
243
- Output:
244
- - File listings, note content, or search results (with context).
245
-
246
- ## Running on Hugging Face Spaces
247
-
248
- 1) Duplicate the Space at https://huggingface.co/spaces/Nymbo/Tools.
249
- 2) In Space Settings → Secrets, add HF_READ_TOKEN (and/or HF_TOKEN) for model access.
250
- 3) Both the UI and MCP clients will list every tool. Image/Video/Deep Research still need a valid token when invoked.
251
-
252
- ## Troubleshooting
253
-
254
- - Image/Video/Deep Research calls fail immediately:
255
- - Provide HF_READ_TOKEN (and optionally HF_TOKEN). Restart the app/Space.
256
- - 401/403 when calling generation tools:
257
- - Token missing/insufficient permissions. Ensure your token can read the chosen model.
258
- - Kokoro not found:
259
- - Install kokoro>=0.9.4. CPU works; CUDA used if available. Torch may be skipped on Apple Silicon by design.
260
- - Windows PowerShell activation policy blocks venv activation:
261
- - Run PowerShell as Admin and set a suitable execution policy for the current user (e.g., RemoteSigned), or manually run `python app.py` after installing dependencies.
262
- - File System or Shell path errors:
263
- - Paths are relative to the tool root. Set NYMBO_TOOLS_ROOT to customize. Set UNSAFE_ALLOW_ABS_PATHS=1 only if you fully trust the environment.
264
-
265
- ## License
266
-
267
  Apache-2.0 (see Space metadata). If you duplicate the Space or use these tools, ensure your usage complies with the licenses and terms of the underlying models and providers.
 
1
+ ---
2
+ title: Nymbo Tools MCP
3
+ emoji: ⚙️
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 6.2.0
8
+ python_version: 3.12
9
+ app_file: app.py
10
+ pinned: true
11
+ license: apache-2.0
12
+ short_description: All-in-one hub of general purpose tools useful for any agent
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
+
17
+ ## Nymbo-Tools MCP Server
18
+
19
+ All-in-one hub of general-purpose tools useful for any agent. Run it as a Gradio web app, or connect to it remotely as a Model Context Protocol (MCP) server to call its tools programmatically.
20
+
21
+ Live Space: https://huggingface.co/spaces/Nymbo/Tools
22
+
23
+ ### What’s inside
24
+
25
+ - Web Fetch: Turn any webpage into clean Markdown with optional link-only scraping, CSS selector stripping, length limits, and pagination via cursor offset.
26
+ - Web Search: DuckDuckGo-backed search across text, news, images, videos, and books with readable, paginated output.
27
+ - Code Interpreter: Execute small Python snippets and capture stdout.
28
+ - Memory Manager: Lightweight JSON-based memory store with save/list/search/delete and tag filters.
29
+ - Generate Speech: Kokoro‑82M TTS with 54 voices and adjustable speed (CPU or CUDA if available).
30
+ - Generate Image: Text-to-image via Hugging Face serverless inference (choose model, steps, CFG, size, seed).
31
+ - Generate Video: Text-to-video via Hugging Face serverless inference (model, steps, guidance, size, fps, duration, seed).
32
+ - Deep Research: Multi-query web research pipeline (DDG search + parallel fetch + LLM report synthesis) with downloadable report.
33
+ - Agent Skills: Discover, inspect, and access specialized agent skills and resources.
34
+ - Agent Terminal: Execute Python code to chain multiple tools together (e.g., fetch -> summarize -> save) efficiently.
35
+ - Obsidian Vault: Read-only access to an Obsidian vault (list, read, search notes) with safelisted paths.
36
+ - File System: Safe, sandboxed filesystem operations under a tool root.
37
+ - Shell Command: Run shell commands inside the same safe root as File System.
38
+
39
+ ## Quick start
40
+
41
+ Run the following commands in sequence to run the server locally:
42
+
43
+ ```shell
44
+ git clone https://huggingface.co/spaces/Nymbo/Tools
45
+ cd Tools
46
+ python -m venv env
47
+ source env/bin/activate
48
+ pip install -r requirements.txt
49
+ python app.py
50
+ ```
51
+
52
+ Defaults:
53
+ - The Gradio UI typically serves on http://127.0.0.1:7860
54
+ - The MCP endpoint is available at http://127.0.0.1:7860/gradio_api/mcp/
55
+
56
+ ## Using it as an MCP server
57
+
58
+ Remote MCP (hosted):
59
+ - Base URL: https://mcp.nymbo.net/gradio_api/mcp/
60
+ - SSE endpoint (for clients that need it): https://mcp.nymbo.net/gradio_api/mcp/sse
61
+
62
+ Local MCP (when you run app.py):
63
+ - Base URL: http://127.0.0.1:7860/gradio_api/mcp/
64
+ - SSE endpoint: http://127.0.0.1:7860/gradio_api/mcp/sse
65
+
66
+ Example client config (JSON):
67
+
68
+ ```json
69
+ {
70
+ "mcpServers": {
71
+ "nymbo-tools": {
72
+ "url": "https://mcp.nymbo.net/gradio_api/mcp/"
73
+ }
74
+ }
75
+ }
76
+ ```
77
+
78
+ ## Environment variables (optional but recommended)
79
+
80
+ - HF_READ_TOKEN: Enables Image Generation, Video Generation, and Deep Research (Hugging Face serverless inference). These tools stay visible to MCP clients but calls require a valid token to succeed.
81
+ - HF_TOKEN: Alternative token fallback used by some providers (also enables Deep Research/Video).
82
+ - NYMBO_TOOLS_ROOT: Overrides the File System/Shell working root. Defaults to Nymbo-Tools/Filesystem.
83
+ - UNSAFE_ALLOW_ABS_PATHS=1: Allow absolute paths in File System and Shell Command (off by default for safety).
84
+
85
+ Notes:
86
+ - Without a HF API key, you can still use Web Fetch, Web Search, Code Interpreter, Memory Manager, File System, Shell Command, and Generate Speech.
87
+ - Generate Speech requires the kokoro package and its dependencies; it works on CPU and uses CUDA if available. Doesn't require an API key because it's computed on the server itself.
88
+
89
+ ## Persistence and privacy
90
+
91
+ - Memory Manager stores entries in `memories.json` at the Nymbo-Tools folder root when running locally.
92
+ - File System defaults to the `Filesystem/` directory under Nymbo-Tools.
93
+ - In the public demo Space, storage is ephemeral and visible to anyone using the Space; avoid personal or sensitive data.
94
+
95
+ ## Tool reference (signatures and behavior)
96
+
97
+ Below are the MCP tool parameters summarized by inputs, outputs, and notable behaviors.
98
+
99
+ ### Web_Fetch (Webpages, converted to Markdown)
100
+ Inputs:
101
+ - url (str): Absolute URL to fetch (must return HTML).
102
+ - max_chars (int, default 3000): 0 = full page; otherwise truncates with a next_cursor notice.
103
+ - strip_selectors (str): Comma-separated CSS selectors to remove (e.g., .header, .footer, nav).
104
+ - mode (str): "markdown" (default), "html", or "url_scraper" (returns list of links).
105
+ - offset (int): Character offset for pagination; pass the previous next_cursor to continue.
106
+
107
+ Output: Markdown string, raw HTML, or link list. If truncated, includes a next_cursor to continue.
108
+
109
+ ### Web_Search (DuckDuckGo backend)
110
+ Inputs:
111
+ - query (str): DuckDuckGo query (supports site:, quotes, OR).
112
+ - max_results (int 1–20, default 5)
113
+ - page (int, default 1) or offset (int) for precise continuation
114
+ - search_type (str): "text" | "news" | "images" | "videos" | "books"
115
+
116
+ Output: Readable text with pagination hints and next_offset.
117
+
118
+ ### Code_Interpreter (Python)
119
+ Inputs:
120
+ - code (str): Python source; stdout is captured.
121
+
122
+ Output: Captured stdout or the exception text.
123
+
124
+ ### Memory_Manager (Simple JSON store)
125
+ Inputs:
126
+ - action: "save" | "list" | "search" | "delete"
127
+ - text (save only), tags (save only)
128
+ - query (search only): supports tag:name terms and AND/OR
129
+ - limit (list/search): default 20
130
+ - memory_id (delete): full UUID or unique prefix
131
+ - include_tags (bool): include tags when listing/searching
132
+
133
+ Output: Confirmation string, listing, search matches, or structured error text.
134
+
135
+ ### Generate_Speech (Kokoro-82M)
136
+ Inputs:
137
+ - text (str)
138
+ - speed (float 0.5–2.0, default 1.25)
139
+ - voice (str): One of 54 voices (e.g., af_heart, am_liam, bf_alice, zf_xiaoyi…)
140
+
141
+ Output: (sample_rate:int, waveform:np.ndarray) – rendered as downloadable WAV in the UI.
142
+
143
+ ### Generate_Image (HF inference)
144
+ Requires: HF_READ_TOKEN
145
+
146
+ Inputs:
147
+ - prompt (str)
148
+ - model_id (str): e.g., black-forest-labs/FLUX.1-Krea-dev
149
+ - negative_prompt (str)
150
+ - steps (1–100), cfg_scale (1–20), sampler (UI label), seed (-1=random), width/height
151
+
152
+ Output: PIL.Image. In UI, displayed and downloadable. Errors guide you to provide a token or fix model id.
153
+
154
+ ### Generate_Video (HF inference)
155
+ Requires: HF_READ_TOKEN or HF_TOKEN
156
+
157
+ Inputs:
158
+ - prompt (str)
159
+ - model_id (str): default Wan-AI/Wan2.2-T2V-A14B
160
+ - negative_prompt (str)
161
+ - steps (1–100), cfg_scale, seed, width/height, fps, duration (s)
162
+
163
+ Output: Temporary MP4 file path; UI shows a playable/downloadable video.
164
+
165
+ ### Deep_Research (HF inference)
166
+ Requires: HF_READ_TOKEN or HF_TOKEN
167
+
168
+ Inputs:
169
+ - summary (str): One or more sentences describing the research task.
170
+ - query1..query5 (str) with max1..max5 (1–50). Total requested results across queries are capped at 50.
171
+
172
+ Behavior:
173
+ - Parallel DDG searches → fetch pages in budget → filter candidate sources with an LLM → synthesize a long, well-structured Markdown report and list of sources.
174
+
175
+ Output: (report_md, fetched_links_text, report_file_path)
176
+
177
+ ### File_System (safe root)
178
+ Root:
179
+ - Defaults to `Nymbo-Tools/Filesystem` (or NYMBO_TOOLS_ROOT). Absolute paths disabled unless UNSAFE_ALLOW_ABS_PATHS=1.
180
+
181
+ Actions:
182
+ - list, read, write, append, edit, mkdir, move, copy, delete, info, help
183
+
184
+ Key fields:
185
+ - path, content (write/append/edit), dest_path (move/copy), recursive, show_hidden, max_entries, offset, max_chars, create_dirs, overwrite
186
+
187
+ Edit format (SEARCH/REPLACE blocks):
188
+ ```
189
+ <<<<<<< SEARCH
190
+ [exact content to find]
191
+ =======
192
+ [new content to replace with]
193
+ >>>>>>> REPLACE
194
+ ```
195
+ - Multiple blocks can be included; each is applied in order
196
+ - Search text must match exactly (whitespace, indentation)
197
+ - Only the first occurrence of each search text is replaced
198
+
199
+ Output:
200
+ - Human-readable listings or JSON-like error strings with code/message/hint.
201
+
202
+ ### Shell_Command (same safe root)
203
+ Inputs:
204
+ - command (str): Single-string shell command (pipelines supported by the host shell).
205
+ - workdir (str): Relative to the root.
206
+ - timeout (s)
207
+
208
+ Output:
209
+ - Combined header + STDOUT/STDERR. Absolute paths disabled by default. Shell is detected automatically (PowerShell on Windows when available; bash/sh on POSIX).
210
+
211
+ ### Agent_Skills (Skill Discovery)
212
+ Inputs:
213
+ - action: "discover" | "info" | "resources" | "validate" | "search" | "help"
214
+ - skill_name (str): Required for info/resources/validate.
215
+ - resource_path (str): Specific file to read within a skill.
216
+ - query (str): Search term for "search" action.
217
+ - max_chars (int), offset (int)
218
+
219
+ Output:
220
+ - Detailed skill metadata, SKILL.md content, resource file content, or validation reports.
221
+
222
+ ### Agent_Terminal (Tool Chaining)
223
+ Inputs:
224
+ - input (str): Python source code to execute.
225
+ - Can call any other tool (e.g., `Web_Fetch(...)`, `File_System(...)`).
226
+ - Use `search_tools("query")` to find tools.
227
+ - Call a tool with no args to get its usage guide.
228
+
229
+ Output:
230
+ - Captured STDOUT validation of the script.
231
+
232
+ ### Obsidian_Vault (Read-only Note Access)
233
+ Root:
234
+ - Defaults to `Tools/Obsidian` (or OBSIDIAN_VAULT_ROOT).
235
+
236
+ Inputs:
237
+ - action: "list" | "read" | "info" | "search" | "help"
238
+ - path (str): Relative to vault root (start with /).
239
+ - query (str): For search action.
240
+ - recursive (bool), show_hidden (bool), max_entries (int)
241
+ - offset (int), max_chars (int)
242
+
243
+ Output:
244
+ - File listings, note content, or search results (with context).
245
+
246
+ ## Running on Hugging Face Spaces
247
+
248
+ 1) Duplicate the Space at https://huggingface.co/spaces/Nymbo/Tools.
249
+ 2) In Space Settings → Secrets, add HF_READ_TOKEN (and/or HF_TOKEN) for model access.
250
+ 3) Both the UI and MCP clients will list every tool. Image/Video/Deep Research still need a valid token when invoked.
251
+
252
+ ## Troubleshooting
253
+
254
+ - Image/Video/Deep Research calls fail immediately:
255
+ - Provide HF_READ_TOKEN (and optionally HF_TOKEN). Restart the app/Space.
256
+ - 401/403 when calling generation tools:
257
+ - Token missing/insufficient permissions. Ensure your token can read the chosen model.
258
+ - Kokoro not found:
259
+ - Install kokoro>=0.9.4. CPU works; CUDA used if available. Torch may be skipped on Apple Silicon by design.
260
+ - Windows PowerShell activation policy blocks venv activation:
261
+ - Run PowerShell as Admin and set a suitable execution policy for the current user (e.g., RemoteSigned), or manually run `python app.py` after installing dependencies.
262
+ - File System or Shell path errors:
263
+ - Paths are relative to the tool root. Set NYMBO_TOOLS_ROOT to customize. Set UNSAFE_ALLOW_ABS_PATHS=1 only if you fully trust the environment.
264
+
265
+ ## License
266
+
267
  Apache-2.0 (see Space metadata). If you duplicate the Space or use these tools, ensure your usage complies with the licenses and terms of the underlying models and providers.
app.py CHANGED
@@ -1,124 +1,118 @@
1
- from __future__ import annotations
2
-
3
- # Project by Nymbo
4
-
5
  import json
6
  import os
7
  import sys
8
- import threading
9
- import time
10
- import warnings
11
- from datetime import datetime, timedelta
12
- from typing import Any
13
-
14
- # Suppress asyncio event loop cleanup errors (Python 3.10 issue on HF Spaces)
15
- # These occur when event loops are garbage collected after file descriptors close
16
- def _patch_asyncio_event_loop_del():
17
- """Patch BaseEventLoop.__del__ to suppress 'Invalid file descriptor: -1' errors."""
18
- try:
19
- import asyncio.base_events as base_events
20
- original_del = getattr(base_events.BaseEventLoop, "__del__", None)
21
- if original_del is None:
22
- return
23
- def patched_del(self):
24
- try:
25
- original_del(self)
26
- except ValueError as e:
27
- if "Invalid file descriptor" not in str(e):
28
- raise
29
- base_events.BaseEventLoop.__del__ = patched_del
30
- except Exception:
31
- pass
32
-
33
- _patch_asyncio_event_loop_del()
34
-
35
- import gradio as gr
36
-
37
-
38
- class RateLimiter:
39
- """Best-effort in-process rate limiter for HTTP-heavy tools."""
40
-
41
- def __init__(self, requests_per_minute: int = 30) -> None:
42
- self.requests_per_minute = requests_per_minute
43
- self._requests: list[datetime] = []
44
- self._lock = threading.Lock()
45
-
46
- def acquire(self) -> None:
47
- now = datetime.now()
48
- with self._lock:
49
- self._requests = [req for req in self._requests if now - req < timedelta(minutes=1)]
50
- if len(self._requests) >= self.requests_per_minute:
51
- wait_time = 60 - (now - self._requests[0]).total_seconds()
52
- if wait_time > 0:
53
- time.sleep(max(1, wait_time))
54
- self._requests.append(now)
55
-
56
-
57
- _search_rate_limiter = RateLimiter(requests_per_minute=20)
58
- _fetch_rate_limiter = RateLimiter(requests_per_minute=25)
59
-
60
-
61
- def _truncate_for_log(value: Any, limit: int = 500) -> str:
62
- if not isinstance(value, str):
63
- value = str(value)
64
- if len(value) <= limit:
65
- return value
66
- return value[: limit - 1] + "…"
67
-
68
-
69
- def _serialize_input(val: Any) -> Any:
70
- try:
71
- if isinstance(val, (str, int, float, bool)) or val is None:
72
- return val
73
- if isinstance(val, (list, tuple)):
74
- return [_serialize_input(v) for v in list(val)[:10]] + (["…"] if len(val) > 10 else [])
75
- if isinstance(val, dict):
76
- out: dict[str, Any] = {}
77
- for i, (k, v) in enumerate(val.items()):
78
- if i >= 12:
79
- out["…"] = "…"
80
- break
81
- out[str(k)] = _serialize_input(v)
82
- return out
83
- return repr(val)[:120]
84
- except Exception:
85
- return "<unserializable>"
86
-
87
-
88
- def _log_call_start(func_name: str, **kwargs: Any) -> None:
89
- try:
90
- compact = {k: _serialize_input(v) for k, v in kwargs.items()}
91
- # Use sys.__stdout__ to avoid capturing logs in redirected output
92
- print(f"[TOOL CALL] {func_name} inputs: {json.dumps(compact, ensure_ascii=False)[:800]}", flush=True, file=sys.__stdout__)
93
- except Exception as exc:
94
- print(f"[TOOL CALL] {func_name} (failed to log inputs: {exc})", flush=True, file=sys.__stdout__)
95
-
96
-
97
- def _log_call_end(func_name: str, output_desc: str) -> None:
98
- try:
99
- # Use sys.__stdout__ to avoid capturing logs in redirected output
100
- print(f"[TOOL RESULT] {func_name} output: {output_desc}", flush=True, file=sys.__stdout__)
101
- except Exception as exc:
102
- print(f"[TOOL RESULT] {func_name} (failed to log output: {exc})", flush=True, file=sys.__stdout__)
103
-
104
- # Ensure Tools modules can import 'app' when this file is executed as a script
105
- # (their code does `from app import ...`).
106
- sys.modules.setdefault("app", sys.modules[__name__])
107
-
108
  # Import per-tool interface builders from the Tools package
109
- from Modules.Web_Fetch import build_interface as build_fetch_interface
110
  from Modules.Web_Search import build_interface as build_search_interface
 
111
  from Modules.Agent_Terminal import build_interface as build_agent_terminal_interface
112
  from Modules.Code_Interpreter import build_interface as build_code_interface
113
  from Modules.Memory_Manager import build_interface as build_memory_interface
114
- from Modules.Generate_Speech import build_interface as build_speech_interface
115
  from Modules.Generate_Image import build_interface as build_image_interface
116
- from Modules.Generate_Video import build_interface as build_video_interface
117
- from Modules.Deep_Research import build_interface as build_research_interface
118
- from Modules.File_System import build_interface as build_fs_interface
119
- from Modules.Obsidian_Vault import build_interface as build_obsidian_interface
120
  from Modules.Shell_Command import build_interface as build_shell_interface
121
- from Modules.Agent_Skills import build_interface as build_skills_interface
122
 
123
  # Optional environment flags used to conditionally show API schemas (unchanged behavior)
124
  HF_IMAGE_TOKEN = bool(os.getenv("HF_READ_TOKEN"))
@@ -131,96 +125,78 @@ with open(_css_path, "r", encoding="utf-8") as _css_file:
131
  CSS_STYLES = _css_file.read()
132
 
133
  # Build each tab interface using modular builders
134
- fetch_interface = build_fetch_interface()
135
  web_search_interface = build_search_interface()
 
136
  agent_terminal_interface = build_agent_terminal_interface()
137
  code_interface = build_code_interface()
138
  memory_interface = build_memory_interface()
139
- kokoro_interface = build_speech_interface()
140
  image_generation_interface = build_image_interface()
141
- video_generation_interface = build_video_interface()
142
- deep_research_interface = build_research_interface()
143
- fs_interface = build_fs_interface()
144
  shell_interface = build_shell_interface()
145
- obsidian_interface = build_obsidian_interface()
146
- skills_interface = build_skills_interface()
147
 
148
  _interfaces = [
149
  agent_terminal_interface,
150
- skills_interface,
151
- fetch_interface,
152
  web_search_interface,
 
153
  code_interface,
154
  shell_interface,
155
- fs_interface,
156
- obsidian_interface,
157
  memory_interface,
158
- kokoro_interface,
159
  image_generation_interface,
160
- video_generation_interface,
161
- deep_research_interface,
162
  ]
163
  _tab_names = [
164
  "Agent Terminal",
165
- "Agent Skills",
166
- "Web Fetch",
167
  "Web Search",
 
168
  "Code Interpreter",
169
  "Shell Command",
170
- "File System",
171
- "Obsidian Vault",
172
  "Memory Manager",
173
- "Generate Speech",
174
  "Generate Image",
175
- "Generate Video",
176
- "Deep Research",
177
  ]
178
-
179
- with gr.Blocks(title="Nymbo/Tools MCP") as demo:
180
-
181
- with gr.Sidebar(width=300, elem_classes="app-sidebar"):
182
- gr.Markdown(
183
- "## Nymbo/Tools MCP\n"
184
- "<p style='font-size: 0.7rem; opacity: 0.85; margin-top: 2px; margin-bottom: 6px;'>General purpose tools useful for any agent.</p>\n"
185
- "<a href='https://www.nymbo.net/nymbot' target='_blank' style='font-size: 0.7rem; display: block;'>Test with Nymbot</a>"
186
- )
187
-
188
- with gr.Accordion("Information", open=False):
189
- gr.HTML(
190
- """
191
- <div class="info-accordion">
192
- <div class="info-grid" style="grid-template-columns: 1fr;">
193
- <section class="info-card">
194
- <div class="info-card__body">
195
- <h3>Connecting from an MCP Client</h3>
196
- <p>
197
- This Space also runs as a Model Context Protocol (MCP) server. Point your client to:
198
- <br/>
199
- <code>https://nymbo-tools.hf.space/gradio_api/mcp/</code>
200
- </p>
201
- <p>Example client configuration:</p>
202
- <pre><code class="language-json">{
203
- "mcpServers": {
204
- "nymbo-tools": {
205
- "url": "https://nymbo-tools.hf.space/gradio_api/mcp/"
206
- }
207
- }
208
- }</code></pre>
209
- <p>Run the following commands in sequence to run the server locally:</p>
210
- <pre><code>git clone https://huggingface.co/spaces/Nymbo/Tools
211
- cd Tools
212
- python -m venv env
213
- source env/bin/activate
214
- pip install -r requirements.txt
215
- python app.py</code></pre>
216
- </div>
217
- </section>
218
-
219
  <section class="info-card">
220
  <div class="info-card__body">
221
- <h3>Enable Image Gen, Video Gen, and Deep Research</h3>
222
  <p>
223
- The <code>Generate_Image</code>, <code>Generate_Video</code>, and <code>Deep_Research</code> tools require a
224
  <code>HF_READ_TOKEN</code> set as a secret or environment variable.
225
  </p>
226
  <ul class="info-list">
@@ -228,124 +204,66 @@ python app.py</code></pre>
228
  <li>Or run locally with <code>HF_READ_TOKEN</code> in your environment.</li>
229
  </ul>
230
  <div class="info-hint">
231
- MCP clients can see these tools even without tokens, but calls will fail until a valid token is provided.
232
  </div>
233
  </div>
234
  </section>
235
-
236
  <section class="info-card">
237
  <div class="info-card__body">
238
- <h3>Persistent Memories and Files</h3>
239
  <p>
240
- In this public demo, memories and files created with the <code>Memory_Manager</code> and <code>File_System</code> are stored in the Space's running container and are cleared when the Space restarts. Content is visible to everyone—avoid personal data.
241
  </p>
242
  <p>
243
- When running locally, memories are saved to <code>memories.json</code> at the repo root for privacy, and files are saved to the <code>Tools/Filesystem</code> directory on disk.
244
  </p>
245
  </div>
246
  </section>
247
-
248
  <section class="info-card">
249
  <div class="info-card__body">
250
- <h3>Tool Notes &amp; Kokoro Voice Legend</h3>
251
  <p><strong>No authentication required for:</strong></p>
252
  <ul class="info-list">
253
- <li><code>Web_Fetch</code></li>
254
  <li><code>Web_Search</code></li>
255
  <li><code>Agent_Terminal</code></li>
256
  <li><code>Code_Interpreter</code></li>
257
  <li><code>Memory_Manager</code></li>
258
- <li><code>Generate_Speech</code></li>
259
- <li><code>File_System</code></li>
260
  <li><code>Shell_Command</code></li>
261
- <li><code>Agent_Skills</code></li>
262
  </ul>
263
- <p><strong>Kokoro voice prefixes</strong></p>
264
- <table style="width:100%; border-collapse:collapse; font-size:0.9em; margin-top:8px;">
265
- <thead>
266
- <tr style="border-bottom:1px solid rgba(255,255,255,0.15);">
267
- <th style="padding:6px 8px; text-align:left;">Accent</th>
268
- <th style="padding:6px 8px; text-align:center;">Female</th>
269
- <th style="padding:6px 8px; text-align:center;">Male</th>
270
- </tr>
271
- </thead>
272
- <tbody>
273
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
274
- <td style="padding:6px 8px; font-weight:600;">American</td>
275
- <td style="padding:6px 8px; text-align:center;"><code>af</code></td>
276
- <td style="padding:6px 8px; text-align:center;"><code>am</code></td>
277
- </tr>
278
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
279
- <td style="padding:6px 8px; font-weight:600;">British</td>
280
- <td style="padding:6px 8px; text-align:center;"><code>bf</code></td>
281
- <td style="padding:6px 8px; text-align:center;"><code>bm</code></td>
282
- </tr>
283
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
284
- <td style="padding:6px 8px; font-weight:600;">European</td>
285
- <td style="padding:6px 8px; text-align:center;"><code>ef</code></td>
286
- <td style="padding:6px 8px; text-align:center;"><code>em</code></td>
287
- </tr>
288
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
289
- <td style="padding:6px 8px; font-weight:600;">French</td>
290
- <td style="padding:6px 8px; text-align:center;"><code>ff</code></td>
291
- <td style="padding:6px 8px; text-align:center;">—</td>
292
- </tr>
293
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
294
- <td style="padding:6px 8px; font-weight:600;">Hindi</td>
295
- <td style="padding:6px 8px; text-align:center;"><code>hf</code></td>
296
- <td style="padding:6px 8px; text-align:center;"><code>hm</code></td>
297
- </tr>
298
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
299
- <td style="padding:6px 8px; font-weight:600;">Italian</td>
300
- <td style="padding:6px 8px; text-align:center;"><code>if</code></td>
301
- <td style="padding:6px 8px; text-align:center;"><code>im</code></td>
302
- </tr>
303
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
304
- <td style="padding:6px 8px; font-weight:600;">Japanese</td>
305
- <td style="padding:6px 8px; text-align:center;"><code>jf</code></td>
306
- <td style="padding:6px 8px; text-align:center;"><code>jm</code></td>
307
- </tr>
308
- <tr style="border-bottom:1px solid rgba(255,255,255,0.08);">
309
- <td style="padding:6px 8px; font-weight:600;">Portuguese</td>
310
- <td style="padding:6px 8px; text-align:center;"><code>pf</code></td>
311
- <td style="padding:6px 8px; text-align:center;"><code>pm</code></td>
312
- </tr>
313
- <tr>
314
- <td style="padding:6px 8px; font-weight:600;">Chinese</td>
315
- <td style="padding:6px 8px; text-align:center;"><code>zf</code></td>
316
- <td style="padding:6px 8px; text-align:center;"><code>zm</code></td>
317
- </tr>
318
- </tbody>
319
- </table>
320
  </div>
321
  </section>
322
- </div>
323
- </div>
324
- """
325
- )
326
-
327
- gr.Markdown("### Tools")
328
- tool_selector = gr.Radio(
329
- choices=_tab_names,
330
- value=_tab_names[0],
331
- label="Select Tool",
332
- show_label=False,
333
- container=False,
334
- elem_classes="sidebar-nav"
335
- )
336
-
337
- with gr.Tabs(elem_classes="hidden-tabs", selected=_tab_names[0]) as tool_tabs:
338
- for name, interface in zip(_tab_names, _interfaces):
339
- with gr.TabItem(label=name, id=name, elem_id=f"tab-{name}"):
340
- interface.render()
341
-
342
- # Use JavaScript to click the hidden tab button when the radio selection changes
343
- tool_selector.change(
344
- fn=None,
345
- inputs=tool_selector,
346
- outputs=None,
347
- js="(selected_tool) => { const buttons = document.querySelectorAll('.hidden-tabs button'); buttons.forEach(btn => { if (btn.innerText.trim() === selected_tool) { btn.click(); } }); }"
348
- )
349
-
350
- if __name__ == "__main__":
351
- demo.launch(mcp_server=True, theme="Nymbo/Nymbo_Theme", css=CSS_STYLES, ssr_mode=False)
 
1
+ from __future__ import annotations
2
+
3
+ # Project by Nymbo
4
+
5
  import json
6
  import os
7
  import sys
8
+ import threading
9
+ import time
10
+ import warnings
11
+ from datetime import datetime, timedelta
12
+ from typing import Any
13
+
14
+ # Suppress asyncio event loop cleanup errors (Python 3.10 issue on HF Spaces)
15
+ # These occur when event loops are garbage collected after file descriptors close
16
+ def _patch_asyncio_event_loop_del():
17
+ """Patch BaseEventLoop.__del__ to suppress 'Invalid file descriptor: -1' errors."""
18
+ try:
19
+ import asyncio.base_events as base_events
20
+ original_del = getattr(base_events.BaseEventLoop, "__del__", None)
21
+ if original_del is None:
22
+ return
23
+ def patched_del(self):
24
+ try:
25
+ original_del(self)
26
+ except ValueError as e:
27
+ if "Invalid file descriptor" not in str(e):
28
+ raise
29
+ base_events.BaseEventLoop.__del__ = patched_del
30
+ except Exception:
31
+ pass
32
+
33
+ _patch_asyncio_event_loop_del()
34
+
35
+ import gradio as gr
36
+
37
+
38
+ class RateLimiter:
39
+ """Best-effort in-process rate limiter for HTTP-heavy tools."""
40
+
41
+ def __init__(self, requests_per_minute: int = 30) -> None:
42
+ self.requests_per_minute = requests_per_minute
43
+ self._requests: list[datetime] = []
44
+ self._lock = threading.Lock()
45
+
46
+ def acquire(self) -> None:
47
+ now = datetime.now()
48
+ with self._lock:
49
+ self._requests = [req for req in self._requests if now - req < timedelta(minutes=1)]
50
+ if len(self._requests) >= self.requests_per_minute:
51
+ wait_time = 60 - (now - self._requests[0]).total_seconds()
52
+ if wait_time > 0:
53
+ time.sleep(max(1, wait_time))
54
+ self._requests.append(now)
55
+
56
+
57
+ _search_rate_limiter = RateLimiter(requests_per_minute=20)
58
+ _fetch_rate_limiter = RateLimiter(requests_per_minute=25)
59
+
60
+
61
+ def _truncate_for_log(value: Any, limit: int = 500) -> str:
62
+ if not isinstance(value, str):
63
+ value = str(value)
64
+ if len(value) <= limit:
65
+ return value
66
+ return value[: limit - 1] + "…"
67
+
68
+
69
+ def _serialize_input(val: Any) -> Any:
70
+ try:
71
+ if isinstance(val, (str, int, float, bool)) or val is None:
72
+ return val
73
+ if isinstance(val, (list, tuple)):
74
+ return [_serialize_input(v) for v in list(val)[:10]] + (["…"] if len(val) > 10 else [])
75
+ if isinstance(val, dict):
76
+ out: dict[str, Any] = {}
77
+ for i, (k, v) in enumerate(val.items()):
78
+ if i >= 12:
79
+ out["…"] = "…"
80
+ break
81
+ out[str(k)] = _serialize_input(v)
82
+ return out
83
+ return repr(val)[:120]
84
+ except Exception:
85
+ return "<unserializable>"
86
+
87
+
88
+ def _log_call_start(func_name: str, **kwargs: Any) -> None:
89
+ try:
90
+ compact = {k: _serialize_input(v) for k, v in kwargs.items()}
91
+ # Use sys.__stdout__ to avoid capturing logs in redirected output
92
+ print(f"[TOOL CALL] {func_name} inputs: {json.dumps(compact, ensure_ascii=False)[:800]}", flush=True, file=sys.__stdout__)
93
+ except Exception as exc:
94
+ print(f"[TOOL CALL] {func_name} (failed to log inputs: {exc})", flush=True, file=sys.__stdout__)
95
+
96
+
97
+ def _log_call_end(func_name: str, output_desc: str) -> None:
98
+ try:
99
+ # Use sys.__stdout__ to avoid capturing logs in redirected output
100
+ print(f"[TOOL RESULT] {func_name} output: {output_desc}", flush=True, file=sys.__stdout__)
101
+ except Exception as exc:
102
+ print(f"[TOOL RESULT] {func_name} (failed to log output: {exc})", flush=True, file=sys.__stdout__)
103
+
104
+ # Ensure Tools modules can import 'app' when this file is executed as a script
105
+ # (their code does `from app import ...`).
106
+ sys.modules.setdefault("app", sys.modules[__name__])
107
+
108
  # Import per-tool interface builders from the Tools package
 
109
  from Modules.Web_Search import build_interface as build_search_interface
110
+ from Modules.ScrapeGraphAI import build_interface as build_scrapegraph_interface
111
  from Modules.Agent_Terminal import build_interface as build_agent_terminal_interface
112
  from Modules.Code_Interpreter import build_interface as build_code_interface
113
  from Modules.Memory_Manager import build_interface as build_memory_interface
 
114
  from Modules.Generate_Image import build_interface as build_image_interface
 
 
 
 
115
  from Modules.Shell_Command import build_interface as build_shell_interface
 
116
 
117
  # Optional environment flags used to conditionally show API schemas (unchanged behavior)
118
  HF_IMAGE_TOKEN = bool(os.getenv("HF_READ_TOKEN"))
 
125
  CSS_STYLES = _css_file.read()
126
 
127
  # Build each tab interface using modular builders
 
128
  web_search_interface = build_search_interface()
129
+ scrapegraph_interface = build_scrapegraph_interface()
130
  agent_terminal_interface = build_agent_terminal_interface()
131
  code_interface = build_code_interface()
132
  memory_interface = build_memory_interface()
 
133
  image_generation_interface = build_image_interface()
 
 
 
134
  shell_interface = build_shell_interface()
 
 
135
 
136
  _interfaces = [
137
  agent_terminal_interface,
 
 
138
  web_search_interface,
139
+ scrapegraph_interface,
140
  code_interface,
141
  shell_interface,
 
 
142
  memory_interface,
 
143
  image_generation_interface,
 
 
144
  ]
145
  _tab_names = [
146
  "Agent Terminal",
 
 
147
  "Web Search",
148
+ "ScrapeGraphAI",
149
  "Code Interpreter",
150
  "Shell Command",
 
 
151
  "Memory Manager",
 
152
  "Generate Image",
 
 
153
  ]
154
+
155
+ with gr.Blocks(title="Nymbo/Tools MCP") as demo:
156
+
157
+ with gr.Sidebar(width=300, elem_classes="app-sidebar"):
158
+ gr.Markdown(
159
+ "## Nymbo/Tools MCP\n"
160
+ "<p style='font-size: 0.7rem; opacity: 0.85; margin-top: 2px; margin-bottom: 6px;'>General purpose tools useful for any agent.</p>\n"
161
+ "<a href='https://www.nymbo.net/nymbot' target='_blank' style='font-size: 0.7rem; display: block;'>Test with Nymbot</a>"
162
+ )
163
+
164
+ with gr.Accordion("Information", open=False):
165
+ gr.HTML(
166
+ """
167
+ <div class="info-accordion">
168
+ <div class="info-grid" style="grid-template-columns: 1fr;">
169
+ <section class="info-card">
170
+ <div class="info-card__body">
171
+ <h3>Connecting from an MCP Client</h3>
172
+ <p>
173
+ This Space also runs as a Model Context Protocol (MCP) server. Point your client to:
174
+ <br/>
175
+ <code>https://nymbo-tools.hf.space/gradio_api/mcp/</code>
176
+ </p>
177
+ <p>Example client configuration:</p>
178
+ <pre><code class="language-json">{
179
+ "mcpServers": {
180
+ "nymbo-tools": {
181
+ "url": "https://nymbo-tools.hf.space/gradio_api/mcp/"
182
+ }
183
+ }
184
+ }</code></pre>
185
+ <p>Run the following commands in sequence to run the server locally:</p>
186
+ <pre><code>git clone https://huggingface.co/spaces/Nymbo/Tools
187
+ cd Tools
188
+ python -m venv env
189
+ source env/bin/activate
190
+ pip install -r requirements.txt
191
+ python app.py</code></pre>
192
+ </div>
193
+ </section>
194
+
195
  <section class="info-card">
196
  <div class="info-card__body">
197
+ <h3>Enable Image Gen</h3>
198
  <p>
199
+ The <code>Generate_Image</code> tool requires a
200
  <code>HF_READ_TOKEN</code> set as a secret or environment variable.
201
  </p>
202
  <ul class="info-list">
 
204
  <li>Or run locally with <code>HF_READ_TOKEN</code> in your environment.</li>
205
  </ul>
206
  <div class="info-hint">
207
+ The <code>ScrapeGraphAI</code> tool also requires <code>MISTRAL_API_KEY</code> for extraction actions.
208
  </div>
209
  </div>
210
  </section>
211
+
212
  <section class="info-card">
213
  <div class="info-card__body">
214
+ <h3>Persistent Memories</h3>
215
  <p>
216
+ In this public demo, memories created with the <code>Memory_Manager</code> tool are stored in the Space's running container and are cleared when the Space restarts. Content is visible to everyone—avoid personal data.
217
  </p>
218
  <p>
219
+ When running locally, memories are saved to <code>memories.json</code> at the repo root for privacy.
220
  </p>
221
  </div>
222
  </section>
223
+
224
  <section class="info-card">
225
  <div class="info-card__body">
226
+ <h3>Tool Notes</h3>
227
  <p><strong>No authentication required for:</strong></p>
228
  <ul class="info-list">
 
229
  <li><code>Web_Search</code></li>
230
  <li><code>Agent_Terminal</code></li>
231
  <li><code>Code_Interpreter</code></li>
232
  <li><code>Memory_Manager</code></li>
 
 
233
  <li><code>Shell_Command</code></li>
 
234
  </ul>
235
+ <p>
236
+ <code>ScrapeGraphAI</code> is available in this Space, but extraction actions require <code>MISTRAL_API_KEY</code>. The <code>render_markdown</code> action does not require Mistral.
237
+ </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  </div>
239
  </section>
240
+ </div>
241
+ </div>
242
+ """
243
+ )
244
+
245
+ gr.Markdown("### Tools")
246
+ tool_selector = gr.Radio(
247
+ choices=_tab_names,
248
+ value=_tab_names[0],
249
+ label="Select Tool",
250
+ show_label=False,
251
+ container=False,
252
+ elem_classes="sidebar-nav"
253
+ )
254
+
255
+ with gr.Tabs(elem_classes="hidden-tabs", selected=_tab_names[0]) as tool_tabs:
256
+ for name, interface in zip(_tab_names, _interfaces):
257
+ with gr.TabItem(label=name, id=name, elem_id=f"tab-{name}"):
258
+ interface.render()
259
+
260
+ # Use JavaScript to click the hidden tab button when the radio selection changes
261
+ tool_selector.change(
262
+ fn=None,
263
+ inputs=tool_selector,
264
+ outputs=None,
265
+ js="(selected_tool) => { const buttons = document.querySelectorAll('.hidden-tabs button'); buttons.forEach(btn => { if (btn.innerText.trim() === selected_tool) { btn.click(); } }); }"
266
+ )
267
+
268
+ if __name__ == "__main__":
269
+ demo.launch(mcp_server=True, theme="Nymbo/Nymbo_Theme", css=CSS_STYLES, ssr_mode=False)
memories.json CHANGED
@@ -1,20 +1,20 @@
1
- [
2
- {
3
- "id": "c8e3965d-270c-4baf-836f-33c6ed57f527",
4
- "text": "The user's personal website is driven by Markdown and Vue, hosted on Vercel.",
5
- "timestamp": "2025-09-06 02:21:17",
6
- "tags": "website,markdown,vue,vercel"
7
- },
8
- {
9
- "id": "17806073-cb86-472f-9b39-c1aaaf3ac058",
10
- "text": "The user lives in New York City.",
11
- "timestamp": "2025-09-06 17:07:27",
12
- "tags": "location,address"
13
- },
14
- {
15
- "id": "86e9f249-b43d-4aaa-bca0-b55fcb0c03be",
16
- "text": "The user has a pet Russian tortoise who is 8 years old.",
17
- "timestamp": "2025-09-06 02:20:59",
18
- "tags": "pet,tortoise,animals"
19
- }
20
  ]
 
1
+ [
2
+ {
3
+ "id": "c8e3965d-270c-4baf-836f-33c6ed57f527",
4
+ "text": "The user's personal website is driven by Markdown and Vue, hosted on Vercel.",
5
+ "timestamp": "2025-09-06 02:21:17",
6
+ "tags": "website,markdown,vue,vercel"
7
+ },
8
+ {
9
+ "id": "17806073-cb86-472f-9b39-c1aaaf3ac058",
10
+ "text": "The user lives in New York City.",
11
+ "timestamp": "2025-09-06 17:07:27",
12
+ "tags": "location,address"
13
+ },
14
+ {
15
+ "id": "86e9f249-b43d-4aaa-bca0-b55fcb0c03be",
16
+ "text": "The user has a pet Russian tortoise who is 8 years old.",
17
+ "timestamp": "2025-09-06 02:20:59",
18
+ "tags": "pet,tortoise,animals"
19
+ }
20
  ]
requirements.txt CHANGED
@@ -1,12 +1,14 @@
1
- gradio[mcp]==6.2.0
2
- requests
3
- beautifulsoup4
4
- lxml
5
- readability-lxml
6
- ddgs
7
- kokoro>=0.7.16
8
- numpy
9
- torch; platform_system != "Darwin" or platform_machine != "arm64"
10
- Pillow
11
- huggingface_hub>=0.30.0
12
- markdownify
 
 
 
1
+ gradio[mcp]==6.2.0
2
+ requests
3
+ beautifulsoup4
4
+ lxml
5
+ readability-lxml
6
+ ddgs
7
+ kokoro>=0.7.16
8
+ numpy
9
+ torch; platform_system != "Darwin" or platform_machine != "arm64"
10
+ Pillow
11
+ huggingface_hub>=0.30.0
12
+ markdownify
13
+ langchain-mistralai>=1.1.1
14
+ scrapegraphai>=1.75.1
styles.css CHANGED
@@ -1,308 +1,308 @@
1
- /* Style only the top-level app title to avoid affecting headings elsewhere */
2
- .app-title {
3
- text-align: center;
4
- /* Ensure main title appears first, then our two subtitle lines */
5
- display: grid;
6
- justify-items: center;
7
- }
8
- .app-title::after {
9
- grid-row: 2;
10
- content: "General purpose tools useful for any agent.";
11
- display: block;
12
- font-size: 1rem;
13
- font-weight: 400;
14
- opacity: 0.9;
15
- margin-top: 2px;
16
- white-space: pre-wrap;
17
- }
18
-
19
- /* Sidebar Container */
20
- .app-sidebar {
21
- background: var(--body-background-fill) !important;
22
- border-right: 1px solid rgba(255, 255, 255, 0.08) !important;
23
- }
24
- @media (prefers-color-scheme: light) {
25
- .app-sidebar {
26
- border-right: 1px solid rgba(0, 0, 0, 0.08) !important;
27
- }
28
- }
29
-
30
- /* Historical safeguard: if any h1 appears inside tabs, don't attach pseudo content */
31
- .gradio-container [role="tabpanel"] h1::before,
32
- .gradio-container [role="tabpanel"] h1::after {
33
- content: none !important;
34
- }
35
-
36
- /* Information accordion - modern info cards */
37
- .info-accordion {
38
- margin: 8px 0 2px;
39
- }
40
- .info-grid {
41
- display: grid;
42
- gap: 12px;
43
- /* Force a 2x2 layout on medium+ screens */
44
- grid-template-columns: repeat(2, minmax(0, 1fr));
45
- align-items: stretch;
46
- }
47
- /* On narrow screens, stack into a single column */
48
- @media (max-width: 800px) {
49
- .info-grid {
50
- grid-template-columns: 1fr;
51
- }
52
- }
53
- .info-card {
54
- display: flex;
55
- gap: 14px;
56
- padding: 14px 16px;
57
- border: 1px solid rgba(255, 255, 255, 0.08);
58
- background: linear-gradient(180deg, rgba(255,255,255,0.05), rgba(255,255,255,0.03));
59
- border-radius: 12px;
60
- box-shadow: 0 1px 2px rgba(0, 0, 0, 0.04);
61
- position: relative;
62
- overflow: hidden;
63
- -webkit-backdrop-filter: blur(2px);
64
- backdrop-filter: blur(2px);
65
- }
66
- .info-card::before {
67
- content: "";
68
- position: absolute;
69
- inset: 0;
70
- border-radius: 12px;
71
- pointer-events: none;
72
- background: linear-gradient(90deg, rgba(99,102,241,0.06), rgba(59,130,246,0.05));
73
- }
74
- .info-card__icon {
75
- font-size: 24px;
76
- flex: 0 0 28px;
77
- line-height: 1;
78
- filter: saturate(1.1);
79
- }
80
- .info-card__body {
81
- min-width: 0;
82
- }
83
- .info-card__body h3 {
84
- margin: 0 0 6px;
85
- font-size: 1.05rem;
86
- }
87
- .info-card__body p {
88
- margin: 6px 0;
89
- opacity: 0.95;
90
- }
91
- /* Readable code blocks inside info cards */
92
- .info-card pre {
93
- margin: 8px 0;
94
- padding: 10px 12px;
95
- background: rgba(20, 20, 30, 0.55);
96
- border: 1px solid rgba(255, 255, 255, 0.08);
97
- border-radius: 10px;
98
- overflow-x: auto;
99
- white-space: pre;
100
- }
101
- .info-card code {
102
- font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
103
- font-size: 0.95em;
104
- }
105
- .info-card pre code {
106
- display: block;
107
- }
108
- .info-card p {
109
- word-wrap: break-word;
110
- overflow-wrap: break-word;
111
- }
112
- .info-card p code {
113
- word-break: break-all;
114
- }
115
- .info-list {
116
- margin: 6px 0 0 18px;
117
- padding: 0;
118
- }
119
- .info-hint {
120
- margin-top: 8px;
121
- font-size: 0.9em;
122
- opacity: 0.9;
123
- }
124
-
125
- /* Light theme adjustments */
126
- @media (prefers-color-scheme: light) {
127
- .info-card {
128
- border-color: rgba(0, 0, 0, 0.08);
129
- background: linear-gradient(180deg, rgba(255,255,255,0.95), rgba(255,255,255,0.9));
130
- }
131
- .info-card::before {
132
- background: linear-gradient(90deg, rgba(99,102,241,0.08), rgba(59,130,246,0.06));
133
- }
134
- .info-card pre {
135
- background: rgba(245, 246, 250, 0.95);
136
- border-color: rgba(0, 0, 0, 0.08);
137
- }
138
- }
139
-
140
- /* Sidebar Navigation - styled like the previous tabs */
141
- .sidebar-nav {
142
- background: transparent !important;
143
- border: none !important;
144
- padding: 0 !important;
145
- }
146
- .sidebar-nav .form {
147
- gap: 8px !important;
148
- display: flex !important;
149
- flex-direction: column !important;
150
- border: none !important;
151
- background: transparent !important;
152
- }
153
- .sidebar-nav label {
154
- display: flex !important;
155
- align-items: center !important;
156
- padding: 10px 12px !important;
157
- border-radius: 10px !important;
158
- border: 1px solid rgba(255, 255, 255, 0.08) !important;
159
- background: linear-gradient(180deg, rgba(255,255,255,0.05), rgba(255,255,255,0.03)) !important;
160
- transition: background .2s ease, border-color .2s ease, box-shadow .2s ease, transform .06s ease !important;
161
- cursor: pointer !important;
162
- margin-bottom: 0 !important;
163
- width: 100% !important;
164
- justify-content: flex-start !important;
165
- text-align: left !important;
166
- }
167
- .sidebar-nav label:hover {
168
- border-color: rgba(99,102,241,0.28) !important;
169
- background: linear-gradient(180deg, rgba(99,102,241,0.10), rgba(59,130,246,0.08)) !important;
170
- }
171
- /* Selected state - Gradio adds 'selected' class to the label in some versions, or we check input:checked */
172
- .sidebar-nav label.selected {
173
- border-color: rgba(99,102,241,0.35) !important;
174
- box-shadow: inset 0 0 0 1px rgba(99,102,241,0.25), 0 1px 2px rgba(0,0,0,0.25) !important;
175
- background: linear-gradient(180deg, rgba(99,102,241,0.18), rgba(59,130,246,0.14)) !important;
176
- color: rgba(255, 255, 255, 0.95) !important;
177
- }
178
-
179
- /* Light theme adjustments for sidebar */
180
- @media (prefers-color-scheme: light) {
181
- .sidebar-nav label {
182
- border-color: rgba(0, 0, 0, 0.08) !important;
183
- background: linear-gradient(180deg, rgba(255,255,255,0.95), rgba(255,255,255,0.90)) !important;
184
- color: rgba(0, 0, 0, 0.85) !important;
185
- }
186
- .sidebar-nav label:hover {
187
- border-color: rgba(99,102,241,0.25) !important;
188
- background: linear-gradient(180deg, rgba(99,102,241,0.08), rgba(59,130,246,0.06)) !important;
189
- }
190
- .sidebar-nav label.selected {
191
- border-color: rgba(99,102,241,0.35) !important;
192
- background: linear-gradient(180deg, rgba(99,102,241,0.16), rgba(59,130,246,0.12)) !important;
193
- color: rgba(0, 0, 0, 0.85) !important;
194
- }
195
- }
196
-
197
- /* Hide scrollbars/arrows that can appear on the description block in some browsers */
198
- /* stylelint-disable compat-api/css */
199
- article.prose, .prose, .gr-prose {
200
- overflow: visible !important;
201
- max-height: none !important;
202
- -ms-overflow-style: none !important; /* IE/Edge */
203
- scrollbar-width: none !important; /* Firefox */
204
- }
205
- /* stylelint-enable compat-api/css */
206
- article.prose::-webkit-scrollbar,
207
- .prose::-webkit-scrollbar,
208
- .gr-prose::-webkit-scrollbar {
209
- display: none !important; /* Chrome/Safari */
210
- }
211
-
212
- /* Fix for white background on single-line inputs in dark mode */
213
- .gradio-container input[type="text"],
214
- .gradio-container input[type="password"],
215
- .gradio-container input[type="number"],
216
- .gradio-container input[type="email"] {
217
- background-color: var(--input-background-fill) !important;
218
- color: var(--body-text-color) !important;
219
- }
220
-
221
- /* Custom glossy purple styling for primary action buttons */
222
- .gradio-container button.primary {
223
- border: 1px solid rgba(99, 102, 241, 0.35) !important;
224
- background: linear-gradient(180deg, rgba(99, 102, 241, 0.25), rgba(59, 130, 246, 0.20)) !important;
225
- box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.12), 0 2px 4px rgba(0, 0, 0, 0.15) !important;
226
- color: rgba(255, 255, 255, 0.95) !important;
227
- transition: background .2s ease, border-color .2s ease, box-shadow .2s ease, transform .06s ease !important;
228
- }
229
- .gradio-container button.primary:hover {
230
- border-color: rgba(99, 102, 241, 0.5) !important;
231
- background: linear-gradient(180deg, rgba(99, 102, 241, 0.35), rgba(59, 130, 246, 0.28)) !important;
232
- box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 3px 6px rgba(0, 0, 0, 0.2) !important;
233
- }
234
- .gradio-container button.primary:active {
235
- transform: scale(0.98) !important;
236
- box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.2), 0 1px 2px rgba(0, 0, 0, 0.1) !important;
237
- }
238
- @media (prefers-color-scheme: light) {
239
- .gradio-container button.primary {
240
- border-color: rgba(99, 102, 241, 0.4) !important;
241
- background: linear-gradient(180deg, rgba(99, 102, 241, 0.85), rgba(79, 70, 229, 0.75)) !important;
242
- box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.25), 0 2px 4px rgba(0, 0, 0, 0.12) !important;
243
- color: rgba(255, 255, 255, 0.98) !important;
244
- }
245
- .gradio-container button.primary:hover {
246
- background: linear-gradient(180deg, rgba(99, 102, 241, 0.95), rgba(79, 70, 229, 0.85)) !important;
247
- box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.3), 0 3px 6px rgba(0, 0, 0, 0.15) !important;
248
- }
249
- }
250
-
251
- /* Hide the actual tabs since we use the sidebar to control them */
252
- .hidden-tabs .tab-nav,
253
- .hidden-tabs [role="tablist"] {
254
- display: none !important;
255
- }
256
- /* Hide the entire first row of the tabs container (contains tab buttons + overflow) */
257
- .hidden-tabs > div:first-child {
258
- display: none !important;
259
- }
260
- /* Ensure audio component buttons remain visible - they're inside tab panels, not the first row */
261
- .hidden-tabs [role="tabpanel"] button {
262
- display: inline-flex !important;
263
- }
264
-
265
- /* Custom scrollbar styling - Progressive enhancement, falls back to default scrollbars */
266
- /* stylelint-disable compat-api/css */
267
- * {
268
- scrollbar-width: thin;
269
- scrollbar-color: rgba(61, 212, 159, 0.4) rgba(255, 255, 255, 0.05);
270
- }
271
- *::-webkit-scrollbar {
272
- width: 8px;
273
- height: 8px;
274
- }
275
- *::-webkit-scrollbar-track {
276
- background: rgba(255, 255, 255, 0.05);
277
- border-radius: 4px;
278
- }
279
- *::-webkit-scrollbar-thumb {
280
- background: linear-gradient(180deg, rgba(61, 212, 159, 0.5), rgba(17, 186, 136, 0.4));
281
- border-radius: 4px;
282
- border: 1px solid rgba(119, 247, 209, 0.2);
283
- }
284
- *::-webkit-scrollbar-thumb:hover {
285
- background: linear-gradient(180deg, rgba(85, 250, 192, 0.7), rgba(65, 184, 131, 0.6));
286
- }
287
- *::-webkit-scrollbar-corner {
288
- background: rgba(255, 255, 255, 0.05);
289
- }
290
- @media (prefers-color-scheme: light) {
291
- * {
292
- scrollbar-color: rgba(61, 212, 159, 0.4) rgba(0, 0, 0, 0.05);
293
- }
294
- *::-webkit-scrollbar-track {
295
- background: rgba(0, 0, 0, 0.05);
296
- }
297
- *::-webkit-scrollbar-thumb {
298
- background: linear-gradient(180deg, rgba(61, 212, 159, 0.5), rgba(17, 186, 136, 0.4));
299
- border-color: rgba(0, 0, 0, 0.1);
300
- }
301
- *::-webkit-scrollbar-thumb:hover {
302
- background: linear-gradient(180deg, rgba(85, 250, 192, 0.7), rgba(65, 184, 131, 0.6));
303
- }
304
- *::-webkit-scrollbar-corner {
305
- background: rgba(0, 0, 0, 0.05);
306
- }
307
- }
308
  /* stylelint-enable compat-api/css */
 
1
+ /* Style only the top-level app title to avoid affecting headings elsewhere */
2
+ .app-title {
3
+ text-align: center;
4
+ /* Ensure main title appears first, then our two subtitle lines */
5
+ display: grid;
6
+ justify-items: center;
7
+ }
8
+ .app-title::after {
9
+ grid-row: 2;
10
+ content: "General purpose tools useful for any agent.";
11
+ display: block;
12
+ font-size: 1rem;
13
+ font-weight: 400;
14
+ opacity: 0.9;
15
+ margin-top: 2px;
16
+ white-space: pre-wrap;
17
+ }
18
+
19
+ /* Sidebar Container */
20
+ .app-sidebar {
21
+ background: var(--body-background-fill) !important;
22
+ border-right: 1px solid rgba(255, 255, 255, 0.08) !important;
23
+ }
24
+ @media (prefers-color-scheme: light) {
25
+ .app-sidebar {
26
+ border-right: 1px solid rgba(0, 0, 0, 0.08) !important;
27
+ }
28
+ }
29
+
30
+ /* Historical safeguard: if any h1 appears inside tabs, don't attach pseudo content */
31
+ .gradio-container [role="tabpanel"] h1::before,
32
+ .gradio-container [role="tabpanel"] h1::after {
33
+ content: none !important;
34
+ }
35
+
36
+ /* Information accordion - modern info cards */
37
+ .info-accordion {
38
+ margin: 8px 0 2px;
39
+ }
40
+ .info-grid {
41
+ display: grid;
42
+ gap: 12px;
43
+ /* Force a 2x2 layout on medium+ screens */
44
+ grid-template-columns: repeat(2, minmax(0, 1fr));
45
+ align-items: stretch;
46
+ }
47
+ /* On narrow screens, stack into a single column */
48
+ @media (max-width: 800px) {
49
+ .info-grid {
50
+ grid-template-columns: 1fr;
51
+ }
52
+ }
53
+ .info-card {
54
+ display: flex;
55
+ gap: 14px;
56
+ padding: 14px 16px;
57
+ border: 1px solid rgba(255, 255, 255, 0.08);
58
+ background: linear-gradient(180deg, rgba(255,255,255,0.05), rgba(255,255,255,0.03));
59
+ border-radius: 12px;
60
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.04);
61
+ position: relative;
62
+ overflow: hidden;
63
+ -webkit-backdrop-filter: blur(2px);
64
+ backdrop-filter: blur(2px);
65
+ }
66
+ .info-card::before {
67
+ content: "";
68
+ position: absolute;
69
+ inset: 0;
70
+ border-radius: 12px;
71
+ pointer-events: none;
72
+ background: linear-gradient(90deg, rgba(99,102,241,0.06), rgba(59,130,246,0.05));
73
+ }
74
+ .info-card__icon {
75
+ font-size: 24px;
76
+ flex: 0 0 28px;
77
+ line-height: 1;
78
+ filter: saturate(1.1);
79
+ }
80
+ .info-card__body {
81
+ min-width: 0;
82
+ }
83
+ .info-card__body h3 {
84
+ margin: 0 0 6px;
85
+ font-size: 1.05rem;
86
+ }
87
+ .info-card__body p {
88
+ margin: 6px 0;
89
+ opacity: 0.95;
90
+ }
91
+ /* Readable code blocks inside info cards */
92
+ .info-card pre {
93
+ margin: 8px 0;
94
+ padding: 10px 12px;
95
+ background: rgba(20, 20, 30, 0.55);
96
+ border: 1px solid rgba(255, 255, 255, 0.08);
97
+ border-radius: 10px;
98
+ overflow-x: auto;
99
+ white-space: pre;
100
+ }
101
+ .info-card code {
102
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
103
+ font-size: 0.95em;
104
+ }
105
+ .info-card pre code {
106
+ display: block;
107
+ }
108
+ .info-card p {
109
+ word-wrap: break-word;
110
+ overflow-wrap: break-word;
111
+ }
112
+ .info-card p code {
113
+ word-break: break-all;
114
+ }
115
+ .info-list {
116
+ margin: 6px 0 0 18px;
117
+ padding: 0;
118
+ }
119
+ .info-hint {
120
+ margin-top: 8px;
121
+ font-size: 0.9em;
122
+ opacity: 0.9;
123
+ }
124
+
125
+ /* Light theme adjustments */
126
+ @media (prefers-color-scheme: light) {
127
+ .info-card {
128
+ border-color: rgba(0, 0, 0, 0.08);
129
+ background: linear-gradient(180deg, rgba(255,255,255,0.95), rgba(255,255,255,0.9));
130
+ }
131
+ .info-card::before {
132
+ background: linear-gradient(90deg, rgba(99,102,241,0.08), rgba(59,130,246,0.06));
133
+ }
134
+ .info-card pre {
135
+ background: rgba(245, 246, 250, 0.95);
136
+ border-color: rgba(0, 0, 0, 0.08);
137
+ }
138
+ }
139
+
140
+ /* Sidebar Navigation - styled like the previous tabs */
141
+ .sidebar-nav {
142
+ background: transparent !important;
143
+ border: none !important;
144
+ padding: 0 !important;
145
+ }
146
+ .sidebar-nav .form {
147
+ gap: 8px !important;
148
+ display: flex !important;
149
+ flex-direction: column !important;
150
+ border: none !important;
151
+ background: transparent !important;
152
+ }
153
+ .sidebar-nav label {
154
+ display: flex !important;
155
+ align-items: center !important;
156
+ padding: 10px 12px !important;
157
+ border-radius: 10px !important;
158
+ border: 1px solid rgba(255, 255, 255, 0.08) !important;
159
+ background: linear-gradient(180deg, rgba(255,255,255,0.05), rgba(255,255,255,0.03)) !important;
160
+ transition: background .2s ease, border-color .2s ease, box-shadow .2s ease, transform .06s ease !important;
161
+ cursor: pointer !important;
162
+ margin-bottom: 0 !important;
163
+ width: 100% !important;
164
+ justify-content: flex-start !important;
165
+ text-align: left !important;
166
+ }
167
+ .sidebar-nav label:hover {
168
+ border-color: rgba(99,102,241,0.28) !important;
169
+ background: linear-gradient(180deg, rgba(99,102,241,0.10), rgba(59,130,246,0.08)) !important;
170
+ }
171
+ /* Selected state - Gradio adds 'selected' class to the label in some versions, or we check input:checked */
172
+ .sidebar-nav label.selected {
173
+ border-color: rgba(99,102,241,0.35) !important;
174
+ box-shadow: inset 0 0 0 1px rgba(99,102,241,0.25), 0 1px 2px rgba(0,0,0,0.25) !important;
175
+ background: linear-gradient(180deg, rgba(99,102,241,0.18), rgba(59,130,246,0.14)) !important;
176
+ color: rgba(255, 255, 255, 0.95) !important;
177
+ }
178
+
179
+ /* Light theme adjustments for sidebar */
180
+ @media (prefers-color-scheme: light) {
181
+ .sidebar-nav label {
182
+ border-color: rgba(0, 0, 0, 0.08) !important;
183
+ background: linear-gradient(180deg, rgba(255,255,255,0.95), rgba(255,255,255,0.90)) !important;
184
+ color: rgba(0, 0, 0, 0.85) !important;
185
+ }
186
+ .sidebar-nav label:hover {
187
+ border-color: rgba(99,102,241,0.25) !important;
188
+ background: linear-gradient(180deg, rgba(99,102,241,0.08), rgba(59,130,246,0.06)) !important;
189
+ }
190
+ .sidebar-nav label.selected {
191
+ border-color: rgba(99,102,241,0.35) !important;
192
+ background: linear-gradient(180deg, rgba(99,102,241,0.16), rgba(59,130,246,0.12)) !important;
193
+ color: rgba(0, 0, 0, 0.85) !important;
194
+ }
195
+ }
196
+
197
+ /* Hide scrollbars/arrows that can appear on the description block in some browsers */
198
+ /* stylelint-disable compat-api/css */
199
+ article.prose, .prose, .gr-prose {
200
+ overflow: visible !important;
201
+ max-height: none !important;
202
+ -ms-overflow-style: none !important; /* IE/Edge */
203
+ scrollbar-width: none !important; /* Firefox */
204
+ }
205
+ /* stylelint-enable compat-api/css */
206
+ article.prose::-webkit-scrollbar,
207
+ .prose::-webkit-scrollbar,
208
+ .gr-prose::-webkit-scrollbar {
209
+ display: none !important; /* Chrome/Safari */
210
+ }
211
+
212
+ /* Fix for white background on single-line inputs in dark mode */
213
+ .gradio-container input[type="text"],
214
+ .gradio-container input[type="password"],
215
+ .gradio-container input[type="number"],
216
+ .gradio-container input[type="email"] {
217
+ background-color: var(--input-background-fill) !important;
218
+ color: var(--body-text-color) !important;
219
+ }
220
+
221
+ /* Custom glossy purple styling for primary action buttons */
222
+ .gradio-container button.primary {
223
+ border: 1px solid rgba(99, 102, 241, 0.35) !important;
224
+ background: linear-gradient(180deg, rgba(99, 102, 241, 0.25), rgba(59, 130, 246, 0.20)) !important;
225
+ box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.12), 0 2px 4px rgba(0, 0, 0, 0.15) !important;
226
+ color: rgba(255, 255, 255, 0.95) !important;
227
+ transition: background .2s ease, border-color .2s ease, box-shadow .2s ease, transform .06s ease !important;
228
+ }
229
+ .gradio-container button.primary:hover {
230
+ border-color: rgba(99, 102, 241, 0.5) !important;
231
+ background: linear-gradient(180deg, rgba(99, 102, 241, 0.35), rgba(59, 130, 246, 0.28)) !important;
232
+ box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 3px 6px rgba(0, 0, 0, 0.2) !important;
233
+ }
234
+ .gradio-container button.primary:active {
235
+ transform: scale(0.98) !important;
236
+ box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.2), 0 1px 2px rgba(0, 0, 0, 0.1) !important;
237
+ }
238
+ @media (prefers-color-scheme: light) {
239
+ .gradio-container button.primary {
240
+ border-color: rgba(99, 102, 241, 0.4) !important;
241
+ background: linear-gradient(180deg, rgba(99, 102, 241, 0.85), rgba(79, 70, 229, 0.75)) !important;
242
+ box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.25), 0 2px 4px rgba(0, 0, 0, 0.12) !important;
243
+ color: rgba(255, 255, 255, 0.98) !important;
244
+ }
245
+ .gradio-container button.primary:hover {
246
+ background: linear-gradient(180deg, rgba(99, 102, 241, 0.95), rgba(79, 70, 229, 0.85)) !important;
247
+ box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.3), 0 3px 6px rgba(0, 0, 0, 0.15) !important;
248
+ }
249
+ }
250
+
251
+ /* Hide the actual tabs since we use the sidebar to control them */
252
+ .hidden-tabs .tab-nav,
253
+ .hidden-tabs [role="tablist"] {
254
+ display: none !important;
255
+ }
256
+ /* Hide the entire first row of the tabs container (contains tab buttons + overflow) */
257
+ .hidden-tabs > div:first-child {
258
+ display: none !important;
259
+ }
260
+ /* Ensure audio component buttons remain visible - they're inside tab panels, not the first row */
261
+ .hidden-tabs [role="tabpanel"] button {
262
+ display: inline-flex !important;
263
+ }
264
+
265
+ /* Custom scrollbar styling - Progressive enhancement, falls back to default scrollbars */
266
+ /* stylelint-disable compat-api/css */
267
+ * {
268
+ scrollbar-width: thin;
269
+ scrollbar-color: rgba(61, 212, 159, 0.4) rgba(255, 255, 255, 0.05);
270
+ }
271
+ *::-webkit-scrollbar {
272
+ width: 8px;
273
+ height: 8px;
274
+ }
275
+ *::-webkit-scrollbar-track {
276
+ background: rgba(255, 255, 255, 0.05);
277
+ border-radius: 4px;
278
+ }
279
+ *::-webkit-scrollbar-thumb {
280
+ background: linear-gradient(180deg, rgba(61, 212, 159, 0.5), rgba(17, 186, 136, 0.4));
281
+ border-radius: 4px;
282
+ border: 1px solid rgba(119, 247, 209, 0.2);
283
+ }
284
+ *::-webkit-scrollbar-thumb:hover {
285
+ background: linear-gradient(180deg, rgba(85, 250, 192, 0.7), rgba(65, 184, 131, 0.6));
286
+ }
287
+ *::-webkit-scrollbar-corner {
288
+ background: rgba(255, 255, 255, 0.05);
289
+ }
290
+ @media (prefers-color-scheme: light) {
291
+ * {
292
+ scrollbar-color: rgba(61, 212, 159, 0.4) rgba(0, 0, 0, 0.05);
293
+ }
294
+ *::-webkit-scrollbar-track {
295
+ background: rgba(0, 0, 0, 0.05);
296
+ }
297
+ *::-webkit-scrollbar-thumb {
298
+ background: linear-gradient(180deg, rgba(61, 212, 159, 0.5), rgba(17, 186, 136, 0.4));
299
+ border-color: rgba(0, 0, 0, 0.1);
300
+ }
301
+ *::-webkit-scrollbar-thumb:hover {
302
+ background: linear-gradient(180deg, rgba(85, 250, 192, 0.7), rgba(65, 184, 131, 0.6));
303
+ }
304
+ *::-webkit-scrollbar-corner {
305
+ background: rgba(0, 0, 0, 0.05);
306
+ }
307
+ }
308
  /* stylelint-enable compat-api/css */