"""Plugin registry for scrapeRL - manages all available plugins and tools.""" from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Callable, Optional from enum import Enum class PluginCategory(str, Enum): """Categories of plugins.""" BROWSER = "browser" PARSER = "parser" DATA = "data" NETWORK = "network" MEDIA = "media" ANALYSIS = "analysis" EXTRACTION = "extraction" VALIDATION = "validation" STORAGE = "storage" AI = "ai" @dataclass class ToolDefinition: """Definition of a tool that can be called by agents.""" name: str description: str category: PluginCategory parameters: dict[str, Any] = field(default_factory=dict) returns: dict[str, Any] = field(default_factory=dict) examples: list[str] = field(default_factory=list) @dataclass class PluginDefinition: """Definition of a plugin with its tools.""" id: str name: str description: str category: PluginCategory tools: list[ToolDefinition] = field(default_factory=list) enabled: bool = True version: str = "1.0.0" # ============================================================================== # BROWSER TOOLS # ============================================================================== BROWSER_TOOLS = [ ToolDefinition( name="browser.navigate", description="Navigate browser to a URL and wait for page load", category=PluginCategory.BROWSER, parameters={"url": "string", "wait_for": "string (page_load|network_idle)"}, returns={"success": "bool", "html_length": "int", "status_code": "int"}, ), ToolDefinition( name="browser.click", description="Click on an element matching the selector", category=PluginCategory.BROWSER, parameters={"selector": "string", "wait_after": "int (ms)"}, returns={"clicked": "bool", "element_found": "bool"}, ), ToolDefinition( name="browser.type", description="Type text into an input field", category=PluginCategory.BROWSER, parameters={"selector": "string", "text": "string", "clear_first": "bool"}, returns={"typed": "bool", "element_found": "bool"}, ), ToolDefinition( name="browser.scroll", description="Scroll the page or element", category=PluginCategory.BROWSER, parameters={"direction": "string (up|down|top|bottom)", "amount": "int (px)"}, returns={"scrolled": "bool", "new_position": "int"}, ), ToolDefinition( name="browser.screenshot", description="Capture a screenshot of the page or element", category=PluginCategory.BROWSER, parameters={"selector": "string (optional)", "full_page": "bool"}, returns={"captured": "bool", "size_bytes": "int", "dimensions": "dict"}, ), ToolDefinition( name="browser.wait", description="Wait for an element or condition", category=PluginCategory.BROWSER, parameters={"selector": "string", "timeout_ms": "int", "state": "string"}, returns={"found": "bool", "waited_ms": "int"}, ), ToolDefinition( name="browser.execute_js", description="Execute JavaScript in browser context", category=PluginCategory.BROWSER, parameters={"script": "string", "args": "list"}, returns={"result": "any", "error": "string|null"}, ), ToolDefinition( name="browser.get_cookies", description="Get cookies for current domain", category=PluginCategory.BROWSER, parameters={"domain": "string (optional)"}, returns={"cookies": "list[dict]", "count": "int"}, ), ] # ============================================================================== # HTML/DOM PARSING TOOLS # ============================================================================== HTML_TOOLS = [ ToolDefinition( name="html.parse", description="Parse HTML document into structured DOM", category=PluginCategory.PARSER, parameters={"parser": "string (html.parser|lxml)", "content_length": "int"}, returns={"parsed": "bool", "soup_type": "string"}, ), ToolDefinition( name="html.select", description="Select elements using CSS selector", category=PluginCategory.PARSER, parameters={"selector": "string", "limit": "int (optional)"}, returns={"elements_found": "int", "selector_used": "string"}, ), ToolDefinition( name="html.select_one", description="Select first element matching CSS selector", category=PluginCategory.PARSER, parameters={"selector": "string"}, returns={"found": "bool", "tag": "string", "text": "string"}, ), ToolDefinition( name="html.find_all", description="Find all elements by tag and attributes (bs4)", category=PluginCategory.PARSER, parameters={"tag": "string", "attrs": "dict", "recursive": "bool"}, returns={"elements_found": "int", "tags": "list[string]"}, ), ToolDefinition( name="html.get_text", description="Extract text content from element or page", category=PluginCategory.PARSER, parameters={"selector": "string (optional)", "separator": "string"}, returns={"text": "string", "length": "int"}, ), ToolDefinition( name="html.get_attribute", description="Get attribute value from element", category=PluginCategory.PARSER, parameters={"selector": "string", "attribute": "string"}, returns={"value": "string|null", "found": "bool"}, ), ToolDefinition( name="html.extract_links", description="Extract all links from page", category=PluginCategory.PARSER, parameters={"base_url": "string", "filter_pattern": "string (optional)"}, returns={"links": "list[dict]", "count": "int"}, ), ToolDefinition( name="html.extract_images", description="Extract all images with src and alt", category=PluginCategory.PARSER, parameters={"include_lazy": "bool"}, returns={"images": "list[dict]", "count": "int"}, ), ToolDefinition( name="html.extract_tables", description="Extract HTML tables as structured data", category=PluginCategory.PARSER, parameters={"selector": "string (optional)"}, returns={"tables": "list[list[list]]", "count": "int"}, ), ToolDefinition( name="html.extract_forms", description="Extract form structure and fields", category=PluginCategory.PARSER, parameters={"selector": "string (optional)"}, returns={"forms": "list[dict]", "count": "int"}, ), ToolDefinition( name="html.extract_meta", description="Extract page title and meta tags", category=PluginCategory.PARSER, parameters={"include_og": "bool"}, returns={"title": "string", "meta": "dict[string, string]", "count": "int"}, ), ToolDefinition( name="html.extract_jsonld", description="Extract JSON-LD structured data blocks", category=PluginCategory.PARSER, parameters={"include_arrays": "bool"}, returns={"items": "list[dict]", "count": "int"}, ), ToolDefinition( name="html.detect_repeating_blocks", description="Find repeated DOM block signatures for list extraction", category=PluginCategory.PARSER, parameters={"min_repetitions": "int"}, returns={"candidates": "list[dict]", "count": "int"}, ), ] # ============================================================================== # DATA PROCESSING TOOLS # ============================================================================== DATA_TOOLS = [ ToolDefinition( name="json.parse", description="Parse JSON string into object", category=PluginCategory.DATA, parameters={"text": "string"}, returns={"data": "any", "valid": "bool"}, ), ToolDefinition( name="json.dumps", description="Convert object to JSON string", category=PluginCategory.DATA, parameters={"data": "any", "indent": "int", "sort_keys": "bool"}, returns={"output": "string", "length": "int"}, ), ToolDefinition( name="csv.generate", description="Generate CSV from data", category=PluginCategory.DATA, parameters={"data": "list[dict]", "fields": "list[string]"}, returns={"csv": "string", "rows": "int", "columns": "int"}, ), ToolDefinition( name="csv.parse", description="Parse CSV string into records", category=PluginCategory.DATA, parameters={"text": "string", "delimiter": "string", "has_header": "bool"}, returns={"records": "list[dict]", "rows": "int", "columns": "int"}, ), ToolDefinition( name="pandas.DataFrame", description="Create pandas DataFrame from data", category=PluginCategory.DATA, parameters={"data": "list[dict]|dict"}, returns={"shape": "tuple", "columns": "list[string]", "dtypes": "dict"}, ), ToolDefinition( name="pandas.read_csv", description="Read CSV data into DataFrame", category=PluginCategory.DATA, parameters={"content": "string", "sep": "string"}, returns={"shape": "tuple", "columns": "list[string]"}, ), ToolDefinition( name="pandas.to_csv", description="Convert DataFrame to CSV", category=PluginCategory.DATA, parameters={"index": "bool"}, returns={"csv": "string", "rows": "int"}, ), ToolDefinition( name="pandas.describe", description="Generate descriptive statistics", category=PluginCategory.DATA, parameters={"include": "string (all|numeric)"}, returns={"statistics": "dict", "columns": "list[string]"}, ), ToolDefinition( name="pandas.groupby", description="Group data by columns and aggregate", category=PluginCategory.DATA, parameters={"by": "list[string]", "agg": "dict"}, returns={"groups": "int", "result": "dict"}, ), ToolDefinition( name="pandas.filter", description="Filter DataFrame rows by condition", category=PluginCategory.DATA, parameters={"condition": "string"}, returns={"filtered_rows": "int", "original_rows": "int"}, ), ToolDefinition( name="data.dedupe_rows", description="Remove duplicate rows from list-of-dicts data", category=PluginCategory.DATA, parameters={"rows": "list[dict]", "key_fields": "list[string]"}, returns={"rows": "list[dict]", "removed": "int", "count": "int"}, ), ToolDefinition( name="data.rank_rows", description="Rank rows by score/value field", category=PluginCategory.DATA, parameters={"rows": "list[dict]", "sort_field": "string", "descending": "bool", "limit": "int"}, returns={"rows": "list[dict]", "sort_field": "string", "count": "int"}, ), ToolDefinition( name="data.select_columns", description="Project rows to requested output columns", category=PluginCategory.DATA, parameters={"rows": "list[dict]", "columns": "list[string]"}, returns={"rows": "list[dict]", "columns": "list[string]", "count": "int"}, ), ] # ============================================================================== # REGEX/TEXT TOOLS # ============================================================================== REGEX_TOOLS = [ ToolDefinition( name="regex.match", description="Match pattern at start of string", category=PluginCategory.EXTRACTION, parameters={"pattern": "string", "text": "string", "flags": "string"}, returns={"matched": "bool", "groups": "list[string]"}, ), ToolDefinition( name="regex.search", description="Search for pattern anywhere in string", category=PluginCategory.EXTRACTION, parameters={"pattern": "string", "text": "string"}, returns={"found": "bool", "position": "int", "match": "string"}, ), ToolDefinition( name="regex.findall", description="Find all matches of pattern", category=PluginCategory.EXTRACTION, parameters={"pattern": "string", "text": "string"}, returns={"matches": "list[string]", "count": "int"}, ), ToolDefinition( name="regex.sub", description="Replace pattern matches in string", category=PluginCategory.EXTRACTION, parameters={"pattern": "string", "replacement": "string", "text": "string"}, returns={"result": "string", "replacements": "int"}, ), ToolDefinition( name="regex.split", description="Split string by pattern", category=PluginCategory.EXTRACTION, parameters={"pattern": "string", "text": "string", "maxsplit": "int"}, returns={"parts": "list[string]", "count": "int"}, ), ] # ============================================================================== # NETWORK/API TOOLS # ============================================================================== NETWORK_TOOLS = [ ToolDefinition( name="http.get", description="Make HTTP GET request", category=PluginCategory.NETWORK, parameters={"url": "string", "headers": "dict", "timeout": "int"}, returns={"status_code": "int", "content_length": "int", "headers": "dict"}, ), ToolDefinition( name="http.post", description="Make HTTP POST request", category=PluginCategory.NETWORK, parameters={"url": "string", "data": "dict", "json": "dict", "headers": "dict"}, returns={"status_code": "int", "response": "any"}, ), ToolDefinition( name="http.head", description="Make HTTP HEAD request to get headers", category=PluginCategory.NETWORK, parameters={"url": "string", "timeout": "int"}, returns={"status_code": "int", "headers": "dict"}, ), ToolDefinition( name="url.parse", description="Parse URL into components", category=PluginCategory.NETWORK, parameters={"url": "string"}, returns={"scheme": "string", "domain": "string", "path": "string", "params": "dict"}, ), ToolDefinition( name="url.join", description="Join base URL with relative path", category=PluginCategory.NETWORK, parameters={"base": "string", "path": "string"}, returns={"url": "string"}, ), ] # ============================================================================== # MEDIA TOOLS # ============================================================================== MEDIA_TOOLS = [ ToolDefinition( name="image.download", description="Download image from URL", category=PluginCategory.MEDIA, parameters={"url": "string", "timeout": "int"}, returns={"size_bytes": "int", "format": "string", "dimensions": "dict"}, ), ToolDefinition( name="image.analyze", description="Analyze image properties", category=PluginCategory.MEDIA, parameters={"url": "string"}, returns={"width": "int", "height": "int", "format": "string", "has_transparency": "bool"}, ), ToolDefinition( name="pdf.extract_text", description="Extract text content from PDF", category=PluginCategory.MEDIA, parameters={"url": "string", "pages": "list[int]"}, returns={"text": "string", "pages": "int", "words": "int"}, ), ToolDefinition( name="video.metadata", description="Extract video metadata", category=PluginCategory.MEDIA, parameters={"url": "string"}, returns={"duration": "int", "resolution": "string", "format": "string"}, ), ] # ============================================================================== # ANALYSIS TOOLS # ============================================================================== ANALYSIS_TOOLS = [ ToolDefinition( name="stats.describe", description="Calculate descriptive statistics", category=PluginCategory.ANALYSIS, parameters={"data": "list[number]"}, returns={"mean": "float", "median": "float", "std": "float", "min": "float", "max": "float"}, ), ToolDefinition( name="stats.correlation", description="Calculate correlation between datasets", category=PluginCategory.ANALYSIS, parameters={"x": "list[number]", "y": "list[number]"}, returns={"correlation": "float", "p_value": "float"}, ), ToolDefinition( name="text.sentiment", description="Analyze sentiment of text", category=PluginCategory.ANALYSIS, parameters={"text": "string"}, returns={"score": "float", "label": "string (positive|negative|neutral)"}, ), ToolDefinition( name="text.entities", description="Extract named entities from text", category=PluginCategory.ANALYSIS, parameters={"text": "string", "types": "list[string]"}, returns={"entities": "list[dict]", "count": "int"}, ), ToolDefinition( name="text.keywords", description="Extract keywords from text", category=PluginCategory.ANALYSIS, parameters={"text": "string", "top_k": "int"}, returns={"keywords": "list[string]", "scores": "list[float]"}, ), ToolDefinition( name="analysis.infer_schema", description="Infer field types and nullability from extracted rows", category=PluginCategory.ANALYSIS, parameters={"rows": "list[dict]"}, returns={"schema": "dict[string, dict]", "columns": "list[string]"}, ), ToolDefinition( name="analysis.score_relevance", description="Score row relevance against user query/instructions", category=PluginCategory.ANALYSIS, parameters={"rows": "list[dict]", "query": "string"}, returns={"rows": "list[dict]", "count": "int"}, ), ] # ============================================================================== # EXTRACTION TOOLS # ============================================================================== EXTRACTION_TOOLS = [ ToolDefinition( name="extract.emails", description="Extract email addresses from text", category=PluginCategory.EXTRACTION, parameters={"text": "string"}, returns={"emails": "list[string]", "count": "int"}, ), ToolDefinition( name="extract.phones", description="Extract phone numbers from text", category=PluginCategory.EXTRACTION, parameters={"text": "string", "country_code": "string"}, returns={"phones": "list[string]", "count": "int"}, ), ToolDefinition( name="extract.urls", description="Extract URLs from text", category=PluginCategory.EXTRACTION, parameters={"text": "string"}, returns={"urls": "list[string]", "count": "int"}, ), ToolDefinition( name="extract.dates", description="Extract and parse dates from text", category=PluginCategory.EXTRACTION, parameters={"text": "string", "format": "string"}, returns={"dates": "list[string]", "count": "int"}, ), ToolDefinition( name="extract.prices", description="Extract prices and currencies from text", category=PluginCategory.EXTRACTION, parameters={"text": "string"}, returns={"prices": "list[dict]", "count": "int"}, ), ToolDefinition( name="extract.addresses", description="Extract physical addresses from text", category=PluginCategory.EXTRACTION, parameters={"text": "string"}, returns={"addresses": "list[dict]", "count": "int"}, ), ToolDefinition( name="extract.social_handles", description="Extract social media handles", category=PluginCategory.EXTRACTION, parameters={"text": "string", "platforms": "list[string]"}, returns={"handles": "dict[string, list]", "count": "int"}, ), ToolDefinition( name="extract.top_n", description="Select top N rows from extracted dataset", category=PluginCategory.EXTRACTION, parameters={"rows": "list[dict]", "n": "int", "sort_field": "string"}, returns={"rows": "list[dict]", "count": "int"}, ), ] # ============================================================================== # VALIDATION TOOLS # ============================================================================== VALIDATION_TOOLS = [ ToolDefinition( name="validate.url", description="Validate URL format and accessibility", category=PluginCategory.VALIDATION, parameters={"url": "string", "check_accessibility": "bool"}, returns={"valid": "bool", "accessible": "bool", "status_code": "int"}, ), ToolDefinition( name="validate.email", description="Validate email format", category=PluginCategory.VALIDATION, parameters={"email": "string"}, returns={"valid": "bool", "normalized": "string"}, ), ToolDefinition( name="validate.json", description="Validate JSON format", category=PluginCategory.VALIDATION, parameters={"text": "string"}, returns={"valid": "bool", "error": "string|null"}, ), ToolDefinition( name="validate.html", description="Validate HTML structure", category=PluginCategory.VALIDATION, parameters={"html": "string"}, returns={"valid": "bool", "errors": "list[string]"}, ), ToolDefinition( name="validate.schema", description="Validate data against JSON schema", category=PluginCategory.VALIDATION, parameters={"data": "any", "schema": "dict"}, returns={"valid": "bool", "errors": "list[string]"}, ), ToolDefinition( name="validate.data_completeness", description="Score completeness of extracted rows against required fields", category=PluginCategory.VALIDATION, parameters={"rows": "list[dict]", "fields": "list[string]"}, returns={"score": "float", "missing_counts": "dict[string, int]", "fields": "list[string]"}, ), ToolDefinition( name="validate.row_signal", description="Estimate quality signal of extracted rows", category=PluginCategory.VALIDATION, parameters={"rows": "list[dict]"}, returns={"signal": "float", "completeness": "float", "uniqueness": "float"}, ), ] # ============================================================================== # STORAGE TOOLS # ============================================================================== STORAGE_TOOLS = [ ToolDefinition( name="memory.store", description="Store data in long-term memory", category=PluginCategory.STORAGE, parameters={"key": "string", "value": "any", "ttl": "int"}, returns={"stored": "bool", "key": "string"}, ), ToolDefinition( name="memory.retrieve", description="Retrieve data from memory", category=PluginCategory.STORAGE, parameters={"key": "string"}, returns={"found": "bool", "value": "any"}, ), ToolDefinition( name="memory.search", description="Search memory by semantic similarity", category=PluginCategory.STORAGE, parameters={"query": "string", "limit": "int"}, returns={"results": "list[dict]", "count": "int"}, ), ToolDefinition( name="cache.get", description="Get value from session cache", category=PluginCategory.STORAGE, parameters={"key": "string"}, returns={"found": "bool", "value": "any"}, ), ToolDefinition( name="cache.set", description="Set value in session cache", category=PluginCategory.STORAGE, parameters={"key": "string", "value": "any"}, returns={"stored": "bool"}, ), ] # ============================================================================== # SANDBOX TOOLS # ============================================================================== SANDBOX_TOOLS = [ ToolDefinition( name="sandbox.execute", description="Execute Python code in sandboxed environment", category=PluginCategory.AI, parameters={"code": "string", "payload": "dict", "timeout": "int"}, returns={"success": "bool", "output": "any", "stdout": "string"}, ), ToolDefinition( name="sandbox.analyze", description="Run data analysis in sandbox", category=PluginCategory.AI, parameters={"data": "list[dict]", "analysis_type": "string"}, returns={"result": "dict", "visualizations": "list"}, ), ToolDefinition( name="sandbox.transform", description="Transform data using sandbox code", category=PluginCategory.AI, parameters={"data": "any", "transform_code": "string"}, returns={"transformed": "any", "success": "bool"}, ), ] # ============================================================================== # AI TOOLS # ============================================================================== AI_TOOLS = [ ToolDefinition( name="ai.complete", description="Generate text completion using AI model", category=PluginCategory.AI, parameters={"prompt": "string", "model": "string", "max_tokens": "int"}, returns={"text": "string", "tokens_used": "int"}, ), ToolDefinition( name="ai.embed", description="Generate embeddings for text", category=PluginCategory.AI, parameters={"text": "string", "model": "string"}, returns={"embedding": "list[float]", "dimensions": "int"}, ), ToolDefinition( name="ai.classify", description="Classify text into categories", category=PluginCategory.AI, parameters={"text": "string", "labels": "list[string]"}, returns={"label": "string", "confidence": "float"}, ), ToolDefinition( name="ai.summarize", description="Summarize text content", category=PluginCategory.AI, parameters={"text": "string", "max_length": "int"}, returns={"summary": "string", "reduction_ratio": "float"}, ), ] # ============================================================================== # PLUGIN DEFINITIONS # ============================================================================== PLUGINS: list[PluginDefinition] = [ PluginDefinition( id="browser", name="Browser Automation", description="Control browser navigation, clicks, typing, and screenshots", category=PluginCategory.BROWSER, tools=BROWSER_TOOLS, ), PluginDefinition( id="html-parser", name="HTML/DOM Parser", description="Parse and query HTML documents using BeautifulSoup", category=PluginCategory.PARSER, tools=HTML_TOOLS, ), PluginDefinition( id="data-processing", name="Data Processing", description="JSON, CSV, and Pandas data processing tools", category=PluginCategory.DATA, tools=DATA_TOOLS, ), PluginDefinition( id="regex", name="Regular Expressions", description="Pattern matching and text extraction using regex", category=PluginCategory.EXTRACTION, tools=REGEX_TOOLS, ), PluginDefinition( id="network", name="Network/HTTP", description="HTTP requests and URL handling", category=PluginCategory.NETWORK, tools=NETWORK_TOOLS, ), PluginDefinition( id="media", name="Media Processing", description="Image, PDF, and video processing tools", category=PluginCategory.MEDIA, tools=MEDIA_TOOLS, ), PluginDefinition( id="analysis", name="Analysis", description="Statistical analysis and NLP tools", category=PluginCategory.ANALYSIS, tools=ANALYSIS_TOOLS, ), PluginDefinition( id="extraction", name="Data Extraction", description="Extract structured data like emails, phones, addresses", category=PluginCategory.EXTRACTION, tools=EXTRACTION_TOOLS, ), PluginDefinition( id="validation", name="Validation", description="Validate URLs, emails, JSON, HTML, and schemas", category=PluginCategory.VALIDATION, tools=VALIDATION_TOOLS, ), PluginDefinition( id="storage", name="Storage/Memory", description="Long-term memory and session cache", category=PluginCategory.STORAGE, tools=STORAGE_TOOLS, ), PluginDefinition( id="sandbox", name="Python Sandbox", description="Execute Python code in isolated sandbox", category=PluginCategory.AI, tools=SANDBOX_TOOLS, ), PluginDefinition( id="ai", name="AI/LLM", description="AI completion, embeddings, and classification", category=PluginCategory.AI, tools=AI_TOOLS, ), ] def get_all_plugins() -> list[PluginDefinition]: """Get all registered plugins.""" return PLUGINS def get_plugin(plugin_id: str) -> Optional[PluginDefinition]: """Get plugin by ID.""" for plugin in PLUGINS: if plugin.id == plugin_id: return plugin return None def get_all_tools() -> list[ToolDefinition]: """Get all registered tools across all plugins.""" tools = [] for plugin in PLUGINS: tools.extend(plugin.tools) return tools def get_tool(tool_name: str) -> Optional[ToolDefinition]: """Get tool definition by name.""" for plugin in PLUGINS: for tool in plugin.tools: if tool.name == tool_name: return tool return None def get_tools_by_category(category: PluginCategory) -> list[ToolDefinition]: """Get all tools in a category.""" tools = [] for plugin in PLUGINS: if plugin.category == category: tools.extend(plugin.tools) return tools def get_plugin_summary() -> dict[str, Any]: """Get summary of all plugins and tools.""" return { "plugins_count": len(PLUGINS), "tools_count": sum(len(p.tools) for p in PLUGINS), "categories": list(set(p.category.value for p in PLUGINS)), "plugins": [ { "id": p.id, "name": p.name, "category": p.category.value, "tools_count": len(p.tools), } for p in PLUGINS ], }