File size: 2,301 Bytes
2447eba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
URL Validator Tool — Extracts and validates URLs found in text.

Assigned To: Safety Guardian agent ONLY
Reference: system_design.md — Tool 5 (Lines 543-577)
Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation

Key guardrails:
  - Extracts all URLs, checks against known malicious domain patterns
  - Flags shortened URLs (bit.ly, tinyurl) and data: URIs
  - Caps at 50 URLs
  - Returns error STRINGS, never raises exceptions
"""

import re
import json
from crewai.tools import tool


@tool
def url_validator_tool(text: str = "") -> str:
    """Extract and validate URLs found in text. Checks against blocklist of
    suspicious URL patterns. Returns JSON with URL analysis. Pass the text to scan as the 'text' argument."""

    # === INPUT VALIDATION ===
    if not text or not isinstance(text, str):
        return json.dumps({
            "total_urls": 0, "checked": 0,
            "malicious_urls": [], "is_safe": True,
            "error": "Empty or invalid input"
        })

    if len(text.strip()) == 0:
        return json.dumps({
            "total_urls": 0, "checked": 0,
            "malicious_urls": [], "is_safe": True,
            "error": "Empty text provided"
        })

    # === URL EXTRACTION AND VALIDATION ===
    try:
        url_pattern = r'https?://[^\s<>"\')\]]+' 
        urls = re.findall(url_pattern, text)

        suspicious_indicators = [
            "bit.ly", "tinyurl", "t.co", "goo.gl",
            "data:", "javascript:", "file://",
            "malware", "phishing",
        ]

        results = []
        malicious = []

        for url in urls[:50]:  # Cap at 50 URLs
            is_suspicious = any(ind in url.lower() for ind in suspicious_indicators)
            results.append({"url": url, "suspicious": is_suspicious})
            if is_suspicious:
                malicious.append(url)

        return json.dumps({
            "total_urls": len(urls),
            "checked": len(results),
            "malicious_urls": malicious,
            "is_safe": len(malicious) == 0,
        })

    except Exception as e:
        return json.dumps({
            "total_urls": 0, "checked": 0,
            "malicious_urls": [], "is_safe": False,
            "error": f"URL validation failed: {type(e).__name__}: {str(e)}"
        })