AI-Research-Paper-Analyst / tools /url_validator.py
Saleh
Clean deployment to HuggingFace Space
2447eba
"""
URL Validator Tool — Extracts and validates URLs found in text.
Assigned To: Safety Guardian agent ONLY
Reference: system_design.md — Tool 5 (Lines 543-577)
Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation
Key guardrails:
- Extracts all URLs, checks against known malicious domain patterns
- Flags shortened URLs (bit.ly, tinyurl) and data: URIs
- Caps at 50 URLs
- Returns error STRINGS, never raises exceptions
"""
import re
import json
from crewai.tools import tool
@tool
def url_validator_tool(text: str = "") -> str:
"""Extract and validate URLs found in text. Checks against blocklist of
suspicious URL patterns. Returns JSON with URL analysis. Pass the text to scan as the 'text' argument."""
# === INPUT VALIDATION ===
if not text or not isinstance(text, str):
return json.dumps({
"total_urls": 0, "checked": 0,
"malicious_urls": [], "is_safe": True,
"error": "Empty or invalid input"
})
if len(text.strip()) == 0:
return json.dumps({
"total_urls": 0, "checked": 0,
"malicious_urls": [], "is_safe": True,
"error": "Empty text provided"
})
# === URL EXTRACTION AND VALIDATION ===
try:
url_pattern = r'https?://[^\s<>"\')\]]+'
urls = re.findall(url_pattern, text)
suspicious_indicators = [
"bit.ly", "tinyurl", "t.co", "goo.gl",
"data:", "javascript:", "file://",
"malware", "phishing",
]
results = []
malicious = []
for url in urls[:50]: # Cap at 50 URLs
is_suspicious = any(ind in url.lower() for ind in suspicious_indicators)
results.append({"url": url, "suspicious": is_suspicious})
if is_suspicious:
malicious.append(url)
return json.dumps({
"total_urls": len(urls),
"checked": len(results),
"malicious_urls": malicious,
"is_safe": len(malicious) == 0,
})
except Exception as e:
return json.dumps({
"total_urls": 0, "checked": 0,
"malicious_urls": [], "is_safe": False,
"error": f"URL validation failed: {type(e).__name__}: {str(e)}"
})