Spaces:

AISA-Framework
/

AI-Research-Paper-Analyst

Sleeping

AI-Research-Paper-Analyst / tools /url_validator.py

Saleh

Clean deployment to HuggingFace Space

2447eba 21 days ago

2.3 kB

	"""
	URL Validator Tool — Extracts and validates URLs found in text.

	Assigned To: Safety Guardian agent ONLY
	Reference: system_design.md — Tool 5 (Lines 543-577)
	Reference: engineering_guardrails.md — §2 Tool-Call Argument Validation

	Key guardrails:
	- Extracts all URLs, checks against known malicious domain patterns
	- Flags shortened URLs (bit.ly, tinyurl) and data: URIs
	- Caps at 50 URLs
	- Returns error STRINGS, never raises exceptions
	"""

	import re
	import json
	from crewai.tools import tool


	@tool
	def url_validator_tool(text: str = "") -> str:
	"""Extract and validate URLs found in text. Checks against blocklist of
	suspicious URL patterns. Returns JSON with URL analysis. Pass the text to scan as the 'text' argument."""

	# === INPUT VALIDATION ===
	if not text or not isinstance(text, str):
	return json.dumps({
	"total_urls": 0, "checked": 0,
	"malicious_urls": [], "is_safe": True,
	"error": "Empty or invalid input"
	})

	if len(text.strip()) == 0:
	return json.dumps({
	"total_urls": 0, "checked": 0,
	"malicious_urls": [], "is_safe": True,
	"error": "Empty text provided"
	})

	# === URL EXTRACTION AND VALIDATION ===
	try:
	url_pattern = r'https?://[^\s<>"\')\]]+'
	urls = re.findall(url_pattern, text)

	suspicious_indicators = [
	"bit.ly", "tinyurl", "t.co", "goo.gl",
	"data:", "javascript:", "file://",
	"malware", "phishing",
	]

	results = []
	malicious = []

	for url in urls[:50]: # Cap at 50 URLs
	is_suspicious = any(ind in url.lower() for ind in suspicious_indicators)
	results.append({"url": url, "suspicious": is_suspicious})
	if is_suspicious:
	malicious.append(url)

	return json.dumps({
	"total_urls": len(urls),
	"checked": len(results),
	"malicious_urls": malicious,
	"is_safe": len(malicious) == 0,
	})

	except Exception as e:
	return json.dumps({
	"total_urls": 0, "checked": 0,
	"malicious_urls": [], "is_safe": False,
	"error": f"URL validation failed: {type(e).__name__}: {str(e)}"
	})