Spaces:

AISA-Framework
/

PolicySummarizer

Sleeping

App Files Files Community

Nadasr commited on 23 days ago

Commit

1b963f1

verified ·

1 Parent(s): c9f8ea8

Upload 2 files

Browse files

Files changed (2) hide show

utils/logger.py +81 -0
utils/validators.py +107 -0

utils/logger.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Logging utility for agent actions - Policy Summarizer
+"""
+import logging
+import time
+from typing import Optional, List
+from functools import wraps
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger('PolicySummarizer')
+# Store logs for UI display
+_agent_logs: List[dict] = []
+def get_logs() -> List[dict]:
+    """Get all logged agent actions"""
+    return _agent_logs.copy()
+def clear_logs():
+    """Clear all logs"""
+    global _agent_logs
+    _agent_logs = []
+def log_agent_action(
+    agent_name: str,
+    action: str,
+    input_summary: str,
+    output_summary: str,
+    duration_seconds: float,
+    success: bool = True,
+    error: Optional[str] = None
+):
+    """Log an agent action without sensitive data."""
+    log_entry = {
+        "agent_name": agent_name,
+        "action": action,
+        "input_summary": input_summary[:200] + "..." if len(input_summary) > 200 else input_summary,
+        "output_summary": output_summary[:200] + "..." if len(output_summary) > 200 else output_summary,
+        "duration_seconds": round(duration_seconds, 2),
+        "success": success,
+        "error": error
+    }
+    _agent_logs.append(log_entry)
+    status = "✓" if success else "✗"
+    logger.info(f"{status} [{agent_name}] {action} ({duration_seconds:.2f}s)")
+    if error:
+        logger.error(f"  Error: {error}")
+def format_logs_for_display() -> str:
+    """Format logs for display in UI"""
+    if not _agent_logs:
+        return "No logs yet."
+    lines = ["## Agent Activity Log\n"]
+    for i, log in enumerate(_agent_logs, 1):
+        status = "✅" if log["success"] else "❌"
+        lines.append(f"### Step {i}: {log['agent_name']}")
+        lines.append(f"- **Action:** {log['action']}")
+        lines.append(f"- **Status:** {status}")
+        lines.append(f"- **Duration:** {log['duration_seconds']}s")
+        if log.get("error"):
+            lines.append(f"- **Error:** {log['error']}")
+        lines.append("")
+    return "\n".join(lines)

utils/validators.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Input validation utilities - Policy Summarizer
+"""
+import re
+from urllib.parse import urlparse
+from typing import Tuple
+# Maximum content length to process
+MAX_CONTENT_LENGTH = 50000
+# URL validation pattern
+URL_PATTERN = re.compile(
+    r'^https?://'
+    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
+    r'localhost|'
+    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
+    r'(?::\d+)?'
+    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+def validate_url(url: str) -> Tuple[bool, str]:
+    """Validate if the URL is valid and safe to scrape."""
+    if not url or not isinstance(url, str):
+        return False, "URL cannot be empty"
+    url = url.strip()
+    if len(url) > 2048:
+        return False, "URL is too long (max 2048 characters)"
+    if not URL_PATTERN.match(url):
+        return False, "Invalid URL format. Must start with http:// or https://"
+    try:
+        parsed = urlparse(url)
+    except Exception as e:
+        return False, f"Failed to parse URL: {str(e)}"
+    if parsed.scheme not in ['http', 'https']:
+        return False, "URL must use http or https protocol"
+    if not parsed.netloc:
+        return False, "URL must have a valid domain"
+    blocked_hosts = ['localhost', '127.0.0.1', '0.0.0.0', '::1']
+    if parsed.hostname and parsed.hostname.lower() in blocked_hosts:
+        return False, "Cannot scrape localhost or private addresses"
+    return True, ""
+def is_likely_policy_url(url: str) -> bool:
+    """Check if the URL likely points to a policy page."""
+    keywords = ['privacy', 'policy', 'terms', 'tos', 'legal', 'service', 'conditions']
+    url_lower = url.lower()
+    return any(keyword in url_lower for keyword in keywords)
+def sanitize_text(text: str) -> str:
+    """Sanitize text content to prevent prompt injection."""
+    if not text:
+        return ""
+    text = text.replace('\x00', '')
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r' {3,}', '  ', text)
+    # Remove potential prompt injection patterns
+    injection_patterns = [
+        r'ignore\s+(previous|above|all)\s+instructions',
+        r'disregard\s+(previous|above|all)\s+instructions',
+        r'forget\s+(previous|above|all)\s+instructions',
+        r'new\s+instructions?\s*:',
+        r'system\s*:\s*',
+    ]
+    for pattern in injection_patterns:
+        text = re.sub(pattern, '[FILTERED]', text, flags=re.IGNORECASE)
+    return text.strip()
+def truncate_content(content: str, max_length: int = MAX_CONTENT_LENGTH) -> str:
+    """Truncate content to maximum length while preserving sentences."""
+    if len(content) <= max_length:
+        return content
+    truncated = content[:max_length]
+    last_period = truncated.rfind('.')
+    if last_period > max_length * 0.8:
+        truncated = truncated[:last_period + 1]
+    return truncated + "\n\n[Content truncated due to length...]"
+def validate_content_length(content: str) -> Tuple[bool, str]:
+    """Validate that content is not empty and not too short."""
+    if not content or not content.strip():
+        return False, "No content was extracted from the page"
+    word_count = len(content.split())
+    if word_count < 50:
+        return False, f"Content too short ({word_count} words). This may not be a valid policy page."
+    return True, ""