Spaces:

AISA-Framework
/

PolicySummarizer

Sleeping

App Files Files Community

Nadasr commited on 23 days ago

Commit

81ddc8e

verified ·

1 Parent(s): 9b6536a

Upload 3 files

Browse files

Files changed (3) hide show

tools/text_analyzer.py +162 -0
tools/text_chunker.py +142 -0
tools/web_scraper.py +130 -0

tools/text_analyzer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+Text Analyzer Tool - Analyzes policy text to identify sections and concerns
+"""
+from crewai.tools import tool
+from typing import List, Dict
+import re
+import time
+from utils.logger import log_agent_action
+# Keywords for identifying sections
+SECTION_KEYWORDS = {
+    'data_collection': ['collect', 'gather', 'information we collect', 'personal data'],
+    'data_sharing': ['share', 'third party', 'partners', 'disclose', 'sell'],
+    'user_rights': ['your rights', 'opt-out', 'delete', 'access your data', 'gdpr', 'ccpa'],
+    'data_retention': ['retain', 'retention', 'how long', 'keep your'],
+    'security': ['security', 'protect', 'encryption', 'safeguard'],
+    'cookies': ['cookie', 'tracking', 'analytics'],
+}
+# Red flag keywords
+RED_FLAG_KEYWORDS = [
+    'sell your data', 'sell your information', 'share with third parties',
+    'advertising partners', 'indefinitely', 'without notice',
+    'at our discretion', 'waive your right', 'arbitration', 'class action waiver'
+]
+def chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> List[str]:
+    """Split text into overlapping chunks."""
+    if len(text) <= chunk_size:
+        return [text]
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        if end < len(text):
+            para_break = text.rfind('\n\n', start, end)
+            if para_break > start + chunk_size // 2:
+                end = para_break
+        chunks.append(text[start:end].strip())
+        start = end - overlap
+        if start >= len(text) - overlap:
+            break
+    return chunks
+def identify_sections(text: str) -> Dict[str, List[str]]:
+    """Identify relevant sections in the policy text."""
+    sections = {key: [] for key in SECTION_KEYWORDS}
+    paragraphs = re.split(r'\n{2,}', text)
+    for paragraph in paragraphs:
+        para_lower = paragraph.lower()
+        for section_type, keywords in SECTION_KEYWORDS.items():
+            for keyword in keywords:
+                if keyword in para_lower:
+                    excerpt = paragraph[:500] + "..." if len(paragraph) > 500 else paragraph
+                    if excerpt not in sections[section_type]:
+                        sections[section_type].append(excerpt)
+                    break
+    return sections
+def find_red_flags(text: str) -> List[Dict[str, str]]:
+    """Find potential concerns in the policy."""
+    red_flags = []
+    text_lower = text.lower()
+    for keyword in RED_FLAG_KEYWORDS:
+        if keyword in text_lower:
+            idx = text_lower.find(keyword)
+            start = max(0, idx - 100)
+            end = min(len(text), idx + len(keyword) + 100)
+            context = text[start:end].strip()
+            red_flags.append({'keyword': keyword, 'context': context})
+    return red_flags
+@tool("text_analyzer")
+def text_analyzer_tool(text: str) -> str:
+    """
+    Analyzes policy text to identify key sections and potential concerns.
+    Args:
+        text: The policy text content to analyze
+    Returns:
+        Structured analysis with sections and red flags
+    """
+    start_time = time.time()
+    if not text or len(text.strip()) < 100:
+        error_msg = "Text too short for analysis"
+        log_agent_action("Text Analyzer Tool", "Validation", f"Received {len(text) if text else 0} chars",
+                        error_msg, time.time() - start_time, False, error_msg)
+        return f"Error: {error_msg}"
+    try:
+        chunks = chunk_text(text)
+        all_sections = {key: [] for key in SECTION_KEYWORDS}
+        all_red_flags = []
+        for chunk in chunks:
+            sections = identify_sections(chunk)
+            for key, excerpts in sections.items():
+                all_sections[key].extend(excerpts)
+            flags = find_red_flags(chunk)
+            all_red_flags.extend(flags)
+        # Deduplicate
+        for key in all_sections:
+            all_sections[key] = list(set(all_sections[key]))[:3]
+        seen_keywords = set()
+        unique_flags = []
+        for flag in all_red_flags:
+            if flag['keyword'] not in seen_keywords:
+                seen_keywords.add(flag['keyword'])
+                unique_flags.append(flag)
+        all_red_flags = unique_flags[:10]
+        # Build result
+        result_parts = ["=== POLICY ANALYSIS ===\n"]
+        result_parts.append("## KEY SECTIONS:\n")
+        for section_type, excerpts in all_sections.items():
+            if excerpts:
+                result_parts.append(f"\n### {section_type.upper().replace('_', ' ')}:")
+                for i, excerpt in enumerate(excerpts, 1):
+                    result_parts.append(f"{i}. {excerpt[:300]}...")
+        result_parts.append("\n\n## POTENTIAL CONCERNS:\n")
+        if all_red_flags:
+            for i, flag in enumerate(all_red_flags, 1):
+                result_parts.append(f"{i}. **{flag['keyword'].upper()}**")
+                result_parts.append(f"   Context: \"{flag['context']}\"")
+        else:
+            result_parts.append("No major red flags identified.")
+        result_parts.append(f"\n\n## STATS: {len(text)} chars, {len(chunks)} chunks, {len(all_red_flags)} concerns")
+        result = "\n".join(result_parts)
+        log_agent_action("Text Analyzer Tool", "Analysis", f"Analyzed {len(chunks)} chunks",
+                        f"Found {len(all_red_flags)} concerns", time.time() - start_time, True)
+        return result
+    except Exception as e:
+        error_msg = f"Analysis error: {str(e)}"
+        log_agent_action("Text Analyzer Tool", "Analysis", "Processing text", error_msg,
+                        time.time() - start_time, False, error_msg)
+        return f"Error: {error_msg}"

tools/text_chunker.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Text Chunker Tool - Splits and processes long policy texts
+"""
+from crewai.tools import tool
+from typing import List
+import sys
+import os
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from utils.logger import get_logs
+logger = get_logs("TextChunkerTool")
+# Configuration
+DEFAULT_CHUNK_SIZE = 4000
+DEFAULT_OVERLAP = 200
+@tool("text_chunker")
+def text_chunker(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> str:
+    """
+    Splits long text into manageable chunks for analysis.
+    Use this tool when the policy text is too long to process at once.
+    Args:
+        text: The text to split into chunks
+        chunk_size: Maximum size of each chunk (default 4000)
+    Returns:
+        Chunked text with section markers
+    """
+    logger.log_step("Starting text chunking", f"Input length: {len(text)}")
+    if not text or len(text.strip()) == 0:
+        logger.log_error("Empty text provided")
+        return "Error: No text provided to chunk"
+    # If text is short enough, return as is
+    if len(text) <= chunk_size:
+        logger.log_result("Chunking", "Text short enough, no chunking needed")
+        return text
+    chunks = []
+    paragraphs = text.split('\n\n')
+    current_chunk = ""
+    chunk_num = 1
+    for para in paragraphs:
+        # If adding this paragraph would exceed chunk size
+        if len(current_chunk) + len(para) + 2 > chunk_size:
+            if current_chunk:
+                chunks.append(f"[Section {chunk_num}]\n{current_chunk.strip()}")
+                chunk_num += 1
+                current_chunk = para
+            else:
+                # Paragraph itself is too long, split by sentences
+                sentences = para.replace('. ', '.\n').split('\n')
+                for sentence in sentences:
+                    if len(current_chunk) + len(sentence) + 1 > chunk_size:
+                        if current_chunk:
+                            chunks.append(f"[Section {chunk_num}]\n{current_chunk.strip()}")
+                            chunk_num += 1
+                        current_chunk = sentence
+                    else:
+                        current_chunk += " " + sentence if current_chunk else sentence
+        else:
+            current_chunk += "\n\n" + para if current_chunk else para
+    # Add remaining content
+    if current_chunk:
+        chunks.append(f"[Section {chunk_num}]\n{current_chunk.strip()}")
+    result = "\n\n---\n\n".join(chunks)
+    logger.log_tool_call("text_chunker", "success")
+    logger.log_result("Chunking", f"Split into {len(chunks)} sections")
+    return result
+@tool("extract_sections")
+def extract_sections(text: str) -> str:
+    """
+    Extracts and identifies key sections from policy text.
+    Looks for common policy sections like Privacy, Data Collection, User Rights, etc.
+    Args:
+        text: The policy text to analyze
+    Returns:
+        Identified sections with their content
+    """
+    logger.log_step("Extracting sections from policy")
+    # Common section headers in policies
+    section_keywords = [
+        "privacy", "data collection", "data we collect", "information we collect",
+        "how we use", "data use", "sharing", "third party", "third-party",
+        "your rights", "user rights", "your choices", "opt-out", "opt out",
+        "cookies", "tracking", "retention", "how long", "security",
+        "children", "minors", "contact", "changes", "updates"
+    ]
+    lines = text.split('\n')
+    sections = {}
+    current_section = "Introduction"
+    current_content = []
+    for line in lines:
+        line_lower = line.lower().strip()
+        # Check if this line is a section header
+        is_header = False
+        for keyword in section_keywords:
+            if keyword in line_lower and len(line) < 100:
+                is_header = True
+                # Save previous section
+                if current_content:
+                    sections[current_section] = '\n'.join(current_content)
+                current_section = line.strip()
+                current_content = []
+                break
+        if not is_header:
+            current_content.append(line)
+    # Save last section
+    if current_content:
+        sections[current_section] = '\n'.join(current_content)
+    # Format output
+    result = "Identified Policy Sections:\n\n"
+    for section_name, content in sections.items():
+        preview = content[:300] + "..." if len(content) > 300 else content
+        result += f"## {section_name}\n{preview}\n\n"
+    logger.log_tool_call("extract_sections", "success")
+    logger.log_result("Section extraction", f"Found {len(sections)} sections")
+    return result

tools/web_scraper.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Web Scraper Tool - Fetches and extracts text from policy pages
+"""
+import requests
+from bs4 import BeautifulSoup
+from crewai.tools import tool
+import time
+from utils.validators import validate_url, sanitize_text, truncate_content, validate_content_length
+from utils.logger import log_agent_action
+# Configuration
+REQUEST_TIMEOUT = 30
+MAX_RETRIES = 2
+RETRY_DELAY = 2
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+}
+def extract_text_from_html(html: str) -> str:
+    """Extract clean text from HTML content."""
+    soup = BeautifulSoup(html, 'html.parser')
+    # Remove unwanted elements
+    for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
+        element.decompose()
+    # Try to find main content
+    main_content = None
+    for selector in ['main', 'article', '[role="main"]', '.content', '.policy-content', '#content']:
+        main_content = soup.select_one(selector)
+        if main_content:
+            break
+    if not main_content:
+        main_content = soup.body if soup.body else soup
+    text = main_content.get_text(separator='\n', strip=True)
+    lines = [line.strip() for line in text.split('\n') if line.strip() and len(line.strip()) > 2]
+    return '\n'.join(lines)
+def get_page_title(html: str) -> str:
+    """Extract page title from HTML"""
+    soup = BeautifulSoup(html, 'html.parser')
+    if soup.title and soup.title.string:
+        return soup.title.string.strip()
+    h1 = soup.find('h1')
+    if h1:
+        return h1.get_text(strip=True)
+    return "Unknown Policy"
+@tool("web_scraper")
+def web_scraper_tool(url: str) -> str:
+    """
+    Scrapes text content from a policy webpage.
+    Args:
+        url: The URL of the policy page to scrape
+    Returns:
+        Extracted text content from the policy page
+    """
+    start_time = time.time()
+    # Validate URL
+    is_valid, error_msg = validate_url(url)
+    if not is_valid:
+        log_agent_action("Web Scraper Tool", "URL Validation", f"URL provided", f"Failed: {error_msg}",
+                        time.time() - start_time, False, error_msg)
+        return f"Error: {error_msg}"
+    try:
+        # Fetch with retry
+        response = None
+        for attempt in range(MAX_RETRIES + 1):
+            try:
+                response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
+                response.raise_for_status()
+                break
+            except requests.exceptions.RequestException as e:
+                if attempt < MAX_RETRIES:
+                    time.sleep(RETRY_DELAY)
+                else:
+                    raise e
+        # Extract content
+        html = response.text
+        title = get_page_title(html)
+        content = extract_text_from_html(html)
+        content = sanitize_text(content)
+        # Validate content
+        is_valid, error_msg = validate_content_length(content)
+        if not is_valid:
+            log_agent_action("Web Scraper Tool", "Content Extraction", "HTML received", error_msg,
+                           time.time() - start_time, False, error_msg)
+            return f"Error: {error_msg}"
+        content = truncate_content(content)
+        word_count = len(content.split())
+        log_agent_action("Web Scraper Tool", "Page Scraping", "URL fetched",
+                        f"Extracted {word_count} words", time.time() - start_time, True)
+        return f"TITLE: {title}\nWORD_COUNT: {word_count}\nCONTENT:\n{content}"
+    except requests.exceptions.Timeout:
+        error_msg = f"Request timed out after {REQUEST_TIMEOUT} seconds"
+        log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
+                        time.time() - start_time, False, error_msg)
+        return f"Error: {error_msg}"
+    except requests.exceptions.HTTPError as e:
+        error_msg = f"HTTP error: {e.response.status_code}"
+        log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
+                        time.time() - start_time, False, error_msg)
+        return f"Error: {error_msg}"
+    except Exception as e:
+        error_msg = f"Unexpected error: {str(e)}"
+        log_agent_action("Web Scraper Tool", "Page Scraping", "Processing", error_msg,
+                        time.time() - start_time, False, error_msg)
+        return f"Error: {error_msg}"