Nadasr commited on
Commit
81ddc8e
·
verified ·
1 Parent(s): 9b6536a

Upload 3 files

Browse files
Files changed (3) hide show
  1. tools/text_analyzer.py +162 -0
  2. tools/text_chunker.py +142 -0
  3. tools/web_scraper.py +130 -0
tools/text_analyzer.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Analyzer Tool - Analyzes policy text to identify sections and concerns
3
+ """
4
+ from crewai.tools import tool
5
+ from typing import List, Dict
6
+ import re
7
+ import time
8
+
9
+ from utils.logger import log_agent_action
10
+
11
+ # Keywords for identifying sections
12
+ SECTION_KEYWORDS = {
13
+ 'data_collection': ['collect', 'gather', 'information we collect', 'personal data'],
14
+ 'data_sharing': ['share', 'third party', 'partners', 'disclose', 'sell'],
15
+ 'user_rights': ['your rights', 'opt-out', 'delete', 'access your data', 'gdpr', 'ccpa'],
16
+ 'data_retention': ['retain', 'retention', 'how long', 'keep your'],
17
+ 'security': ['security', 'protect', 'encryption', 'safeguard'],
18
+ 'cookies': ['cookie', 'tracking', 'analytics'],
19
+ }
20
+
21
+ # Red flag keywords
22
+ RED_FLAG_KEYWORDS = [
23
+ 'sell your data', 'sell your information', 'share with third parties',
24
+ 'advertising partners', 'indefinitely', 'without notice',
25
+ 'at our discretion', 'waive your right', 'arbitration', 'class action waiver'
26
+ ]
27
+
28
+
29
+ def chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> List[str]:
30
+ """Split text into overlapping chunks."""
31
+ if len(text) <= chunk_size:
32
+ return [text]
33
+
34
+ chunks = []
35
+ start = 0
36
+
37
+ while start < len(text):
38
+ end = start + chunk_size
39
+ if end < len(text):
40
+ para_break = text.rfind('\n\n', start, end)
41
+ if para_break > start + chunk_size // 2:
42
+ end = para_break
43
+
44
+ chunks.append(text[start:end].strip())
45
+ start = end - overlap
46
+
47
+ if start >= len(text) - overlap:
48
+ break
49
+
50
+ return chunks
51
+
52
+
53
+ def identify_sections(text: str) -> Dict[str, List[str]]:
54
+ """Identify relevant sections in the policy text."""
55
+ sections = {key: [] for key in SECTION_KEYWORDS}
56
+ paragraphs = re.split(r'\n{2,}', text)
57
+
58
+ for paragraph in paragraphs:
59
+ para_lower = paragraph.lower()
60
+ for section_type, keywords in SECTION_KEYWORDS.items():
61
+ for keyword in keywords:
62
+ if keyword in para_lower:
63
+ excerpt = paragraph[:500] + "..." if len(paragraph) > 500 else paragraph
64
+ if excerpt not in sections[section_type]:
65
+ sections[section_type].append(excerpt)
66
+ break
67
+
68
+ return sections
69
+
70
+
71
+ def find_red_flags(text: str) -> List[Dict[str, str]]:
72
+ """Find potential concerns in the policy."""
73
+ red_flags = []
74
+ text_lower = text.lower()
75
+
76
+ for keyword in RED_FLAG_KEYWORDS:
77
+ if keyword in text_lower:
78
+ idx = text_lower.find(keyword)
79
+ start = max(0, idx - 100)
80
+ end = min(len(text), idx + len(keyword) + 100)
81
+ context = text[start:end].strip()
82
+ red_flags.append({'keyword': keyword, 'context': context})
83
+
84
+ return red_flags
85
+
86
+
87
+ @tool("text_analyzer")
88
+ def text_analyzer_tool(text: str) -> str:
89
+ """
90
+ Analyzes policy text to identify key sections and potential concerns.
91
+
92
+ Args:
93
+ text: The policy text content to analyze
94
+
95
+ Returns:
96
+ Structured analysis with sections and red flags
97
+ """
98
+ start_time = time.time()
99
+
100
+ if not text or len(text.strip()) < 100:
101
+ error_msg = "Text too short for analysis"
102
+ log_agent_action("Text Analyzer Tool", "Validation", f"Received {len(text) if text else 0} chars",
103
+ error_msg, time.time() - start_time, False, error_msg)
104
+ return f"Error: {error_msg}"
105
+
106
+ try:
107
+ chunks = chunk_text(text)
108
+ all_sections = {key: [] for key in SECTION_KEYWORDS}
109
+ all_red_flags = []
110
+
111
+ for chunk in chunks:
112
+ sections = identify_sections(chunk)
113
+ for key, excerpts in sections.items():
114
+ all_sections[key].extend(excerpts)
115
+
116
+ flags = find_red_flags(chunk)
117
+ all_red_flags.extend(flags)
118
+
119
+ # Deduplicate
120
+ for key in all_sections:
121
+ all_sections[key] = list(set(all_sections[key]))[:3]
122
+
123
+ seen_keywords = set()
124
+ unique_flags = []
125
+ for flag in all_red_flags:
126
+ if flag['keyword'] not in seen_keywords:
127
+ seen_keywords.add(flag['keyword'])
128
+ unique_flags.append(flag)
129
+ all_red_flags = unique_flags[:10]
130
+
131
+ # Build result
132
+ result_parts = ["=== POLICY ANALYSIS ===\n"]
133
+
134
+ result_parts.append("## KEY SECTIONS:\n")
135
+ for section_type, excerpts in all_sections.items():
136
+ if excerpts:
137
+ result_parts.append(f"\n### {section_type.upper().replace('_', ' ')}:")
138
+ for i, excerpt in enumerate(excerpts, 1):
139
+ result_parts.append(f"{i}. {excerpt[:300]}...")
140
+
141
+ result_parts.append("\n\n## POTENTIAL CONCERNS:\n")
142
+ if all_red_flags:
143
+ for i, flag in enumerate(all_red_flags, 1):
144
+ result_parts.append(f"{i}. **{flag['keyword'].upper()}**")
145
+ result_parts.append(f" Context: \"{flag['context']}\"")
146
+ else:
147
+ result_parts.append("No major red flags identified.")
148
+
149
+ result_parts.append(f"\n\n## STATS: {len(text)} chars, {len(chunks)} chunks, {len(all_red_flags)} concerns")
150
+
151
+ result = "\n".join(result_parts)
152
+
153
+ log_agent_action("Text Analyzer Tool", "Analysis", f"Analyzed {len(chunks)} chunks",
154
+ f"Found {len(all_red_flags)} concerns", time.time() - start_time, True)
155
+
156
+ return result
157
+
158
+ except Exception as e:
159
+ error_msg = f"Analysis error: {str(e)}"
160
+ log_agent_action("Text Analyzer Tool", "Analysis", "Processing text", error_msg,
161
+ time.time() - start_time, False, error_msg)
162
+ return f"Error: {error_msg}"
tools/text_chunker.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text Chunker Tool - Splits and processes long policy texts
3
+ """
4
+ from crewai.tools import tool
5
+ from typing import List
6
+ import sys
7
+ import os
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+
12
+ from utils.logger import get_logs
13
+
14
+ logger = get_logs("TextChunkerTool")
15
+
16
+ # Configuration
17
+ DEFAULT_CHUNK_SIZE = 4000
18
+ DEFAULT_OVERLAP = 200
19
+
20
+
21
+ @tool("text_chunker")
22
+ def text_chunker(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> str:
23
+ """
24
+ Splits long text into manageable chunks for analysis.
25
+ Use this tool when the policy text is too long to process at once.
26
+
27
+ Args:
28
+ text: The text to split into chunks
29
+ chunk_size: Maximum size of each chunk (default 4000)
30
+
31
+ Returns:
32
+ Chunked text with section markers
33
+ """
34
+ logger.log_step("Starting text chunking", f"Input length: {len(text)}")
35
+
36
+ if not text or len(text.strip()) == 0:
37
+ logger.log_error("Empty text provided")
38
+ return "Error: No text provided to chunk"
39
+
40
+ # If text is short enough, return as is
41
+ if len(text) <= chunk_size:
42
+ logger.log_result("Chunking", "Text short enough, no chunking needed")
43
+ return text
44
+
45
+ chunks = []
46
+ paragraphs = text.split('\n\n')
47
+ current_chunk = ""
48
+ chunk_num = 1
49
+
50
+ for para in paragraphs:
51
+ # If adding this paragraph would exceed chunk size
52
+ if len(current_chunk) + len(para) + 2 > chunk_size:
53
+ if current_chunk:
54
+ chunks.append(f"[Section {chunk_num}]\n{current_chunk.strip()}")
55
+ chunk_num += 1
56
+ current_chunk = para
57
+ else:
58
+ # Paragraph itself is too long, split by sentences
59
+ sentences = para.replace('. ', '.\n').split('\n')
60
+ for sentence in sentences:
61
+ if len(current_chunk) + len(sentence) + 1 > chunk_size:
62
+ if current_chunk:
63
+ chunks.append(f"[Section {chunk_num}]\n{current_chunk.strip()}")
64
+ chunk_num += 1
65
+ current_chunk = sentence
66
+ else:
67
+ current_chunk += " " + sentence if current_chunk else sentence
68
+ else:
69
+ current_chunk += "\n\n" + para if current_chunk else para
70
+
71
+ # Add remaining content
72
+ if current_chunk:
73
+ chunks.append(f"[Section {chunk_num}]\n{current_chunk.strip()}")
74
+
75
+ result = "\n\n---\n\n".join(chunks)
76
+
77
+ logger.log_tool_call("text_chunker", "success")
78
+ logger.log_result("Chunking", f"Split into {len(chunks)} sections")
79
+
80
+ return result
81
+
82
+
83
+ @tool("extract_sections")
84
+ def extract_sections(text: str) -> str:
85
+ """
86
+ Extracts and identifies key sections from policy text.
87
+ Looks for common policy sections like Privacy, Data Collection, User Rights, etc.
88
+
89
+ Args:
90
+ text: The policy text to analyze
91
+
92
+ Returns:
93
+ Identified sections with their content
94
+ """
95
+ logger.log_step("Extracting sections from policy")
96
+
97
+ # Common section headers in policies
98
+ section_keywords = [
99
+ "privacy", "data collection", "data we collect", "information we collect",
100
+ "how we use", "data use", "sharing", "third party", "third-party",
101
+ "your rights", "user rights", "your choices", "opt-out", "opt out",
102
+ "cookies", "tracking", "retention", "how long", "security",
103
+ "children", "minors", "contact", "changes", "updates"
104
+ ]
105
+
106
+ lines = text.split('\n')
107
+ sections = {}
108
+ current_section = "Introduction"
109
+ current_content = []
110
+
111
+ for line in lines:
112
+ line_lower = line.lower().strip()
113
+
114
+ # Check if this line is a section header
115
+ is_header = False
116
+ for keyword in section_keywords:
117
+ if keyword in line_lower and len(line) < 100:
118
+ is_header = True
119
+ # Save previous section
120
+ if current_content:
121
+ sections[current_section] = '\n'.join(current_content)
122
+ current_section = line.strip()
123
+ current_content = []
124
+ break
125
+
126
+ if not is_header:
127
+ current_content.append(line)
128
+
129
+ # Save last section
130
+ if current_content:
131
+ sections[current_section] = '\n'.join(current_content)
132
+
133
+ # Format output
134
+ result = "Identified Policy Sections:\n\n"
135
+ for section_name, content in sections.items():
136
+ preview = content[:300] + "..." if len(content) > 300 else content
137
+ result += f"## {section_name}\n{preview}\n\n"
138
+
139
+ logger.log_tool_call("extract_sections", "success")
140
+ logger.log_result("Section extraction", f"Found {len(sections)} sections")
141
+
142
+ return result
tools/web_scraper.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Scraper Tool - Fetches and extracts text from policy pages
3
+ """
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from crewai.tools import tool
7
+ import time
8
+
9
+ from utils.validators import validate_url, sanitize_text, truncate_content, validate_content_length
10
+ from utils.logger import log_agent_action
11
+
12
+ # Configuration
13
+ REQUEST_TIMEOUT = 30
14
+ MAX_RETRIES = 2
15
+ RETRY_DELAY = 2
16
+
17
+ HEADERS = {
18
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
19
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20
+ 'Accept-Language': 'en-US,en;q=0.5',
21
+ }
22
+
23
+
24
+ def extract_text_from_html(html: str) -> str:
25
+ """Extract clean text from HTML content."""
26
+ soup = BeautifulSoup(html, 'html.parser')
27
+
28
+ # Remove unwanted elements
29
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
30
+ element.decompose()
31
+
32
+ # Try to find main content
33
+ main_content = None
34
+ for selector in ['main', 'article', '[role="main"]', '.content', '.policy-content', '#content']:
35
+ main_content = soup.select_one(selector)
36
+ if main_content:
37
+ break
38
+
39
+ if not main_content:
40
+ main_content = soup.body if soup.body else soup
41
+
42
+ text = main_content.get_text(separator='\n', strip=True)
43
+
44
+ lines = [line.strip() for line in text.split('\n') if line.strip() and len(line.strip()) > 2]
45
+ return '\n'.join(lines)
46
+
47
+
48
+ def get_page_title(html: str) -> str:
49
+ """Extract page title from HTML"""
50
+ soup = BeautifulSoup(html, 'html.parser')
51
+ if soup.title and soup.title.string:
52
+ return soup.title.string.strip()
53
+ h1 = soup.find('h1')
54
+ if h1:
55
+ return h1.get_text(strip=True)
56
+ return "Unknown Policy"
57
+
58
+
59
+ @tool("web_scraper")
60
+ def web_scraper_tool(url: str) -> str:
61
+ """
62
+ Scrapes text content from a policy webpage.
63
+
64
+ Args:
65
+ url: The URL of the policy page to scrape
66
+
67
+ Returns:
68
+ Extracted text content from the policy page
69
+ """
70
+ start_time = time.time()
71
+
72
+ # Validate URL
73
+ is_valid, error_msg = validate_url(url)
74
+ if not is_valid:
75
+ log_agent_action("Web Scraper Tool", "URL Validation", f"URL provided", f"Failed: {error_msg}",
76
+ time.time() - start_time, False, error_msg)
77
+ return f"Error: {error_msg}"
78
+
79
+ try:
80
+ # Fetch with retry
81
+ response = None
82
+ for attempt in range(MAX_RETRIES + 1):
83
+ try:
84
+ response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
85
+ response.raise_for_status()
86
+ break
87
+ except requests.exceptions.RequestException as e:
88
+ if attempt < MAX_RETRIES:
89
+ time.sleep(RETRY_DELAY)
90
+ else:
91
+ raise e
92
+
93
+ # Extract content
94
+ html = response.text
95
+ title = get_page_title(html)
96
+ content = extract_text_from_html(html)
97
+ content = sanitize_text(content)
98
+
99
+ # Validate content
100
+ is_valid, error_msg = validate_content_length(content)
101
+ if not is_valid:
102
+ log_agent_action("Web Scraper Tool", "Content Extraction", "HTML received", error_msg,
103
+ time.time() - start_time, False, error_msg)
104
+ return f"Error: {error_msg}"
105
+
106
+ content = truncate_content(content)
107
+ word_count = len(content.split())
108
+
109
+ log_agent_action("Web Scraper Tool", "Page Scraping", "URL fetched",
110
+ f"Extracted {word_count} words", time.time() - start_time, True)
111
+
112
+ return f"TITLE: {title}\nWORD_COUNT: {word_count}\nCONTENT:\n{content}"
113
+
114
+ except requests.exceptions.Timeout:
115
+ error_msg = f"Request timed out after {REQUEST_TIMEOUT} seconds"
116
+ log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
117
+ time.time() - start_time, False, error_msg)
118
+ return f"Error: {error_msg}"
119
+
120
+ except requests.exceptions.HTTPError as e:
121
+ error_msg = f"HTTP error: {e.response.status_code}"
122
+ log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
123
+ time.time() - start_time, False, error_msg)
124
+ return f"Error: {error_msg}"
125
+
126
+ except Exception as e:
127
+ error_msg = f"Unexpected error: {str(e)}"
128
+ log_agent_action("Web Scraper Tool", "Page Scraping", "Processing", error_msg,
129
+ time.time() - start_time, False, error_msg)
130
+ return f"Error: {error_msg}"