Nadasr commited on
Commit
7fae465
·
verified ·
1 Parent(s): ba11860

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +73 -0
  2. crew.py +127 -0
  3. evaluation.ipynb +180 -0
  4. requirements.txt +25 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Policy Summarizer - Gradio App
3
+ """
4
+ import gradio as gr
5
+ from dotenv import load_dotenv
6
+
7
+ from crew import run_policy_analysis
8
+ from utils.validators import validate_url, is_likely_policy_url
9
+ from utils.logger import format_logs_for_display, clear_logs
10
+
11
+ load_dotenv()
12
+
13
+ def process_policy(url: str):
14
+ """Process a policy URL and return summary + logs"""
15
+
16
+ clear_logs()
17
+
18
+ # Validate URL
19
+ is_valid, error_msg = validate_url(url)
20
+ if not is_valid:
21
+ return f"❌ **Invalid URL:** {error_msg}", "Validation failed"
22
+
23
+ # Warning for non-policy URLs
24
+ warning = ""
25
+ if not is_likely_policy_url(url):
26
+ warning = "⚠️ This URL may not be a policy page.\n\n"
27
+
28
+ try:
29
+ result = run_policy_analysis(url)
30
+ logs = format_logs_for_display()
31
+ return warning + result, logs
32
+ except Exception as e:
33
+ return f"❌ **Error:** {str(e)}", format_logs_for_display()
34
+
35
+
36
+ # Create Gradio interface
37
+ with gr.Blocks(title="Policy Summarizer") as app:
38
+
39
+ gr.Markdown("# 🔍 Policy Summarizer")
40
+ gr.Markdown("""
41
+ Paste a link to any Privacy Policy or Terms of Service, and AI agents will:
42
+ - 📄 **Summarize** the key points
43
+ - ✅ **Highlight** your rights
44
+ - ⚠️ **Warn** about concerns
45
+ """)
46
+
47
+ with gr.Row():
48
+ url_input = gr.Textbox(
49
+ label="Policy URL",
50
+ placeholder="https://example.com/privacy-policy",
51
+ scale=4
52
+ )
53
+ analyze_btn = gr.Button("🔍 Analyze", variant="primary", scale=1)
54
+
55
+ gr.Markdown("### Examples:")
56
+ gr.Markdown("- https://discord.com/privacy")
57
+ gr.Markdown("- https://www.spotify.com/legal/privacy-policy/")
58
+
59
+ with gr.Tabs():
60
+ with gr.TabItem("📋 Summary"):
61
+ output_summary = gr.Markdown(value="*Enter a URL and click Analyze*")
62
+ with gr.TabItem("📊 Logs"):
63
+ output_logs = gr.Markdown(value="*Logs appear here*")
64
+
65
+ analyze_btn.click(
66
+ fn=process_policy,
67
+ inputs=[url_input],
68
+ outputs=[output_summary, output_logs]
69
+ )
70
+
71
+ if __name__ == "__main__":
72
+ app.launch(server_name="0.0.0.0", server_port=7860)
73
+
crew.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CrewAI Configuration - Policy Summarizer
3
+ """
4
+ import os
5
+ from crewai import Agent, Task, Crew, Process
6
+ from tools.web_scraper import web_scraper_tool
7
+ from tools.text_analyzer import text_analyzer_tool
8
+ from utils.logger import log_agent_action, clear_logs
9
+
10
+
11
+ def create_agents():
12
+ """Create the 3 agents"""
13
+
14
+ orchestrator = Agent(
15
+ role="Policy Analysis Orchestrator",
16
+ goal="Coordinate the policy analysis and create a user-friendly summary",
17
+ backstory="""You are an expert at analyzing legal documents and presenting
18
+ complex information in simple terms. You coordinate the analysis workflow.""",
19
+ verbose=True,
20
+ allow_delegation=True
21
+ )
22
+
23
+ scraper = Agent(
24
+ role="Web Content Scraper",
25
+ goal="Extract clean policy text from web URLs",
26
+ backstory="""You specialize in web scraping and content extraction.
27
+ You can extract policy text while filtering out irrelevant content.""",
28
+ verbose=True,
29
+ allow_delegation=False,
30
+ tools=[web_scraper_tool]
31
+ )
32
+
33
+ analyzer = Agent(
34
+ role="Policy Analyzer",
35
+ goal="Analyze policies to identify key points, rights, and concerns",
36
+ backstory="""You are a legal expert who analyzes terms of service and
37
+ privacy policies. You identify user rights and potential red flags.""",
38
+ verbose=True,
39
+ allow_delegation=False,
40
+ tools=[text_analyzer_tool]
41
+ )
42
+
43
+ return orchestrator, scraper, analyzer
44
+
45
+
46
+ def create_tasks(orchestrator, scraper, analyzer, url: str):
47
+ """Create the tasks for each agent"""
48
+
49
+ scrape_task = Task(
50
+ description=f"""
51
+ Scrape the policy content from: {url}
52
+ Use the web_scraper_tool to fetch and extract the text.
53
+ Return the full policy text content.
54
+ """,
55
+ expected_output="The extracted policy text content",
56
+ agent=scraper
57
+ )
58
+
59
+ analyze_task = Task(
60
+ description="""
61
+ Analyze the scraped policy content:
62
+ 1. Use text_analyzer_tool to identify key sections
63
+ 2. Find user rights (deletion, access, opt-out, etc.)
64
+ 3. Identify concerns and red flags
65
+ 4. Note data collection and sharing practices
66
+ """,
67
+ expected_output="Structured analysis with sections, rights, and concerns",
68
+ agent=analyzer,
69
+ context=[scrape_task]
70
+ )
71
+
72
+ summary_task = Task(
73
+ description="""
74
+ Create a user-friendly summary with these sections:
75
+
76
+ ## 📄 Policy Summary
77
+ [3-5 key points about this policy]
78
+
79
+ ## ✅ Your Rights
80
+ [List user rights with brief explanations]
81
+
82
+ ## ⚠️ Concerns & Warnings
83
+ [List red flags with severity: 🔴 High, 🟡 Medium, 🟢 Low]
84
+
85
+ ## 💡 Recommendation
86
+ [Overall assessment and advice]
87
+
88
+ Use simple language, avoid legal jargon.
89
+ """,
90
+ expected_output="A formatted, user-friendly policy summary",
91
+ agent=orchestrator,
92
+ context=[scrape_task, analyze_task]
93
+ )
94
+
95
+ return [scrape_task, analyze_task, summary_task]
96
+
97
+
98
+ def run_policy_analysis(url: str) -> str:
99
+ """Main function to analyze a policy URL"""
100
+
101
+ clear_logs()
102
+
103
+ log_agent_action(
104
+ agent_name="System",
105
+ action="Starting Analysis",
106
+ input_summary=f"URL length: {len(url)}",
107
+ output_summary="Initializing agents...",
108
+ duration_seconds=0,
109
+ success=True
110
+ )
111
+
112
+ try:
113
+ orchestrator, scraper, analyzer = create_agents()
114
+ tasks = create_tasks(orchestrator, scraper, analyzer, url)
115
+
116
+ crew = Crew(
117
+ agents=[orchestrator, scraper, analyzer],
118
+ tasks=tasks,
119
+ process=Process.sequential,
120
+ verbose=True
121
+ )
122
+
123
+ result = crew.kickoff()
124
+ return str(result)
125
+
126
+ except Exception as e:
127
+ return f"❌ Error: {str(e)}"
evaluation.ipynb ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from datetime import datetime\n",
10
+ "from utils.validators import validate_url, is_likely_policy_url\n",
11
+ "\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "TEST_URLS = [\n",
21
+ " {\"company\": \"Discord\", \"url\": \"https://discord.com/privacy\"},\n",
22
+ " {\"company\": \"Spotify\", \"url\": \"https://www.spotify.com/legal/privacy-policy/\"},\n",
23
+ " {\"company\": \"Reddit\", \"url\": \"https://www.reddit.com/policies/privacy-policy\"},\n",
24
+ " {\"company\": \"Netflix\", \"url\": \"https://www.netflix.com/privacy\"},\n",
25
+ " {\"company\": \"Twitter\", \"url\": \"https://twitter.com/en/tos\"},\n",
26
+ " {\"company\": \"TikTok\", \"url\": \"https://www.tiktok.com/legal/privacy-policy\"},\n",
27
+ " {\"company\": \"LinkedIn\", \"url\": \"https://www.linkedin.com/legal/privacy-policy\"},\n",
28
+ " {\"company\": \"Google\", \"url\": \"https://policies.google.com/privacy\"},\n",
29
+ " {\"company\": \"Apple\", \"url\": \"https://www.apple.com/legal/privacy/\"},\n",
30
+ " {\"company\": \"Amazon\", \"url\": \"https://www.amazon.com/gp/help/customer/display.html\"}\n",
31
+ "]\n",
32
+ "\n",
33
+ "BAD_URLS = [\n",
34
+ " {\"name\": \"Empty\", \"url\": \"\"},\n",
35
+ " {\"name\": \"No protocol\", \"url\": \"google.com\"},\n",
36
+ " {\"name\": \"Localhost\", \"url\": \"http://localhost/test\"},\n",
37
+ " {\"name\": \"Private IP\", \"url\": \"http://192.168.1.1/page\"}\n",
38
+ "]"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 3,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "data": {
48
+ "text/plain": [
49
+ "{'Discord': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
50
+ " 'Spotify': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
51
+ " 'Reddit': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
52
+ " 'Netflix': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
53
+ " 'Twitter': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
54
+ " 'TikTok': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
55
+ " 'LinkedIn': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
56
+ " 'Google': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
57
+ " 'Apple': {'valid': True, 'is_policy': True, 'status': 'PASS'},\n",
58
+ " 'Amazon': {'valid': True, 'is_policy': False, 'status': 'PASS'}}"
59
+ ]
60
+ },
61
+ "execution_count": 3,
62
+ "metadata": {},
63
+ "output_type": "execute_result"
64
+ }
65
+ ],
66
+ "source": [
67
+ "url_results = {}\n",
68
+ "\n",
69
+ "for test in TEST_URLS:\n",
70
+ " valid, msg = validate_url(test['url'])\n",
71
+ " is_policy = is_likely_policy_url(test['url'])\n",
72
+ " url_results[test['company']] = {\n",
73
+ " \"valid\": valid,\n",
74
+ " \"is_policy\": is_policy,\n",
75
+ " \"status\": \"PASS\" if valid else \"FAIL\"\n",
76
+ " }\n",
77
+ "\n",
78
+ "url_results"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 4,
84
+ "metadata": {},
85
+ "outputs": [
86
+ {
87
+ "data": {
88
+ "text/plain": [
89
+ "{'Empty': {'blocked': True,\n",
90
+ " 'message': 'URL cannot be empty',\n",
91
+ " 'status': 'PASS'},\n",
92
+ " 'No protocol': {'blocked': True,\n",
93
+ " 'message': 'Invalid URL format. Must start with http:// or https://',\n",
94
+ " 'status': 'PASS'},\n",
95
+ " 'Localhost': {'blocked': True,\n",
96
+ " 'message': 'Cannot scrape localhost or private addresses',\n",
97
+ " 'status': 'PASS'},\n",
98
+ " 'Private IP': {'blocked': False, 'message': '', 'status': 'FAIL'}}"
99
+ ]
100
+ },
101
+ "execution_count": 4,
102
+ "metadata": {},
103
+ "output_type": "execute_result"
104
+ }
105
+ ],
106
+ "source": [
107
+ "safety_results = {}\n",
108
+ "\n",
109
+ "for test in BAD_URLS:\n",
110
+ " valid, msg = validate_url(test['url'])\n",
111
+ " safety_results[test['name']] = {\n",
112
+ " \"blocked\": not valid,\n",
113
+ " \"message\": msg,\n",
114
+ " \"status\": \"PASS\" if not valid else \"FAIL\"\n",
115
+ " }\n",
116
+ "\n",
117
+ "safety_results"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 6,
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "data": {
127
+ "text/plain": [
128
+ "{'date': '2026-02-12 20:27',\n",
129
+ " 'url_validation': '10/10',\n",
130
+ " 'safety_tests': '3/4',\n",
131
+ " 'overall': 'PASS'}"
132
+ ]
133
+ },
134
+ "execution_count": 6,
135
+ "metadata": {},
136
+ "output_type": "execute_result"
137
+ }
138
+ ],
139
+ "source": [
140
+ "url_passed = sum(1 for r in url_results.values() if r['status'] == 'PASS')\n",
141
+ "safety_passed = sum(1 for r in safety_results.values() if r['status'] == 'PASS')\n",
142
+ "\n",
143
+ "{\n",
144
+ " \"date\": datetime.now().strftime(\"%Y-%m-%d %H:%M\"),\n",
145
+ " \"url_validation\": f\"{url_passed}/10\",\n",
146
+ " \"safety_tests\": f\"{safety_passed}/4\",\n",
147
+ " \"overall\": \"PASS\" if url_passed >= 8 and safety_passed >=3 else \"FAIL\"\n",
148
+ "}"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": []
157
+ }
158
+ ],
159
+ "metadata": {
160
+ "kernelspec": {
161
+ "display_name": "Python 3",
162
+ "language": "python",
163
+ "name": "python3"
164
+ },
165
+ "language_info": {
166
+ "codemirror_mode": {
167
+ "name": "ipython",
168
+ "version": 3
169
+ },
170
+ "file_extension": ".py",
171
+ "mimetype": "text/x-python",
172
+ "name": "python",
173
+ "nbconvert_exporter": "python",
174
+ "pygments_lexer": "ipython3",
175
+ "version": "3.12.10"
176
+ }
177
+ },
178
+ "nbformat": 4,
179
+ "nbformat_minor": 4
180
+ }
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Policy Summarizer - Requirements
2
+ # Install with: pip install -r requirements.txt
3
+
4
+ # CrewAI Framework
5
+ crewai
6
+ crewai-tools
7
+ # LangChain for LLM
8
+ langchain>=0.1.0
9
+ langchain-openai>=0.0.5
10
+
11
+ # Web Scraping
12
+ requests>=2.31.0
13
+ beautifulsoup4>=4.12.0
14
+
15
+ # UI
16
+ gradio>=4.0.0
17
+
18
+ # Data Validation
19
+ pydantic>=2.0.0
20
+
21
+ # Environment Variables
22
+ python-dotenv>=1.0.0
23
+
24
+ # Utilities
25
+ tenacity>=8.2.0 # For retries