ShreeDeepthi commited on
Commit
84cab31
·
verified ·
1 Parent(s): fa3fa6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -190
app.py CHANGED
@@ -1,190 +1,200 @@
1
- import streamlit as st
2
- from pdfminer.high_level import extract_text
3
- import smtplib
4
- from email.message import EmailMessage
5
- from email_validator import validate_email, EmailNotValidError
6
- import spacy
7
- from collections import Counter
8
- import heapq
9
- from fpdf import FPDF
10
- import pandas as pd
11
- import matplotlib.pyplot as plt
12
- import requests
13
-
14
- # Load spaCy model
15
- nlp = spacy.load("en_core_web_sm")
16
-
17
- # Predefined risk-related words
18
- RISK_WORDS = [
19
- "fraud", "penalty", "violation", "risk", "lawsuit", "breach",
20
- "noncompliance", "litigation", "regulatory", "fine"
21
- ]
22
-
23
- HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
24
- SENDER_EMAIL = "shreedeepthi2005@gmail.com"
25
- SENDER_PASSWORD = "qntm oher jqfz oflt"
26
-
27
- def extract_text_from_pdf(uploaded_file):
28
- return extract_text(uploaded_file)
29
-
30
- def extract_key_clauses(text):
31
- doc = nlp(text)
32
- sentences = list(doc.sents)
33
- clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
34
- return clauses[:10] # Return top 10 clauses for simplicity
35
-
36
- def summarize_text(text, num_sentences=5):
37
- doc = nlp(text)
38
- sentences = list(doc.sents)
39
- word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
40
- sentence_scores = {}
41
- for sent in sentences:
42
- sentence_score = 0
43
- for word in sent:
44
- if word.text.lower() in word_frequencies:
45
- sentence_score += word_frequencies[word.text.lower()]
46
- sentence_scores[sent] = sentence_score
47
- summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
48
- summary = ' '.join([str(sentence) for sentence in summarized_sentences])
49
- return summary
50
-
51
- def detect_risks(text):
52
- doc = nlp(text.lower())
53
- detected_risks = [token.text for token in doc if token.text in RISK_WORDS]
54
- return list(set(detected_risks))
55
-
56
- def get_regulatory_updates():
57
- # Fallback: Pre-defined updates
58
- predefined_updates = [
59
- {"title": "New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
60
- {"title": "Update on Financial Risks", "summary": "New policies to mitigate risks in the financial sector."},
61
- ]
62
- url = "https://www.sec.gov/newsroom/press-releases"
63
- try:
64
- response = requests.get(url, headers=HEADERS)
65
- response.raise_for_status()
66
- updates = [] # Placeholder for parsed updates (needs a proper parsing method)
67
- # Process response.content with BeautifulSoup or similar parser if allowed
68
- return updates if updates else predefined_updates
69
- except requests.exceptions.RequestException:
70
- return predefined_updates
71
-
72
- def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pdf"):
73
- pdf = FPDF()
74
- pdf.set_auto_page_break(auto=True, margin=15)
75
- pdf.add_page()
76
- pdf.set_font("Arial", size=12)
77
-
78
- pdf.cell(200, 10, txt="Legal Document Analysis Results", ln=True, align="C")
79
-
80
- # Summary
81
- pdf.ln(10)
82
- pdf.cell(200, 10, txt="Summary", ln=True, align="L")
83
- pdf.set_font("Arial", size=10)
84
- pdf.multi_cell(0, 10, summary)
85
-
86
- # Key Clauses
87
- pdf.ln(10)
88
- pdf.set_font("Arial", size=12)
89
- pdf.cell(200, 10, txt="Key Clauses", ln=True, align="L")
90
- pdf.set_font("Arial", size=10)
91
- for clause in clauses:
92
- pdf.multi_cell(0, 10, f"- {clause}")
93
-
94
- # Risks
95
- pdf.ln(10)
96
- pdf.set_font("Arial", size=12)
97
- pdf.cell(200, 10, txt="Detected Risks", ln=True, align="L")
98
- pdf.set_font("Arial", size=10)
99
- pdf.multi_cell(0, 10, ", ".join(risks))
100
-
101
- # Regulatory Updates
102
- pdf.ln(10)
103
- pdf.set_font("Arial", size=12)
104
- pdf.cell(200, 10, txt="Regulatory Updates", ln=True, align="L")
105
- pdf.set_font("Arial", size=10)
106
- if updates:
107
- for update in updates:
108
- pdf.multi_cell(0, 10, f"- {update.get('title', 'N/A')}: {update.get('summary', 'N/A')}")
109
-
110
- pdf.output(pdf_path)
111
-
112
- def send_email(pdf_path, recipient_email):
113
- msg = EmailMessage()
114
- msg["Subject"] = "Legal Document Analysis Results"
115
- msg["From"] = SENDER_EMAIL
116
- msg["To"] = recipient_email
117
-
118
- msg.set_content("Please find attached the analysis results PDF.")
119
-
120
- with open(pdf_path, "rb") as file:
121
- msg.add_attachment(file.read(), maintype="application", subtype="pdf", filename="Analysis_Results.pdf")
122
-
123
- with smtplib.SMTP("smtp.gmail.com", 587) as server:
124
- server.starttls()
125
- server.login(SENDER_EMAIL, SENDER_PASSWORD)
126
- server.send_message(msg)
127
-
128
- def main():
129
- st.title("Interactive Legal Document Analysis Dashboard")
130
-
131
- # Sidebar options
132
- st.sidebar.title("Options")
133
- features = st.sidebar.multiselect("Select Features",
134
- ["Data Visualization", "Summary", "Key Clauses", "Risk Detection", "Regulatory Updates"])
135
-
136
- # File upload
137
- uploaded_file = st.file_uploader("Upload a legal document (PDF)", type="pdf")
138
- recipient_email = st.text_input("Enter your email to receive the analysis results (optional)")
139
-
140
- if uploaded_file is not None:
141
- try:
142
- text = extract_text_from_pdf(uploaded_file)
143
- st.success("Text extracted successfully!")
144
- except Exception as e:
145
- st.error(f"Error extracting text from PDF: {e}")
146
- return
147
-
148
- summary = ""
149
- clauses, risks, updates = [], [], []
150
-
151
- if "Summary" in features:
152
- summary = summarize_text(text)
153
- st.subheader("Summary")
154
- st.write(summary)
155
-
156
- if "Key Clauses" in features:
157
- clauses = extract_key_clauses(text)
158
- st.subheader("Key Clauses")
159
- for i, clause in enumerate(clauses, 1):
160
- st.write(f"{i}. {clause}")
161
-
162
- if "Risk Detection" in features:
163
- risks = detect_risks(text)
164
- st.subheader("Detected Risks")
165
- st.write(", ".join(risks) if risks else "No risks detected.")
166
-
167
- if "Regulatory Updates" in features:
168
- updates = get_regulatory_updates()
169
- st.subheader("Regulatory Updates")
170
- for update in updates:
171
- st.write(f"- **{update.get('title')}**: {update.get('summary')}")
172
-
173
- # Generate PDF
174
- if st.button("Generate PDF Report"):
175
- pdf_path = "Analysis_Results.pdf"
176
- generate_pdf(summary, clauses, risks, updates, pdf_path)
177
- with open(pdf_path, "rb") as file:
178
- st.download_button("Download PDF Report", file, file_name="Analysis_Results.pdf", mime="application/pdf")
179
-
180
- # Email PDF
181
- if recipient_email:
182
- try:
183
- validate_email(recipient_email)
184
- send_email(pdf_path, recipient_email)
185
- st.success(f"PDF sent to {recipient_email} successfully!")
186
- except EmailNotValidError:
187
- st.error("Invalid email address. Please enter a valid one.")
188
-
189
- if __name__ == "__main__":
190
- main()
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdfminer.high_level import extract_text
3
+ import smtplib
4
+ from email.message import EmailMessage
5
+ from email_validator import validate_email, EmailNotValidError
6
+ import spacy
7
+ from collections import Counter
8
+ import heapq
9
+ from fpdf import FPDF
10
+ import pandas as pd
11
+ import matplotlib.pyplot as plt
12
+ import requests
13
+ import subprocess
14
+ import sys
15
+
16
+ # Install spaCy and the en_core_web_sm model if they are not installed
17
+ try:
18
+ import spacy
19
+ spacy.load("en_core_web_sm")
20
+ except OSError:
21
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
22
+ subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
23
+
24
+ # Load spaCy model
25
+ nlp = spacy.load("en_core_web_sm")
26
+
27
+ # Predefined risk-related words
28
+ RISK_WORDS = [
29
+ "fraud", "penalty", "violation", "risk", "lawsuit", "breach",
30
+ "noncompliance", "litigation", "regulatory", "fine"
31
+ ]
32
+
33
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
34
+ SENDER_EMAIL = "shreedeepthi2005@gmail.com"
35
+ SENDER_PASSWORD = "qntm oher jqfz oflt"
36
+
37
+ def extract_text_from_pdf(uploaded_file):
38
+ return extract_text(uploaded_file)
39
+
40
+ def extract_key_clauses(text):
41
+ doc = nlp(text)
42
+ sentences = list(doc.sents)
43
+ clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
44
+ return clauses[:10] # Return top 10 clauses for simplicity
45
+
46
+ def summarize_text(text, num_sentences=5):
47
+ doc = nlp(text)
48
+ sentences = list(doc.sents)
49
+ word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
50
+ sentence_scores = {}
51
+ for sent in sentences:
52
+ sentence_score = 0
53
+ for word in sent:
54
+ if word.text.lower() in word_frequencies:
55
+ sentence_score += word_frequencies[word.text.lower()]
56
+ sentence_scores[sent] = sentence_score
57
+ summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
58
+ summary = ' '.join([str(sentence) for sentence in summarized_sentences])
59
+ return summary
60
+
61
+ def detect_risks(text):
62
+ doc = nlp(text.lower())
63
+ detected_risks = [token.text for token in doc if token.text in RISK_WORDS]
64
+ return list(set(detected_risks))
65
+
66
+ def get_regulatory_updates():
67
+ # Fallback: Pre-defined updates
68
+ predefined_updates = [
69
+ {"title": "New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
70
+ {"title": "Update on Financial Risks", "summary": "New policies to mitigate risks in the financial sector."},
71
+ ]
72
+ url = "https://www.sec.gov/newsroom/press-releases"
73
+ try:
74
+ response = requests.get(url, headers=HEADERS)
75
+ response.raise_for_status()
76
+ updates = [] # Placeholder for parsed updates (needs a proper parsing method)
77
+ # Process response.content with BeautifulSoup or similar parser if allowed
78
+ return updates if updates else predefined_updates
79
+ except requests.exceptions.RequestException:
80
+ return predefined_updates
81
+
82
+ def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pdf"):
83
+ pdf = FPDF()
84
+ pdf.set_auto_page_break(auto=True, margin=15)
85
+ pdf.add_page()
86
+ pdf.set_font("Arial", size=12)
87
+
88
+ pdf.cell(200, 10, txt="Legal Document Analysis Results", ln=True, align="C")
89
+
90
+ # Summary
91
+ pdf.ln(10)
92
+ pdf.cell(200, 10, txt="Summary", ln=True, align="L")
93
+ pdf.set_font("Arial", size=10)
94
+ pdf.multi_cell(0, 10, summary)
95
+
96
+ # Key Clauses
97
+ pdf.ln(10)
98
+ pdf.set_font("Arial", size=12)
99
+ pdf.cell(200, 10, txt="Key Clauses", ln=True, align="L")
100
+ pdf.set_font("Arial", size=10)
101
+ for clause in clauses:
102
+ pdf.multi_cell(0, 10, f"- {clause}")
103
+
104
+ # Risks
105
+ pdf.ln(10)
106
+ pdf.set_font("Arial", size=12)
107
+ pdf.cell(200, 10, txt="Detected Risks", ln=True, align="L")
108
+ pdf.set_font("Arial", size=10)
109
+ pdf.multi_cell(0, 10, ", ".join(risks))
110
+
111
+ # Regulatory Updates
112
+ pdf.ln(10)
113
+ pdf.set_font("Arial", size=12)
114
+ pdf.cell(200, 10, txt="Regulatory Updates", ln=True, align="L")
115
+ pdf.set_font("Arial", size=10)
116
+ if updates:
117
+ for update in updates:
118
+ pdf.multi_cell(0, 10, f"- {update.get('title', 'N/A')}: {update.get('summary', 'N/A')}")
119
+
120
+ pdf.output(pdf_path)
121
+
122
+ def send_email(pdf_path, recipient_email):
123
+ msg = EmailMessage()
124
+ msg["Subject"] = "Legal Document Analysis Results"
125
+ msg["From"] = SENDER_EMAIL
126
+ msg["To"] = recipient_email
127
+
128
+ msg.set_content("Please find attached the analysis results PDF.")
129
+
130
+ with open(pdf_path, "rb") as file:
131
+ msg.add_attachment(file.read(), maintype="application", subtype="pdf", filename="Analysis_Results.pdf")
132
+
133
+ with smtplib.SMTP("smtp.gmail.com", 587) as server:
134
+ server.starttls()
135
+ server.login(SENDER_EMAIL, SENDER_PASSWORD)
136
+ server.send_message(msg)
137
+
138
+ def main():
139
+ st.title("Interactive Legal Document Analysis Dashboard")
140
+
141
+ # Sidebar options
142
+ st.sidebar.title("Options")
143
+ features = st.sidebar.multiselect("Select Features",
144
+ ["Data Visualization", "Summary", "Key Clauses", "Risk Detection", "Regulatory Updates"])
145
+
146
+ # File upload
147
+ uploaded_file = st.file_uploader("Upload a legal document (PDF)", type="pdf")
148
+ recipient_email = st.text_input("Enter your email to receive the analysis results (optional)")
149
+
150
+ if uploaded_file is not None:
151
+ try:
152
+ text = extract_text_from_pdf(uploaded_file)
153
+ st.success("Text extracted successfully!")
154
+ except Exception as e:
155
+ st.error(f"Error extracting text from PDF: {e}")
156
+ return
157
+
158
+ summary = ""
159
+ clauses, risks, updates = [], [], []
160
+
161
+ if "Summary" in features:
162
+ summary = summarize_text(text)
163
+ st.subheader("Summary")
164
+ st.write(summary)
165
+
166
+ if "Key Clauses" in features:
167
+ clauses = extract_key_clauses(text)
168
+ st.subheader("Key Clauses")
169
+ for i, clause in enumerate(clauses, 1):
170
+ st.write(f"{i}. {clause}")
171
+
172
+ if "Risk Detection" in features:
173
+ risks = detect_risks(text)
174
+ st.subheader("Detected Risks")
175
+ st.write(", ".join(risks) if risks else "No risks detected.")
176
+
177
+ if "Regulatory Updates" in features:
178
+ updates = get_regulatory_updates()
179
+ st.subheader("Regulatory Updates")
180
+ for update in updates:
181
+ st.write(f"- **{update.get('title')}**: {update.get('summary')}")
182
+
183
+ # Generate PDF
184
+ if st.button("Generate PDF Report"):
185
+ pdf_path = "Analysis_Results.pdf"
186
+ generate_pdf(summary, clauses, risks, updates, pdf_path)
187
+ with open(pdf_path, "rb") as file:
188
+ st.download_button("Download PDF Report", file, file_name="Analysis_Results.pdf", mime="application/pdf")
189
+
190
+ # Email PDF
191
+ if recipient_email:
192
+ try:
193
+ validate_email(recipient_email)
194
+ send_email(pdf_path, recipient_email)
195
+ st.success(f"PDF sent to {recipient_email} successfully!")
196
+ except EmailNotValidError:
197
+ st.error("Invalid email address. Please enter a valid one.")
198
+
199
+ if __name__ == "__main__":
200
+ main()