Update app.py
Browse files
app.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from pdfminer.high_level import extract_text
|
| 3 |
-
import smtplib
|
| 4 |
from email.message import EmailMessage
|
| 5 |
from email_validator import validate_email, EmailNotValidError
|
| 6 |
-
import
|
| 7 |
from collections import Counter
|
| 8 |
import heapq
|
| 9 |
from fpdf import FPDF
|
| 10 |
-
import pandas as pd
|
| 11 |
import matplotlib.pyplot as plt
|
|
|
|
| 12 |
import requests
|
| 13 |
import subprocess
|
| 14 |
import sys
|
| 15 |
|
| 16 |
-
#
|
| 17 |
try:
|
| 18 |
import spacy
|
| 19 |
except ImportError:
|
|
@@ -33,38 +32,40 @@ RISK_WORDS = [
|
|
| 33 |
]
|
| 34 |
|
| 35 |
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
| 36 |
-
SENDER_EMAIL = "
|
| 37 |
-
SENDER_PASSWORD = "
|
| 38 |
|
|
|
|
| 39 |
def extract_text_from_pdf(uploaded_file):
|
| 40 |
return extract_text(uploaded_file)
|
| 41 |
|
|
|
|
| 42 |
def extract_key_clauses(text):
|
| 43 |
doc = nlp(text)
|
| 44 |
sentences = list(doc.sents)
|
| 45 |
clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
|
| 46 |
return clauses[:10]
|
| 47 |
|
|
|
|
| 48 |
def summarize_text(text, num_sentences=5):
|
| 49 |
doc = nlp(text)
|
| 50 |
sentences = list(doc.sents)
|
| 51 |
word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
|
| 52 |
sentence_scores = {}
|
| 53 |
for sent in sentences:
|
| 54 |
-
sentence_score = 0
|
| 55 |
-
for word in sent:
|
| 56 |
-
if word.text.lower() in word_frequencies:
|
| 57 |
-
sentence_score += word_frequencies[word.text.lower()]
|
| 58 |
sentence_scores[sent] = sentence_score
|
| 59 |
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
|
| 60 |
summary = ' '.join([str(sentence) for sentence in summarized_sentences])
|
| 61 |
return summary
|
| 62 |
|
|
|
|
| 63 |
def detect_risks(text):
|
| 64 |
doc = nlp(text.lower())
|
| 65 |
detected_risks = [token.text for token in doc if token.text in RISK_WORDS]
|
| 66 |
return list(set(detected_risks))
|
| 67 |
|
|
|
|
| 68 |
def get_regulatory_updates():
|
| 69 |
predefined_updates = [
|
| 70 |
{"title": "New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
|
|
@@ -74,11 +75,12 @@ def get_regulatory_updates():
|
|
| 74 |
try:
|
| 75 |
response = requests.get(url, headers=HEADERS)
|
| 76 |
response.raise_for_status()
|
| 77 |
-
updates = []
|
| 78 |
return updates if updates else predefined_updates
|
| 79 |
except requests.exceptions.RequestException:
|
| 80 |
return predefined_updates
|
| 81 |
|
|
|
|
| 82 |
def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pdf"):
|
| 83 |
pdf = FPDF()
|
| 84 |
pdf.set_auto_page_break(auto=True, margin=15)
|
|
@@ -115,6 +117,7 @@ def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pd
|
|
| 115 |
|
| 116 |
pdf.output(pdf_path)
|
| 117 |
|
|
|
|
| 118 |
def send_email(pdf_path, recipient_email):
|
| 119 |
msg = EmailMessage()
|
| 120 |
msg["Subject"] = "Legal Document Analysis Results"
|
|
@@ -131,6 +134,7 @@ def send_email(pdf_path, recipient_email):
|
|
| 131 |
server.login(SENDER_EMAIL, SENDER_PASSWORD)
|
| 132 |
server.send_message(msg)
|
| 133 |
|
|
|
|
| 134 |
def plot_word_frequencies(text):
|
| 135 |
doc = nlp(text)
|
| 136 |
word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
|
|
@@ -143,6 +147,7 @@ def plot_word_frequencies(text):
|
|
| 143 |
plt.ylabel("Frequency")
|
| 144 |
st.pyplot(plt)
|
| 145 |
|
|
|
|
| 146 |
def main():
|
| 147 |
st.title("Interactive Legal Document Analysis Dashboard")
|
| 148 |
|
|
@@ -154,52 +159,49 @@ def main():
|
|
| 154 |
recipient_email = st.text_input("Enter your email to receive the analysis results (optional)")
|
| 155 |
|
| 156 |
if st.button("Submit"):
|
| 157 |
-
if not
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
else:
|
| 160 |
-
st.
|
| 161 |
-
|
| 162 |
-
if uploaded_file is not None:
|
| 163 |
-
text = extract_text_from_pdf(uploaded_file)
|
| 164 |
-
|
| 165 |
-
summary, clauses, risks, updates = "", [], [], []
|
| 166 |
-
|
| 167 |
-
if "Summary" in features:
|
| 168 |
-
summary = summarize_text(text)
|
| 169 |
-
st.subheader("Summary")
|
| 170 |
-
st.write(summary)
|
| 171 |
-
|
| 172 |
-
if "Key Clauses" in features:
|
| 173 |
-
clauses = extract_key_clauses(text)
|
| 174 |
-
st.subheader("Key Clauses")
|
| 175 |
-
for i, clause in enumerate(clauses, 1):
|
| 176 |
-
st.write(f"{i}. {clause}")
|
| 177 |
-
|
| 178 |
-
if "Risk Detection" in features:
|
| 179 |
-
risks = detect_risks(text)
|
| 180 |
-
st.subheader("Detected Risks")
|
| 181 |
-
st.write(", ".join(risks) if risks else "No risks detected.")
|
| 182 |
-
|
| 183 |
-
if "Regulatory Updates" in features:
|
| 184 |
-
updates = get_regulatory_updates()
|
| 185 |
-
st.subheader("Regulatory Updates")
|
| 186 |
-
for update in updates:
|
| 187 |
-
st.write(f"- **{update.get('title')}**: {update.get('summary')}")
|
| 188 |
-
|
| 189 |
-
if "Data Visualization" in features:
|
| 190 |
-
st.subheader("Word Frequency Visualization")
|
| 191 |
-
plot_word_frequencies(text)
|
| 192 |
-
|
| 193 |
-
pdf_path = "Analysis_Results.pdf"
|
| 194 |
-
generate_pdf(summary, clauses, risks, updates, pdf_path)
|
| 195 |
-
|
| 196 |
-
if recipient_email:
|
| 197 |
-
try:
|
| 198 |
-
validate_email(recipient_email)
|
| 199 |
-
send_email(pdf_path, recipient_email)
|
| 200 |
-
st.success("Analysis PDF has been sent to your email.")
|
| 201 |
-
except EmailNotValidError:
|
| 202 |
-
st.error("Invalid email address. Please enter a valid email.")
|
| 203 |
|
| 204 |
if __name__ == "__main__":
|
| 205 |
main()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from pdfminer.high_level import extract_text
|
|
|
|
| 3 |
from email.message import EmailMessage
|
| 4 |
from email_validator import validate_email, EmailNotValidError
|
| 5 |
+
import smtplib
|
| 6 |
from collections import Counter
|
| 7 |
import heapq
|
| 8 |
from fpdf import FPDF
|
|
|
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
+
import spacy
|
| 11 |
import requests
|
| 12 |
import subprocess
|
| 13 |
import sys
|
| 14 |
|
| 15 |
+
# Ensure spaCy is installed and 'en_core_web_sm' is loaded
|
| 16 |
try:
|
| 17 |
import spacy
|
| 18 |
except ImportError:
|
|
|
|
| 32 |
]
|
| 33 |
|
| 34 |
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
| 35 |
+
SENDER_EMAIL = "your_email@gmail.com" # Replace with your email
|
| 36 |
+
SENDER_PASSWORD = "your_app_password" # Replace with your app password
|
| 37 |
|
| 38 |
+
# Extract text from PDF
|
| 39 |
def extract_text_from_pdf(uploaded_file):
|
| 40 |
return extract_text(uploaded_file)
|
| 41 |
|
| 42 |
+
# Extract key clauses
|
| 43 |
def extract_key_clauses(text):
|
| 44 |
doc = nlp(text)
|
| 45 |
sentences = list(doc.sents)
|
| 46 |
clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
|
| 47 |
return clauses[:10]
|
| 48 |
|
| 49 |
+
# Summarize text
|
| 50 |
def summarize_text(text, num_sentences=5):
|
| 51 |
doc = nlp(text)
|
| 52 |
sentences = list(doc.sents)
|
| 53 |
word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
|
| 54 |
sentence_scores = {}
|
| 55 |
for sent in sentences:
|
| 56 |
+
sentence_score = sum(word_frequencies.get(word.text.lower(), 0) for word in sent)
|
|
|
|
|
|
|
|
|
|
| 57 |
sentence_scores[sent] = sentence_score
|
| 58 |
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
|
| 59 |
summary = ' '.join([str(sentence) for sentence in summarized_sentences])
|
| 60 |
return summary
|
| 61 |
|
| 62 |
+
# Detect risks
|
| 63 |
def detect_risks(text):
|
| 64 |
doc = nlp(text.lower())
|
| 65 |
detected_risks = [token.text for token in doc if token.text in RISK_WORDS]
|
| 66 |
return list(set(detected_risks))
|
| 67 |
|
| 68 |
+
# Fetch regulatory updates
|
| 69 |
def get_regulatory_updates():
|
| 70 |
predefined_updates = [
|
| 71 |
{"title": "New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
|
|
|
|
| 75 |
try:
|
| 76 |
response = requests.get(url, headers=HEADERS)
|
| 77 |
response.raise_for_status()
|
| 78 |
+
updates = [] # Placeholder for scraped updates
|
| 79 |
return updates if updates else predefined_updates
|
| 80 |
except requests.exceptions.RequestException:
|
| 81 |
return predefined_updates
|
| 82 |
|
| 83 |
+
# Generate PDF
|
| 84 |
def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pdf"):
|
| 85 |
pdf = FPDF()
|
| 86 |
pdf.set_auto_page_break(auto=True, margin=15)
|
|
|
|
| 117 |
|
| 118 |
pdf.output(pdf_path)
|
| 119 |
|
| 120 |
+
# Send email
|
| 121 |
def send_email(pdf_path, recipient_email):
|
| 122 |
msg = EmailMessage()
|
| 123 |
msg["Subject"] = "Legal Document Analysis Results"
|
|
|
|
| 134 |
server.login(SENDER_EMAIL, SENDER_PASSWORD)
|
| 135 |
server.send_message(msg)
|
| 136 |
|
| 137 |
+
# Plot word frequencies
|
| 138 |
def plot_word_frequencies(text):
|
| 139 |
doc = nlp(text)
|
| 140 |
word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
|
|
|
|
| 147 |
plt.ylabel("Frequency")
|
| 148 |
st.pyplot(plt)
|
| 149 |
|
| 150 |
+
# Main function
|
| 151 |
def main():
|
| 152 |
st.title("Interactive Legal Document Analysis Dashboard")
|
| 153 |
|
|
|
|
| 159 |
recipient_email = st.text_input("Enter your email to receive the analysis results (optional)")
|
| 160 |
|
| 161 |
if st.button("Submit"):
|
| 162 |
+
if uploaded_file is not None:
|
| 163 |
+
text = extract_text_from_pdf(uploaded_file)
|
| 164 |
+
|
| 165 |
+
summary, clauses, risks, updates = "", [], [], []
|
| 166 |
+
|
| 167 |
+
if "Summary" in features:
|
| 168 |
+
summary = summarize_text(text)
|
| 169 |
+
st.subheader("Summary")
|
| 170 |
+
st.write(summary)
|
| 171 |
+
|
| 172 |
+
if "Key Clauses" in features:
|
| 173 |
+
clauses = extract_key_clauses(text)
|
| 174 |
+
st.subheader("Key Clauses")
|
| 175 |
+
for i, clause in enumerate(clauses, 1):
|
| 176 |
+
st.write(f"{i}. {clause}")
|
| 177 |
+
|
| 178 |
+
if "Risk Detection" in features:
|
| 179 |
+
risks = detect_risks(text)
|
| 180 |
+
st.subheader("Detected Risks")
|
| 181 |
+
st.write(", ".join(risks) if risks else "No risks detected.")
|
| 182 |
+
|
| 183 |
+
if "Regulatory Updates" in features:
|
| 184 |
+
updates = get_regulatory_updates()
|
| 185 |
+
st.subheader("Regulatory Updates")
|
| 186 |
+
for update in updates:
|
| 187 |
+
st.write(f"- **{update.get('title')}**: {update.get('summary')}")
|
| 188 |
+
|
| 189 |
+
if "Data Visualization" in features:
|
| 190 |
+
st.subheader("Word Frequency Visualization")
|
| 191 |
+
plot_word_frequencies(text)
|
| 192 |
+
|
| 193 |
+
pdf_path = "Analysis_Results.pdf"
|
| 194 |
+
generate_pdf(summary, clauses, risks, updates, pdf_path)
|
| 195 |
+
|
| 196 |
+
if recipient_email:
|
| 197 |
+
try:
|
| 198 |
+
validate_email(recipient_email)
|
| 199 |
+
send_email(pdf_path, recipient_email)
|
| 200 |
+
st.success("Analysis PDF has been sent to your email.")
|
| 201 |
+
except EmailNotValidError:
|
| 202 |
+
st.error("Invalid email address. Please enter a valid email.")
|
| 203 |
else:
|
| 204 |
+
st.error("Please upload a PDF document for analysis.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
if __name__ == "__main__":
|
| 207 |
main()
|