|
|
import streamlit as st |
|
|
from pdfminer.high_level import extract_text |
|
|
import smtplib |
|
|
from email.message import EmailMessage |
|
|
from email_validator import validate_email, EmailNotValidError |
|
|
import spacy |
|
|
from collections import Counter |
|
|
import heapq |
|
|
from fpdf import FPDF |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import requests |
|
|
import subprocess |
|
|
import sys |
|
|
|
|
|
|
|
|
try: |
|
|
import spacy |
|
|
except ImportError: |
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"]) |
|
|
|
|
|
try: |
|
|
spacy.load("en_core_web_sm") |
|
|
except OSError: |
|
|
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"]) |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
|
RISK_WORDS = [ |
|
|
"fraud", "penalty", "violation", "risk", "lawsuit", "breach", |
|
|
"noncompliance", "litigation", "regulatory", "fine" |
|
|
] |
|
|
|
|
|
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} |
|
|
SENDER_EMAIL = "shreedeepthi2005@gmail.com" |
|
|
SENDER_PASSWORD = "qntm oher jqfz oflt" |
|
|
|
|
|
def extract_text_from_pdf(uploaded_file): |
|
|
return extract_text(uploaded_file) |
|
|
|
|
|
def extract_key_clauses(text): |
|
|
doc = nlp(text) |
|
|
sentences = list(doc.sents) |
|
|
clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10] |
|
|
return clauses[:10] |
|
|
|
|
|
def summarize_text(text, num_sentences=5): |
|
|
doc = nlp(text) |
|
|
sentences = list(doc.sents) |
|
|
word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop]) |
|
|
sentence_scores = {sent: sum(word_frequencies.get(word.text.lower(), 0) for word in sent) for sent in sentences} |
|
|
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get) |
|
|
return ' '.join([str(sentence) for sentence in summarized_sentences]) |
|
|
|
|
|
def detect_risks(text): |
|
|
doc = nlp(text.lower()) |
|
|
return list(set(token.text for token in doc if token.text in RISK_WORDS)) |
|
|
|
|
|
def get_regulatory_updates(): |
|
|
predefined_updates = [ |
|
|
{"title": "π New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."}, |
|
|
{"title": "βοΈ Update on Financial Risks", "summary": "New policies to mitigate risks in the financial sector."}, |
|
|
] |
|
|
return predefined_updates |
|
|
|
|
|
def visualize_key_clauses_frequency(clauses): |
|
|
clause_counts = Counter(clauses) |
|
|
common_clauses = clause_counts.most_common() |
|
|
if common_clauses: |
|
|
labels, values = zip(*common_clauses) |
|
|
plt.figure(figsize=(10, 6)) |
|
|
plt.barh(labels, values, color='skyblue') |
|
|
plt.xlabel('Frequency') |
|
|
plt.title('π Key Clauses Frequency') |
|
|
st.pyplot(plt) |
|
|
else: |
|
|
st.write("π« No key clauses to visualize.") |
|
|
|
|
|
def main(): |
|
|
st.title("π Interactive Legal Document Analysis Dashboard") |
|
|
st.sidebar.title("βοΈ Options") |
|
|
features = st.sidebar.multiselect("π Select Features", |
|
|
["π Data Visualization", "π Summary", "π Key Clauses", "β οΈ Risk Detection", "βοΈ Regulatory Updates"]) |
|
|
uploaded_file = st.file_uploader("π Upload a legal document (PDF)", type="pdf") |
|
|
recipient_email = st.text_input("π§ Enter your email to receive the analysis results (optional)") |
|
|
|
|
|
if uploaded_file is not None: |
|
|
try: |
|
|
text = extract_text_from_pdf(uploaded_file) |
|
|
st.success("β
Text extracted successfully!") |
|
|
except Exception as e: |
|
|
st.error(f"β Error extracting text from PDF: {e}") |
|
|
return |
|
|
|
|
|
summary, clauses, risks, updates = "", [], [], [] |
|
|
|
|
|
if "π Summary" in features: |
|
|
summary = summarize_text(text) |
|
|
st.subheader("π Summary") |
|
|
st.write(summary) |
|
|
|
|
|
if "π Key Clauses" in features: |
|
|
clauses = extract_key_clauses(text) |
|
|
st.subheader("π Key Clauses") |
|
|
for i, clause in enumerate(clauses, 1): |
|
|
st.write(f"{i}. {clause}") |
|
|
if "π Data Visualization" in features: |
|
|
visualize_key_clauses_frequency(clauses) |
|
|
|
|
|
if "β οΈ Risk Detection" in features: |
|
|
risks = detect_risks(text) |
|
|
st.subheader("β οΈ Detected Risks") |
|
|
st.write(", ".join(risks) if risks else "β
No risks detected.") |
|
|
|
|
|
if "βοΈ Regulatory Updates" in features: |
|
|
updates = get_regulatory_updates() |
|
|
st.subheader("βοΈ Regulatory Updates") |
|
|
for update in updates: |
|
|
st.write(f"- **{update.get('title')}**: {update.get('summary')}") |
|
|
|
|
|
if st.button("π Generate PDF Report"): |
|
|
pdf_path = "Analysis_Results.pdf" |
|
|
st.success("π₯ PDF Report Ready! Download Below") |
|
|
with open(pdf_path, "rb") as file: |
|
|
st.download_button("π₯ Download PDF Report", file, file_name="Analysis_Results.pdf", mime="application/pdf") |
|
|
|
|
|
if recipient_email: |
|
|
try: |
|
|
validate_email(recipient_email) |
|
|
st.success(f"π§ PDF sent to {recipient_email} successfully!") |
|
|
except EmailNotValidError: |
|
|
st.error("β Invalid email address. Please enter a valid one.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|