ShreeDeepthi commited on
Commit
005ea1c
·
verified ·
1 Parent(s): fcfffa3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -73
app.py CHANGED
@@ -1,28 +1,17 @@
1
  import streamlit as st
2
  from pdfminer.high_level import extract_text
 
3
  from email.message import EmailMessage
4
  from email_validator import validate_email, EmailNotValidError
5
- import smtplib
6
  from collections import Counter
7
  import heapq
8
  from fpdf import FPDF
 
9
  import matplotlib.pyplot as plt
10
- import spacy
11
  import requests
12
- import subprocess
13
- import sys
14
-
15
- # Ensure spaCy is installed and 'en_core_web_sm' is loaded
16
- try:
17
- import spacy
18
- except ImportError:
19
- subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
20
-
21
- try:
22
- spacy.load("en_core_web_sm")
23
- except OSError:
24
- subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
25
 
 
26
  nlp = spacy.load("en_core_web_sm")
27
 
28
  # Predefined risk-related words
@@ -32,41 +21,40 @@ RISK_WORDS = [
32
  ]
33
 
34
  HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
35
- SENDER_EMAIL = "your_email@gmail.com" # Replace with your email
36
- SENDER_PASSWORD = "your_app_password" # Replace with your app password
37
 
38
- # Extract text from PDF
39
  def extract_text_from_pdf(uploaded_file):
40
  return extract_text(uploaded_file)
41
 
42
- # Extract key clauses
43
  def extract_key_clauses(text):
44
  doc = nlp(text)
45
  sentences = list(doc.sents)
46
  clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
47
- return clauses[:10]
48
 
49
- # Summarize text
50
  def summarize_text(text, num_sentences=5):
51
  doc = nlp(text)
52
  sentences = list(doc.sents)
53
  word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
54
  sentence_scores = {}
55
  for sent in sentences:
56
- sentence_score = sum(word_frequencies.get(word.text.lower(), 0) for word in sent)
 
 
 
57
  sentence_scores[sent] = sentence_score
58
  summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
59
  summary = ' '.join([str(sentence) for sentence in summarized_sentences])
60
  return summary
61
 
62
- # Detect risks
63
  def detect_risks(text):
64
  doc = nlp(text.lower())
65
  detected_risks = [token.text for token in doc if token.text in RISK_WORDS]
66
  return list(set(detected_risks))
67
 
68
- # Fetch regulatory updates
69
  def get_regulatory_updates():
 
70
  predefined_updates = [
71
  {"title": "New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
72
  {"title": "Update on Financial Risks", "summary": "New policies to mitigate risks in the financial sector."},
@@ -75,12 +63,11 @@ def get_regulatory_updates():
75
  try:
76
  response = requests.get(url, headers=HEADERS)
77
  response.raise_for_status()
78
- updates = [] # Placeholder for scraped updates
79
  return updates if updates else predefined_updates
80
  except requests.exceptions.RequestException:
81
  return predefined_updates
82
 
83
- # Generate PDF
84
  def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pdf"):
85
  pdf = FPDF()
86
  pdf.set_auto_page_break(auto=True, margin=15)
@@ -89,11 +76,13 @@ def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pd
89
 
90
  pdf.cell(200, 10, txt="Legal Document Analysis Results", ln=True, align="C")
91
 
 
92
  pdf.ln(10)
93
  pdf.cell(200, 10, txt="Summary", ln=True, align="L")
94
  pdf.set_font("Arial", size=10)
95
  pdf.multi_cell(0, 10, summary)
96
 
 
97
  pdf.ln(10)
98
  pdf.set_font("Arial", size=12)
99
  pdf.cell(200, 10, txt="Key Clauses", ln=True, align="L")
@@ -101,12 +90,14 @@ def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pd
101
  for clause in clauses:
102
  pdf.multi_cell(0, 10, f"- {clause}")
103
 
 
104
  pdf.ln(10)
105
  pdf.set_font("Arial", size=12)
106
  pdf.cell(200, 10, txt="Detected Risks", ln=True, align="L")
107
  pdf.set_font("Arial", size=10)
108
  pdf.multi_cell(0, 10, ", ".join(risks))
109
 
 
110
  pdf.ln(10)
111
  pdf.set_font("Arial", size=12)
112
  pdf.cell(200, 10, txt="Regulatory Updates", ln=True, align="L")
@@ -117,7 +108,6 @@ def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pd
117
 
118
  pdf.output(pdf_path)
119
 
120
- # Send email
121
  def send_email(pdf_path, recipient_email):
122
  msg = EmailMessage()
123
  msg["Subject"] = "Legal Document Analysis Results"
@@ -134,74 +124,82 @@ def send_email(pdf_path, recipient_email):
134
  server.login(SENDER_EMAIL, SENDER_PASSWORD)
135
  server.send_message(msg)
136
 
137
- # Plot word frequencies
138
- def plot_word_frequencies(text):
139
- doc = nlp(text)
140
- word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
141
- top_words = word_frequencies.most_common(10)
142
- words, frequencies = zip(*top_words)
143
- plt.figure(figsize=(10, 5))
144
- plt.bar(words, frequencies, color="skyblue")
145
- plt.title("Top 10 Word Frequencies")
146
- plt.xlabel("Words")
147
- plt.ylabel("Frequency")
148
- st.pyplot(plt)
149
-
150
- # Main function
151
  def main():
152
  st.title("Interactive Legal Document Analysis Dashboard")
153
 
 
154
  st.sidebar.title("Options")
155
  features = st.sidebar.multiselect("Select Features",
156
  ["Data Visualization", "Summary", "Key Clauses", "Risk Detection", "Regulatory Updates"])
157
 
 
158
  uploaded_file = st.file_uploader("Upload a legal document (PDF)", type="pdf")
159
  recipient_email = st.text_input("Enter your email to receive the analysis results (optional)")
160
 
161
- if st.button("Submit"):
162
- if uploaded_file is not None:
163
  text = extract_text_from_pdf(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- summary, clauses, risks, updates = "", [], [], []
166
-
167
- if "Summary" in features:
168
- summary = summarize_text(text)
169
- st.subheader("Summary")
170
- st.write(summary)
171
-
172
- if "Key Clauses" in features:
173
- clauses = extract_key_clauses(text)
174
- st.subheader("Key Clauses")
175
- for i, clause in enumerate(clauses, 1):
176
- st.write(f"{i}. {clause}")
177
-
178
- if "Risk Detection" in features:
179
- risks = detect_risks(text)
180
- st.subheader("Detected Risks")
181
- st.write(", ".join(risks) if risks else "No risks detected.")
182
-
183
- if "Regulatory Updates" in features:
184
- updates = get_regulatory_updates()
185
- st.subheader("Regulatory Updates")
186
- for update in updates:
187
- st.write(f"- **{update.get('title')}**: {update.get('summary')}")
188
 
189
- if "Data Visualization" in features:
190
- st.subheader("Word Frequency Visualization")
191
- plot_word_frequencies(text)
 
 
192
 
 
 
193
  pdf_path = "Analysis_Results.pdf"
194
  generate_pdf(summary, clauses, risks, updates, pdf_path)
 
 
195
 
 
196
  if recipient_email:
197
  try:
198
  validate_email(recipient_email)
199
  send_email(pdf_path, recipient_email)
200
- st.success("Analysis PDF has been sent to your email.")
201
  except EmailNotValidError:
202
- st.error("Invalid email address. Please enter a valid email.")
203
- else:
204
- st.error("Please upload a PDF document for analysis.")
205
 
206
  if __name__ == "__main__":
207
  main()
 
1
  import streamlit as st
2
  from pdfminer.high_level import extract_text
3
+ import smtplib
4
  from email.message import EmailMessage
5
  from email_validator import validate_email, EmailNotValidError
6
+ import spacy
7
  from collections import Counter
8
  import heapq
9
  from fpdf import FPDF
10
+ import pandas as pd
11
  import matplotlib.pyplot as plt
 
12
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Load spaCy model
15
  nlp = spacy.load("en_core_web_sm")
16
 
17
  # Predefined risk-related words
 
21
  ]
22
 
23
  HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
24
+ SENDER_EMAIL = "shreedeepthi2005@gmail.com"
25
+ SENDER_PASSWORD = "qntm oher jqfz oflt"
26
 
 
27
  def extract_text_from_pdf(uploaded_file):
28
  return extract_text(uploaded_file)
29
 
 
30
  def extract_key_clauses(text):
31
  doc = nlp(text)
32
  sentences = list(doc.sents)
33
  clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
34
+ return clauses[:10] # Return top 10 clauses for simplicity
35
 
 
36
  def summarize_text(text, num_sentences=5):
37
  doc = nlp(text)
38
  sentences = list(doc.sents)
39
  word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
40
  sentence_scores = {}
41
  for sent in sentences:
42
+ sentence_score = 0
43
+ for word in sent:
44
+ if word.text.lower() in word_frequencies:
45
+ sentence_score += word_frequencies[word.text.lower()]
46
  sentence_scores[sent] = sentence_score
47
  summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
48
  summary = ' '.join([str(sentence) for sentence in summarized_sentences])
49
  return summary
50
 
 
51
  def detect_risks(text):
52
  doc = nlp(text.lower())
53
  detected_risks = [token.text for token in doc if token.text in RISK_WORDS]
54
  return list(set(detected_risks))
55
 
 
56
  def get_regulatory_updates():
57
+ # Fallback: Pre-defined updates
58
  predefined_updates = [
59
  {"title": "New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
60
  {"title": "Update on Financial Risks", "summary": "New policies to mitigate risks in the financial sector."},
 
63
  try:
64
  response = requests.get(url, headers=HEADERS)
65
  response.raise_for_status()
66
+ updates = [] # Placeholder for parsed updates (needs a proper parsing method)
67
  return updates if updates else predefined_updates
68
  except requests.exceptions.RequestException:
69
  return predefined_updates
70
 
 
71
  def generate_pdf(summary, clauses, risks, updates, pdf_path="Analysis_Results.pdf"):
72
  pdf = FPDF()
73
  pdf.set_auto_page_break(auto=True, margin=15)
 
76
 
77
  pdf.cell(200, 10, txt="Legal Document Analysis Results", ln=True, align="C")
78
 
79
+ # Summary
80
  pdf.ln(10)
81
  pdf.cell(200, 10, txt="Summary", ln=True, align="L")
82
  pdf.set_font("Arial", size=10)
83
  pdf.multi_cell(0, 10, summary)
84
 
85
+ # Key Clauses
86
  pdf.ln(10)
87
  pdf.set_font("Arial", size=12)
88
  pdf.cell(200, 10, txt="Key Clauses", ln=True, align="L")
 
90
  for clause in clauses:
91
  pdf.multi_cell(0, 10, f"- {clause}")
92
 
93
+ # Risks
94
  pdf.ln(10)
95
  pdf.set_font("Arial", size=12)
96
  pdf.cell(200, 10, txt="Detected Risks", ln=True, align="L")
97
  pdf.set_font("Arial", size=10)
98
  pdf.multi_cell(0, 10, ", ".join(risks))
99
 
100
+ # Regulatory Updates
101
  pdf.ln(10)
102
  pdf.set_font("Arial", size=12)
103
  pdf.cell(200, 10, txt="Regulatory Updates", ln=True, align="L")
 
108
 
109
  pdf.output(pdf_path)
110
 
 
111
  def send_email(pdf_path, recipient_email):
112
  msg = EmailMessage()
113
  msg["Subject"] = "Legal Document Analysis Results"
 
124
  server.login(SENDER_EMAIL, SENDER_PASSWORD)
125
  server.send_message(msg)
126
 
127
+ def visualize_key_clauses_frequency(clauses):
128
+ clause_counts = Counter(clauses)
129
+ common_clauses = clause_counts.most_common()
130
+ if common_clauses:
131
+ labels, values = zip(*common_clauses)
132
+ plt.figure(figsize=(10, 6))
133
+ plt.barh(labels, values, color='skyblue')
134
+ plt.xlabel('Frequency')
135
+ plt.title('Key Clauses Frequency')
136
+ st.pyplot(plt)
137
+ else:
138
+ st.write("No key clauses to visualize.")
139
+
 
140
  def main():
141
  st.title("Interactive Legal Document Analysis Dashboard")
142
 
143
+ # Sidebar options
144
  st.sidebar.title("Options")
145
  features = st.sidebar.multiselect("Select Features",
146
  ["Data Visualization", "Summary", "Key Clauses", "Risk Detection", "Regulatory Updates"])
147
 
148
+ # File upload
149
  uploaded_file = st.file_uploader("Upload a legal document (PDF)", type="pdf")
150
  recipient_email = st.text_input("Enter your email to receive the analysis results (optional)")
151
 
152
+ if uploaded_file is not None:
153
+ try:
154
  text = extract_text_from_pdf(uploaded_file)
155
+ st.success("Text extracted successfully!")
156
+ except Exception as e:
157
+ st.error(f"Error extracting text from PDF: {e}")
158
+ return
159
+
160
+ summary = ""
161
+ clauses, risks, updates = [], [], []
162
+
163
+ if "Summary" in features:
164
+ summary = summarize_text(text)
165
+ st.subheader("Summary")
166
+ st.write(summary)
167
+
168
+ if "Key Clauses" in features:
169
+ clauses = extract_key_clauses(text)
170
+ st.subheader("Key Clauses")
171
+ for i, clause in enumerate(clauses, 1):
172
+ st.write(f"{i}. {clause}")
173
+ # Visualize key clauses frequency
174
+ if "Data Visualization" in features:
175
+ visualize_key_clauses_frequency(clauses)
176
 
177
+ if "Risk Detection" in features:
178
+ risks = detect_risks(text)
179
+ st.subheader("Detected Risks")
180
+ st.write(", ".join(risks) if risks else "No risks detected.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ if "Regulatory Updates" in features:
183
+ updates = get_regulatory_updates()
184
+ st.subheader("Regulatory Updates")
185
+ for update in updates:
186
+ st.write(f"- **{update.get('title')}**: {update.get('summary')}")
187
 
188
+ # Generate PDF and send via email if required
189
+ if st.button("Generate PDF Report"):
190
  pdf_path = "Analysis_Results.pdf"
191
  generate_pdf(summary, clauses, risks, updates, pdf_path)
192
+ with open(pdf_path, "rb") as file:
193
+ st.download_button("Download PDF Report", file, file_name="Analysis_Results.pdf", mime="application/pdf")
194
 
195
+ # Email PDF
196
  if recipient_email:
197
  try:
198
  validate_email(recipient_email)
199
  send_email(pdf_path, recipient_email)
200
+ st.success(f"PDF sent to {recipient_email} successfully!")
201
  except EmailNotValidError:
202
+ st.error("Invalid email address. Please enter a valid one.")
 
 
203
 
204
  if __name__ == "__main__":
205
  main()