LDS_AI / app.py
cyrus-spc's picture
Upload app.py
46031ff verified
import streamlit as st
from pdfminer.high_level import extract_text
import spacy
from collections import Counter
import heapq
from fpdf import FPDF
import matplotlib.pyplot as plt
import re
import pandas as pd
@st.cache_resource
def load_spacy():
try:
return spacy.load("en_core_web_sm")
except OSError:
st.warning("⚠️ spaCy model not found. Using basic text processing.")
return None
nlp = load_spacy()
# Predefined risk-related words
RISK_WORDS = [
"fraud", "penalty", "violation", "risk", "lawsuit", "breach",
"noncompliance", "litigation", "regulatory", "fine"
]
def extract_text_from_pdf(uploaded_file):
return extract_text(uploaded_file)
@st.cache_data
def process_text(text):
if nlp is None:
return None
return nlp(text)
def extract_key_clauses(text):
if nlp is None:
# Fallback: split by sentences using basic punctuation
sentences = re.split(r'[.!?]+', text)
clauses = [s.strip() for s in sentences if len(s.strip()) > 10]
return clauses[:10]
else:
doc = process_text(text)
sentences = list(doc.sents)
clauses = [str(sentence).strip() for sentence in sentences if len(sentence) > 10]
return clauses[:10]
def summarize_text(text, num_sentences=5):
if nlp is None:
# Fallback: simple sentence extraction
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
return '. '.join(sentences[:num_sentences])
else:
doc = process_text(text)
sentences = list(doc.sents)
word_frequencies = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
sentence_scores = {sent: sum(word_frequencies.get(word.text.lower(), 0) for word in sent) for sent in sentences}
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
return ' '.join([str(sentence) for sentence in summarized_sentences])
def detect_risks(text):
if nlp is None:
# Fallback: direct string matching
text_lower = text.lower()
return list(set(word for word in RISK_WORDS if word in text_lower))
else:
doc = process_text(text.lower())
return list(set(token.text for token in doc if token.text in RISK_WORDS))
def extract_legal_entities(text):
entities = {
"PERSONS": [],
"ORGANIZATIONS": [],
"DATES": [],
"MONETARY": [],
"LEGAL_TERMS": []
}
if nlp is None:
# Basic regex-based entity extraction
import re
# Find dates
date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}\b'
entities["DATES"] = list(set(re.findall(date_pattern, text)))
# Find monetary amounts
money_pattern = r'\$\d+(?:,\d{3})*(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|dollars?|cents?)'
entities["MONETARY"] = list(set(re.findall(money_pattern, text, re.IGNORECASE)))
# Find legal terms
legal_terms = ["contract", "agreement", "liability", "indemnity", "warranty", "termination", "jurisdiction", "governing law"]
text_lower = text.lower()
entities["LEGAL_TERMS"] = [term for term in legal_terms if term in text_lower]
else:
doc = process_text(text)
for ent in doc.ents:
if ent.label_ == "PERSON":
entities["PERSONS"].append(ent.text)
elif ent.label_ == "ORG":
entities["ORGANIZATIONS"].append(ent.text)
elif ent.label_ == "DATE":
entities["DATES"].append(ent.text)
elif ent.label_ == "MONEY":
entities["MONETARY"].append(ent.text)
# Remove duplicates and limit results
for key in entities:
entities[key] = list(set(entities[key]))[:10]
return entities
def check_compliance(text):
compliance_issues = []
text_lower = text.lower()
# Check for common compliance issues
if "governing law" not in text_lower and "jurisdiction" not in text_lower:
compliance_issues.append("Missing governing law or jurisdiction clause")
if "termination" not in text_lower:
compliance_issues.append("No termination clause found")
if "confidential" not in text_lower and "proprietary" not in text_lower:
compliance_issues.append("No confidentiality or proprietary information clause")
if "liability" not in text_lower:
compliance_issues.append("Liability terms not clearly defined")
# Check for signature requirements
if "signature" not in text_lower and "signed" not in text_lower:
compliance_issues.append("Document may lack proper signature requirements")
return compliance_issues
def generate_legal_report(summary, clauses, risks, entities, compliance_issues, filename):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", "B", 16)
pdf.cell(0, 10, "Legal Document Analysis Report", ln=True, align="C")
pdf.ln(10)
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, f"Document: {filename}", ln=True)
pdf.cell(0, 10, f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=True)
pdf.ln(10)
# Executive Summary
if summary:
pdf.set_font("Arial", "B", 14)
pdf.cell(0, 10, "Executive Summary:", ln=True)
pdf.set_font("Arial", "", 11)
pdf.multi_cell(0, 8, summary)
pdf.ln(5)
# Key Legal Clauses
if clauses:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Key Legal Clauses:", ln=True)
pdf.set_font("Arial", "", 10)
for i, clause in enumerate(clauses, 1):
pdf.multi_cell(0, 8, f"{i}. {clause}")
pdf.ln(5)
# Risk Assessment
if risks:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Risk Assessment:", ln=True)
pdf.set_font("Arial", "", 10)
for risk in risks:
pdf.cell(0, 8, f"β€’ {risk.title()}", ln=True)
pdf.ln(5)
# Entity Recognition
if entities:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Identified Entities:", ln=True)
pdf.set_font("Arial", "", 10)
for entity_type, entity_list in entities.items():
if entity_list:
pdf.cell(0, 8, f"{entity_type}: {', '.join(entity_list)}", ln=True)
pdf.ln(5)
# Compliance Issues
if compliance_issues:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Compliance Issues:", ln=True)
pdf.set_font("Arial", "", 10)
for issue in compliance_issues:
pdf.cell(0, 8, f"⚠ {issue}", ln=True)
pdf.ln(5)
pdf_path = f"Legal_Analysis_{filename.replace('.pdf', '')}.pdf"
pdf.output(pdf_path)
return pdf_path
def get_regulatory_updates():
predefined_updates = [
{"title": "πŸ“œ New Compliance Guidelines", "summary": "SEC released new guidelines for regulatory compliance."},
{"title": "βš–οΈ Update on Financial Risks", "summary": "New policies to mitigate risks in the financial sector."},
]
return predefined_updates
def visualize_key_clauses_frequency(clauses):
clause_counts = Counter(clauses)
common_clauses = clause_counts.most_common()
if common_clauses:
labels, values = zip(*common_clauses)
plt.figure(figsize=(10, 6))
plt.barh(labels, values, color='skyblue')
plt.xlabel('Frequency')
plt.title('πŸ“Š Key Clauses Frequency')
st.pyplot(plt)
else:
st.write("🚫 No key clauses to visualize.")
def generate_pdf_report(summary, clauses, risks, updates):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", "B", 16)
pdf.cell(0, 10, "Legal Document Analysis Report", ln=True, align="C")
pdf.ln(10)
if summary:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Summary:", ln=True)
pdf.set_font("Arial", "", 10)
pdf.multi_cell(0, 10, summary)
pdf.ln(5)
if clauses:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Key Clauses:", ln=True)
pdf.set_font("Arial", "", 10)
for i, clause in enumerate(clauses, 1):
pdf.multi_cell(0, 10, f"{i}. {clause}")
pdf.ln(5)
if risks:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Detected Risks:", ln=True)
pdf.set_font("Arial", "", 10)
pdf.multi_cell(0, 10, ", ".join(risks))
pdf.ln(5)
if updates:
pdf.set_font("Arial", "B", 12)
pdf.cell(0, 10, "Regulatory Updates:", ln=True)
pdf.set_font("Arial", "", 10)
for update in updates:
pdf.multi_cell(0, 10, f"- {update.get('title')}: {update.get('summary')}")
pdf_path = "Analysis_Results.pdf"
pdf.output(pdf_path)
return pdf_path
def main():
st.title("βš–οΈ Legal Document NLP Toolkit")
st.markdown("### πŸ“‹ Advanced Legal Document Analysis & Summarization Platform")
st.sidebar.title("πŸ”§ NLP Toolkit Options")
features = st.sidebar.multiselect("πŸ” Select Analysis Features",
["οΏ½ Document Summary", "πŸ”‘ Key Legal Clauses", "βš–οΈ Risk Assessment",
"πŸ“Š Entity Recognition", "🎯 Compliance Check", "πŸ“ˆ Data Visualization"])
uploaded_file = st.file_uploader("πŸ“‚ Upload Legal Document (PDF)", type="pdf",
help="Upload contracts, agreements, or legal documents for analysis")
# Add document info section
if uploaded_file is not None:
col1, col2 = st.columns([2, 1])
with col1:
st.info(f"πŸ“„ **Document:** {uploaded_file.name}")
with col2:
file_size = len(uploaded_file.getvalue()) / 1024
st.info(f"πŸ“Š **Size:** {file_size:.1f} KB")
if uploaded_file is not None:
try:
text = extract_text_from_pdf(uploaded_file)
st.success("βœ… Legal document processed successfully!")
# Document statistics
with st.expander("πŸ“Š Document Statistics", expanded=False):
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("πŸ“ Words", len(text.split()))
with col2:
st.metric("πŸ“„ Characters", len(text))
with col3:
st.metric("πŸ”€ Sentences", len(re.split(r'[.!?]+', text)))
with col4:
st.metric("πŸ“– Paragraphs", len(text.split('\n\n')))
except Exception as e:
st.error(f"❌ Error processing legal document: {e}")
return
summary, clauses, risks, entities, compliance_issues = "", [], [], [], []
if "οΏ½ Document Summary" in features:
with st.spinner("πŸ€– Generating AI-powered summary..."):
summary = summarize_text(text)
st.subheader("οΏ½ Executive Summary")
st.write(summary)
st.info("πŸ’‘ **Summary Method:** " + ("Advanced NLP with spaCy" if nlp else "Basic text extraction"))
if "πŸ”‘ Key Legal Clauses" in features:
with st.spinner("βš–οΈ Extracting legal clauses..."):
clauses = extract_key_clauses(text)
st.subheader("πŸ”‘ Key Legal Clauses")
for i, clause in enumerate(clauses, 1):
st.write(f"**{i}.** {clause}")
if "οΏ½ Data Visualization" in features and clauses:
visualize_key_clauses_frequency(clauses)
if "βš–οΈ Risk Assessment" in features:
with st.spinner("πŸ” Analyzing legal risks..."):
risks = detect_risks(text)
st.subheader("βš–οΈ Risk Assessment Report")
if risks:
risk_colors = {"high": "πŸ”΄", "medium": "🟑", "low": "🟒"}
for risk in risks:
risk_level = "medium" if risk in ["violation", "breach"] else "low"
st.write(f"{risk_colors.get(risk_level, 'βšͺ')} **{risk.title()}** - {risk_level.upper()} priority")
else:
st.success("βœ… No significant legal risks detected")
if "🎯 Entity Recognition" in features:
with st.spinner("🏷️ Identifying legal entities..."):
entities = extract_legal_entities(text)
st.subheader("🏷️ Legal Entity Recognition")
if entities:
for entity_type, entity_list in entities.items():
if entity_list:
st.write(f"**{entity_type}:** {', '.join(entity_list[:5])}")
else:
st.info("ℹ️ No specific legal entities identified")
if "🎯 Compliance Check" in features:
with st.spinner("βœ… Checking compliance requirements..."):
compliance_issues = check_compliance(text)
st.subheader("🎯 Compliance Analysis")
if compliance_issues:
for issue in compliance_issues:
st.warning(f"⚠️ {issue}")
else:
st.success("βœ… Document appears compliant with standard legal requirements")
# Generate comprehensive report
if features:
st.markdown("---")
if st.button("οΏ½ Generate Comprehensive Legal Report"):
pdf_path = generate_legal_report(summary, clauses, risks, entities, compliance_issues, uploaded_file.name)
st.success("πŸ“₯ Legal Analysis Report Ready!")
with open(pdf_path, "rb") as file:
st.download_button("πŸ“₯ Download Legal Report", file,
file_name=f"Legal_Analysis_{uploaded_file.name}.pdf",
mime="application/pdf")
if __name__ == "__main__":
main()