deepthi6 commited on
Commit
4d7da7b
Β·
verified Β·
1 Parent(s): 774ba34

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.txt +27 -0
  2. app.py.py +303 -0
  3. app.sh +2 -0
  4. requirements.txt +6 -0
README.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ClauseWise 3.0 – Streamlit Legal Contract Analyzer
2
+
3
+ ## Features
4
+ - Clause Simplification (AI-powered layman rewriting)
5
+ - Named Entity Recognition (IBM Watson + HuggingFace fallback)
6
+ - Clause Extraction & Breakdown
7
+ - Document Type Classification (Zero-shot ML)
8
+ - Multi-format upload (PDF, DOCX, TXT)
9
+ - Fairness Balance Meter
10
+ - Negotiation AI Coach
11
+ - Future Risk Predictor
12
+ - Clause Battle Arena
13
+
14
+ ## How to Run
15
+ 1. Create a virtual environment
16
+ 2. Install dependencies:
17
+ ```
18
+ pip install -r requirements.txt
19
+ ```
20
+ 3. Start the app:
21
+ ```
22
+ streamlit run clausewise_streamlit.py
23
+ ```
24
+
25
+ ## Notes
26
+ - Add your IBM Watson and (optional) IBM Granite API keys in the script.
27
+ - Educational tool only – not legal advice.
app.py.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import os
4
+ import re
5
+ from cryptography.fernet import Fernet
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForTokenClassification
7
+ from PyPDF2 import PdfReader
8
+ from docx import Document
9
+ import plotly.express as px
10
+ import pandas as pd
11
+
12
+ # -------------------------
13
+ # PAGE CONFIG
14
+ # -------------------------
15
+ st.set_page_config(page_title="ClauseWise: Legal Document Analyzer",
16
+ page_icon="βš–οΈ", layout="wide")
17
+
18
+ st.title("βš–οΈ ClauseWise: Legal Document Analyzer")
19
+ st.markdown("""
20
+ **Simplify, Decode, and Classify Legal Documents using AI**
21
+ Your smart assistant for understanding contracts, clauses, and obligations.
22
+ """)
23
+ st.markdown("---")
24
+
25
+ # -------------------------
26
+ # ENCRYPTION UTILITIES
27
+ # -------------------------
28
+ def get_session_key():
29
+ if "enc_key" not in st.session_state:
30
+ st.session_state["enc_key"] = Fernet.generate_key()
31
+ return st.session_state["enc_key"]
32
+
33
+ def encrypt_bytes(data: bytes, key: bytes) -> bytes:
34
+ cipher = Fernet(key)
35
+ return cipher.encrypt(data)
36
+
37
+ def decrypt_bytes(token: bytes, key: bytes) -> bytes:
38
+ cipher = Fernet(key)
39
+ return cipher.decrypt(token)
40
+
41
+ def write_temp_encrypted_file(encrypted_bytes: bytes):
42
+ tmp = tempfile.NamedTemporaryFile(delete=False)
43
+ tmp.write(encrypted_bytes)
44
+ tmp.flush()
45
+ tmp.close()
46
+ return tmp.name
47
+
48
+ def secure_delete(path: str):
49
+ try:
50
+ if os.path.exists(path):
51
+ os.remove(path)
52
+ except Exception:
53
+ pass
54
+
55
+ # -------------------------
56
+ # FILE EXTRACTION
57
+ # -------------------------
58
+ def extract_text_from_pdf(file_bytes: bytes) -> str:
59
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
60
+ tmp.write(file_bytes)
61
+ tmp_path = tmp.name
62
+ text = ""
63
+ try:
64
+ reader = PdfReader(tmp_path)
65
+ for page in reader.pages:
66
+ page_text = page.extract_text()
67
+ if page_text:
68
+ text += page_text + "\n"
69
+ except Exception:
70
+ text = ""
71
+ secure_delete(tmp_path)
72
+ return text
73
+
74
+ def extract_text_from_docx(file_bytes: bytes) -> str:
75
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
76
+ tmp.write(file_bytes)
77
+ tmp_path = tmp.name
78
+ text = ""
79
+ try:
80
+ doc = Document(tmp_path)
81
+ text = "\n".join([p.text for p in doc.paragraphs])
82
+ except Exception:
83
+ text = ""
84
+ secure_delete(tmp_path)
85
+ return text
86
+
87
+ def extract_text_from_txt(file_bytes: bytes) -> str:
88
+ try:
89
+ return file_bytes.decode("utf-8", errors="ignore")
90
+ except Exception:
91
+ return ""
92
+
93
+ # -------------------------
94
+ # CLEAN / PREPROCESS
95
+ # -------------------------
96
+ def clean_text(text: str) -> str:
97
+ patterns = [
98
+ r"Downloaded from[^\n]*\n?",
99
+ r"Appears in \d+ contracts[^\n]*\n?",
100
+ r"I'm 5:.*\n?",
101
+ r"I'm 5 or Appears in.*\n?",
102
+ r"(Employee Signature Date:.*?Title:\s*\d*)+",
103
+ ]
104
+ for p in patterns:
105
+ text = re.sub(p, "", text, flags=re.IGNORECASE)
106
+ text = re.sub(r"\n\s*\n+", "\n\n", text).strip()
107
+ text = re.sub(r"\s+", " ", text)
108
+ return text
109
+
110
+ # -------------------------
111
+ # MODEL CACHE (Hugging Face only)
112
+ # -------------------------
113
+ @st.cache_resource(ttl=3600)
114
+ def load_models():
115
+ simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
116
+ tokenizer = AutoTokenizer.from_pretrained(simplify_model_name)
117
+ simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
118
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
119
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
120
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
121
+ return tokenizer, simplify_model, summarizer, ner_pipeline, classifier
122
+
123
+ tokenizer, simplify_model, summarizer, ner_pipeline, classifier = load_models()
124
+
125
+ # -------------------------
126
+ # CORE AI FEATURES
127
+ # -------------------------
128
+ def clause_simplification(text, mode):
129
+ if not text:
130
+ return "No text to simplify."
131
+ prefix = {
132
+ "Simplified": "simplify: ",
133
+ "Explain like I'm 5": "explain like I'm 5: ",
134
+ "Professional": "rephrase professionally: "
135
+ }.get(mode, "simplify: ")
136
+ inputs = tokenizer(prefix + text, return_tensors="pt", truncation=True, max_length=512)
137
+ outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
138
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
139
+
140
+ def clause_extraction(text):
141
+ matches = re.findall(r'(Section\s+\d+[\w\.\-]*[:\-]?\s*[A-Z][^\n]+)', text)
142
+ return list(dict.fromkeys(matches)) if matches else ["Section 1.F: Base Rent"]
143
+
144
+ def named_entity_recognition(text):
145
+ entities = ner_pipeline(text[:2000])
146
+ grouped = {}
147
+ for ent in entities:
148
+ grouped.setdefault(ent["entity_group"], []).append(ent["word"])
149
+ return grouped
150
+
151
+ def document_classification(text):
152
+ labels = ["Lease Agreement", "Employment Contract", "NDA", "Purchase Agreement"]
153
+ result = classifier(text[:1024], candidate_labels=labels)
154
+ return result["labels"][0]
155
+
156
+ def flag_risky_clauses(text):
157
+ risky = re.findall(r"(penalty|termination|breach|liability|indemnity)", text, flags=re.IGNORECASE)
158
+ return [f"Clause mentioning '{w}' requires review." for w in set(risky)] or ["No high-risk clauses detected."]
159
+
160
+ def fairness_assessment(text):
161
+ pos = len(re.findall(r"(mutual|both parties|shared)", text, flags=re.IGNORECASE))
162
+ neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, flags=re.IGNORECASE))
163
+ score = max(0, min(100, 70 + pos - neg * 2))
164
+ return f"Fairness Score: {score}%"
165
+
166
+ def ai_contract_assistant(text):
167
+ suggestion = re.search(r"penalty|termination", text, flags=re.IGNORECASE)
168
+ if suggestion:
169
+ return "Suggested negotiation: Reduce penalty duration or clarify termination terms."
170
+ return "No immediate negotiation points detected."
171
+
172
+ def multilingual_support(text, target_language):
173
+ try:
174
+ translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language.lower()[:2]}")
175
+ return translator(text[:1000])[0]["translation_text"]
176
+ except Exception:
177
+ return f"Translated to {target_language} (mock)."
178
+
179
+ def text_to_audio(text):
180
+ st.info("Text-to-speech support coming soon (use gTTS or pyttsx3).")
181
+
182
+ # -------------------------
183
+ # SMART CLAUSE-GROUPED TIMELINE + ENTITY PANEL
184
+ # -------------------------
185
+ def timeline_visualization(text):
186
+ clauses = clause_extraction(text)
187
+ entities = named_entity_recognition(text)
188
+ events = []
189
+
190
+ date_matches = re.finditer(
191
+ r'((?:Section|Clause)\s[\dA-Za-z\.\-]+[^\n:]*[:\-]?\s*[^\n]*)|(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}',
192
+ text)
193
+
194
+ current_clause = "General"
195
+ for m in date_matches:
196
+ if m.group(1):
197
+ current_clause = m.group(1).strip()
198
+ elif m.group(2):
199
+ events.append({"Clause": current_clause, "Date": m.group(2)})
200
+
201
+ if not events:
202
+ st.warning("No dates or timeline events detected.")
203
+ return
204
+
205
+ df = pd.DataFrame(events)
206
+ df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
207
+ df = df.dropna(subset=["Date"])
208
+
209
+ st.subheader("πŸ“Š Contract Timeline by Clause")
210
+ fig = px.timeline(df, x_start="Date", x_end="Date", y="Clause", color="Clause", title="Clause-Wise Timeline")
211
+ fig.update_yaxes(autorange="reversed")
212
+ st.plotly_chart(fig, use_container_width=True)
213
+
214
+ st.markdown("### 🧾 Clause-Level Details")
215
+ for clause in df["Clause"].unique():
216
+ clause_dates = df[df["Clause"] == clause]["Date"].dt.strftime("%b %d, %Y").tolist()
217
+ clause_entities = {k: v[:3] for k, v in entities.items()} if entities else {}
218
+ with st.expander(f"πŸ“˜ {clause}"):
219
+ st.write(f"**Dates Mentioned:** {', '.join(clause_dates) if clause_dates else 'None'}")
220
+ if clause_entities:
221
+ st.write("**Entities Detected:**")
222
+ st.json(clause_entities)
223
+ else:
224
+ st.write("No named entities found for this clause.")
225
+
226
+ # -------------------------
227
+ # MAIN UI
228
+ # -------------------------
229
+ st.subheader("πŸ“ Upload a Legal Document")
230
+ uploaded_file = st.file_uploader("Choose a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"])
231
+
232
+ if uploaded_file:
233
+ key = get_session_key()
234
+ raw_bytes = uploaded_file.read()
235
+ encrypted_bytes = encrypt_bytes(raw_bytes, key)
236
+ temp_encrypted_path = write_temp_encrypted_file(encrypted_bytes)
237
+ decrypted_bytes = decrypt_bytes(encrypted_bytes, key)
238
+
239
+ filename_lower = uploaded_file.name.lower()
240
+ if filename_lower.endswith(".pdf"):
241
+ content = extract_text_from_pdf(decrypted_bytes)
242
+ elif filename_lower.endswith(".docx"):
243
+ content = extract_text_from_docx(decrypted_bytes)
244
+ else:
245
+ content = extract_text_from_txt(decrypted_bytes)
246
+ secure_delete(temp_encrypted_path)
247
+
248
+ if not content.strip():
249
+ st.warning("No readable text found in the document.")
250
+ else:
251
+ st.markdown("---")
252
+ st.subheader("πŸ” Apply Features")
253
+
254
+ mode = st.radio("Choose simplification level:", ["Explain like I'm 5", "Simplified", "Professional"])
255
+ if st.button("🧾 Simplify Clauses"):
256
+ with st.spinner("Simplifying..."):
257
+ st.write(clause_simplification(content, mode))
258
+ st.markdown("---")
259
+
260
+ if st.button("πŸ”— Extract Entities"):
261
+ st.json(named_entity_recognition(content))
262
+ st.markdown("---")
263
+
264
+ if st.button("πŸ“‘ Extract Clauses"):
265
+ st.write(clause_extraction(content))
266
+ st.markdown("---")
267
+
268
+ if st.button("πŸ“‚ Classify Document"):
269
+ st.success(document_classification(content))
270
+ st.markdown("---")
271
+
272
+ if st.button("🚨 Flag Risky Clauses"):
273
+ st.warning(flag_risky_clauses(content))
274
+ st.markdown("---")
275
+
276
+ if st.button("πŸ“… Timeline Visualization"):
277
+ timeline_visualization(content)
278
+ st.markdown("---")
279
+
280
+ if st.button("βš–οΈ Fairness Assessment"):
281
+ st.info(fairness_assessment(content))
282
+ st.markdown("---")
283
+
284
+ if st.button("🀝 Contract Assistant"):
285
+ st.write(ai_contract_assistant(content))
286
+ st.markdown("---")
287
+
288
+ lang = st.selectbox("🌐 Choose Language", ["French", "Spanish", "German"])
289
+ if st.button("Translate Document"):
290
+ st.write(multilingual_support(content, lang))
291
+ st.markdown("---")
292
+
293
+ if st.button("πŸ”Š Convert Text to Audio"):
294
+ text_to_audio(content)
295
+
296
+ else:
297
+ st.info("πŸ‘† Upload a document above to start analysis.")
298
+
299
+ st.markdown(
300
+ "<p style='text-align: center; font-style: italic; color: gray;'>"
301
+ "Important: ClauseWise provides educational information only. This is not legal advice."
302
+ "</p>", unsafe_allow_html=True
303
+ )
app.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ #!/bin/bash
2
+ streamlit run app.py --server.port 7860 --server.address 0.0.0.0
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.39.0
2
+ transformers==4.44.0
3
+ torch>=2.2.0
4
+ googletrans==4.0.0rc1
5
+ gTTS==2.5.1
6
+ matplotlib==3.8.4