deepthi6 commited on
Commit
ef5a56e
·
verified ·
1 Parent(s): ab6e1a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +422 -303
app.py CHANGED
@@ -1,303 +1,422 @@
1
- import streamlit as st
2
- import tempfile
3
- import os
4
- import re
5
- from cryptography.fernet import Fernet
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForTokenClassification
7
- from PyPDF2 import PdfReader
8
- from docx import Document
9
- import plotly.express as px
10
- import pandas as pd
11
-
12
- # -------------------------
13
- # PAGE CONFIG
14
- # -------------------------
15
- st.set_page_config(page_title="ClauseWise: Legal Document Analyzer",
16
- page_icon="⚖️", layout="wide")
17
-
18
- st.title("⚖️ ClauseWise: Legal Document Analyzer")
19
- st.markdown("""
20
- **Simplify, Decode, and Classify Legal Documents using AI**
21
- Your smart assistant for understanding contracts, clauses, and obligations.
22
- """)
23
- st.markdown("---")
24
-
25
- # -------------------------
26
- # ENCRYPTION UTILITIES
27
- # -------------------------
28
- def get_session_key():
29
- if "enc_key" not in st.session_state:
30
- st.session_state["enc_key"] = Fernet.generate_key()
31
- return st.session_state["enc_key"]
32
-
33
- def encrypt_bytes(data: bytes, key: bytes) -> bytes:
34
- cipher = Fernet(key)
35
- return cipher.encrypt(data)
36
-
37
- def decrypt_bytes(token: bytes, key: bytes) -> bytes:
38
- cipher = Fernet(key)
39
- return cipher.decrypt(token)
40
-
41
- def write_temp_encrypted_file(encrypted_bytes: bytes):
42
- tmp = tempfile.NamedTemporaryFile(delete=False)
43
- tmp.write(encrypted_bytes)
44
- tmp.flush()
45
- tmp.close()
46
- return tmp.name
47
-
48
- def secure_delete(path: str):
49
- try:
50
- if os.path.exists(path):
51
- os.remove(path)
52
- except Exception:
53
- pass
54
-
55
- # -------------------------
56
- # FILE EXTRACTION
57
- # -------------------------
58
- def extract_text_from_pdf(file_bytes: bytes) -> str:
59
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
60
- tmp.write(file_bytes)
61
- tmp_path = tmp.name
62
- text = ""
63
- try:
64
- reader = PdfReader(tmp_path)
65
- for page in reader.pages:
66
- page_text = page.extract_text()
67
- if page_text:
68
- text += page_text + "\n"
69
- except Exception:
70
- text = ""
71
- secure_delete(tmp_path)
72
- return text
73
-
74
- def extract_text_from_docx(file_bytes: bytes) -> str:
75
- with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
76
- tmp.write(file_bytes)
77
- tmp_path = tmp.name
78
- text = ""
79
- try:
80
- doc = Document(tmp_path)
81
- text = "\n".join([p.text for p in doc.paragraphs])
82
- except Exception:
83
- text = ""
84
- secure_delete(tmp_path)
85
- return text
86
-
87
- def extract_text_from_txt(file_bytes: bytes) -> str:
88
- try:
89
- return file_bytes.decode("utf-8", errors="ignore")
90
- except Exception:
91
- return ""
92
-
93
- # -------------------------
94
- # CLEAN / PREPROCESS
95
- # -------------------------
96
- def clean_text(text: str) -> str:
97
- patterns = [
98
- r"Downloaded from[^\n]*\n?",
99
- r"Appears in \d+ contracts[^\n]*\n?",
100
- r"I'm 5:.*\n?",
101
- r"I'm 5 or Appears in.*\n?",
102
- r"(Employee Signature Date:.*?Title:\s*\d*)+",
103
- ]
104
- for p in patterns:
105
- text = re.sub(p, "", text, flags=re.IGNORECASE)
106
- text = re.sub(r"\n\s*\n+", "\n\n", text).strip()
107
- text = re.sub(r"\s+", " ", text)
108
- return text
109
-
110
- # -------------------------
111
- # MODEL CACHE (Hugging Face only)
112
- # -------------------------
113
- @st.cache_resource(ttl=3600)
114
- def load_models():
115
- simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
116
- tokenizer = AutoTokenizer.from_pretrained(simplify_model_name)
117
- simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
118
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
119
- ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
120
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
121
- return tokenizer, simplify_model, summarizer, ner_pipeline, classifier
122
-
123
- tokenizer, simplify_model, summarizer, ner_pipeline, classifier = load_models()
124
-
125
- # -------------------------
126
- # CORE AI FEATURES
127
- # -------------------------
128
- def clause_simplification(text, mode):
129
- if not text:
130
- return "No text to simplify."
131
- prefix = {
132
- "Simplified": "simplify: ",
133
- "Explain like I'm 5": "explain like I'm 5: ",
134
- "Professional": "rephrase professionally: "
135
- }.get(mode, "simplify: ")
136
- inputs = tokenizer(prefix + text, return_tensors="pt", truncation=True, max_length=512)
137
- outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
138
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
139
-
140
- def clause_extraction(text):
141
- matches = re.findall(r'(Section\s+\d+[\w\.\-]*[:\-]?\s*[A-Z][^\n]+)', text)
142
- return list(dict.fromkeys(matches)) if matches else ["Section 1.F: Base Rent"]
143
-
144
- def named_entity_recognition(text):
145
- entities = ner_pipeline(text[:2000])
146
- grouped = {}
147
- for ent in entities:
148
- grouped.setdefault(ent["entity_group"], []).append(ent["word"])
149
- return grouped
150
-
151
- def document_classification(text):
152
- labels = ["Lease Agreement", "Employment Contract", "NDA", "Purchase Agreement"]
153
- result = classifier(text[:1024], candidate_labels=labels)
154
- return result["labels"][0]
155
-
156
- def flag_risky_clauses(text):
157
- risky = re.findall(r"(penalty|termination|breach|liability|indemnity)", text, flags=re.IGNORECASE)
158
- return [f"Clause mentioning '{w}' requires review." for w in set(risky)] or ["No high-risk clauses detected."]
159
-
160
- def fairness_assessment(text):
161
- pos = len(re.findall(r"(mutual|both parties|shared)", text, flags=re.IGNORECASE))
162
- neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, flags=re.IGNORECASE))
163
- score = max(0, min(100, 70 + pos - neg * 2))
164
- return f"Fairness Score: {score}%"
165
-
166
- def ai_contract_assistant(text):
167
- suggestion = re.search(r"penalty|termination", text, flags=re.IGNORECASE)
168
- if suggestion:
169
- return "Suggested negotiation: Reduce penalty duration or clarify termination terms."
170
- return "No immediate negotiation points detected."
171
-
172
- def multilingual_support(text, target_language):
173
- try:
174
- translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language.lower()[:2]}")
175
- return translator(text[:1000])[0]["translation_text"]
176
- except Exception:
177
- return f"Translated to {target_language} (mock)."
178
-
179
- def text_to_audio(text):
180
- st.info("Text-to-speech support coming soon (use gTTS or pyttsx3).")
181
-
182
- # -------------------------
183
- # SMART CLAUSE-GROUPED TIMELINE + ENTITY PANEL
184
- # -------------------------
185
- def timeline_visualization(text):
186
- clauses = clause_extraction(text)
187
- entities = named_entity_recognition(text)
188
- events = []
189
-
190
- date_matches = re.finditer(
191
- r'((?:Section|Clause)\s[\dA-Za-z\.\-]+[^\n:]*[:\-]?\s*[^\n]*)|(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}',
192
- text)
193
-
194
- current_clause = "General"
195
- for m in date_matches:
196
- if m.group(1):
197
- current_clause = m.group(1).strip()
198
- elif m.group(2):
199
- events.append({"Clause": current_clause, "Date": m.group(2)})
200
-
201
- if not events:
202
- st.warning("No dates or timeline events detected.")
203
- return
204
-
205
- df = pd.DataFrame(events)
206
- df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
207
- df = df.dropna(subset=["Date"])
208
-
209
- st.subheader("📊 Contract Timeline by Clause")
210
- fig = px.timeline(df, x_start="Date", x_end="Date", y="Clause", color="Clause", title="Clause-Wise Timeline")
211
- fig.update_yaxes(autorange="reversed")
212
- st.plotly_chart(fig, use_container_width=True)
213
-
214
- st.markdown("### 🧾 Clause-Level Details")
215
- for clause in df["Clause"].unique():
216
- clause_dates = df[df["Clause"] == clause]["Date"].dt.strftime("%b %d, %Y").tolist()
217
- clause_entities = {k: v[:3] for k, v in entities.items()} if entities else {}
218
- with st.expander(f"📘 {clause}"):
219
- st.write(f"**Dates Mentioned:** {', '.join(clause_dates) if clause_dates else 'None'}")
220
- if clause_entities:
221
- st.write("**Entities Detected:**")
222
- st.json(clause_entities)
223
- else:
224
- st.write("No named entities found for this clause.")
225
-
226
- # -------------------------
227
- # MAIN UI
228
- # -------------------------
229
- st.subheader("📁 Upload a Legal Document")
230
- uploaded_file = st.file_uploader("Choose a document (PDF, DOCX, or TXT)", type=["pdf", "docx", "txt"])
231
-
232
- if uploaded_file:
233
- key = get_session_key()
234
- raw_bytes = uploaded_file.read()
235
- encrypted_bytes = encrypt_bytes(raw_bytes, key)
236
- temp_encrypted_path = write_temp_encrypted_file(encrypted_bytes)
237
- decrypted_bytes = decrypt_bytes(encrypted_bytes, key)
238
-
239
- filename_lower = uploaded_file.name.lower()
240
- if filename_lower.endswith(".pdf"):
241
- content = extract_text_from_pdf(decrypted_bytes)
242
- elif filename_lower.endswith(".docx"):
243
- content = extract_text_from_docx(decrypted_bytes)
244
- else:
245
- content = extract_text_from_txt(decrypted_bytes)
246
- secure_delete(temp_encrypted_path)
247
-
248
- if not content.strip():
249
- st.warning("No readable text found in the document.")
250
- else:
251
- st.markdown("---")
252
- st.subheader("🔍 Apply Features")
253
-
254
- mode = st.radio("Choose simplification level:", ["Explain like I'm 5", "Simplified", "Professional"])
255
- if st.button("🧾 Simplify Clauses"):
256
- with st.spinner("Simplifying..."):
257
- st.write(clause_simplification(content, mode))
258
- st.markdown("---")
259
-
260
- if st.button("🔗 Extract Entities"):
261
- st.json(named_entity_recognition(content))
262
- st.markdown("---")
263
-
264
- if st.button("📑 Extract Clauses"):
265
- st.write(clause_extraction(content))
266
- st.markdown("---")
267
-
268
- if st.button("📂 Classify Document"):
269
- st.success(document_classification(content))
270
- st.markdown("---")
271
-
272
- if st.button("🚨 Flag Risky Clauses"):
273
- st.warning(flag_risky_clauses(content))
274
- st.markdown("---")
275
-
276
- if st.button("📅 Timeline Visualization"):
277
- timeline_visualization(content)
278
- st.markdown("---")
279
-
280
- if st.button("⚖️ Fairness Assessment"):
281
- st.info(fairness_assessment(content))
282
- st.markdown("---")
283
-
284
- if st.button("🤝 Contract Assistant"):
285
- st.write(ai_contract_assistant(content))
286
- st.markdown("---")
287
-
288
- lang = st.selectbox("🌐 Choose Language", ["French", "Spanish", "German"])
289
- if st.button("Translate Document"):
290
- st.write(multilingual_support(content, lang))
291
- st.markdown("---")
292
-
293
- if st.button("🔊 Convert Text to Audio"):
294
- text_to_audio(content)
295
-
296
- else:
297
- st.info("👆 Upload a document above to start analysis.")
298
-
299
- st.markdown(
300
- "<p style='text-align: center; font-style: italic; color: gray;'>"
301
- "Important: ClauseWise provides educational information only. This is not legal advice."
302
- "</p>", unsafe_allow_html=True
303
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+ import re
5
+ import io
6
+ import asyncio
7
+ from typing import List, Dict, Tuple, Optional, Any
8
+
9
+ import torch
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
+ from pypdf import PdfReader
12
+ import docx
13
+ import spacy
14
+ import gradio as gr
15
+
16
+ # -----------------------------
17
+ # Model: IBM Granite 3.2 2B Instruct
18
+ # -----------------------------
19
+ MODEL_ID = "ibm-granite/granite-3.2-2b-instruct"
20
+
21
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
+ DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ MODEL_ID,
27
+ torch_dtype=DTYPE,
28
+ device_map="auto" if DEVICE == "cuda" else None
29
+ )
30
+ if DEVICE != "cuda":
31
+ model.to(DEVICE)
32
+
33
+ # -----------------------------
34
+ # spaCy for NER
35
+ # -----------------------------
36
+ nlp = spacy.load("en_core_web_sm")
37
+
38
+ # -----------------------------
39
+ # Helper: chat templating for Granite or fallback
40
+ # -----------------------------
41
+ def build_chat_prompt(system_prompt: str, user_prompt: str) -> str:
42
+ messages = []
43
+ if system_prompt:
44
+ messages.append({"role": "system", "content": system_prompt})
45
+ messages.append({"role": "user", "content": user_prompt})
46
+ try:
47
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
48
+ except Exception:
49
+ # Fallback: simple concatenation
50
+ sys = f"[SYSTEM]\n{system_prompt}\n" if system_prompt else ""
51
+ usr = f"[USER]\n{user_prompt}\n[ASSISTANT]\n"
52
+ return sys + usr
53
+
54
+ # -----------------------------
55
+ # LLM generation
56
+ # -----------------------------
57
+ def llm_generate(
58
+ system_prompt: str,
59
+ user_prompt: str,
60
+ max_new_tokens: int = 512,
61
+ temperature: float = 0.3,
62
+ top_p: float = 0.9
63
+ ) -> str:
64
+ prompt = build_chat_prompt(system_prompt, user_prompt)
65
+ inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
66
+ with torch.inference_mode():
67
+ output_ids = model.generate(
68
+ **inputs,
69
+ max_new_tokens=max_new_tokens,
70
+ temperature=temperature,
71
+ top_p=top_p,
72
+ do_sample=True,
73
+ pad_token_id=tokenizer.eos_token_id
74
+ )
75
+ full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
76
+ # Try to extract only assistant part if chat template used
77
+ if "[ASSISTANT]" in full_text:
78
+ return full_text.split("[ASSISTANT]")[-1].strip()
79
+ # Otherwise, remove the prompt prefix
80
+ if full_text.startswith(prompt):
81
+ return full_text[len(prompt):].strip()
82
+ return full_text.strip()
83
+
84
+ # -----------------------------
85
+ # Document loading (PDF/DOCX/TXT)
86
+ # -----------------------------
87
+ def load_text_from_pdf(file_obj) -> str:
88
+ reader = PdfReader(file_obj)
89
+ pages = []
90
+ for page in reader.pages:
91
+ try:
92
+ pages.append(page.extract_text() or "")
93
+ except Exception:
94
+ pages.append("")
95
+ return "\n".join(pages).strip()
96
+
97
+ def load_text_from_docx(file_obj) -> str:
98
+ # file_obj is a temporary file-like object; need to read into BytesIO for python-docx
99
+ data = file_obj.read()
100
+ file_obj.seek(0)
101
+ f = io.BytesIO(data)
102
+ doc = docx.Document(f)
103
+ paras = [p.text for p in doc.paragraphs]
104
+ return "\n".join(paras).strip()
105
+
106
+ def load_text_from_txt(file_obj) -> str:
107
+ data = file_obj.read()
108
+ if isinstance(data, bytes):
109
+ try:
110
+ data = data.decode("utf-8", errors="ignore")
111
+ except Exception:
112
+ data = data.decode("latin-1", errors="ignore")
113
+ return str(data).strip()
114
+
115
+ def load_document(file: Optional[gr.File]) -> str:
116
+ if not file:
117
+ return ""
118
+ name = (file.name or "").lower()
119
+ if name.endswith(".pdf"):
120
+ return load_text_from_pdf(file)
121
+ elif name.endswith(".docx"):
122
+ return load_text_from_docx(file)
123
+ elif name.endswith(".txt"):
124
+ return load_text_from_txt(file)
125
+ else:
126
+ # Try all in order
127
+ try:
128
+ return load_text_from_pdf(file)
129
+ except Exception:
130
+ pass
131
+ try:
132
+ return load_text_from_docx(file)
133
+ except Exception:
134
+ pass
135
+ try:
136
+ return load_text_from_txt(file)
137
+ except Exception:
138
+ pass
139
+ return ""
140
+
141
+ # -----------------------------
142
+ # Clause extraction heuristics
143
+ # -----------------------------
144
+ CLAUSE_SPLIT_REGEX = re.compile(
145
+ r"(?:(?:^\s*\d+(?:\.\d+)[.)]\s+)|(?:^\s[A-Z]\s*[.)]\s+)|(?:;?\s*\n))",
146
+ re.MULTILINE
147
+ )
148
+
149
+ def split_into_clauses(text: str, min_len: int = 40) -> List[str]:
150
+ if not text:
151
+ return []
152
+ # First try structured numbering/bullets
153
+ parts = re.split(CLAUSE_SPLIT_REGEX, text)
154
+ # Fallback: sentence-like splits if too few
155
+ if len(parts) < 2:
156
+ parts = re.split(r"(?<=[.;])\s+\n?\s*", text)
157
+ clauses = [p.strip() for p in parts if len(p.strip()) >= min_len]
158
+ # Deduplicate near-identical snippets
159
+ seen = set()
160
+ unique = []
161
+ for c in clauses:
162
+ key = re.sub(r"\s+", " ", c.lower())
163
+ if key not in seen:
164
+ seen.add(key)
165
+ unique.append(c)
166
+ return unique
167
+
168
+ # -----------------------------
169
+ # Feature: Clause Simplification / Plain English
170
+ # -----------------------------
171
+ def simplify_clause(clause: str) -> str:
172
+ system = "You are a legal assistant that rewrites clauses into plain, layman-friendly English while preserving legal meaning."
173
+ user = f"Rewrite the following clause in plain English, preserving intent. Highlight any risks with bullet points at the end.\n\nClause:\n{clause}"
174
+ return llm_generate(system, user, max_new_tokens=400)
175
+
176
+ # -----------------------------
177
+ # Feature: Named Entity Recognition (NER)
178
+ # -----------------------------
179
+ def ner_entities(text: str) -> Dict[str, List[str]]:
180
+ if not text:
181
+ return {}
182
+ doc = nlp(text)
183
+ out: Dict[str, List[str]] = {}
184
+ for ent in doc.ents:
185
+ out.setdefault(ent.label_, []).append(ent.text)
186
+ # Deduplicate
187
+ out = {k: sorted(set(v)) for k, v in out.items()}
188
+ return out
189
+
190
+ # -----------------------------
191
+ # Feature: Clause Extraction and Breakdown
192
+ # -----------------------------
193
+ def extract_clauses(text: str) -> List[str]:
194
+ return split_into_clauses(text)
195
+
196
+ # -----------------------------
197
+ # Feature: Document Type Classification (LLM zero-shot)
198
+ # -----------------------------
199
+ DOC_TYPES = [
200
+ "Non-Disclosure Agreement (NDA)",
201
+ "Lease Agreement",
202
+ "Employment Contract",
203
+ "Service Agreement",
204
+ "Sales Agreement",
205
+ "Consulting Agreement",
206
+ "End User License Agreement (EULA)",
207
+ "Terms of Service",
208
+ ]
209
+
210
+ def classify_document(text: str) -> str:
211
+ system = "You are a legal document classifier. Choose the single best-matching document type from the provided list."
212
+ labels = "\n".join(f"- {t}" for t in DOC_TYPES)
213
+ user = f"Classify the following document into one of these types:\n{labels}\n\nDocument:\n{text[:5000]}"
214
+ resp = llm_generate(system, user, max_new_tokens=200)
215
+ # Try to pick the closest label
216
+ scores = {t: (1.0 if t.lower() in resp.lower() else 0.0) for t in DOC_TYPES}
217
+ best = max(scores.items(), key=lambda kv: kv[1])[0]
218
+ # Fallback: heuristic keyword match
219
+ if scores[best] == 0.0:
220
+ lower = text.lower()
221
+ if "confidential" in lower or "non-disclosure" in lower or "nda" in lower:
222
+ best = "Non-Disclosure Agreement (NDA)"
223
+ elif "lease" in lower or "tenant" in lower or "landlord" in lower:
224
+ best = "Lease Agreement"
225
+ elif "employment" in lower or "employee" in lower or "employer" in lower:
226
+ best = "Employment Contract"
227
+ elif "services" in lower or "service" in lower or "statement of work" in lower:
228
+ best = "Service Agreement"
229
+ return best
230
+
231
+ # -----------------------------
232
+ # Feature: Negotiation Coach (3 alternatives with acceptance rates)
233
+ # -----------------------------
234
+ def negotiation_coach(clause: str) -> Tuple[str, List[Dict[str, Any]]]:
235
+ system = "You are an AI negotiation coach for contracts."
236
+ user = (
237
+ "Given the clause below, propose 3 alternative versions ranked by expected acceptance rate. "
238
+ "Provide JSON with fields: alternatives: [ {rank, acceptance_rate_percent, title, clause_text, rationale} ]. "
239
+ "Rank 1 is highest acceptance rate. Keep acceptance_rate_percent as integer. "
240
+ f"\n\nClause:\n{clause}"
241
+ )
242
+ resp = llm_generate(system, user, max_new_tokens=700)
243
+ # Try to extract JSON
244
+ data = None
245
+ try:
246
+ json_str = re.search(r"\{[\s\S]*\}", resp).group(0)
247
+ data = json.loads(json_str)
248
+ except Exception:
249
+ # Try to reconstruct minimal structure
250
+ data = {"alternatives": []}
251
+ # heuristic extraction
252
+ alts = re.split(r"\n\s*\d+[.)]\s*", resp)
253
+ for i, chunk in enumerate(alts[1:4], start=1):
254
+ data["alternatives"].append({
255
+ "rank": i,
256
+ "acceptance_rate_percent": max(50, 90 - (i-1)*10),
257
+ "title": f"Alternative {i}",
258
+ "clause_text": chunk.strip()[:800],
259
+ "rationale": "Heuristic parse from model response."
260
+ })
261
+ pretty = json.dumps(data, indent=2)
262
+ return pretty, data.get("alternatives", [])
263
+
264
+ # -----------------------------
265
+ # Feature: Future Risk Predictor (1–5+ years timeline)
266
+ # -----------------------------
267
+ def future_risk_predictor(clause: str) -> Tuple[str, List[Dict[str, Any]]]:
268
+ system = "You analyze contractual clauses and forecast future risks over time."
269
+ user = (
270
+ "Analyze the clause and forecast risks over the next 1 to 5 years. "
271
+ "Return strict JSON: {timeline: [ {year: int, risk_score_0_100: int, key_risks: [str], mitigation: [str]} ]}. "
272
+ "risk_score_0_100 is an integer. Keep the list length between 5 and 6."
273
+ f"\n\nClause:\n{clause}"
274
+ )
275
+ resp = llm_generate(system, user, max_new_tokens=700)
276
+ data = None
277
+ try:
278
+ json_str = re.search(r"\{[\s\S]*\}", resp).group(0)
279
+ data = json.loads(json_str)
280
+ except Exception:
281
+ data = {"timeline": []}
282
+ for y in range(1, 6):
283
+ data["timeline"].append({
284
+ "year": y,
285
+ "risk_score_0_100": min(95, 40 + y*8),
286
+ "key_risks": ["Heuristic timeline due to JSON parse fallback."],
287
+ "mitigation": ["Seek legal review", "Adjust clause terms", "Add notice/cure period"]
288
+ })
289
+ pretty = json.dumps(data, indent=2)
290
+ return pretty, data["timeline"]
291
+
292
+ # -----------------------------
293
+ # Feature: Fairness Balance Meter (power distribution)
294
+ # -----------------------------
295
+ def fairness_balance_meter(clause: str) -> Tuple[str, int, str]:
296
+ system = "You evaluate which party a clause favors on a 0-100 scale (0=Party A heavily favored, 50=balanced, 100=Party B heavily favored)."
297
+ user = (
298
+ "Return strict JSON: {score_0_100: int, rationale: str, notes: [str]}. "
299
+ "Do not include anything else."
300
+ f"\n\nClause:\n{clause}"
301
+ )
302
+ resp = llm_generate(system, user, max_new_tokens=400)
303
+ try:
304
+ data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
305
+ score = int(data.get("score_0_100", 50))
306
+ rationale = data.get("rationale", "")
307
+ except Exception:
308
+ score, rationale = 50, "Fallback balanced score due to JSON parse."
309
+ data = {"score_0_100": score, "rationale": rationale, "notes": []}
310
+ pretty = json.dumps(data, indent=2)
311
+ return pretty, score, rationale
312
+
313
+ # -----------------------------
314
+ # Feature: Clause Battle Arena (head-to-head)
315
+ # -----------------------------
316
+ def clause_battle_arena(text_a: str, text_b: str) -> Tuple[str, str]:
317
+ system = "You compare two contract drafts across objective criteria and declare an overall winner."
318
+ user = (
319
+ "Compare Document A vs Document B across: Liability, Termination, IP, Payment, Confidentiality, Governing Law. "
320
+ "Return JSON: {rounds: [ {category, winner: 'A'|'B'|'Draw', rationale} ], overall_winner: 'A'|'B'|'Draw', summary: str}.\n"
321
+ f"Document A:\n{text_a[:4000]}\n\nDocument B:\n{text_b[:4000]}"
322
+ )
323
+ resp = llm_generate(system, user, max_new_tokens=900)
324
+ try:
325
+ data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
326
+ except Exception:
327
+ data = {
328
+ "rounds": [
329
+ {"category": "Liability", "winner": "Draw", "rationale": "Fallback"},
330
+ {"category": "Termination", "winner": "Draw", "rationale": "Fallback"},
331
+ {"category": "IP", "winner": "Draw", "rationale": "Fallback"},
332
+ {"category": "Payment", "winner": "Draw", "rationale": "Fallback"},
333
+ {"category": "Confidentiality", "winner": "Draw", "rationale": "Fallback"},
334
+ {"category": "Governing Law", "winner": "Draw", "rationale": "Fallback"},
335
+ ],
336
+ "overall_winner": "Draw",
337
+ "summary": "JSON parse fallback."
338
+ }
339
+ pretty = json.dumps(data, indent=2)
340
+ rounds_md = "\n".join([f"- {r['category']}: {r['winner']} — {r.get('rationale','')}" for r in data.get("rounds", [])])
341
+ md = f"Overall Winner: {data.get('overall_winner','Draw')}\n\nRounds:\n{rounds_md}\n\nSummary:\n{data.get('summary','')}"
342
+ return pretty, md
343
+
344
+ # -----------------------------
345
+ # Feature: Sensitive Data Sniffer
346
+ # -----------------------------
347
+ PII_REGEXES = {
348
+ "Email": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
349
+ "Phone": r"\+?\d[\d\-\s]{7,}\d",
350
+ "SSN (US)": r"\b\d{3}-\d{2}-\d{4}\b",
351
+ "Credit Card": r"\b(?:\d[ -]*?){13,16}\b",
352
+ }
353
+
354
+ def sensitive_data_sniffer(text: str) -> Tuple[str, Dict[str, List[str]]]:
355
+ # LLM-based identification of privacy traps plus regex PII
356
+ system = "You find hidden privacy traps in legal text and list personal data categories being shared or processed."
357
+ user = (
358
+ "Return strict JSON: {data_categories: [str], sharing_parties: [str], processing_purposes: [str], risks: [str], recommendations: [str]}.\n"
359
+ f"Text:\n{text[:6000]}"
360
+ )
361
+ resp = llm_generate(system, user, max_new_tokens=700)
362
+ data = None
363
+ try:
364
+ data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
365
+ except Exception:
366
+ data = {
367
+ "data_categories": ["Name", "Email"],
368
+ "sharing_parties": ["Service Provider"],
369
+ "processing_purposes": ["Service delivery"],
370
+ "risks": ["Potential over-collection"],
371
+ "recommendations": ["Narrow purpose", "Limit retention"]
372
+ }
373
+ # Regex-based PII findings
374
+ regex_hits: Dict[str, List[str]] = {}
375
+ for label, pattern in PII_REGEXES.items():
376
+ hits = re.findall(pattern, text or "", flags=re.IGNORECASE)
377
+ if hits:
378
+ regex_hits[label] = sorted(set([h.strip() for h in hits]))
379
+ pretty = json.dumps({"llm": data, "regex_hits": regex_hits}, indent=2)
380
+ return pretty, regex_hits
381
+
382
+ # -----------------------------
383
+ # Feature: Litigation Risk Radar
384
+ # -----------------------------
385
+ def litigation_risk_radar(text: str) -> Tuple[str, str]:
386
+ clauses = split_into_clauses(text)
387
+ sample = "\n\n".join(clauses[:8]) if clauses else text[:4000]
388
+ system = "You identify clauses most likely to trigger disputes or litigation and provide sample dispute scenarios."
389
+ user = (
390
+ "Analyze the clauses and return JSON: {hotspots: [ {clause_excerpt, risk_level: 'Low'|'Medium'|'High', why, sample_dispute_scenario} ]}.\n"
391
+ f"Clauses:\n{sample}"
392
+ )
393
+ resp = llm_generate(system, user, max_new_tokens=900)
394
+ try:
395
+ data = json.loads(re.search(r"\{[\s\S]*\}", resp).group(0))
396
+ except Exception:
397
+ data = {
398
+ "hotspots": [
399
+ {
400
+ "clause_excerpt": (clauses[0][:280] if clauses else text[:280]),
401
+ "risk_level": "Medium",
402
+ "why": "Ambiguous obligations.",
403
+ "sample_dispute_scenario": "Party A alleges non-performance due to unclear milestones."
404
+ }
405
+ ]
406
+ }
407
+ pretty = json.dumps(data, indent=2)
408
+ md = "\n".join([
409
+ f"- [{h.get('risk_level','Medium')}] {h.get('clause_excerpt','')}\n Why: {h.get('why','')}\n Scenario: {h.get('sample_dispute_scenario','')}"
410
+ for h in data.get("hotspots", [])
411
+ ])
412
+ return pretty, md
413
+
414
+ # -----------------------------
415
+ # Glue: Input handling (upload or paste)
416
+ # -----------------------------
417
+ def get_text_from_inputs(file: Optional[gr.File], text: str) -> str:
418
+ file_text = load_document(file) if file else ""
419
+ final = (text or "").strip()
420
+ if len(file_text) > len(final):
421
+ return file_text
422
+     return final