bhoomi19 commited on
Commit
e631f5a
Β·
verified Β·
1 Parent(s): 89c4cef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -432
app.py CHANGED
@@ -1,502 +1,361 @@
1
  import streamlit as st
2
- import tempfile
3
  import os
4
  import re
5
- import io
6
  import json
7
- from typing import List, Dict, Tuple, Any, Optional
8
  import torch
9
- from transformers import AutoTokenizer, AutoModelForCausalLM
10
  from pypdf import PdfReader
11
  import docx
12
- import spacy
13
- import math
14
- import sys
15
- import subprocess
16
-
17
- # -------------------------
18
- # SPACES-SPECIFIC CONFIG
19
- # -------------------------
20
- # Hugging Face Spaces provide these tokens automatically
21
- HF_TOKEN = os.environ.get("HF_TOKEN")
22
 
23
- # Set page config as the VERY FIRST Streamlit command
24
  st.set_page_config(
25
- page_title="ClauseWise – Granite 3.2 (2B) Legal Assistant",
26
  page_icon="βš–οΈ",
27
- layout="wide",
28
- initial_sidebar_state="expanded"
29
  )
30
 
31
- # -------------------------
32
- # MODEL SETUP - Optimized for Spaces
33
- # -------------------------
34
- MODEL_ID = "ibm-granite/granite-3.2-2b-instruct"
35
-
36
- # Spaces hardware detection
37
- if torch.cuda.is_available():
38
- DEVICE = "cuda"
39
- DTYPE = torch.float16 # Use float16 for better memory usage
40
- elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
41
- DEVICE = "mps"
42
- DTYPE = torch.float16
43
- else:
44
- DEVICE = "cpu"
45
- DTYPE = torch.float32
46
 
47
- # Cache model properly for Spaces
48
- @st.cache_resource(show_spinner=True)
49
- def load_llm_model():
50
- """Load the LLM model optimized for Spaces"""
51
  try:
52
- st.info("πŸš€ Loading AI model... This may take a few minutes on first run.")
53
-
54
- # Load tokenizer
55
- tokenizer = AutoTokenizer.from_pretrained(
56
- MODEL_ID,
57
- token=HF_TOKEN,
58
- trust_remote_code=True
59
- )
60
-
61
- # Load model with optimized settings for Spaces
62
- model = AutoModelForCausalLM.from_pretrained(
63
- MODEL_ID,
64
- token=HF_TOKEN,
65
- torch_dtype=DTYPE,
66
- trust_remote_code=True,
67
- device_map="auto" if DEVICE != "cpu" else None,
68
- low_cpu_mem_usage=True
69
  )
70
-
71
- # If no device map, move manually
72
- if DEVICE != "cpu" and model.device.type != DEVICE:
73
- model = model.to(DEVICE)
74
-
75
- st.success("βœ… Model loaded successfully!")
76
- return tokenizer, model
77
-
78
  except Exception as e:
79
- st.error(f"❌ Error loading model: {str(e)}")
80
- # Return a fallback that won't break the app
81
- return None, None
82
-
83
- # -------------------------
84
- # SPAcy SETUP
85
- # -------------------------
86
- try:
87
- nlp = spacy.load("en_core_web_sm")
88
- except OSError:
89
- with st.spinner("Downloading spaCy model..."):
90
- subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
91
- nlp = spacy.load("en_core_web_sm")
92
-
93
- # -------------------------
94
- # SESSION STATE INITIALIZATION
95
- # -------------------------
96
- def init_session_state():
97
- """Initialize all session state variables"""
98
- defaults = {
99
- 'model_loaded': False,
100
- 'text_data': "",
101
- 'simplified_clause': "",
102
- 'classification_results': "",
103
- 'ner_results': {},
104
- 'extracted_clauses': [],
105
- 'negotiation_alternatives': {},
106
- 'risk_prediction': {},
107
- 'fairness_score': 50,
108
- 'fairness_rationale': "",
109
- 'battle_results': "",
110
- 'sensitive_data_results': "",
111
- 'litigation_risk_results': "",
112
- }
113
-
114
- for key, value in defaults.items():
115
- if key not in st.session_state:
116
- st.session_state[key] = value
117
-
118
- init_session_state()
119
-
120
- # -------------------------
121
- # STREAMLIT UI - Spaces Optimized
122
- # -------------------------
123
-
124
- # Sidebar with Spaces info
125
- with st.sidebar:
126
- st.title("βš–οΈ ClauseWise")
127
- st.markdown("Legal AI Assistant powered by Granite 3.2 2B")
128
-
129
- # Spaces info
130
- st.markdown("---")
131
- st.markdown("**Hardware Info:**")
132
- st.write(f"Device: {DEVICE}")
133
- if torch.cuda.is_available():
134
- st.write(f"GPU: {torch.cuda.get_device_name()}")
135
-
136
- # File upload
137
- st.markdown("---")
138
- st.subheader("πŸ“ Document Input")
139
- uploaded_file = st.file_uploader(
140
- "Upload PDF/DOCX/TXT",
141
- type=["pdf", "docx", "txt"],
142
- help="Supported formats: PDF, Word, Text"
143
- )
144
-
145
- # Text input
146
- pasted_text = st.text_area(
147
- "Or paste text directly",
148
- height=150,
149
- placeholder="Paste your legal text here...",
150
- help="For best results, provide clear legal clauses or contract text"
151
- )
152
-
153
- # Load model button
154
- st.markdown("---")
155
- if st.button("πŸ”„ Initialize AI Model", type="primary"):
156
- with st.spinner("Loading AI model..."):
157
- tokenizer, model = load_llm_model()
158
- if tokenizer and model:
159
- st.session_state.model_loaded = True
160
- st.success("AI model ready!")
161
- else:
162
- st.error("Failed to load model")
163
-
164
- # Main area
165
- st.title("βš–οΈ ClauseWise – Legal AI Assistant")
166
- st.markdown("Analyze legal documents with AI-powered insights using IBM's Granite 3.2 2B model")
167
-
168
- # Process document input
169
- if uploaded_file or pasted_text:
170
- with st.spinner("Processing document..."):
171
- if uploaded_file:
172
- text_data = load_document(uploaded_file)
173
- else:
174
- text_data = pasted_text
175
-
176
- st.session_state.text_data = text_data
177
-
178
- # Show document preview
179
- with st.expander("πŸ“„ Document Preview", expanded=False):
180
- preview_text = text_data[:1500] + ("..." if len(text_data) > 1500 else "")
181
- st.text_area("Preview", preview_text, height=200, label_visibility="collapsed")
182
- st.caption(f"Document length: {len(text_data)} characters")
183
-
184
- # Warning if no model loaded
185
- if not st.session_state.model_loaded:
186
- st.warning("⚠️ Please initialize the AI model first using the button in the sidebar")
187
-
188
- # -------------------------
189
- # HELPER FUNCTIONS - Optimized for Spaces
190
- # -------------------------
191
-
192
- def load_document(file) -> str:
193
- """Load text from various document formats"""
194
- if not file:
195
- return ""
196
-
197
- name = (file.name or "").lower()
198
 
199
  try:
200
- if name.endswith(".pdf"):
201
- return load_text_from_pdf(file)
202
- elif name.endswith(".docx"):
203
- return load_text_from_docx(file)
204
- elif name.endswith(".txt"):
205
- return load_text_from_txt(file)
206
- else:
207
- # Try all formats
208
- for loader in [load_text_from_pdf, load_text_from_docx, load_text_from_txt]:
209
- try:
210
- return loader(file)
211
- except:
212
- continue
213
- return ""
214
  except Exception as e:
215
- st.error(f"Error reading document: {str(e)}")
216
- return ""
217
 
218
- def load_text_from_pdf(file_obj) -> str:
219
- """Extract text from PDF"""
220
  try:
221
- reader = PdfReader(file_obj)
 
 
 
222
  text = ""
223
  for page in reader.pages:
224
- page_text = page.extract_text() or ""
225
- text += page_text + "\n"
 
226
  return text.strip()
227
  except Exception as e:
228
- st.error(f"PDF reading error: {str(e)}")
229
- return ""
230
 
231
- def load_text_from_docx(file_obj) -> str:
232
- """Extract text from Word document"""
233
  try:
234
- doc = docx.Document(file_obj)
235
- return "\n".join([para.text for para in doc.paragraphs]).strip()
 
 
236
  except Exception as e:
237
- st.error(f"DOCX reading error: {str(e)}")
238
- return ""
239
 
240
- def load_text_from_txt(file_obj) -> str:
241
- """Extract text from text file"""
242
  try:
243
- content = file_obj.read()
244
- if isinstance(content, bytes):
245
- content = content.decode('utf-8', errors='ignore')
246
- return str(content).strip()
 
247
  except Exception as e:
248
- st.error(f"TXT reading error: {str(e)}")
249
- return ""
250
 
251
- def build_chat_prompt(system_prompt: str, user_prompt: str) -> str:
252
- """Build chat prompt for the model"""
253
- tokenizer, model = load_llm_model()
254
- if tokenizer is None:
255
- return f"{system_prompt}\n\n{user_prompt}"
256
 
257
- messages = [
258
- {"role": "system", "content": system_prompt},
259
- {"role": "user", "content": user_prompt}
260
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- try:
263
- return tokenizer.apply_chat_template(
264
- messages,
265
- tokenize=False,
266
- add_generation_prompt=True
267
- )
268
- except Exception:
269
- # Fallback template
270
- return f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:"
271
-
272
- def llm_generate(system_prompt: str, user_prompt: str, max_new_tokens=512, temperature=0.3, top_p=0.9) -> str:
273
- """Generate text using the LLM with Spaces optimization"""
274
- tokenizer, model = load_llm_model()
275
 
276
- if tokenizer is None or model is None:
277
- return "❌ AI model not loaded. Please initialize the model first."
278
 
279
- try:
280
- prompt = build_chat_prompt(system_prompt, user_prompt)
281
-
282
- # Tokenize with truncation for Spaces memory limits
283
- inputs = tokenizer(
284
- prompt,
285
- return_tensors="pt",
286
- truncation=True,
287
- max_length=2048
288
- ).to(DEVICE)
289
-
290
- # Generate with optimized settings
291
- with torch.no_generation():
292
- outputs = model.generate(
293
- **inputs,
294
- max_new_tokens=max_new_tokens,
295
- temperature=temperature,
296
- top_p=top_p,
297
- do_sample=True,
298
- pad_token_id=tokenizer.eos_token_id,
299
- repetition_penalty=1.1
300
- )
301
-
302
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
303
-
304
- # Extract just the assistant's response
305
- if "Assistant:" in response:
306
- return response.split("Assistant:")[-1].strip()
307
- elif prompt in response:
308
- return response[len(prompt):].strip()
309
- else:
310
- return response.strip()
311
-
312
- except Exception as e:
313
- return f"❌ Generation error: {str(e)}"
314
-
315
- # -------------------------
316
- # ANALYSIS FUNCTIONS - Simplified for Spaces
317
- # -------------------------
318
-
319
- def simplify_clause(clause: str) -> str:
320
- """Simplify legal clause to plain English"""
321
- system = """You are a legal assistant that rewrites complex legal clauses into simple, plain English.
322
- Keep the meaning exactly the same but make it easy for non-lawyers to understand.
323
- Focus on clarity and simplicity."""
324
 
325
- user = f"Rewrite this legal clause in plain English:\n\n{clause}"
326
- return llm_generate(system, user, max_new_tokens=400)
327
-
328
- def ner_entities(text: str) -> Dict[str, List[str]]:
329
- """Extract named entities using spaCy"""
330
- if not text:
331
- return {}
332
 
333
- doc = nlp(text[:10000]) # Limit for performance
334
- entities = {}
 
335
 
336
- for ent in doc.ents:
337
- entities.setdefault(ent.label_, []).append(ent.text)
 
 
 
 
 
 
 
 
338
 
339
- # Remove duplicates
340
- return {k: list(set(v)) for k, v in entities.items()}
341
 
342
- CLAUSE_SPLIT_REGEX = re.compile(r"(?:(?:^\s*\d+(?:\.\d+)*[.)]\s+)|(?:(?<=[.;])\s+(?=[A-Z]))", re.MULTILINE)
343
-
344
- def extract_clauses(text: str) -> List[str]:
345
- """Extract individual clauses from legal text"""
346
- if not text:
347
- return []
 
 
 
 
 
348
 
349
- # Simple clause splitting
350
- clauses = re.split(CLAUSE_SPLIT_REGEX, text)
351
- clauses = [c.strip() for c in clauses if len(c.strip()) > 50] # Minimum length
 
 
 
 
 
 
 
 
 
 
352
 
353
- # Remove duplicates based on simplified text
354
- seen = set()
355
- unique_clauses = []
 
 
 
 
 
 
 
 
356
 
357
- for clause in clauses:
358
- # Create a simple fingerprint
359
- simple = re.sub(r'\s+', ' ', clause.lower())[:100]
360
- if simple not in seen:
361
- seen.add(simple)
362
- unique_clauses.append(clause)
363
 
364
- return unique_clauses[:20] # Limit for performance
365
 
366
- # -------------------------
367
- # MAIN TABS INTERFACE
368
- # -------------------------
 
 
 
 
369
 
370
- if st.session_state.text_data:
371
- tab1, tab2, tab3, tab4 = st.tabs([
372
- "πŸ” Clause Analysis",
373
- "πŸ“Š Document Insights",
374
- "βš–οΈ Legal Review",
375
- "πŸ›‘οΈ Risk Assessment"
376
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
- with tab1:
379
- st.subheader("Clause Analysis")
 
 
 
380
 
381
- col1, col2 = st.columns(2)
 
382
 
383
  with col1:
384
- if st.button("🧹 Simplify Clauses", use_container_width=True):
385
- if st.session_state.model_loaded:
386
- with st.spinner("Simplifying clauses..."):
387
- simplified = simplify_clause(st.session_state.text_data[:2000])
388
- st.session_state.simplified_clause = simplified
389
- else:
390
- st.warning("Please initialize AI model first")
391
-
392
- if st.session_state.simplified_clause:
393
- st.subheader("Simplified Version")
394
- st.write(st.session_state.simplified_clause)
395
 
396
  with col2:
397
- if st.button("πŸ” Extract Entities", use_container_width=True):
398
- with st.spinner("Extracting named entities..."):
399
- entities = ner_entities(st.session_state.text_data)
400
- st.session_state.ner_results = entities
401
-
402
- if st.session_state.ner_results:
403
- st.subheader("Named Entities")
404
- for label, values in st.session_state.ner_results.items():
405
- with st.expander(f"{label} ({len(values)})"):
406
- st.write(", ".join(values[:10])) # Limit display
407
-
408
- with tab2:
409
- st.subheader("Document Insights")
410
 
411
- col1, col2 = st.columns(2)
 
412
 
413
- with col1:
414
- if st.button("πŸ“‘ Extract Clauses", use_container_width=True):
415
- with st.spinner("Extracting clauses..."):
416
- clauses = extract_clauses(st.session_state.text_data)
417
- st.session_state.extracted_clauses = clauses
 
 
 
 
 
 
 
 
 
418
 
419
- if st.session_state.extracted_clauses:
420
- st.subheader(f"Extracted Clauses ({len(st.session_state.extracted_clauses)})")
421
- for i, clause in enumerate(st.session_state.extracted_clauses[:5], 1):
422
- with st.expander(f"Clause {i}"):
423
- st.write(clause[:500] + "..." if len(clause) > 500 else clause)
424
 
425
- with col2:
426
- if st.button("πŸ“‹ Classify Document", use_container_width=True):
427
- if st.session_state.model_loaded:
428
- with st.spinner("Classifying document..."):
429
- doc_type = classify_document(st.session_state.text_data)
430
- st.session_state.classification_results = doc_type
431
- else:
432
- st.warning("Please initialize AI model first")
433
-
434
- if st.session_state.classification_results:
435
- st.subheader("Document Type")
436
- st.info(st.session_state.classification_results)
437
-
438
- with tab3:
439
- st.subheader("Legal Review Tools")
440
- st.info("More advanced legal review features will be available here")
441
 
442
- with tab4:
443
- st.subheader("Risk Assessment")
444
- st.info("Risk analysis features will be available here")
 
 
 
 
 
445
 
446
  else:
447
- # Welcome screen when no document is loaded
448
  st.markdown("""
449
- ## πŸ‘‹ Welcome to ClauseWise
450
 
451
- To get started:
452
 
453
- 1. **Upload a document** (PDF, Word, or Text) in the sidebar, OR
 
454
  2. **Paste your legal text** in the text area
455
- 3. **Initialize the AI model** using the button in the sidebar
456
- 4. **Choose an analysis tool** from the tabs above
457
-
458
- ### πŸ“‹ Supported Analyses:
459
 
460
- - **Clause Simplification**: Rewrite legal jargon in plain English
461
- - **Entity Extraction**: Identify people, organizations, dates
462
- - **Clause Extraction**: Break down documents into individual clauses
463
- - **Document Classification**: Identify the type of legal document
 
464
 
465
- ### ⚠️ Important Notes for Spaces:
 
 
 
 
 
 
 
 
466
 
467
- - Model loading may take 2-5 minutes on first use
468
- - Some features require GPU acceleration
469
- - Large documents may be processed in chunks
 
 
470
  """)
471
 
472
- # -------------------------
473
- # MISSING FUNCTION IMPLEMENTATIONS
474
- # -------------------------
475
-
476
- def classify_document(text: str) -> str:
477
- """Classify document type"""
478
- system = """You are a legal document classifier. Analyze the text and classify it into one of these types:
479
- - Non-Disclosure Agreement (NDA)
480
- - Employment Contract
481
- - Service Agreement
482
- - Lease Agreement
483
- - Sales Agreement
484
- - Terms of Service
485
- - Other Legal Document
486
-
487
- Respond with ONLY the document type name."""
488
-
489
- user = f"Classify this legal document:\n\n{text[:3000]}"
490
- response = llm_generate(system, user, max_new_tokens=100)
491
- return response.strip()
492
-
493
- # Add other functions as needed with simplified implementations for Spaces
494
-
495
- # -------------------------
496
- # FOOTER
497
- # -------------------------
498
  st.markdown("---")
499
- st.markdown(
500
- "**ClauseWise** | Powered by IBM Granite 3.2 2B | "
501
- "Deployed on Hugging Face Spaces πŸ€—"
502
- )
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  import os
3
  import re
 
4
  import json
5
+ from typing import List, Dict
6
  import torch
7
+ from transformers import pipeline
8
  from pypdf import PdfReader
9
  import docx
10
+ import io
 
 
 
 
 
 
 
 
 
11
 
12
+ # Set page config FIRST - this is critical for Streamlit
13
  st.set_page_config(
14
+ page_title="ClauseWise Legal Assistant",
15
  page_icon="βš–οΈ",
16
+ layout="wide"
 
17
  )
18
 
19
+ # Use a small, reliable model
20
+ MODEL_ID = "microsoft/DialoGPT-small" # 334M parameters - fits in Spaces memory
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ @st.cache_resource(show_spinner=False)
23
+ def load_model():
24
+ """Load a small model that actually works on Spaces"""
 
25
  try:
26
+ # Use a simple pipeline - much more memory efficient
27
+ generator = pipeline(
28
+ "text-generation",
29
+ model=MODEL_ID,
30
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
31
+ device_map="auto" if torch.cuda.is_available() else None,
32
+ max_length=512
 
 
 
 
 
 
 
 
 
 
33
  )
34
+ return generator
 
 
 
 
 
 
 
35
  except Exception as e:
36
+ st.error(f"Model loading failed: {e}")
37
+ return None
38
+
39
+ def simple_llm_generate(prompt: str, max_length=200) -> str:
40
+ """Simple generation with error handling"""
41
+ generator = load_model()
42
+ if generator is None:
43
+ return "Model not available. Using demo mode."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  try:
46
+ result = generator(
47
+ prompt,
48
+ max_length=max_length,
49
+ num_return_sequences=1,
50
+ temperature=0.7,
51
+ do_sample=True,
52
+ pad_token_id=50256
53
+ )
54
+ generated = result[0]['generated_text']
55
+ # Remove the prompt from the response
56
+ if generated.startswith(prompt):
57
+ return generated[len(prompt):].strip()
58
+ return generated.strip()
 
59
  except Exception as e:
60
+ return f"Generation error: {str(e)}"
 
61
 
62
+ # Document loading functions
63
+ def load_text_from_pdf(file_obj):
64
  try:
65
+ # Read the file content
66
+ file_content = file_obj.read()
67
+ file_obj.seek(0) # Reset file pointer
68
+ reader = PdfReader(io.BytesIO(file_content))
69
  text = ""
70
  for page in reader.pages:
71
+ page_text = page.extract_text()
72
+ if page_text:
73
+ text += page_text + "\n"
74
  return text.strip()
75
  except Exception as e:
76
+ return f"Error reading PDF: {str(e)}"
 
77
 
78
+ def load_text_from_docx(file_obj):
 
79
  try:
80
+ file_content = file_obj.read()
81
+ file_obj.seek(0)
82
+ doc = docx.Document(io.BytesIO(file_content))
83
+ return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
84
  except Exception as e:
85
+ return f"Error reading DOCX: {str(e)}"
 
86
 
87
+ def load_text_from_txt(file_obj):
 
88
  try:
89
+ file_content = file_obj.read()
90
+ file_obj.seek(0)
91
+ if isinstance(file_content, bytes):
92
+ return file_content.decode('utf-8', errors='ignore')
93
+ return str(file_content)
94
  except Exception as e:
95
+ return f"Error reading TXT: {str(e)}"
 
96
 
97
+ def load_document(file):
98
+ """Universal document loader"""
99
+ if not file:
100
+ return ""
 
101
 
102
+ filename = file.name.lower()
103
+ if filename.endswith('.pdf'):
104
+ return load_text_from_pdf(file)
105
+ elif filename.endswith('.docx'):
106
+ return load_text_from_docx(file)
107
+ elif filename.endswith('.txt'):
108
+ return load_text_from_txt(file)
109
+ else:
110
+ # Try all formats
111
+ for loader in [load_text_from_pdf, load_text_from_docx, load_text_from_txt]:
112
+ try:
113
+ result = loader(file)
114
+ if result and not result.startswith("Error"):
115
+ return result
116
+ except:
117
+ continue
118
+ return "Could not read document"
119
+
120
+ # FIXED regex patterns - simple and working
121
+ def extract_clauses_simple(text: str) -> List[str]:
122
+ """Simple clause extraction using reliable regex"""
123
+ if not text:
124
+ return []
125
 
126
+ # Multiple splitting strategies
127
+ clauses = []
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ # Strategy 1: Split by common clause separators
130
+ clauses1 = re.split(r'[.;!?]\s+', text)
131
 
132
+ # Strategy 2: Split by line breaks followed by numbers or bullets
133
+ clauses2 = re.split(r'\n\s*(?:\d+\.|\*|\-)\s+', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Strategy 3: Split by section markers
136
+ clauses3 = re.split(r'\n\s*[A-Z][A-Za-z\s]+\:', text)
 
 
 
 
 
137
 
138
+ # Combine all strategies and clean up
139
+ all_clauses = clauses1 + clauses2 + clauses3
140
+ cleaned_clauses = []
141
 
142
+ for clause in all_clauses:
143
+ clause = clause.strip()
144
+ # Only keep meaningful clauses
145
+ if (len(clause) > 30 and
146
+ len(clause) < 1000 and
147
+ not clause.isspace()):
148
+ # Simple deduplication
149
+ simple_clause = re.sub(r'\s+', ' ', clause.lower())
150
+ if simple_clause not in [re.sub(r'\s+', ' ', c.lower()) for c in cleaned_clauses]:
151
+ cleaned_clauses.append(clause)
152
 
153
+ return cleaned_clauses[:20] # Limit to 20 clauses
 
154
 
155
+ def rule_based_analysis(text):
156
+ """Rule-based analysis without AI"""
157
+ results = {}
158
+
159
+ # Basic statistics
160
+ results['character_count'] = len(text)
161
+ results['word_count'] = len(text.split())
162
+
163
+ # Clause analysis
164
+ clauses = extract_clauses_simple(text)
165
+ results['clauses_found'] = len(clauses)
166
 
167
+ # Risk word detection
168
+ risk_words = {
169
+ 'high_risk': ['liable', 'indemnify', 'damages', 'breach', 'termination', 'penalty'],
170
+ 'medium_risk': ['confidential', 'proprietary', 'warranty', 'obligation'],
171
+ 'low_risk': ['agree', 'party', 'contract', 'term']
172
+ }
173
+
174
+ found_risks = {}
175
+ text_lower = text.lower()
176
+ for risk_level, words in risk_words.items():
177
+ found = [word for word in words if word in text_lower]
178
+ if found:
179
+ found_risks[risk_level] = found
180
 
181
+ results['risk_terms'] = found_risks
182
+
183
+ # Simple document type detection
184
+ text_lower = text.lower()
185
+ doc_type_scores = {
186
+ "Non-Disclosure Agreement": len(re.findall(r'confidential|non.?disclosure|nda', text_lower)),
187
+ "Employment Contract": len(re.findall(r'employ|salary|duties|terminat', text_lower)),
188
+ "Lease Agreement": len(re.findall(r'lease|tenant|rent|property', text_lower)),
189
+ "Service Agreement": len(re.findall(r'service|provider|client|deliverable', text_lower)),
190
+ "Sales Agreement": len(re.findall(r'sale|purchase|price|payment', text_lower))
191
+ }
192
 
193
+ best_type = max(doc_type_scores.items(), key=lambda x: x[1])
194
+ results['doc_type'] = best_type[0] if best_type[1] > 0 else "General Contract"
195
+ results['doc_type_confidence'] = min(100, best_type[1] * 20) # Simple confidence score
 
 
 
196
 
197
+ return results, clauses
198
 
199
+ # Initialize session state
200
+ if 'text_data' not in st.session_state:
201
+ st.session_state.text_data = ""
202
+ if 'analysis_results' not in st.session_state:
203
+ st.session_state.analysis_results = {}
204
+ if 'clauses' not in st.session_state:
205
+ st.session_state.clauses = []
206
 
207
+ # UI Layout
208
+ st.title("βš–οΈ ClauseWise Legal Assistant")
209
+ st.markdown("**Lightweight legal document analysis**")
210
+
211
+ # Sidebar
212
+ with st.sidebar:
213
+ st.header("πŸ“ Document Input")
214
+
215
+ uploaded_file = st.file_uploader(
216
+ "Upload Document",
217
+ type=["pdf", "docx", "txt"],
218
+ help="Supported formats: PDF, Word, Text"
219
+ )
220
+
221
+ pasted_text = st.text_area("Or paste text below:", height=150, placeholder="Paste your legal text here...")
222
+
223
+ process_btn = st.button("πŸ“Š Analyze Document", type="primary", use_container_width=True)
224
+
225
+ if process_btn:
226
+ if uploaded_file:
227
+ with st.spinner("Reading document..."):
228
+ st.session_state.text_data = load_document(uploaded_file)
229
+ elif pasted_text.strip():
230
+ st.session_state.text_data = pasted_text.strip()
231
+ else:
232
+ st.error("Please upload a file or paste some text")
233
+
234
+ if st.session_state.text_data and not st.session_state.text_data.startswith("Error"):
235
+ st.success(f"βœ… Loaded {len(st.session_state.text_data)} characters")
236
+
237
+ with st.spinner("Analyzing content..."):
238
+ st.session_state.analysis_results, st.session_state.clauses = rule_based_analysis(st.session_state.text_data)
239
+ else:
240
+ st.error("Failed to load document text")
241
+
242
+ # Main content area
243
+ if st.session_state.text_data and not st.session_state.text_data.startswith("Error"):
244
+ # Document preview
245
+ with st.expander("πŸ“„ Document Preview", expanded=False):
246
+ preview_text = st.session_state.text_data
247
+ if len(preview_text) > 1500:
248
+ st.text_area("", preview_text[:1500] + "...", height=200, label_visibility="collapsed")
249
+ st.caption(f"Preview truncated. Full document: {len(preview_text)} characters")
250
+ else:
251
+ st.text_area("", preview_text, height=200, label_visibility="collapsed")
252
 
253
+ # Analysis results
254
+ if st.session_state.analysis_results:
255
+ results = st.session_state.analysis_results
256
+
257
+ st.subheader("πŸ“Š Analysis Results")
258
 
259
+ # Key metrics
260
+ col1, col2, col3, col4 = st.columns(4)
261
 
262
  with col1:
263
+ st.metric("Document Type", results['doc_type'])
 
 
 
 
 
 
 
 
 
 
264
 
265
  with col2:
266
+ st.metric("Confidence", f"{results['doc_type_confidence']}%")
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ with col3:
269
+ st.metric("Clauses Found", results['clauses_found'])
270
 
271
+ with col4:
272
+ st.metric("Word Count", results['word_count'])
273
+
274
+ # Risk analysis
275
+ if results['risk_terms']:
276
+ st.subheader("⚠️ Risk Analysis")
277
+
278
+ for risk_level, terms in results['risk_terms'].items():
279
+ risk_display = risk_level.replace('_', ' ').title()
280
+ color = {
281
+ 'high_risk': 'red',
282
+ 'medium_risk': 'orange',
283
+ 'low_risk': 'green'
284
+ }.get(risk_level, 'gray')
285
 
286
+ st.write(f"**{risk_display}**: {', '.join(terms)}")
 
 
 
 
287
 
288
+ # Clauses display
289
+ if st.session_state.clauses:
290
+ st.subheader(f"πŸ“‘ Extracted Clauses ({len(st.session_state.clauses)})")
291
+
292
+ for i, clause in enumerate(st.session_state.clauses[:10], 1):
293
+ with st.expander(f"Clause {i} ({len(clause)} chars)"):
294
+ st.write(clause)
295
+
296
+ if len(st.session_state.clauses) > 10:
297
+ st.info(f"Showing first 10 of {len(st.session_state.clauses)} clauses")
298
+
299
+ # AI Analysis Section (optional)
300
+ st.subheader("πŸ€– AI Analysis (Optional)")
 
 
 
301
 
302
+ if st.button("Generate AI Summary", key="ai_summary"):
303
+ if len(st.session_state.text_data) > 100:
304
+ with st.spinner("AI is analyzing..."):
305
+ prompt = f"Provide a concise summary of this legal document:\n\n{st.session_state.text_data[:1000]}"
306
+ ai_summary = simple_llm_generate(prompt, max_length=300)
307
+ st.write(ai_summary)
308
+ else:
309
+ st.warning("Document too short for AI analysis")
310
 
311
  else:
312
+ # Welcome screen
313
  st.markdown("""
314
+ ## πŸ‘‹ Welcome to ClauseWise!
315
 
316
+ A lightweight legal document analyzer optimized for Hugging Face Spaces.
317
 
318
+ ### πŸš€ How to use:
319
+ 1. **Upload a document** (PDF, DOCX, TXT) in the sidebar **OR**
320
  2. **Paste your legal text** in the text area
321
+ 3. Click **"Analyze Document"** to process
322
+ 4. Review the automated analysis results
 
 
323
 
324
+ ### πŸ“‹ What it analyzes:
325
+ - **Document type** (NDA, Employment, Lease, etc.)
326
+ - **Risk terms** and potential issues
327
+ - **Clause extraction** and organization
328
+ - **Basic statistics** and metrics
329
 
330
+ ### πŸ§ͺ Try this sample text:
331
+ ```
332
+ This Non-Disclosure Agreement (the "Agreement") is entered into between
333
+ Company ABC ("Disclosing Party") and John Smith ("Receiving Party").
334
+ The Receiving Party agrees to maintain the confidentiality of all
335
+ proprietary information disclosed under this Agreement for a period
336
+ of three years following termination. Any breach of this Agreement
337
+ may result in legal action and liability for damages.
338
+ ```
339
 
340
+ ### ⚠️ Important Notes:
341
+ - Uses rule-based analysis for reliability
342
+ - Optional AI features use small, fast models
343
+ - Works best with clear legal text
344
+ - Free and open source
345
  """)
346
 
347
+ # Footer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  st.markdown("---")
349
+ st.caption("πŸ”’ ClauseWise Demo | Optimized for Hugging Face Spaces | No data stored")
350
+
351
+ # Add some custom CSS to make it look nicer
352
+ st.markdown("""
353
+ <style>
354
+ .main .block-container {
355
+ padding-top: 2rem;
356
+ }
357
+ .stButton button {
358
+ width: 100%;
359
+ }
360
+ </style>
361
+ """, unsafe_allow_html=True)