File size: 21,313 Bytes
9b1c753
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""Unsupervised Risk Discovery System - No Hardcoded Categories!
"""
import re
from typing import Dict, List, Tuple, Any
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

class UnsupervisedRiskDiscovery:
    """
    Discovers risk patterns in legal contracts using unsupervised learning.
    NO hardcoded risk categories - learns everything from text!
    """
    
    def __init__(self, n_clusters: int = 7, random_state: int = 42):
        self.n_clusters = n_clusters
        self.random_state = random_state
        
        # Initialize components
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 3),
            stop_words='english',
            lowercase=True,
            min_df=2,
            max_df=0.95
        )
        
        self.kmeans = KMeans(
            n_clusters=n_clusters,
            random_state=random_state,
            n_init=10
        )
        
        # Risk pattern storage
        self.discovered_patterns = {}
        self.risk_features = {}
        self.cluster_labels = None
        self.feature_matrix = None
        
        # Legal language patterns (domain-agnostic)
        self.legal_indicators = {
            'obligation_strength': r'\b(?:shall|must|required|mandatory|obligated|bound)\b',
            'prohibition_terms': r'\b(?:shall not|must not|prohibited|forbidden|restricted)\b',
            'conditional_risk': r'\b(?:if|unless|provided|subject to|in the event|failure to)\b',
            'liability_terms': r'\b(?:liable|responsibility|damages|penalty|loss|harm)\b',
            'temporal_urgency': r'\b(?:immediately|within|before|after|deadline|expir)\b',
            'monetary_terms': r'\$|USD|dollar|payment|fee|cost|expense|fine',
            'parties': r'\b(?:Party|Parties|Company|Corporation|Licensor|Licensee|Vendor|Customer)\b',
            'dates': r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
        }
        
        # Legal complexity indicators
        self.complexity_indicators = {
            'modal_verbs': r'\b(?:shall|must|may|should|will|might|could|would)\b',
            'conditional_terms': r'\b(?:if|unless|provided|subject to|in the event|notwithstanding)\b',
            'legal_conjunctions': r'\b(?:whereas|therefore|furthermore|moreover|however)\b',
            'obligation_terms': r'\b(?:agrees?|undertakes?|covenants?|warrants?|represents?)\b'
        }
    
    def clean_clause_text(self, text: str) -> str:
        """Clean and normalize clause text"""
        if not isinstance(text, str):
            return ""
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters but keep legal punctuation
        text = re.sub(r'[^\w\s.,;:()"-]', ' ', text)
        
        # Clean up spacing
        text = text.strip()
        
        return text
    
    def extract_risk_features(self, clause_text: str) -> Dict[str, float]:
        """
        Extract numerical features that indicate risk levels (domain-agnostic)
        """
        text_lower = clause_text.lower()
        words = text_lower.split()
        
        features = {}
        
        # Basic text statistics
        features['clause_length'] = len(words)
        features['sentence_count'] = len(re.split(r'[.!?]+', clause_text))
        features['avg_word_length'] = np.mean([len(word) for word in words]) if words else 0
        
        # Legal language intensity
        for pattern_name, pattern in self.legal_indicators.items():
            matches = len(re.findall(pattern, text_lower))
            features[f'{pattern_name}_count'] = matches
            features[f'{pattern_name}_density'] = matches / len(words) if words else 0
        
        # Legal complexity features
        for pattern_name, pattern in self.complexity_indicators.items():
            matches = len(re.findall(pattern, text_lower))
            features[f'{pattern_name}_complexity'] = matches / len(words) if words else 0
        
        # Risk intensity indicators
        features['obligation_strength'] = (
            features.get('obligation_strength_density', 0) * 2 +
            features.get('modal_verbs_complexity', 0)
        )
        
        features['legal_complexity'] = (
            features.get('conditional_terms_complexity', 0) +
            features.get('legal_conjunctions_complexity', 0) +
            features.get('obligation_terms_complexity', 0)
        )
        
        features['risk_intensity'] = (
            features.get('liability_terms_density', 0) * 2 +
            features.get('prohibition_terms_density', 0) +
            features.get('conditional_risk_density', 0)
        )
        
        return features
    
    def discover_risk_patterns(self, clause_texts: List[str]) -> Dict[str, Any]:
        """
        Discover risk patterns using unsupervised clustering.
        Returns discovered risk types and their characteristics.
        """
        print(f"πŸ” Discovering risk patterns from {len(clause_texts)} clauses...")
        
        # Clean texts
        cleaned_texts = [self.clean_clause_text(text) for text in clause_texts]
        
        # Extract TF-IDF features
        print("πŸ“Š Extracting TF-IDF features...")
        self.feature_matrix = self.tfidf_vectorizer.fit_transform(cleaned_texts)
        
        # Perform clustering
        print(f"🎯 Clustering into {self.n_clusters} risk patterns...")
        self.cluster_labels = self.kmeans.fit_predict(self.feature_matrix)
        
        # Extract risk features for each clause
        print("βš–οΈ Extracting legal risk features...")
        risk_features_list = [self.extract_risk_features(text) for text in clause_texts]
        
        # Analyze discovered clusters
        self.discovered_patterns = self._analyze_clusters(
            cleaned_texts, self.cluster_labels, risk_features_list
        )
        
        print("βœ… Risk pattern discovery complete!")
        print(f"πŸ“‹ Discovered {len(self.discovered_patterns)} risk patterns:")
        
        for i, (pattern_name, details) in enumerate(self.discovered_patterns.items()):
            print(f"  {i+1}. {pattern_name}: {details['clause_count']} clauses")
            print(f"     Key terms: {', '.join(details['key_terms'][:5])}")
            print(f"     Risk intensity: {details['avg_risk_intensity']:.3f}")
        
        # Calculate quality metrics
        from sklearn.metrics import silhouette_score
        try:
            silhouette = silhouette_score(self.feature_matrix, self.cluster_labels)
        except:
            silhouette = 0.0
        
        # Return structured results for comparison
        return {
            'method': 'K-Means_Clustering',
            'n_clusters': self.n_clusters,
            'discovered_patterns': self.discovered_patterns,
            'cluster_labels': self.cluster_labels,
            'quality_metrics': {
                'silhouette_score': silhouette,
                'n_patterns': len(self.discovered_patterns)
            }
        }
    
    def _analyze_clusters(self, texts: List[str], labels: np.ndarray, 
                         risk_features: List[Dict]) -> Dict[str, Any]:
        """Analyze and name discovered clusters"""
        patterns = {}
        
        # Get feature names
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        
        for cluster_id in range(self.n_clusters):
            # Get clauses in this cluster
            cluster_mask = labels == cluster_id
            cluster_texts = [texts[i] for i in range(len(texts)) if cluster_mask[i]]
            cluster_features = [risk_features[i] for i in range(len(risk_features)) if cluster_mask[i]]
            
            # Get top terms for this cluster
            cluster_center = self.kmeans.cluster_centers_[cluster_id]
            top_indices = cluster_center.argsort()[-20:][::-1]
            top_terms = [feature_names[i] for i in top_indices]
            
            # Calculate average risk features
            avg_features = {}
            if cluster_features:
                for key in cluster_features[0].keys():
                    avg_features[key] = np.mean([f.get(key, 0) for f in cluster_features])
            
            # Generate cluster name based on top terms and risk characteristics
            cluster_name = self._generate_cluster_name(top_terms, avg_features)
            
            patterns[cluster_name] = {
                'cluster_id': cluster_id,
                'clause_count': len(cluster_texts),
                'key_terms': top_terms,
                'avg_risk_intensity': avg_features.get('risk_intensity', 0),
                'avg_legal_complexity': avg_features.get('legal_complexity', 0),
                'avg_obligation_strength': avg_features.get('obligation_strength', 0),
                'sample_clauses': cluster_texts[:3],
                'risk_features': avg_features
            }
        
        return patterns
    
    def _generate_cluster_name(self, top_terms: List[str], avg_features: Dict[str, float]) -> str:
        """Generate meaningful names for discovered clusters"""
        # Analyze top terms to identify risk theme
        term_analysis = {
            'liability': ['liable', 'liability', 'damages', 'loss', 'harm', 'injury'],
            'obligation': ['shall', 'must', 'required', 'obligation', 'duty'],
            'indemnity': ['indemnify', 'indemnification', 'defend', 'hold harmless'],
            'termination': ['terminate', 'termination', 'end', 'expire', 'breach'],
            'intellectual_property': ['intellectual', 'property', 'patent', 'copyright', 'trademark'],
            'confidentiality': ['confidential', 'confidentiality', 'non-disclosure', 'proprietary'],
            'compliance': ['comply', 'compliance', 'regulation', 'law', 'legal']
        }
        
        # Score each theme based on term presence
        theme_scores = {}
        for theme, keywords in term_analysis.items():
            score = sum(1 for term in top_terms[:10] if any(kw in term.lower() for kw in keywords))
            theme_scores[theme] = score
        
        # Get best matching theme
        best_theme = max(theme_scores, key=theme_scores.get) if theme_scores else 'general'
        
        # Add intensity modifier based on risk features
        risk_intensity = avg_features.get('risk_intensity', 0)
        if risk_intensity > 0.1:
            intensity = 'high_risk'
        elif risk_intensity > 0.05:
            intensity = 'moderate_risk'
        else:
            intensity = 'low_risk'
        
        return f"{intensity}_{best_theme}_pattern"
    
    def get_risk_labels(self, clause_texts: List[str]) -> List[int]:
        """Get risk cluster labels for new clause texts"""
        if self.cluster_labels is None:
            raise ValueError("Must discover patterns first using discover_risk_patterns()")
        
        cleaned_texts = [self.clean_clause_text(text) for text in clause_texts]
        feature_matrix = self.tfidf_vectorizer.transform(cleaned_texts)
        
        return self.kmeans.predict(feature_matrix)
    
    def get_discovered_risk_names(self) -> List[str]:
        """Get list of discovered risk pattern names"""
        if not self.discovered_patterns:
            raise ValueError("Must discover patterns first using discover_risk_patterns()")
        
        return list(self.discovered_patterns.keys())


class LDARiskDiscovery:
    """
    LDA-based risk discovery system - wrapper around TopicModelingRiskDiscovery
    Provides a compatible interface with UnsupervisedRiskDiscovery while using LDA underneath.
    
    LDA (Latent Dirichlet Allocation) is superior for legal text because:
    - Discovers overlapping risk categories (clauses can belong to multiple topics)
    - Provides probability distributions over risk types
    - Better balance across discovered patterns
    - More interpretable topic-word distributions
    """
    
    def __init__(self, n_clusters: int = 7, doc_topic_prior: float = 0.1,
                 topic_word_prior: float = 0.01, max_iter: int = 20,
                 max_features: int = 5000, learning_method: str = 'batch',
                 random_state: int = 42):
        """
        Initialize LDA risk discovery system.
        
        Args:
            n_clusters: Number of risk topics to discover
            doc_topic_prior: Alpha parameter (document-topic concentration, lower = more focused)
            topic_word_prior: Beta parameter (topic-word concentration, lower = more focused)
            max_iter: Maximum iterations for LDA training
            max_features: Vocabulary size for feature extraction
            learning_method: 'batch' (more accurate) or 'online' (faster for large datasets)
            random_state: Random seed for reproducibility
        """
        from risk_discovery_alternatives import TopicModelingRiskDiscovery
        
        self.n_clusters = n_clusters
        self.random_state = random_state
        
        # Initialize LDA backend
        self.lda_backend = TopicModelingRiskDiscovery(
            n_topics=n_clusters,
            random_state=random_state
        )
        
        # Override LDA parameters
        self.lda_backend.lda_model.doc_topic_prior = doc_topic_prior
        self.lda_backend.lda_model.topic_word_prior = topic_word_prior
        self.lda_backend.lda_model.max_iter = max_iter
        self.lda_backend.lda_model.learning_method = learning_method
        self.lda_backend.vectorizer.max_features = max_features
        
        # Storage for compatibility
        self.discovered_patterns = {}
        self.cluster_labels = None  # Will store dominant topic per document
        self.feature_matrix = None
        
        # Legal language patterns (same as UnsupervisedRiskDiscovery for compatibility)
        self.legal_indicators = {
            'obligation_strength': r'\b(?:shall|must|required|mandatory|obligated|bound)\b',
            'prohibition_terms': r'\b(?:shall not|must not|prohibited|forbidden|restricted)\b',
            'conditional_risk': r'\b(?:if|unless|provided|subject to|in the event|failure to)\b',
            'liability_terms': r'\b(?:liable|responsibility|damages|penalty|loss|harm)\b',
            'temporal_urgency': r'\b(?:immediately|within|before|after|deadline|expir)\b',
            'monetary_terms': r'\$|USD|dollar|payment|fee|cost|expense|fine',
            'parties': r'\b(?:Party|Parties|Company|Corporation|Licensor|Licensee|Vendor|Customer)\b',
            'dates': r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
        }
        
        # Legal complexity indicators
        self.complexity_indicators = {
            'modal_verbs': r'\b(?:shall|must|may|should|will|might|could|would)\b',
            'conditional_terms': r'\b(?:if|unless|provided|subject to|in the event|notwithstanding)\b',
            'legal_conjunctions': r'\b(?:whereas|therefore|furthermore|moreover|however)\b',
            'obligation_terms': r'\b(?:agrees?|undertakes?|covenants?|warrants?|represents?)\b'
        }
        
    def discover_risk_patterns(self, clause_texts: List[str]) -> Dict[str, Any]:
        """
        Discover risk patterns using LDA topic modeling.
        Compatible with UnsupervisedRiskDiscovery interface.
        
        Args:
            clause_texts: List of legal clause texts
            
        Returns:
            Dictionary with discovered patterns and quality metrics
        """
        print(f"πŸ” Discovering risk patterns using LDA (n_topics={self.n_clusters})...")
        print("   πŸ“Š LDA provides balanced, overlapping risk categories")
        print("   🎯 Best for legal text with multi-faceted risks")
        
        # Run LDA discovery
        results = self.lda_backend.discover_risk_patterns(clause_texts)
        
        # Store results for compatibility
        self.discovered_patterns = results.get('discovered_topics', {})
        self.cluster_labels = results.get('topic_labels', None)
        self.feature_matrix = self.lda_backend.feature_matrix
        
        # Add keywords field for compatibility with trainer
        for topic_name, topic_info in self.discovered_patterns.items():
            if 'keywords' not in topic_info and 'top_words' in topic_info:
                topic_info['keywords'] = topic_info['top_words']
        
        print(f"βœ… LDA discovery complete: {len(self.discovered_patterns)} risk topics found")
        
        return results
    
    def get_risk_labels(self, clause_texts: List[str]) -> List[int]:
        """
        Get dominant topic labels for new clause texts.
        Returns the most probable topic for each clause.
        
        Args:
            clause_texts: List of legal clause texts
            
        Returns:
            List of topic IDs (0 to n_clusters-1)
        """
        if self.cluster_labels is None:
            raise ValueError("Must discover patterns first using discover_risk_patterns()")
        
        # Clean and transform new clauses
        cleaned_texts = [self.lda_backend._clean_text(text) for text in clause_texts]
        feature_matrix = self.lda_backend.vectorizer.transform(cleaned_texts)
        
        # Get topic distribution and extract dominant topic
        doc_topic_dist = self.lda_backend.lda_model.transform(feature_matrix)
        
        # Return the topic with highest probability for each document
        labels = doc_topic_dist.argmax(axis=1).tolist()
        
        return labels
    
    def get_discovered_risk_names(self) -> List[str]:
        """Get list of discovered risk topic names"""
        if not self.discovered_patterns:
            raise ValueError("Must discover patterns first using discover_risk_patterns()")
        
        return list(self.discovered_patterns.keys())
    
    def get_topic_distribution(self, clause_texts: List[str]) -> np.ndarray:
        """
        Get full probability distribution over topics for clauses.
        This is unique to LDA - shows membership in ALL topics with probabilities.
        
        Args:
            clause_texts: List of legal clause texts
            
        Returns:
            Array of shape (n_clauses, n_topics) with probability distributions
        """
        cleaned = [self.lda_backend._clean_text(c) for c in clause_texts]
        feature_matrix = self.lda_backend.vectorizer.transform(cleaned)
        return self.lda_backend.lda_model.transform(feature_matrix)
    
    def clean_clause_text(self, text: str) -> str:
        """Clean and normalize clause text - for compatibility with trainer"""
        if not isinstance(text, str):
            return ""
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove special characters but keep legal punctuation
        text = re.sub(r'[^\w\s.,;:()"-]', ' ', text)
        
        # Clean up spacing
        text = text.strip()
        
        return text
    
    def extract_risk_features(self, clause_text: str) -> Dict[str, float]:
        """
        Extract numerical features that indicate risk levels.
        Required by trainer for generating synthetic severity/importance scores.
        """
        text_lower = clause_text.lower()
        words = text_lower.split()
        
        features = {}
        
        # Basic text statistics
        features['clause_length'] = len(words)
        features['sentence_count'] = len(re.split(r'[.!?]+', clause_text))
        features['avg_word_length'] = np.mean([len(word) for word in words]) if words else 0
        
        # Legal language intensity
        for pattern_name, pattern in self.legal_indicators.items():
            matches = len(re.findall(pattern, text_lower))
            features[f'{pattern_name}_count'] = matches
            features[f'{pattern_name}_density'] = matches / len(words) if words else 0
        
        # Legal complexity features
        for pattern_name, pattern in self.complexity_indicators.items():
            matches = len(re.findall(pattern, text_lower))
            features[f'{pattern_name}_complexity'] = matches / len(words) if words else 0
        
        # Risk intensity indicators
        features['obligation_strength'] = (
            features.get('obligation_strength_density', 0) * 2 +
            features.get('modal_verbs_complexity', 0)
        )
        
        features['legal_complexity'] = (
            features.get('conditional_terms_complexity', 0) +
            features.get('legal_conjunctions_complexity', 0) +
            features.get('obligation_terms_complexity', 0)
        )
        
        features['risk_intensity'] = (
            features.get('liability_terms_density', 0) * 2 +
            features.get('prohibition_terms_density', 0) +
            features.get('conditional_risk_density', 0)
        )
        
        return features