code2-repo / risk_discovery_alternatives.py

Deepu1965

Upload folder using huggingface_hub

9b1c753 verified about 2 months ago

55.8 kB

	"""
	Alternative Risk Discovery Methods for Comparison

	This module implements 3 alternative approaches to risk pattern discovery:
	1. Topic Modeling (LDA) - Discovers latent risk topics
	2. Hierarchical Clustering (Agglomerative) - Discovers nested risk hierarchies
	3. Density-Based Clustering (DBSCAN) - Discovers risk clusters of varying shapes

	Each method provides a different perspective on risk patterns in legal contracts.
	"""
	import re
	import numpy as np
	from typing import Dict, List, Tuple, Any
	from collections import Counter, defaultdict
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation, NMF
	from sklearn.cluster import AgglomerativeClustering, DBSCAN
	from sklearn.metrics import silhouette_score
	import warnings


	class TopicModelingRiskDiscovery:
	"""
	Risk discovery using Latent Dirichlet Allocation (LDA) topic modeling.

	Discovers risk patterns as latent topics where each clause is a mixture of topics.
	Better for discovering overlapping risk categories and multi-faceted risks.

	Advantages:
	- Handles overlapping risk types naturally
	- Provides probability distribution over risk types
	- Discovers interpretable topic words
	- Works well with legal text (documents with multiple themes)

	Disadvantages:
	- Requires more tuning (alpha, beta parameters)
	- Slower than K-Means
	- Less clear cluster boundaries
	"""

	def __init__(self, n_topics: int = 7, random_state: int = 42):
	self.n_topics = n_topics
	self.random_state = random_state

	# Use CountVectorizer for LDA (works better than TF-IDF)
	self.vectorizer = CountVectorizer(
	max_features=5000,
	ngram_range=(1, 2),
	stop_words='english',
	lowercase=True,
	min_df=3,
	max_df=0.85
	)

	# LDA model
	self.lda_model = LatentDirichletAllocation(
	n_components=n_topics,
	random_state=random_state,
	max_iter=20,
	learning_method='batch',
	doc_topic_prior=0.1, # Alpha - document-topic density
	topic_word_prior=0.01, # Beta - topic-word density
	n_jobs=-1
	)

	self.discovered_topics = {}
	self.topic_labels = None
	self.feature_matrix = None
	self.topic_word_distribution = None

	def discover_risk_patterns(self, clauses: List[str]) -> Dict[str, Any]:
	"""
	Discover risk patterns using LDA topic modeling.

	Args:
	clauses: List of legal clause texts

	Returns:
	Dictionary with discovered topics and assignments
	"""
	print(f"🔍 Discovering risk topics using LDA (n_topics={self.n_topics})...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create document-term matrix
	print(" 📊 Creating document-term matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Fit LDA model
	print(" 🧠 Fitting LDA model...")
	self.lda_model.fit(self.feature_matrix)

	# Get topic-word distribution
	self.topic_word_distribution = self.lda_model.components_

	# Get document-topic distribution
	doc_topic_dist = self.lda_model.transform(self.feature_matrix)

	# Assign each document to dominant topic
	self.topic_labels = np.argmax(doc_topic_dist, axis=1)

	# Extract top words for each topic
	print(" 📝 Extracting topic keywords...")
	n_top_words = 15
	for topic_idx in range(self.n_topics):
	top_word_indices = np.argsort(self.topic_word_distribution[topic_idx])[-n_top_words:][::-1]
	top_words = [feature_names[i] for i in top_word_indices]
	top_weights = [self.topic_word_distribution[topic_idx][i] for i in top_word_indices]

	# Generate topic name from top words
	topic_name = self._generate_topic_name(top_words)

	# Count clauses in this topic
	clause_count = np.sum(self.topic_labels == topic_idx)

	self.discovered_topics[topic_idx] = {
	'topic_id': topic_idx,
	'topic_name': topic_name,
	'top_words': top_words,
	'word_weights': top_weights,
	'clause_count': int(clause_count),
	'proportion': float(clause_count / len(clauses))
	}

	# Compute perplexity and log-likelihood
	perplexity = self.lda_model.perplexity(self.feature_matrix)
	log_likelihood = self.lda_model.score(self.feature_matrix)

	print(f"✅ LDA discovery complete: {self.n_topics} topics found")
	print(f" Perplexity: {perplexity:.2f} (lower is better)")
	print(f" Log-likelihood: {log_likelihood:.2f}")

	return {
	'method': 'LDA_Topic_Modeling',
	'n_topics': self.n_topics,
	'discovered_topics': self.discovered_topics,
	'topic_labels': self.topic_labels,
	'doc_topic_distribution': doc_topic_dist,
	'perplexity': perplexity,
	'log_likelihood': log_likelihood,
	'quality_metrics': {
	'perplexity': perplexity,
	'avg_topic_diversity': self._compute_topic_diversity()
	}
	}

	def get_clause_topic_distribution(self, clause_idx: int) -> Dict[int, float]:
	"""Get probability distribution over topics for a specific clause"""
	if self.feature_matrix is None:
	return {}

	doc_topic_dist = self.lda_model.transform(self.feature_matrix)
	return {topic_id: float(prob) for topic_id, prob in enumerate(doc_topic_dist[clause_idx])}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _generate_topic_name(self, top_words: List[str]) -> str:
	"""Generate descriptive name from top words"""
	# Look for common legal risk themes
	themes = {
	'liability': ['liability', 'liable', 'damages', 'loss', 'harm', 'injury'],
	'indemnity': ['indemnify', 'indemnification', 'hold', 'harmless', 'defend'],
	'termination': ['terminate', 'termination', 'cancel', 'end', 'expire'],
	'intellectual_property': ['intellectual', 'property', 'ip', 'patent', 'copyright', 'trademark'],
	'confidentiality': ['confidential', 'confidentiality', 'disclosure', 'nda', 'secret'],
	'payment': ['payment', 'pay', 'fee', 'price', 'cost', 'charge'],
	'compliance': ['comply', 'compliance', 'regulation', 'law', 'legal', 'regulatory'],
	'warranty': ['warranty', 'warrant', 'represent', 'guarantee', 'assure']
	}

	# Score each theme
	theme_scores = defaultdict(int)
	for word in top_words[:10]:
	for theme, keywords in themes.items():
	if any(keyword in word.lower() for keyword in keywords):
	theme_scores[theme] += 1

	# Pick best theme or use top words
	if theme_scores:
	best_theme = max(theme_scores.items(), key=lambda x: x[1])[0]
	return f"Topic_{best_theme.upper()}"
	else:
	return f"Topic_{top_words[0].upper()}_{top_words[1].upper()}"

	def _compute_topic_diversity(self) -> float:
	"""Compute average diversity of topics (entropy of word distribution)"""
	diversities = []
	for topic_idx in range(self.n_topics):
	word_dist = self.topic_word_distribution[topic_idx]
	word_dist = word_dist / np.sum(word_dist) # Normalize
	entropy = -np.sum(word_dist * np.log(word_dist + 1e-10))
	diversities.append(entropy)
	return float(np.mean(diversities))


	class HierarchicalRiskDiscovery:
	"""
	Risk discovery using Hierarchical Agglomerative Clustering.

	Discovers nested risk hierarchies where similar risks are grouped at multiple levels.
	Better for understanding relationships between risk types.

	Advantages:
	- Discovers hierarchical structure (parent-child risk relationships)
	- No need to specify number of clusters upfront
	- Deterministic results
	- Can cut dendrogram at different levels

	Disadvantages:
	- Slower for large datasets (O(n²) or O(n³))
	- Memory intensive
	- Cannot handle very large datasets
	"""

	def __init__(self, n_clusters: int = 7, linkage: str = 'ward', random_state: int = 42):
	self.n_clusters = n_clusters
	self.linkage = linkage # 'ward', 'average', 'complete', 'single'
	self.random_state = random_state

	# TF-IDF vectorizer
	self.vectorizer = TfidfVectorizer(
	max_features=8000,
	ngram_range=(1, 3),
	stop_words='english',
	lowercase=True,
	min_df=2,
	max_df=0.90
	)

	# Hierarchical clustering model
	self.clustering_model = AgglomerativeClustering(
	n_clusters=n_clusters,
	linkage=linkage
	)

	self.discovered_clusters = {}
	self.cluster_labels = None
	self.feature_matrix = None

	def discover_risk_patterns(self, clauses: List[str]) -> Dict[str, Any]:
	"""
	Discover risk patterns using hierarchical clustering.

	Args:
	clauses: List of legal clause texts

	Returns:
	Dictionary with discovered clusters and hierarchy
	"""
	print(f"🔍 Discovering risk patterns using Hierarchical Clustering (n_clusters={self.n_clusters})...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create TF-IDF matrix
	print(" 📊 Creating TF-IDF feature matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Fit hierarchical clustering
	print(f" 🧠 Fitting Hierarchical Clustering (linkage={self.linkage})...")
	self.cluster_labels = self.clustering_model.fit_predict(self.feature_matrix.toarray())

	# Analyze each cluster
	print(" 📝 Analyzing discovered clusters...")
	for cluster_id in range(self.n_clusters):
	cluster_mask = self.cluster_labels == cluster_id
	cluster_indices = np.where(cluster_mask)[0]

	# Get representative clauses
	cluster_clauses = [clauses[i] for i in cluster_indices]

	# Extract top TF-IDF terms for this cluster
	cluster_tfidf = self.feature_matrix[cluster_mask].mean(axis=0)
	top_term_indices = np.argsort(np.asarray(cluster_tfidf).flatten())[-15:][::-1]
	top_terms = [feature_names[i] for i in top_term_indices]
	top_scores = [float(cluster_tfidf[0, i]) for i in top_term_indices]

	# Generate cluster name
	cluster_name = self._generate_cluster_name(top_terms)

	self.discovered_clusters[cluster_id] = {
	'cluster_id': cluster_id,
	'cluster_name': cluster_name,
	'top_terms': top_terms,
	'term_scores': top_scores,
	'clause_count': int(len(cluster_indices)),
	'proportion': float(len(cluster_indices) / len(clauses)),
	'sample_clauses': cluster_clauses[:3] # First 3 clauses as examples
	}

	# Compute silhouette score
	if len(clauses) < 10000: # Only for reasonable sizes
	silhouette = silhouette_score(self.feature_matrix, self.cluster_labels)
	else:
	silhouette = None

	print(f"✅ Hierarchical clustering complete: {self.n_clusters} clusters found")
	if silhouette:
	print(f" Silhouette Score: {silhouette:.3f} (range: -1 to 1, higher is better)")

	return {
	'method': 'Hierarchical_Agglomerative_Clustering',
	'n_clusters': self.n_clusters,
	'linkage': self.linkage,
	'discovered_clusters': self.discovered_clusters,
	'cluster_labels': self.cluster_labels,
	'quality_metrics': {
	'silhouette_score': silhouette if silhouette else 'N/A',
	'avg_cluster_size': float(np.mean([c['clause_count'] for c in self.discovered_clusters.values()]))
	}
	}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _generate_cluster_name(self, top_terms: List[str]) -> str:
	"""Generate descriptive name from top terms"""
	# Legal risk theme detection
	themes = {
	'LIABILITY': ['liability', 'liable', 'damages', 'loss'],
	'INDEMNITY': ['indemnify', 'indemnification', 'hold', 'harmless'],
	'TERMINATION': ['terminate', 'termination', 'cancel', 'expire'],
	'IP': ['intellectual', 'property', 'patent', 'copyright'],
	'CONFIDENTIAL': ['confidential', 'nda', 'disclosure', 'secret'],
	'PAYMENT': ['payment', 'pay', 'fee', 'price'],
	'COMPLIANCE': ['comply', 'compliance', 'regulation', 'law'],
	'WARRANTY': ['warranty', 'warrant', 'represent', 'guarantee']
	}

	for theme, keywords in themes.items():
	if any(keyword in term.lower() for term in top_terms[:5] for keyword in keywords):
	return f"RISK_{theme}"

	return f"RISK_{top_terms[0].upper()}_{top_terms[1].upper()}"


	class DensityBasedRiskDiscovery:
	"""
	Risk discovery using DBSCAN (Density-Based Spatial Clustering).

	Discovers risk clusters based on density, identifying core risks and outliers.
	Better for finding unusual/rare risk patterns and handling noise.

	Advantages:
	- Discovers clusters of arbitrary shapes
	- Identifies outliers/noise (rare risk patterns)
	- No need to specify number of clusters
	- Robust to outliers

	Disadvantages:
	- Sensitive to hyperparameters (eps, min_samples)
	- Struggles with varying density clusters
	- Can produce many small clusters
	"""

	def __init__(self, eps: float = 0.5, min_samples: int = 5, random_state: int = 42):
	self.eps = eps # Maximum distance between samples
	self.min_samples = min_samples # Minimum samples in neighborhood
	self.random_state = random_state

	# TF-IDF vectorizer
	self.vectorizer = TfidfVectorizer(
	max_features=6000,
	ngram_range=(1, 2),
	stop_words='english',
	lowercase=True,
	min_df=3,
	max_df=0.85
	)

	# DBSCAN model
	self.dbscan_model = DBSCAN(
	eps=eps,
	min_samples=min_samples,
	metric='cosine',
	n_jobs=-1
	)

	self.discovered_clusters = {}
	self.cluster_labels = None
	self.feature_matrix = None
	self.outlier_indices = []

	def discover_risk_patterns(self, clauses: List[str], auto_tune: bool = True) -> Dict[str, Any]:
	"""
	Discover risk patterns using DBSCAN.

	Args:
	clauses: List of legal clause texts
	auto_tune: If True, automatically tune eps parameter

	Returns:
	Dictionary with discovered clusters and outliers
	"""
	print(f"🔍 Discovering risk patterns using DBSCAN...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create TF-IDF matrix
	print(" 📊 Creating TF-IDF feature matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Auto-tune eps if requested
	if auto_tune:
	print(" 🔧 Auto-tuning eps parameter...")
	self.eps = self._auto_tune_eps(self.feature_matrix)
	self.dbscan_model.eps = self.eps
	print(f" Selected eps={self.eps:.3f}")

	# Fit DBSCAN
	print(f" 🧠 Fitting DBSCAN (eps={self.eps}, min_samples={self.min_samples})...")
	self.cluster_labels = self.dbscan_model.fit_predict(self.feature_matrix)

	# Identify unique clusters (excluding noise label -1)
	unique_clusters = [c for c in np.unique(self.cluster_labels) if c != -1]
	n_clusters = len(unique_clusters)
	n_noise = np.sum(self.cluster_labels == -1)

	print(f" 📊 Found {n_clusters} clusters and {n_noise} outliers/noise points")

	# Analyze each cluster
	print(" 📝 Analyzing discovered clusters...")
	for cluster_id in unique_clusters:
	cluster_mask = self.cluster_labels == cluster_id
	cluster_indices = np.where(cluster_mask)[0]

	# Get representative clauses
	cluster_clauses = [clauses[i] for i in cluster_indices]

	# Extract top TF-IDF terms
	cluster_tfidf = self.feature_matrix[cluster_mask].mean(axis=0)
	top_term_indices = np.argsort(np.asarray(cluster_tfidf).flatten())[-15:][::-1]
	top_terms = [feature_names[i] for i in top_term_indices]
	top_scores = [float(cluster_tfidf[0, i]) for i in top_term_indices]

	# Generate cluster name
	cluster_name = self._generate_cluster_name(top_terms, cluster_id)

	self.discovered_clusters[cluster_id] = {
	'cluster_id': cluster_id,
	'cluster_name': cluster_name,
	'top_terms': top_terms,
	'term_scores': top_scores,
	'clause_count': int(len(cluster_indices)),
	'proportion': float(len(cluster_indices) / len(clauses)),
	'is_core_cluster': len(cluster_indices) >= self.min_samples * 3
	}

	# Analyze outliers/noise
	self.outlier_indices = np.where(self.cluster_labels == -1)[0]
	outlier_clauses = [clauses[i] for i in self.outlier_indices]

	print(f"✅ DBSCAN discovery complete: {n_clusters} clusters, {n_noise} outliers")

	return {
	'method': 'DBSCAN_Density_Based_Clustering',
	'n_clusters': n_clusters,
	'n_outliers': int(n_noise),
	'eps': self.eps,
	'min_samples': self.min_samples,
	'discovered_clusters': self.discovered_clusters,
	'cluster_labels': self.cluster_labels,
	'outlier_indices': self.outlier_indices.tolist(),
	'outlier_clauses': outlier_clauses[:10], # First 10 outliers
	'quality_metrics': {
	'n_clusters': n_clusters,
	'outlier_ratio': float(n_noise / len(clauses)),
	'avg_cluster_size': float(np.mean([c['clause_count'] for c in self.discovered_clusters.values()])) if n_clusters > 0 else 0
	}
	}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _auto_tune_eps(self, feature_matrix, sample_size: int = 1000) -> float:
	"""
	Auto-tune eps parameter using k-distance graph.

	Uses a sample of data to estimate optimal eps.
	"""
	from sklearn.neighbors import NearestNeighbors

	# Sample data if too large
	n_samples = min(sample_size, feature_matrix.shape[0])
	if feature_matrix.shape[0] > sample_size:
	indices = np.random.choice(feature_matrix.shape[0], sample_size, replace=False)
	sample_matrix = feature_matrix[indices]
	else:
	sample_matrix = feature_matrix

	# Compute k-nearest neighbors
	k = self.min_samples
	nbrs = NearestNeighbors(n_neighbors=k, metric='cosine').fit(sample_matrix)
	distances, _ = nbrs.kneighbors(sample_matrix)

	# Get k-th nearest neighbor distance
	k_distances = np.sort(distances[:, -1])

	# Use elbow method: find point where distances increase rapidly
	# Simple heuristic: use 90th percentile
	eps = np.percentile(k_distances, 90)

	return float(eps)

	def _generate_cluster_name(self, top_terms: List[str], cluster_id: int) -> str:
	"""Generate descriptive name from top terms"""
	# Legal risk theme detection
	themes = {
	'LIABILITY': ['liability', 'liable', 'damages', 'loss'],
	'INDEMNITY': ['indemnify', 'indemnification', 'hold', 'harmless'],
	'TERMINATION': ['terminate', 'termination', 'cancel', 'expire'],
	'IP': ['intellectual', 'property', 'patent', 'copyright'],
	'CONFIDENTIAL': ['confidential', 'nda', 'disclosure', 'secret'],
	'PAYMENT': ['payment', 'pay', 'fee', 'price'],
	'COMPLIANCE': ['comply', 'compliance', 'regulation', 'law'],
	'WARRANTY': ['warranty', 'warrant', 'represent', 'guarantee']
	}

	for theme, keywords in themes.items():
	if any(keyword in term.lower() for term in top_terms[:5] for keyword in keywords):
	return f"RISK_{theme}_C{cluster_id}"

	return f"RISK_CLUSTER_{cluster_id}_{top_terms[0].upper()}"

	def get_outlier_analysis(self) -> Dict[str, Any]:
	"""
	Analyze outlier/noise points to identify rare risk patterns.

	Returns:
	Dictionary with outlier analysis
	"""
	if len(self.outlier_indices) == 0:
	return {'message': 'No outliers found'}

	return {
	'n_outliers': len(self.outlier_indices),
	'outlier_ratio': len(self.outlier_indices) / len(self.cluster_labels),
	'interpretation': 'Outliers may represent rare or unique risk patterns that do not fit common categories'
	}


	class NMFRiskDiscovery:
	"""
	Risk discovery using Non-negative Matrix Factorization (NMF).

	NMF decomposes the document-term matrix into interpretable parts-based representations.
	Different from clustering - learns additive combinations of basis patterns.

	Advantages:
	- ✅ Parts-based decomposition (additive patterns)
	- ✅ Highly interpretable results
	- ✅ Non-negative weights (intuitive)
	- ✅ Fast convergence
	- ✅ Works well with TF-IDF

	Disadvantages:
	- ❌ Requires non-negative features
	- ❌ Sensitive to initialization
	- ❌ May not capture global structure
	"""

	def __init__(self, n_components: int = 7, random_state: int = 42):
	self.n_components = n_components
	self.random_state = random_state

	# TF-IDF vectorizer
	self.vectorizer = TfidfVectorizer(
	max_features=8000,
	ngram_range=(1, 2),
	stop_words='english',
	lowercase=True,
	min_df=3,
	max_df=0.85,
	norm='l2' # Important for NMF
	)

	# NMF model - handle different scikit-learn versions
	# Versions < 1.0: use 'alpha' and 'l1_ratio'
	# Versions >= 1.0: use 'alpha_W', 'alpha_H', 'l1_ratio'
	# Very old versions: neither parameter exists
	import sklearn
	sklearn_version = tuple(map(int, sklearn.__version__.split('.')[:2]))

	nmf_params = {
	'n_components': n_components,
	'random_state': random_state,
	'init': 'nndsvda',
	'max_iter': 500
	}

	# Add regularization params if supported
	if sklearn_version >= (1, 0):
	# scikit-learn >= 1.0
	nmf_params['alpha_W'] = 0.1
	nmf_params['alpha_H'] = 0.1
	nmf_params['l1_ratio'] = 0.5
	elif sklearn_version >= (0, 19):
	# scikit-learn 0.19 to 0.24
	nmf_params['alpha'] = 0.1
	nmf_params['l1_ratio'] = 0.5
	# else: very old version, use basic params only

	self.nmf_model = NMF(**nmf_params)

	self.discovered_components = {}
	self.component_labels = None
	self.feature_matrix = None
	self.W_matrix = None # Document-component matrix
	self.H_matrix = None # Component-feature matrix

	def discover_risk_patterns(self, clauses: List[str]) -> Dict[str, Any]:
	"""
	Discover risk patterns using NMF decomposition.

	Args:
	clauses: List of legal clause texts

	Returns:
	Dictionary with discovered components and assignments
	"""
	print(f"🔍 Discovering risk patterns using NMF (n_components={self.n_components})...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create TF-IDF matrix
	print(" 📊 Creating TF-IDF feature matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Fit NMF model
	print(" 🧠 Fitting NMF model...")
	self.W_matrix = self.nmf_model.fit_transform(self.feature_matrix)
	self.H_matrix = self.nmf_model.components_

	# Assign each document to dominant component
	self.component_labels = np.argmax(self.W_matrix, axis=1)

	# Extract top words for each component
	print(" 📝 Extracting component keywords...")
	n_top_words = 15
	for component_idx in range(self.n_components):
	top_word_indices = np.argsort(self.H_matrix[component_idx])[-n_top_words:][::-1]
	top_words = [feature_names[i] for i in top_word_indices]
	top_weights = [self.H_matrix[component_idx][i] for i in top_word_indices]

	# Generate component name
	component_name = self._generate_component_name(top_words)

	# Count clauses in this component
	clause_count = np.sum(self.component_labels == component_idx)

	# Get average component weight (strength)
	avg_weight = np.mean(self.W_matrix[:, component_idx])

	self.discovered_components[component_idx] = {
	'component_id': component_idx,
	'component_name': component_name,
	'top_words': top_words,
	'word_weights': top_weights,
	'clause_count': int(clause_count),
	'proportion': float(clause_count / len(clauses)),
	'avg_strength': float(avg_weight)
	}

	# Compute reconstruction error
	reconstruction_error = self.nmf_model.reconstruction_err_

	# Compute sparsity (how sparse are the representations)
	sparsity = np.mean(self.W_matrix == 0)

	print(f"✅ NMF discovery complete: {self.n_components} components found")
	print(f" Reconstruction error: {reconstruction_error:.2f}")
	print(f" Sparsity: {sparsity:.2%}")

	return {
	'method': 'NMF_Matrix_Factorization',
	'n_components': self.n_components,
	'discovered_components': self.discovered_components,
	'component_labels': self.component_labels,
	'component_strengths': self.W_matrix,
	'quality_metrics': {
	'reconstruction_error': float(reconstruction_error),
	'sparsity': float(sparsity),
	'avg_component_strength': float(np.mean(np.max(self.W_matrix, axis=1)))
	}
	}

	def get_clause_composition(self, clause_idx: int) -> Dict[int, float]:
	"""Get component composition for a specific clause"""
	if self.W_matrix is None:
	return {}

	return {comp_id: float(weight) for comp_id, weight in enumerate(self.W_matrix[clause_idx])}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _generate_component_name(self, top_words: List[str]) -> str:
	"""Generate descriptive name from top words"""
	themes = {
	'LIABILITY': ['liability', 'liable', 'damages', 'loss'],
	'INDEMNITY': ['indemnify', 'indemnification', 'hold', 'harmless'],
	'TERMINATION': ['terminate', 'termination', 'cancel', 'expire'],
	'IP': ['intellectual', 'property', 'patent', 'copyright'],
	'CONFIDENTIAL': ['confidential', 'nda', 'disclosure', 'secret'],
	'PAYMENT': ['payment', 'pay', 'fee', 'price'],
	'COMPLIANCE': ['comply', 'compliance', 'regulation', 'law'],
	'WARRANTY': ['warranty', 'warrant', 'represent', 'guarantee']
	}

	for theme, keywords in themes.items():
	if any(keyword in term.lower() for term in top_words[:5] for keyword in keywords):
	return f"COMPONENT_{theme}"

	return f"COMPONENT_{top_words[0].upper()}_{top_words[1].upper()}"


	class SpectralClusteringRiskDiscovery:
	"""
	Risk discovery using Spectral Clustering.

	Uses graph theory and eigenvalues to cluster data. Excellent for non-convex clusters
	that other methods miss. Based on similarity graph construction.

	Advantages:
	- ✅ Handles non-convex clusters (arbitrary shapes)
	- ✅ Uses graph structure (captures relationships)
	- ✅ Theoretically sound (spectral graph theory)
	- ✅ Good for manifold-structured data

	Disadvantages:
	- ❌ Computationally expensive (eigenvalue decomposition)
	- ❌ Memory intensive for large datasets
	- ❌ Sensitive to similarity metric
	- ❌ Requires number of clusters
	"""

	def __init__(self, n_clusters: int = 7, affinity: str = 'rbf', random_state: int = 42):
	self.n_clusters = n_clusters
	self.affinity = affinity # 'rbf', 'nearest_neighbors', 'precomputed'
	self.random_state = random_state

	# TF-IDF vectorizer
	self.vectorizer = TfidfVectorizer(
	max_features=6000,
	ngram_range=(1, 2),
	stop_words='english',
	lowercase=True,
	min_df=3,
	max_df=0.85
	)

	# Import spectral clustering
	from sklearn.cluster import SpectralClustering

	# Spectral clustering model
	self.spectral_model = SpectralClustering(
	n_clusters=n_clusters,
	affinity=affinity,
	random_state=random_state,
	n_init=10,
	assign_labels='kmeans' # or 'discretize'
	)

	self.discovered_clusters = {}
	self.cluster_labels = None
	self.feature_matrix = None

	def discover_risk_patterns(self, clauses: List[str]) -> Dict[str, Any]:
	"""
	Discover risk patterns using Spectral Clustering.

	Args:
	clauses: List of legal clause texts

	Returns:
	Dictionary with discovered clusters
	"""
	print(f"🔍 Discovering risk patterns using Spectral Clustering (n_clusters={self.n_clusters})...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create TF-IDF matrix
	print(" 📊 Creating TF-IDF feature matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Fit spectral clustering
	print(f" 🧠 Fitting Spectral Clustering (affinity={self.affinity})...")
	print(" (This may take a while for large datasets...)")

	# For very large datasets, sample for affinity matrix
	if self.feature_matrix.shape[0] > 5000:
	print(f" Large dataset detected ({self.feature_matrix.shape[0]} clauses)")
	print(" Using nearest neighbors affinity for efficiency...")
	self.spectral_model.affinity = 'nearest_neighbors'
	self.spectral_model.n_neighbors = 10

	self.cluster_labels = self.spectral_model.fit_predict(self.feature_matrix)

	# Analyze each cluster
	print(" 📝 Analyzing discovered clusters...")
	for cluster_id in range(self.n_clusters):
	cluster_mask = self.cluster_labels == cluster_id
	cluster_indices = np.where(cluster_mask)[0]

	if len(cluster_indices) == 0:
	continue

	# Get representative clauses
	cluster_clauses = [clauses[i] for i in cluster_indices]

	# Extract top TF-IDF terms
	cluster_tfidf = self.feature_matrix[cluster_mask].mean(axis=0)
	top_term_indices = np.argsort(np.asarray(cluster_tfidf).flatten())[-15:][::-1]
	top_terms = [feature_names[i] for i in top_term_indices]
	top_scores = [float(cluster_tfidf[0, i]) for i in top_term_indices]

	# Generate cluster name
	cluster_name = self._generate_cluster_name(top_terms)

	self.discovered_clusters[cluster_id] = {
	'cluster_id': cluster_id,
	'cluster_name': cluster_name,
	'top_terms': top_terms,
	'term_scores': top_scores,
	'clause_count': int(len(cluster_indices)),
	'proportion': float(len(cluster_indices) / len(clauses))
	}

	# Compute silhouette score if dataset not too large
	if len(clauses) < 10000:
	from sklearn.metrics import silhouette_score
	silhouette = silhouette_score(self.feature_matrix, self.cluster_labels)
	else:
	silhouette = None

	print(f"✅ Spectral clustering complete: {len(self.discovered_clusters)} clusters found")
	if silhouette:
	print(f" Silhouette Score: {silhouette:.3f}")

	return {
	'method': 'Spectral_Clustering',
	'n_clusters': self.n_clusters,
	'affinity': self.affinity,
	'discovered_clusters': self.discovered_clusters,
	'cluster_labels': self.cluster_labels,
	'quality_metrics': {
	'silhouette_score': silhouette if silhouette else 'N/A',
	'n_clusters_found': len(self.discovered_clusters)
	}
	}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _generate_cluster_name(self, top_terms: List[str]) -> str:
	"""Generate descriptive name from top terms"""
	themes = {
	'LIABILITY': ['liability', 'liable', 'damages', 'loss'],
	'INDEMNITY': ['indemnify', 'indemnification', 'hold', 'harmless'],
	'TERMINATION': ['terminate', 'termination', 'cancel', 'expire'],
	'IP': ['intellectual', 'property', 'patent', 'copyright'],
	'CONFIDENTIAL': ['confidential', 'nda', 'disclosure', 'secret'],
	'PAYMENT': ['payment', 'pay', 'fee', 'price'],
	'COMPLIANCE': ['comply', 'compliance', 'regulation', 'law'],
	'WARRANTY': ['warranty', 'warrant', 'represent', 'guarantee']
	}

	for theme, keywords in themes.items():
	if any(keyword in term.lower() for term in top_terms[:5] for keyword in keywords):
	return f"SPECTRAL_{theme}"

	return f"SPECTRAL_{top_terms[0].upper()}_{top_terms[1].upper()}"


	class GaussianMixtureRiskDiscovery:
	"""
	Risk discovery using Gaussian Mixture Models (GMM).

	Probabilistic model that assumes data comes from mixture of Gaussian distributions.
	Provides soft clustering with probability estimates.

	Advantages:
	- ✅ Probabilistic (soft clustering)
	- ✅ Provides uncertainty estimates
	- ✅ Can model elliptical clusters
	- ✅ Flexible covariance structures
	- ✅ Works with EM algorithm (handles missing data)

	Disadvantages:
	- ❌ Assumes Gaussian distributions
	- ❌ Sensitive to initialization
	- ❌ Can get stuck in local optima
	- ❌ Computationally intensive
	"""

	def __init__(self, n_components: int = 7, covariance_type: str = 'diag', random_state: int = 42):
	self.n_components = n_components
	self.covariance_type = covariance_type # 'full', 'tied', 'diag', 'spherical'
	self.random_state = random_state

	# TF-IDF vectorizer
	self.vectorizer = TfidfVectorizer(
	max_features=5000,
	ngram_range=(1, 2),
	stop_words='english',
	lowercase=True,
	min_df=3,
	max_df=0.85
	)

	# Import GMM
	from sklearn.mixture import GaussianMixture

	# GMM model
	self.gmm_model = GaussianMixture(
	n_components=n_components,
	covariance_type=covariance_type,
	random_state=random_state,
	n_init=10,
	max_iter=200
	)

	self.discovered_components = {}
	self.component_labels = None
	self.feature_matrix = None
	self.probabilities = None

	def discover_risk_patterns(self, clauses: List[str]) -> Dict[str, Any]:
	"""
	Discover risk patterns using Gaussian Mixture Model.

	Args:
	clauses: List of legal clause texts

	Returns:
	Dictionary with discovered components and probabilities
	"""
	print(f"🔍 Discovering risk patterns using GMM (n_components={self.n_components})...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create TF-IDF matrix
	print(" 📊 Creating TF-IDF feature matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Reduce dimensionality for GMM (dense matrix needed)
	print(" 🔄 Reducing dimensionality (GMM requires dense matrix)...")
	from sklearn.decomposition import TruncatedSVD
	svd = TruncatedSVD(n_components=min(100, self.feature_matrix.shape[1] - 1), random_state=self.random_state)
	X_reduced = svd.fit_transform(self.feature_matrix)

	# Fit GMM model
	print(f" 🧠 Fitting Gaussian Mixture Model (covariance={self.covariance_type})...")
	self.gmm_model.fit(X_reduced)

	# Get predictions and probabilities
	self.component_labels = self.gmm_model.predict(X_reduced)
	self.probabilities = self.gmm_model.predict_proba(X_reduced)

	# Analyze each component
	print(" 📝 Analyzing discovered components...")
	for component_id in range(self.n_components):
	component_mask = self.component_labels == component_id
	component_indices = np.where(component_mask)[0]

	if len(component_indices) == 0:
	continue

	# Get representative clauses
	component_clauses = [clauses[i] for i in component_indices]

	# Extract top TF-IDF terms
	component_tfidf = self.feature_matrix[component_mask].mean(axis=0)
	top_term_indices = np.argsort(np.asarray(component_tfidf).flatten())[-15:][::-1]
	top_terms = [feature_names[i] for i in top_term_indices]
	top_scores = [float(component_tfidf[0, i]) for i in top_term_indices]

	# Generate component name
	component_name = self._generate_component_name(top_terms)

	# Compute average probability for this component
	avg_probability = np.mean(self.probabilities[component_mask, component_id])

	self.discovered_components[component_id] = {
	'component_id': component_id,
	'component_name': component_name,
	'top_terms': top_terms,
	'term_scores': top_scores,
	'clause_count': int(len(component_indices)),
	'proportion': float(len(component_indices) / len(clauses)),
	'avg_confidence': float(avg_probability)
	}

	# Compute BIC and AIC (model selection criteria)
	bic = self.gmm_model.bic(X_reduced)
	aic = self.gmm_model.aic(X_reduced)

	print(f"✅ GMM discovery complete: {len(self.discovered_components)} components found")
	print(f" BIC: {bic:.2f} (lower is better)")
	print(f" AIC: {aic:.2f} (lower is better)")

	return {
	'method': 'Gaussian_Mixture_Model',
	'n_components': self.n_components,
	'covariance_type': self.covariance_type,
	'discovered_components': self.discovered_components,
	'component_labels': self.component_labels,
	'probabilities': self.probabilities,
	'quality_metrics': {
	'bic': float(bic),
	'aic': float(aic),
	'avg_confidence': float(np.mean(np.max(self.probabilities, axis=1)))
	}
	}

	def get_clause_probabilities(self, clause_idx: int) -> Dict[int, float]:
	"""Get probability distribution over components for a specific clause"""
	if self.probabilities is None:
	return {}

	return {comp_id: float(prob) for comp_id, prob in enumerate(self.probabilities[clause_idx])}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _generate_component_name(self, top_terms: List[str]) -> str:
	"""Generate descriptive name from top terms"""
	themes = {
	'LIABILITY': ['liability', 'liable', 'damages', 'loss'],
	'INDEMNITY': ['indemnify', 'indemnification', 'hold', 'harmless'],
	'TERMINATION': ['terminate', 'termination', 'cancel', 'expire'],
	'IP': ['intellectual', 'property', 'patent', 'copyright'],
	'CONFIDENTIAL': ['confidential', 'nda', 'disclosure', 'secret'],
	'PAYMENT': ['payment', 'pay', 'fee', 'price'],
	'COMPLIANCE': ['comply', 'compliance', 'regulation', 'law'],
	'WARRANTY': ['warranty', 'warrant', 'represent', 'guarantee']
	}

	for theme, keywords in themes.items():
	if any(keyword in term.lower() for term in top_terms[:5] for keyword in keywords):
	return f"GMM_{theme}"

	return f"GMM_{top_terms[0].upper()}_{top_terms[1].upper()}"


	class MiniBatchKMeansRiskDiscovery:
	"""
	Risk discovery using Mini-Batch K-Means.

	Scalable version of K-Means that uses mini-batches for faster computation.
	Ideal for very large datasets (100K+ clauses).

	Advantages:
	- ✅ Extremely fast (processes mini-batches)
	- ✅ Scalable to millions of samples
	- ✅ Low memory footprint
	- ✅ Online learning (can update incrementally)
	- ✅ Similar quality to standard K-Means

	Disadvantages:
	- ❌ Slightly less accurate than standard K-Means
	- ❌ Results vary with batch size
	- ❌ Still requires number of clusters
	"""

	def __init__(self, n_clusters: int = 7, batch_size: int = 1000, random_state: int = 42):
	self.n_clusters = n_clusters
	self.batch_size = batch_size
	self.random_state = random_state

	# TF-IDF vectorizer
	self.vectorizer = TfidfVectorizer(
	max_features=10000,
	ngram_range=(1, 3),
	stop_words='english',
	lowercase=True,
	min_df=2,
	max_df=0.95
	)

	# Import Mini-Batch K-Means
	from sklearn.cluster import MiniBatchKMeans

	# Mini-Batch K-Means model
	self.kmeans_model = MiniBatchKMeans(
	n_clusters=n_clusters,
	random_state=random_state,
	batch_size=batch_size,
	n_init=10,
	max_iter=300,
	reassignment_ratio=0.01
	)

	self.discovered_clusters = {}
	self.cluster_labels = None
	self.feature_matrix = None

	def discover_risk_patterns(self, clauses: List[str]) -> Dict[str, Any]:
	"""
	Discover risk patterns using Mini-Batch K-Means.

	Args:
	clauses: List of legal clause texts

	Returns:
	Dictionary with discovered clusters
	"""
	print(f"🔍 Discovering risk patterns using Mini-Batch K-Means (n_clusters={self.n_clusters})...")

	# Clean clauses
	cleaned_clauses = [self._clean_text(c) for c in clauses]

	# Create TF-IDF matrix
	print(" 📊 Creating TF-IDF feature matrix...")
	self.feature_matrix = self.vectorizer.fit_transform(cleaned_clauses)
	feature_names = self.vectorizer.get_feature_names_out()

	# Fit Mini-Batch K-Means
	print(f" 🧠 Fitting Mini-Batch K-Means (batch_size={self.batch_size})...")
	self.cluster_labels = self.kmeans_model.fit_predict(self.feature_matrix)

	# Analyze each cluster
	print(" 📝 Analyzing discovered clusters...")
	for cluster_id in range(self.n_clusters):
	cluster_mask = self.cluster_labels == cluster_id
	cluster_indices = np.where(cluster_mask)[0]

	if len(cluster_indices) == 0:
	continue

	# Get cluster center
	cluster_center = self.kmeans_model.cluster_centers_[cluster_id]

	# Get top terms from cluster center
	top_term_indices = np.argsort(cluster_center)[-15:][::-1]
	top_terms = [feature_names[i] for i in top_term_indices]
	top_scores = [float(cluster_center[i]) for i in top_term_indices]

	# Generate cluster name
	cluster_name = self._generate_cluster_name(top_terms)

	# Compute cluster cohesion (inertia contribution)
	from scipy.spatial.distance import cdist
	distances = cdist(
	self.feature_matrix[cluster_mask].toarray(),
	[cluster_center],
	metric='euclidean'
	)
	avg_distance = np.mean(distances)

	self.discovered_clusters[cluster_id] = {
	'cluster_id': cluster_id,
	'cluster_name': cluster_name,
	'top_terms': top_terms,
	'term_scores': top_scores,
	'clause_count': int(len(cluster_indices)),
	'proportion': float(len(cluster_indices) / len(clauses)),
	'avg_distance_to_center': float(avg_distance)
	}

	# Compute inertia (total within-cluster sum of squares)
	inertia = self.kmeans_model.inertia_

	print(f"✅ Mini-Batch K-Means complete: {self.n_clusters} clusters found")
	print(f" Inertia: {inertia:.2f} (lower is better)")
	print(f" Speed boost vs standard K-Means: ~3-5x faster")

	return {
	'method': 'MiniBatch_KMeans',
	'n_clusters': self.n_clusters,
	'batch_size': self.batch_size,
	'discovered_clusters': self.discovered_clusters,
	'cluster_labels': self.cluster_labels,
	'quality_metrics': {
	'inertia': float(inertia),
	'avg_cluster_cohesion': float(np.mean([c['avg_distance_to_center'] for c in self.discovered_clusters.values()]))
	}
	}

	def _clean_text(self, text: str) -> str:
	"""Clean clause text"""
	if not isinstance(text, str):
	return ""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def _generate_cluster_name(self, top_terms: List[str]) -> str:
	"""Generate descriptive name from top terms"""
	themes = {
	'LIABILITY': ['liability', 'liable', 'damages', 'loss'],
	'INDEMNITY': ['indemnify', 'indemnification', 'hold', 'harmless'],
	'TERMINATION': ['terminate', 'termination', 'cancel', 'expire'],
	'IP': ['intellectual', 'property', 'patent', 'copyright'],
	'CONFIDENTIAL': ['confidential', 'nda', 'disclosure', 'secret'],
	'PAYMENT': ['payment', 'pay', 'fee', 'price'],
	'COMPLIANCE': ['comply', 'compliance', 'regulation', 'law'],
	'WARRANTY': ['warranty', 'warrant', 'represent', 'guarantee']
	}

	for theme, keywords in themes.items():
	if any(keyword in term.lower() for term in top_terms[:5] for keyword in keywords):
	return f"MB_{theme}"

	return f"MB_{top_terms[0].upper()}_{top_terms[1].upper()}"


	# Utility function to compare all methods
	def compare_risk_discovery_methods(clauses: List[str], n_patterns: int = 7,
	include_advanced: bool = True) -> Dict[str, Any]:
	"""
	Compare all risk discovery methods on the same dataset.

	Args:
	clauses: List of legal clause texts
	n_patterns: Number of risk patterns/clusters to discover
	include_advanced: If True, includes advanced methods (slower but comprehensive)

	Returns:
	Comparison results with metrics for each method
	"""
	print("="*80)
	print("🔬 COMPARING RISK DISCOVERY METHODS")
	print(f" Methods to test: {9 if include_advanced else 4}")
	print("="*80)

	results = {}

	# ===== BASIC METHODS (Fast) =====

	# 1. K-Means (Original)
	print("\n" + "="*80)
	print("METHOD 1: K-Means Clustering (Original) - FAST")
	print("="*80)
	from risk_discovery import UnsupervisedRiskDiscovery
	kmeans_discovery = UnsupervisedRiskDiscovery(n_clusters=n_patterns)
	results['kmeans'] = kmeans_discovery.discover_risk_patterns(clauses)

	# 2. LDA Topic Modeling
	print("\n" + "="*80)
	print("METHOD 2: LDA Topic Modeling - PROBABILISTIC")
	print("="*80)
	lda_discovery = TopicModelingRiskDiscovery(n_topics=n_patterns)
	results['lda'] = lda_discovery.discover_risk_patterns(clauses)

	# 3. Hierarchical Clustering
	print("\n" + "="*80)
	print("METHOD 3: Hierarchical Clustering - STRUCTURE")
	print("="*80)
	hierarchical_discovery = HierarchicalRiskDiscovery(n_clusters=n_patterns)
	results['hierarchical'] = hierarchical_discovery.discover_risk_patterns(clauses)

	# 4. DBSCAN
	print("\n" + "="*80)
	print("METHOD 4: DBSCAN (Density-Based) - OUTLIERS")
	print("="*80)
	dbscan_discovery = DensityBasedRiskDiscovery(eps=0.3, min_samples=5)
	results['dbscan'] = dbscan_discovery.discover_risk_patterns(clauses, auto_tune=True)

	if include_advanced:
	# ===== ADVANCED METHODS =====

	# 5. NMF (Non-negative Matrix Factorization)
	print("\n" + "="*80)
	print("METHOD 5: NMF (Matrix Factorization) - PARTS-BASED")
	print("="*80)
	nmf_discovery = NMFRiskDiscovery(n_components=n_patterns)
	results['nmf'] = nmf_discovery.discover_risk_patterns(clauses)

	# 6. Spectral Clustering
	print("\n" + "="*80)
	print("METHOD 6: Spectral Clustering - GRAPH-BASED")
	print("="*80)
	spectral_discovery = SpectralClusteringRiskDiscovery(n_clusters=n_patterns)
	results['spectral'] = spectral_discovery.discover_risk_patterns(clauses)

	# 7. Gaussian Mixture Model
	print("\n" + "="*80)
	print("METHOD 7: Gaussian Mixture Model - PROBABILISTIC SOFT")
	print("="*80)
	gmm_discovery = GaussianMixtureRiskDiscovery(n_components=n_patterns)
	results['gmm'] = gmm_discovery.discover_risk_patterns(clauses)

	# 8. Mini-Batch K-Means
	print("\n" + "="*80)
	print("METHOD 8: Mini-Batch K-Means - ULTRA FAST")
	print("="*80)
	minibatch_discovery = MiniBatchKMeansRiskDiscovery(n_clusters=n_patterns)
	results['minibatch_kmeans'] = minibatch_discovery.discover_risk_patterns(clauses)

	# 9. Risk-o-meter (Doc2Vec + SVM) - Chakrabarti et al., 2018
	print("\n" + "="*80)
	print("METHOD 9: Risk-o-meter (Doc2Vec + SVM) - PAPER BASELINE")
	print("="*80)
	print("📄 Based on: Chakrabarti et al., 2018")
	print(" Achievement: 91% accuracy on termination clauses")
	try:
	from risk_o_meter import RiskOMeterFramework
	risk_o_meter = RiskOMeterFramework(
	vector_size=100,
	epochs=30,
	verbose=True
	)
	results['risk_o_meter'] = risk_o_meter.discover_risk_patterns(clauses, n_patterns)
	except ImportError:
	print("⚠️ Risk-o-meter requires gensim. Install with: pip install gensim>=4.3.0")
	print(" Skipping Risk-o-meter comparison...")
	except Exception as e:
	print(f"⚠️ Risk-o-meter error: {e}")
	print(" Skipping Risk-o-meter comparison...")

	# Generate comparison summary
	print("\n" + "="*80)
	print("📊 COMPARISON SUMMARY")
	print("="*80)

	summary = {
	'n_clauses': len(clauses),
	'target_patterns': n_patterns,
	'methods_compared': 9 if include_advanced else 4,
	'method_results': {}
	}

	for method_name, method_results in results.items():
	n_discovered = method_results.get('n_clusters') or method_results.get('n_topics', 0)

	print(f"\n{method_name.upper()}:")
	print(f" Patterns Discovered: {n_discovered}")

	if 'quality_metrics' in method_results:
	print(f" Quality Metrics: {method_results['quality_metrics']}")

	summary['method_results'][method_name] = {
	'n_patterns': n_discovered,
	'method': method_results['method'],
	'quality_metrics': method_results.get('quality_metrics', {})
	}

	print("\n" + "="*80)
	print("✅ COMPARISON COMPLETE")
	print("="*80)

	return {
	'summary': summary,
	'detailed_results': results
	}