File size: 18,990 Bytes
6d5953d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
#!/usr/bin/env python3
"""
Description Clustering Script

This script provides multiple clustering techniques for analyzing descriptions from CSV files.
It supports K-means clustering, hierarchical clustering, and topic modeling using LDA.

Usage:
    python description_clustering.py --input data.csv --column descriptions --method kmeans --clusters 5
"""

import pandas as pd
import numpy as np
import argparse
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')


class DescriptionClusterer:
    """A comprehensive class for clustering text descriptions."""
    
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.vectorizer = None
        self.model = None
        self.cluster_labels = None
        self.feature_names = None
        self.embeddings = None
        
    def preprocess_text(self, texts: List[str]) -> List[str]:
        """Preprocess text data by tokenizing, removing stopwords, and lemmatizing."""
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        
        processed_texts = []
        for text in texts:
            if pd.isna(text) or text == '':
                processed_texts.append('')
                continue
                
            # Tokenize
            tokens = word_tokenize(text.lower())
            
            # Remove stopwords and lemmatize
            tokens = [lemmatizer.lemmatize(token) for token in tokens 
                     if token.isalnum() and token not in stop_words and len(token) > 2]
            
            processed_texts.append(' '.join(tokens))
        
        return processed_texts
    
    def vectorize_text(self, texts: List[str], max_features: int = 1000, 
                      min_df: int = 2, max_df: float = 0.95) -> np.ndarray:
        """Convert text to TF-IDF vectors."""
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            min_df=min_df,
            max_df=max_df,
            ngram_range=(1, 2),
            stop_words='english'
        )
        
        self.embeddings = self.vectorizer.fit_transform(texts)
        self.feature_names = self.vectorizer.get_feature_names_out()
        return self.embeddings.toarray()
    
    def kmeans_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray:
        """Perform K-means clustering."""
        self.model = KMeans(
            n_clusters=n_clusters,
            random_state=self.random_state,
            n_init=10
        )
        self.cluster_labels = self.model.fit_predict(embeddings)
        return self.cluster_labels
    
    def hierarchical_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray:
        """Perform hierarchical clustering."""
        self.model = AgglomerativeClustering(
            n_clusters=n_clusters,
            linkage='ward'
        )
        self.cluster_labels = self.model.fit_predict(embeddings)
        return self.cluster_labels
    
    def topic_modeling_lda(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray:
        """Perform topic modeling using Latent Dirichlet Allocation."""
        self.model = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=self.random_state,
            max_iter=100
        )
        # LDA expects non-negative values, so we use the raw TF-IDF matrix
        topic_distributions = self.model.fit_transform(self.embeddings)
        self.cluster_labels = np.argmax(topic_distributions, axis=1)
        return self.cluster_labels
    
    def topic_modeling_nmf(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray:
        """Perform topic modeling using Non-negative Matrix Factorization."""
        self.model = NMF(
            n_components=n_topics,
            random_state=self.random_state,
            max_iter=200
        )
        topic_distributions = self.model.fit_transform(self.embeddings)
        self.cluster_labels = np.argmax(topic_distributions, axis=1)
        return self.cluster_labels
    
    def evaluate_clustering(self, embeddings: np.ndarray) -> Dict[str, float]:
        """Evaluate clustering quality using silhouette and Calinski-Harabasz scores."""
        if self.cluster_labels is None:
            return {}
        
        # Silhouette score (higher is better, range: -1 to 1)
        silhouette_avg = silhouette_score(embeddings, self.cluster_labels)
        
        # Calinski-Harabasz score (higher is better)
        calinski_harabasz = calinski_harabasz_score(embeddings, self.cluster_labels)
        
        return {
            'silhouette_score': silhouette_avg,
            'calinski_harabasz_score': calinski_harabasz
        }
    
    def get_cluster_keywords(self, n_keywords: int = 10) -> Dict[int, List[str]]:
        """Extract top keywords for each cluster."""
        if self.cluster_labels is None or self.feature_names is None:
            return {}
        
        cluster_keywords = {}
        unique_clusters = np.unique(self.cluster_labels)
        
        for cluster_id in unique_clusters:
            cluster_mask = self.cluster_labels == cluster_id
            cluster_embeddings = self.embeddings[cluster_mask]
            
            if cluster_embeddings.shape[0] == 0:
                continue
            
            # Calculate mean TF-IDF scores for this cluster
            cluster_means = np.mean(cluster_embeddings.toarray(), axis=0)
            
            # Get top keywords
            top_indices = np.argsort(cluster_means)[-n_keywords:][::-1]
            keywords = [self.feature_names[i] for i in top_indices]
            cluster_keywords[cluster_id] = keywords
        
        return cluster_keywords
    
    def visualize_clusters(self, embeddings: np.ndarray, output_path: str = "cluster_visualization.png"):
        """Create t-SNE visualization of clusters."""
        if self.cluster_labels is None:
            print("No clustering results to visualize.")
            return
        
        # Reduce dimensionality for visualization
        tsne = TSNE(n_components=2, random_state=self.random_state, perplexity=min(30, len(embeddings)-1))
        embeddings_2d = tsne.fit_transform(embeddings)
        
        # Create visualization
        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                            c=self.cluster_labels, cmap='viridis', alpha=0.7)
        plt.colorbar(scatter)
        plt.title('t-SNE Visualization of Clusters')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.tight_layout()
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"Visualization saved to {output_path}")


def create_sample_data(output_path: str = "sample_descriptions.csv"):
    """Create sample data for demonstration."""
    sample_data = {
        'id': range(1, 101),
        'description': [
            # Technology/Software descriptions
            "Advanced machine learning algorithm for predictive analytics",
            "Cloud-based software solution for enterprise management",
            "Mobile app development framework with cross-platform support",
            "Data visualization tool for business intelligence",
            "Cybersecurity software for threat detection",
            "API integration platform for third-party services",
            "Database management system with real-time synchronization",
            "Web development framework with modern UI components",
            "Artificial intelligence chatbot for customer service",
            "Blockchain technology for secure transactions",
            
            # Food/Restaurant descriptions
            "Authentic Italian restaurant with traditional recipes",
            "Organic farm-to-table dining experience",
            "Gourmet burger joint with craft beer selection",
            "Sushi bar with fresh daily ingredients",
            "Vegan restaurant with plant-based alternatives",
            "Pizza place with wood-fired oven cooking",
            "Mexican restaurant with homemade tortillas",
            "Coffee shop with artisanal brewing methods",
            "Bakery with fresh pastries and bread",
            "Seafood restaurant with ocean views",
            
            # Travel/Tourism descriptions
            "Luxury hotel with spa and wellness facilities",
            "Adventure tour company for outdoor activities",
            "Cultural heritage site with guided tours",
            "Beach resort with water sports equipment",
            "Mountain hiking trail with scenic viewpoints",
            "City walking tour with historical landmarks",
            "Eco-tourism lodge in rainforest setting",
            "Ski resort with modern lift systems",
            "Wine tasting tour in vineyard region",
            "Wildlife safari with expert guides",
            
            # Health/Wellness descriptions
            "Fitness center with personal training programs",
            "Yoga studio with meditation classes",
            "Medical clinic with specialized treatments",
            "Nutrition counseling for healthy eating",
            "Physical therapy center for rehabilitation",
            "Mental health counseling services",
            "Alternative medicine practice with holistic approach",
            "Dental clinic with modern equipment",
            "Pharmacy with prescription services",
            "Wellness spa with massage therapy",
            
            # Education/Training descriptions
            "Online learning platform with interactive courses",
            "Language school with native speakers",
            "Professional certification program",
            "Technical training institute for skills development",
            "University course with research opportunities",
            "Workshop series for creative skills",
            "Corporate training program for leadership",
            "Vocational school with hands-on experience",
            "Tutoring service for academic support",
            "Distance learning program with flexible schedules",
            
            # Entertainment/Recreation descriptions
            "Movie theater with premium seating options",
            "Concert venue with state-of-the-art sound",
            "Gaming arcade with virtual reality experiences",
            "Art gallery with contemporary exhibitions",
            "Sports complex with multiple facilities",
            "Bowling alley with family entertainment",
            "Karaoke bar with private rooms",
            "Comedy club with stand-up performances",
            "Dance studio with various styles",
            "Music school with instrument lessons",
            
            # Business/Professional descriptions
            "Consulting firm with strategic planning services",
            "Marketing agency with digital expertise",
            "Legal practice with specialized areas",
            "Accounting firm with tax preparation",
            "Real estate agency with property management",
            "Insurance company with comprehensive coverage",
            "Financial planning service for investments",
            "Public relations firm with media relations",
            "Human resources consulting for recruitment",
            "IT consulting with system integration",
            
            # Retail/Shopping descriptions
            "Fashion boutique with designer clothing",
            "Electronics store with latest gadgets",
            "Bookstore with rare and used books",
            "Home improvement store with tools",
            "Jewelry store with custom designs",
            "Toy store with educational games",
            "Pet store with grooming services",
            "Garden center with plants and supplies",
            "Sporting goods store with equipment",
            "Antique shop with vintage items",
            
            # Transportation/Logistics descriptions
            "Delivery service with same-day options",
            "Moving company with packing services",
            "Taxi service with luxury vehicles",
            "Car rental agency with flexible terms",
            "Freight forwarding with international shipping",
            "Warehouse storage with climate control",
            "Courier service with tracking systems",
            "Transportation company with fleet management",
            "Logistics provider with supply chain solutions",
            "Shipping company with express delivery",
            
            # Home/Services descriptions
            "Cleaning service with eco-friendly products",
            "Plumbing company with emergency repairs",
            "Electrical contractor with safety certification",
            "HVAC service with maintenance plans",
            "Landscaping company with design services",
            "Roofing contractor with warranty coverage",
            "Painting service with color consultation",
            "Carpentry workshop with custom furniture",
            "Security system installation with monitoring",
            "Home automation with smart technology"
        ]
    }
    
    df = pd.DataFrame(sample_data)
    df.to_csv(output_path, index=False)
    print(f"Sample data created with {len(df)} descriptions: {output_path}")
    return df


def main():
    parser = argparse.ArgumentParser(description='Cluster descriptions from CSV file')
    parser.add_argument('--input', type=str, help='Input CSV file path')
    parser.add_argument('--column', type=str, default='description', help='Column name containing descriptions')
    parser.add_argument('--method', type=str, choices=['kmeans', 'hierarchical', 'lda', 'nmf'], 
                       default='kmeans', help='Clustering method')
    parser.add_argument('--clusters', type=int, default=5, help='Number of clusters')
    parser.add_argument('--max-features', type=int, default=1000, help='Maximum features for vectorization')
    parser.add_argument('--output', type=str, default='clustering_results.json', help='Output file for results')
    parser.add_argument('--visualize', action='store_true', help='Generate visualization')
    parser.add_argument('--create-sample', action='store_true', help='Create sample data for testing')
    
    args = parser.parse_args()
    
    # Create sample data if requested
    if args.create_sample:
        create_sample_data()
        return
    
    # Load data
    if not args.input:
        print("No input file specified. Creating sample data...")
        create_sample_data()
        args.input = "sample_descriptions.csv"
    
    if not Path(args.input).exists():
        print(f"Input file {args.input} not found.")
        return
    
    print(f"Loading data from {args.input}...")
    df = pd.read_csv(args.input)
    
    if args.column not in df.columns:
        print(f"Column '{args.column}' not found in CSV. Available columns: {list(df.columns)}")
        return
    
    # Initialize clusterer
    clusterer = DescriptionClusterer()
    
    # Preprocess text
    print("Preprocessing text...")
    descriptions = df[args.column].fillna('').astype(str).tolist()
    processed_descriptions = clusterer.preprocess_text(descriptions)
    
    # Vectorize text
    print("Vectorizing text...")
    embeddings = clusterer.vectorize_text(processed_descriptions, max_features=args.max_features)
    
    # Perform clustering
    print(f"Performing {args.method} clustering with {args.clusters} clusters...")
    if args.method == 'kmeans':
        cluster_labels = clusterer.kmeans_clustering(embeddings, args.clusters)
    elif args.method == 'hierarchical':
        cluster_labels = clusterer.hierarchical_clustering(embeddings, args.clusters)
    elif args.method == 'lda':
        cluster_labels = clusterer.topic_modeling_lda(embeddings, args.clusters)
    elif args.method == 'nmf':
        cluster_labels = clusterer.topic_modeling_nmf(embeddings, args.clusters)
    
    # Add cluster labels to dataframe
    df['cluster'] = cluster_labels
    
    # Evaluate clustering
    print("Evaluating clustering quality...")
    evaluation_scores = clusterer.evaluate_clustering(embeddings)
    
    # Get cluster keywords
    print("Extracting cluster keywords...")
    cluster_keywords = clusterer.get_cluster_keywords()
    
    # Generate results
    results = {
        'method': args.method,
        'n_clusters': args.clusters,
        'n_samples': len(df),
        'evaluation_scores': evaluation_scores,
        'cluster_keywords': cluster_keywords,
        'cluster_distribution': df['cluster'].value_counts().to_dict(),
        'sample_descriptions': {}
    }
    
    # Add sample descriptions for each cluster
    for cluster_id in sorted(df['cluster'].unique()):
        cluster_samples = df[df['cluster'] == cluster_id][args.column].head(3).tolist()
        results['sample_descriptions'][f'cluster_{cluster_id}'] = cluster_samples
    
    # Save results
    with open(args.output, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\nClustering completed!")
    print(f"Results saved to: {args.output}")
    print(f"\nEvaluation Scores:")
    for metric, score in evaluation_scores.items():
        print(f"  {metric}: {score:.4f}")
    
    print(f"\nCluster Distribution:")
    for cluster_id, count in sorted(results['cluster_distribution'].items()):
        print(f"  Cluster {cluster_id}: {count} descriptions")
    
    print(f"\nTop Keywords by Cluster:")
    for cluster_id, keywords in cluster_keywords.items():
        print(f"  Cluster {cluster_id}: {', '.join(keywords[:5])}")
    
    # Save clustered data
    output_csv = f"clustered_{Path(args.input).name}"
    df.to_csv(output_csv, index=False)
    print(f"\nClustered data saved to: {output_csv}")
    
    # Generate visualization if requested
    if args.visualize:
        clusterer.visualize_clusters(embeddings)


if __name__ == "__main__":
    main()