File size: 5,488 Bytes
6d5953d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
"""
Example script demonstrating description clustering functionality.

This script shows how to use the DescriptionClusterer class to cluster text descriptions
from a CSV file or sample data.
"""

import pandas as pd
from description_clustering import DescriptionClusterer, create_sample_data
import json

def run_clustering_example():
    """Run a complete clustering example with sample data."""
    
    print("=== Description Clustering Example ===\n")
    
    # Step 1: Create sample data
    print("1. Creating sample data...")
    df = create_sample_data("example_descriptions.csv")
    print(f"   Created {len(df)} sample descriptions\n")
    
    # Step 2: Initialize the clusterer
    print("2. Initializing clusterer...")
    clusterer = DescriptionClusterer(random_state=42)
    
    # Step 3: Preprocess the text
    print("3. Preprocessing text...")
    descriptions = df['description'].tolist()
    processed_descriptions = clusterer.preprocess_text(descriptions)
    print(f"   Preprocessed {len(processed_descriptions)} descriptions\n")
    
    # Step 4: Vectorize the text
    print("4. Vectorizing text...")
    embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500)
    print(f"   Created embeddings with shape: {embeddings.shape}\n")
    
    # Step 5: Perform clustering with different methods
    methods = ['kmeans', 'hierarchical', 'lda']
    results = {}
    
    for method in methods:
        print(f"5. Performing {method.upper()} clustering...")
        
        if method == 'kmeans':
            cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10)
        elif method == 'hierarchical':
            cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10)
        elif method == 'lda':
            cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10)
        
        # Evaluate clustering
        evaluation_scores = clusterer.evaluate_clustering(embeddings)
        
        # Get cluster keywords
        cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5)
        
        # Store results
        results[method] = {
            'cluster_labels': cluster_labels.tolist(),
            'evaluation_scores': evaluation_scores,
            'cluster_keywords': cluster_keywords,
            'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict()
        }
        
        print(f"   Silhouette Score: {evaluation_scores['silhouette_score']:.4f}")
        print(f"   Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}")
        print()
    
    # Step 6: Display results
    print("6. Clustering Results Summary:")
    print("=" * 50)
    
    for method, result in results.items():
        print(f"\n{method.upper()} Clustering:")
        print(f"  Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}")
        print(f"  Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}")
        
        print(f"  Cluster Distribution:")
        for cluster_id, count in sorted(result['cluster_distribution'].items()):
            print(f"    Cluster {cluster_id}: {count} descriptions")
        
        print(f"  Top Keywords by Cluster:")
        for cluster_id, keywords in result['cluster_keywords'].items():
            print(f"    Cluster {cluster_id}: {', '.join(keywords)}")
    
    # Step 7: Save results
    print("\n7. Saving results...")
    with open('example_clustering_results.json', 'w') as f:
        json.dump(results, f, indent=2)
    print("   Results saved to: example_clustering_results.json")
    
    # Step 8: Create a comparison dataframe
    print("\n8. Creating comparison dataframe...")
    comparison_df = df.copy()
    for method in methods:
        comparison_df[f'cluster_{method}'] = results[method]['cluster_labels']
    
    comparison_df.to_csv('example_clustering_comparison.csv', index=False)
    print("   Comparison saved to: example_clustering_comparison.csv")
    
    print("\n=== Example completed successfully! ===")
    print("\nFiles created:")
    print("  - example_descriptions.csv (sample data)")
    print("  - example_clustering_results.json (detailed results)")
    print("  - example_clustering_comparison.csv (comparison table)")

def analyze_specific_cluster(method='kmeans', cluster_id=0):
    """Analyze a specific cluster in detail."""
    
    print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===")
    
    # Load the comparison data
    df = pd.read_csv('example_clustering_comparison.csv')
    
    # Filter for the specific cluster
    cluster_mask = df[f'cluster_{method}'] == cluster_id
    cluster_descriptions = df[cluster_mask]['description'].tolist()
    
    print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:")
    print("-" * 40)
    
    for i, desc in enumerate(cluster_descriptions, 1):
        print(f"{i:2d}. {desc}")
    
    # Load the detailed results
    with open('example_clustering_results.json', 'r') as f:
        results = json.load(f)
    
    keywords = results[method]['cluster_keywords'].get(str(cluster_id), [])
    print(f"\nTop keywords for this cluster: {', '.join(keywords)}")

if __name__ == "__main__":
    # Run the main example
    run_clustering_example()
    
    # Analyze a specific cluster (you can modify the parameters)
    analyze_specific_cluster(method='kmeans', cluster_id=0)