#!/usr/bin/env python3 """ Example script demonstrating description clustering functionality. This script shows how to use the DescriptionClusterer class to cluster text descriptions from a CSV file or sample data. """ import pandas as pd from description_clustering import DescriptionClusterer, create_sample_data import json def run_clustering_example(): """Run a complete clustering example with sample data.""" print("=== Description Clustering Example ===\n") # Step 1: Create sample data print("1. Creating sample data...") df = create_sample_data("example_descriptions.csv") print(f" Created {len(df)} sample descriptions\n") # Step 2: Initialize the clusterer print("2. Initializing clusterer...") clusterer = DescriptionClusterer(random_state=42) # Step 3: Preprocess the text print("3. Preprocessing text...") descriptions = df['description'].tolist() processed_descriptions = clusterer.preprocess_text(descriptions) print(f" Preprocessed {len(processed_descriptions)} descriptions\n") # Step 4: Vectorize the text print("4. Vectorizing text...") embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500) print(f" Created embeddings with shape: {embeddings.shape}\n") # Step 5: Perform clustering with different methods methods = ['kmeans', 'hierarchical', 'lda'] results = {} for method in methods: print(f"5. Performing {method.upper()} clustering...") if method == 'kmeans': cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10) elif method == 'hierarchical': cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10) elif method == 'lda': cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10) # Evaluate clustering evaluation_scores = clusterer.evaluate_clustering(embeddings) # Get cluster keywords cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5) # Store results results[method] = { 'cluster_labels': cluster_labels.tolist(), 'evaluation_scores': evaluation_scores, 'cluster_keywords': cluster_keywords, 'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict() } print(f" Silhouette Score: {evaluation_scores['silhouette_score']:.4f}") print(f" Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}") print() # Step 6: Display results print("6. Clustering Results Summary:") print("=" * 50) for method, result in results.items(): print(f"\n{method.upper()} Clustering:") print(f" Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}") print(f" Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}") print(f" Cluster Distribution:") for cluster_id, count in sorted(result['cluster_distribution'].items()): print(f" Cluster {cluster_id}: {count} descriptions") print(f" Top Keywords by Cluster:") for cluster_id, keywords in result['cluster_keywords'].items(): print(f" Cluster {cluster_id}: {', '.join(keywords)}") # Step 7: Save results print("\n7. Saving results...") with open('example_clustering_results.json', 'w') as f: json.dump(results, f, indent=2) print(" Results saved to: example_clustering_results.json") # Step 8: Create a comparison dataframe print("\n8. Creating comparison dataframe...") comparison_df = df.copy() for method in methods: comparison_df[f'cluster_{method}'] = results[method]['cluster_labels'] comparison_df.to_csv('example_clustering_comparison.csv', index=False) print(" Comparison saved to: example_clustering_comparison.csv") print("\n=== Example completed successfully! ===") print("\nFiles created:") print(" - example_descriptions.csv (sample data)") print(" - example_clustering_results.json (detailed results)") print(" - example_clustering_comparison.csv (comparison table)") def analyze_specific_cluster(method='kmeans', cluster_id=0): """Analyze a specific cluster in detail.""" print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===") # Load the comparison data df = pd.read_csv('example_clustering_comparison.csv') # Filter for the specific cluster cluster_mask = df[f'cluster_{method}'] == cluster_id cluster_descriptions = df[cluster_mask]['description'].tolist() print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:") print("-" * 40) for i, desc in enumerate(cluster_descriptions, 1): print(f"{i:2d}. {desc}") # Load the detailed results with open('example_clustering_results.json', 'r') as f: results = json.load(f) keywords = results[method]['cluster_keywords'].get(str(cluster_id), []) print(f"\nTop keywords for this cluster: {', '.join(keywords)}") if __name__ == "__main__": # Run the main example run_clustering_example() # Analyze a specific cluster (you can modify the parameters) analyze_specific_cluster(method='kmeans', cluster_id=0)