Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Example script demonstrating description clustering functionality. | |
| This script shows how to use the DescriptionClusterer class to cluster text descriptions | |
| from a CSV file or sample data. | |
| """ | |
| import pandas as pd | |
| from description_clustering import DescriptionClusterer, create_sample_data | |
| import json | |
| def run_clustering_example(): | |
| """Run a complete clustering example with sample data.""" | |
| print("=== Description Clustering Example ===\n") | |
| # Step 1: Create sample data | |
| print("1. Creating sample data...") | |
| df = create_sample_data("example_descriptions.csv") | |
| print(f" Created {len(df)} sample descriptions\n") | |
| # Step 2: Initialize the clusterer | |
| print("2. Initializing clusterer...") | |
| clusterer = DescriptionClusterer(random_state=42) | |
| # Step 3: Preprocess the text | |
| print("3. Preprocessing text...") | |
| descriptions = df['description'].tolist() | |
| processed_descriptions = clusterer.preprocess_text(descriptions) | |
| print(f" Preprocessed {len(processed_descriptions)} descriptions\n") | |
| # Step 4: Vectorize the text | |
| print("4. Vectorizing text...") | |
| embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500) | |
| print(f" Created embeddings with shape: {embeddings.shape}\n") | |
| # Step 5: Perform clustering with different methods | |
| methods = ['kmeans', 'hierarchical', 'lda'] | |
| results = {} | |
| for method in methods: | |
| print(f"5. Performing {method.upper()} clustering...") | |
| if method == 'kmeans': | |
| cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10) | |
| elif method == 'hierarchical': | |
| cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10) | |
| elif method == 'lda': | |
| cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10) | |
| # Evaluate clustering | |
| evaluation_scores = clusterer.evaluate_clustering(embeddings) | |
| # Get cluster keywords | |
| cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5) | |
| # Store results | |
| results[method] = { | |
| 'cluster_labels': cluster_labels.tolist(), | |
| 'evaluation_scores': evaluation_scores, | |
| 'cluster_keywords': cluster_keywords, | |
| 'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict() | |
| } | |
| print(f" Silhouette Score: {evaluation_scores['silhouette_score']:.4f}") | |
| print(f" Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}") | |
| print() | |
| # Step 6: Display results | |
| print("6. Clustering Results Summary:") | |
| print("=" * 50) | |
| for method, result in results.items(): | |
| print(f"\n{method.upper()} Clustering:") | |
| print(f" Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}") | |
| print(f" Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}") | |
| print(f" Cluster Distribution:") | |
| for cluster_id, count in sorted(result['cluster_distribution'].items()): | |
| print(f" Cluster {cluster_id}: {count} descriptions") | |
| print(f" Top Keywords by Cluster:") | |
| for cluster_id, keywords in result['cluster_keywords'].items(): | |
| print(f" Cluster {cluster_id}: {', '.join(keywords)}") | |
| # Step 7: Save results | |
| print("\n7. Saving results...") | |
| with open('example_clustering_results.json', 'w') as f: | |
| json.dump(results, f, indent=2) | |
| print(" Results saved to: example_clustering_results.json") | |
| # Step 8: Create a comparison dataframe | |
| print("\n8. Creating comparison dataframe...") | |
| comparison_df = df.copy() | |
| for method in methods: | |
| comparison_df[f'cluster_{method}'] = results[method]['cluster_labels'] | |
| comparison_df.to_csv('example_clustering_comparison.csv', index=False) | |
| print(" Comparison saved to: example_clustering_comparison.csv") | |
| print("\n=== Example completed successfully! ===") | |
| print("\nFiles created:") | |
| print(" - example_descriptions.csv (sample data)") | |
| print(" - example_clustering_results.json (detailed results)") | |
| print(" - example_clustering_comparison.csv (comparison table)") | |
| def analyze_specific_cluster(method='kmeans', cluster_id=0): | |
| """Analyze a specific cluster in detail.""" | |
| print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===") | |
| # Load the comparison data | |
| df = pd.read_csv('example_clustering_comparison.csv') | |
| # Filter for the specific cluster | |
| cluster_mask = df[f'cluster_{method}'] == cluster_id | |
| cluster_descriptions = df[cluster_mask]['description'].tolist() | |
| print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:") | |
| print("-" * 40) | |
| for i, desc in enumerate(cluster_descriptions, 1): | |
| print(f"{i:2d}. {desc}") | |
| # Load the detailed results | |
| with open('example_clustering_results.json', 'r') as f: | |
| results = json.load(f) | |
| keywords = results[method]['cluster_keywords'].get(str(cluster_id), []) | |
| print(f"\nTop keywords for this cluster: {', '.join(keywords)}") | |
| if __name__ == "__main__": | |
| # Run the main example | |
| run_clustering_example() | |
| # Analyze a specific cluster (you can modify the parameters) | |
| analyze_specific_cluster(method='kmeans', cluster_id=0) |