Spaces:
Sleeping
Sleeping
File size: 5,488 Bytes
6d5953d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
#!/usr/bin/env python3
"""
Example script demonstrating description clustering functionality.
This script shows how to use the DescriptionClusterer class to cluster text descriptions
from a CSV file or sample data.
"""
import pandas as pd
from description_clustering import DescriptionClusterer, create_sample_data
import json
def run_clustering_example():
"""Run a complete clustering example with sample data."""
print("=== Description Clustering Example ===\n")
# Step 1: Create sample data
print("1. Creating sample data...")
df = create_sample_data("example_descriptions.csv")
print(f" Created {len(df)} sample descriptions\n")
# Step 2: Initialize the clusterer
print("2. Initializing clusterer...")
clusterer = DescriptionClusterer(random_state=42)
# Step 3: Preprocess the text
print("3. Preprocessing text...")
descriptions = df['description'].tolist()
processed_descriptions = clusterer.preprocess_text(descriptions)
print(f" Preprocessed {len(processed_descriptions)} descriptions\n")
# Step 4: Vectorize the text
print("4. Vectorizing text...")
embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500)
print(f" Created embeddings with shape: {embeddings.shape}\n")
# Step 5: Perform clustering with different methods
methods = ['kmeans', 'hierarchical', 'lda']
results = {}
for method in methods:
print(f"5. Performing {method.upper()} clustering...")
if method == 'kmeans':
cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10)
elif method == 'hierarchical':
cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10)
elif method == 'lda':
cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10)
# Evaluate clustering
evaluation_scores = clusterer.evaluate_clustering(embeddings)
# Get cluster keywords
cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5)
# Store results
results[method] = {
'cluster_labels': cluster_labels.tolist(),
'evaluation_scores': evaluation_scores,
'cluster_keywords': cluster_keywords,
'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict()
}
print(f" Silhouette Score: {evaluation_scores['silhouette_score']:.4f}")
print(f" Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}")
print()
# Step 6: Display results
print("6. Clustering Results Summary:")
print("=" * 50)
for method, result in results.items():
print(f"\n{method.upper()} Clustering:")
print(f" Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}")
print(f" Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}")
print(f" Cluster Distribution:")
for cluster_id, count in sorted(result['cluster_distribution'].items()):
print(f" Cluster {cluster_id}: {count} descriptions")
print(f" Top Keywords by Cluster:")
for cluster_id, keywords in result['cluster_keywords'].items():
print(f" Cluster {cluster_id}: {', '.join(keywords)}")
# Step 7: Save results
print("\n7. Saving results...")
with open('example_clustering_results.json', 'w') as f:
json.dump(results, f, indent=2)
print(" Results saved to: example_clustering_results.json")
# Step 8: Create a comparison dataframe
print("\n8. Creating comparison dataframe...")
comparison_df = df.copy()
for method in methods:
comparison_df[f'cluster_{method}'] = results[method]['cluster_labels']
comparison_df.to_csv('example_clustering_comparison.csv', index=False)
print(" Comparison saved to: example_clustering_comparison.csv")
print("\n=== Example completed successfully! ===")
print("\nFiles created:")
print(" - example_descriptions.csv (sample data)")
print(" - example_clustering_results.json (detailed results)")
print(" - example_clustering_comparison.csv (comparison table)")
def analyze_specific_cluster(method='kmeans', cluster_id=0):
"""Analyze a specific cluster in detail."""
print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===")
# Load the comparison data
df = pd.read_csv('example_clustering_comparison.csv')
# Filter for the specific cluster
cluster_mask = df[f'cluster_{method}'] == cluster_id
cluster_descriptions = df[cluster_mask]['description'].tolist()
print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:")
print("-" * 40)
for i, desc in enumerate(cluster_descriptions, 1):
print(f"{i:2d}. {desc}")
# Load the detailed results
with open('example_clustering_results.json', 'r') as f:
results = json.load(f)
keywords = results[method]['cluster_keywords'].get(str(cluster_id), [])
print(f"\nTop keywords for this cluster: {', '.join(keywords)}")
if __name__ == "__main__":
# Run the main example
run_clustering_example()
# Analyze a specific cluster (you can modify the parameters)
analyze_specific_cluster(method='kmeans', cluster_id=0) |