chat_bot / example_clustering.py
tomthekkan's picture
Upload folder using huggingface_hub
6d5953d verified
#!/usr/bin/env python3
"""
Example script demonstrating description clustering functionality.
This script shows how to use the DescriptionClusterer class to cluster text descriptions
from a CSV file or sample data.
"""
import pandas as pd
from description_clustering import DescriptionClusterer, create_sample_data
import json
def run_clustering_example():
"""Run a complete clustering example with sample data."""
print("=== Description Clustering Example ===\n")
# Step 1: Create sample data
print("1. Creating sample data...")
df = create_sample_data("example_descriptions.csv")
print(f" Created {len(df)} sample descriptions\n")
# Step 2: Initialize the clusterer
print("2. Initializing clusterer...")
clusterer = DescriptionClusterer(random_state=42)
# Step 3: Preprocess the text
print("3. Preprocessing text...")
descriptions = df['description'].tolist()
processed_descriptions = clusterer.preprocess_text(descriptions)
print(f" Preprocessed {len(processed_descriptions)} descriptions\n")
# Step 4: Vectorize the text
print("4. Vectorizing text...")
embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500)
print(f" Created embeddings with shape: {embeddings.shape}\n")
# Step 5: Perform clustering with different methods
methods = ['kmeans', 'hierarchical', 'lda']
results = {}
for method in methods:
print(f"5. Performing {method.upper()} clustering...")
if method == 'kmeans':
cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10)
elif method == 'hierarchical':
cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10)
elif method == 'lda':
cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10)
# Evaluate clustering
evaluation_scores = clusterer.evaluate_clustering(embeddings)
# Get cluster keywords
cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5)
# Store results
results[method] = {
'cluster_labels': cluster_labels.tolist(),
'evaluation_scores': evaluation_scores,
'cluster_keywords': cluster_keywords,
'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict()
}
print(f" Silhouette Score: {evaluation_scores['silhouette_score']:.4f}")
print(f" Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}")
print()
# Step 6: Display results
print("6. Clustering Results Summary:")
print("=" * 50)
for method, result in results.items():
print(f"\n{method.upper()} Clustering:")
print(f" Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}")
print(f" Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}")
print(f" Cluster Distribution:")
for cluster_id, count in sorted(result['cluster_distribution'].items()):
print(f" Cluster {cluster_id}: {count} descriptions")
print(f" Top Keywords by Cluster:")
for cluster_id, keywords in result['cluster_keywords'].items():
print(f" Cluster {cluster_id}: {', '.join(keywords)}")
# Step 7: Save results
print("\n7. Saving results...")
with open('example_clustering_results.json', 'w') as f:
json.dump(results, f, indent=2)
print(" Results saved to: example_clustering_results.json")
# Step 8: Create a comparison dataframe
print("\n8. Creating comparison dataframe...")
comparison_df = df.copy()
for method in methods:
comparison_df[f'cluster_{method}'] = results[method]['cluster_labels']
comparison_df.to_csv('example_clustering_comparison.csv', index=False)
print(" Comparison saved to: example_clustering_comparison.csv")
print("\n=== Example completed successfully! ===")
print("\nFiles created:")
print(" - example_descriptions.csv (sample data)")
print(" - example_clustering_results.json (detailed results)")
print(" - example_clustering_comparison.csv (comparison table)")
def analyze_specific_cluster(method='kmeans', cluster_id=0):
"""Analyze a specific cluster in detail."""
print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===")
# Load the comparison data
df = pd.read_csv('example_clustering_comparison.csv')
# Filter for the specific cluster
cluster_mask = df[f'cluster_{method}'] == cluster_id
cluster_descriptions = df[cluster_mask]['description'].tolist()
print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:")
print("-" * 40)
for i, desc in enumerate(cluster_descriptions, 1):
print(f"{i:2d}. {desc}")
# Load the detailed results
with open('example_clustering_results.json', 'r') as f:
results = json.load(f)
keywords = results[method]['cluster_keywords'].get(str(cluster_id), [])
print(f"\nTop keywords for this cluster: {', '.join(keywords)}")
if __name__ == "__main__":
# Run the main example
run_clustering_example()
# Analyze a specific cluster (you can modify the parameters)
analyze_specific_cluster(method='kmeans', cluster_id=0)