Spaces:

tomthekkan
/

chat_bot

Sleeping

File size: 5,488 Bytes

6d5953d

#!/usr/bin/env python3
"""
Example script demonstrating description clustering functionality.

This script shows how to use the DescriptionClusterer class to cluster text descriptions
from a CSV file or sample data.
"""

import pandas as pd
from description_clustering import DescriptionClusterer, create_sample_data
import json

def run_clustering_example():
    """Run a complete clustering example with sample data."""
    
    print("=== Description Clustering Example ===\n")
    
    # Step 1: Create sample data
    print("1. Creating sample data...")
    df = create_sample_data("example_descriptions.csv")
    print(f"   Created {len(df)} sample descriptions\n")
    
    # Step 2: Initialize the clusterer
    print("2. Initializing clusterer...")
    clusterer = DescriptionClusterer(random_state=42)
    
    # Step 3: Preprocess the text
    print("3. Preprocessing text...")
    descriptions = df['description'].tolist()
    processed_descriptions = clusterer.preprocess_text(descriptions)
    print(f"   Preprocessed {len(processed_descriptions)} descriptions\n")
    
    # Step 4: Vectorize the text
    print("4. Vectorizing text...")
    embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500)
    print(f"   Created embeddings with shape: {embeddings.shape}\n")
    
    # Step 5: Perform clustering with different methods
    methods = ['kmeans', 'hierarchical', 'lda']
    results = {}
    
    for method in methods:
        print(f"5. Performing {method.upper()} clustering...")
        
        if method == 'kmeans':
            cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10)
        elif method == 'hierarchical':
            cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10)
        elif method == 'lda':
            cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10)
        
        # Evaluate clustering
        evaluation_scores = clusterer.evaluate_clustering(embeddings)
        
        # Get cluster keywords
        cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5)
        
        # Store results
        results[method] = {
            'cluster_labels': cluster_labels.tolist(),
            'evaluation_scores': evaluation_scores,
            'cluster_keywords': cluster_keywords,
            'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict()
        }
        
        print(f"   Silhouette Score: {evaluation_scores['silhouette_score']:.4f}")
        print(f"   Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}")
        print()
    
    # Step 6: Display results
    print("6. Clustering Results Summary:")
    print("=" * 50)
    
    for method, result in results.items():
        print(f"\n{method.upper()} Clustering:")
        print(f"  Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}")
        print(f"  Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}")
        
        print(f"  Cluster Distribution:")
        for cluster_id, count in sorted(result['cluster_distribution'].items()):
            print(f"    Cluster {cluster_id}: {count} descriptions")
        
        print(f"  Top Keywords by Cluster:")
        for cluster_id, keywords in result['cluster_keywords'].items():
            print(f"    Cluster {cluster_id}: {', '.join(keywords)}")
    
    # Step 7: Save results
    print("\n7. Saving results...")
    with open('example_clustering_results.json', 'w') as f:
        json.dump(results, f, indent=2)
    print("   Results saved to: example_clustering_results.json")
    
    # Step 8: Create a comparison dataframe
    print("\n8. Creating comparison dataframe...")
    comparison_df = df.copy()
    for method in methods:
        comparison_df[f'cluster_{method}'] = results[method]['cluster_labels']
    
    comparison_df.to_csv('example_clustering_comparison.csv', index=False)
    print("   Comparison saved to: example_clustering_comparison.csv")
    
    print("\n=== Example completed successfully! ===")
    print("\nFiles created:")
    print("  - example_descriptions.csv (sample data)")
    print("  - example_clustering_results.json (detailed results)")
    print("  - example_clustering_comparison.csv (comparison table)")

def analyze_specific_cluster(method='kmeans', cluster_id=0):
    """Analyze a specific cluster in detail."""
    
    print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===")
    
    # Load the comparison data
    df = pd.read_csv('example_clustering_comparison.csv')
    
    # Filter for the specific cluster
    cluster_mask = df[f'cluster_{method}'] == cluster_id
    cluster_descriptions = df[cluster_mask]['description'].tolist()
    
    print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:")
    print("-" * 40)
    
    for i, desc in enumerate(cluster_descriptions, 1):
        print(f"{i:2d}. {desc}")
    
    # Load the detailed results
    with open('example_clustering_results.json', 'r') as f:
        results = json.load(f)
    
    keywords = results[method]['cluster_keywords'].get(str(cluster_id), [])
    print(f"\nTop keywords for this cluster: {', '.join(keywords)}")

if __name__ == "__main__":
    # Run the main example
    run_clustering_example()
    
    # Analyze a specific cluster (you can modify the parameters)
    analyze_specific_cluster(method='kmeans', cluster_id=0)