Spaces:

tomthekkan
/

chat_bot

Sleeping

App Files Files Community

chat_bot / example_clustering.py

tomthekkan

Upload folder using huggingface_hub

6d5953d verified 6 months ago

raw

history blame contribute delete

5.49 kB

	#!/usr/bin/env python3
	"""
	Example script demonstrating description clustering functionality.

	This script shows how to use the DescriptionClusterer class to cluster text descriptions
	from a CSV file or sample data.
	"""

	import pandas as pd
	from description_clustering import DescriptionClusterer, create_sample_data
	import json

	def run_clustering_example():
	"""Run a complete clustering example with sample data."""

	print("=== Description Clustering Example ===\n")

	# Step 1: Create sample data
	print("1. Creating sample data...")
	df = create_sample_data("example_descriptions.csv")
	print(f" Created {len(df)} sample descriptions\n")

	# Step 2: Initialize the clusterer
	print("2. Initializing clusterer...")
	clusterer = DescriptionClusterer(random_state=42)

	# Step 3: Preprocess the text
	print("3. Preprocessing text...")
	descriptions = df['description'].tolist()
	processed_descriptions = clusterer.preprocess_text(descriptions)
	print(f" Preprocessed {len(processed_descriptions)} descriptions\n")

	# Step 4: Vectorize the text
	print("4. Vectorizing text...")
	embeddings = clusterer.vectorize_text(processed_descriptions, max_features=500)
	print(f" Created embeddings with shape: {embeddings.shape}\n")

	# Step 5: Perform clustering with different methods
	methods = ['kmeans', 'hierarchical', 'lda']
	results = {}

	for method in methods:
	print(f"5. Performing {method.upper()} clustering...")

	if method == 'kmeans':
	cluster_labels = clusterer.kmeans_clustering(embeddings, n_clusters=10)
	elif method == 'hierarchical':
	cluster_labels = clusterer.hierarchical_clustering(embeddings, n_clusters=10)
	elif method == 'lda':
	cluster_labels = clusterer.topic_modeling_lda(embeddings, n_topics=10)

	# Evaluate clustering
	evaluation_scores = clusterer.evaluate_clustering(embeddings)

	# Get cluster keywords
	cluster_keywords = clusterer.get_cluster_keywords(n_keywords=5)

	# Store results
	results[method] = {
	'cluster_labels': cluster_labels.tolist(),
	'evaluation_scores': evaluation_scores,
	'cluster_keywords': cluster_keywords,
	'cluster_distribution': pd.Series(cluster_labels).value_counts().to_dict()
	}

	print(f" Silhouette Score: {evaluation_scores['silhouette_score']:.4f}")
	print(f" Calinski-Harabasz Score: {evaluation_scores['calinski_harabasz_score']:.2f}")
	print()

	# Step 6: Display results
	print("6. Clustering Results Summary:")
	print("=" * 50)

	for method, result in results.items():
	print(f"\n{method.upper()} Clustering:")
	print(f" Silhouette Score: {result['evaluation_scores']['silhouette_score']:.4f}")
	print(f" Calinski-Harabasz Score: {result['evaluation_scores']['calinski_harabasz_score']:.2f}")

	print(f" Cluster Distribution:")
	for cluster_id, count in sorted(result['cluster_distribution'].items()):
	print(f" Cluster {cluster_id}: {count} descriptions")

	print(f" Top Keywords by Cluster:")
	for cluster_id, keywords in result['cluster_keywords'].items():
	print(f" Cluster {cluster_id}: {', '.join(keywords)}")

	# Step 7: Save results
	print("\n7. Saving results...")
	with open('example_clustering_results.json', 'w') as f:
	json.dump(results, f, indent=2)
	print(" Results saved to: example_clustering_results.json")

	# Step 8: Create a comparison dataframe
	print("\n8. Creating comparison dataframe...")
	comparison_df = df.copy()
	for method in methods:
	comparison_df[f'cluster_{method}'] = results[method]['cluster_labels']

	comparison_df.to_csv('example_clustering_comparison.csv', index=False)
	print(" Comparison saved to: example_clustering_comparison.csv")

	print("\n=== Example completed successfully! ===")
	print("\nFiles created:")
	print(" - example_descriptions.csv (sample data)")
	print(" - example_clustering_results.json (detailed results)")
	print(" - example_clustering_comparison.csv (comparison table)")

	def analyze_specific_cluster(method='kmeans', cluster_id=0):
	"""Analyze a specific cluster in detail."""

	print(f"\n=== Detailed Analysis of {method.upper()} Cluster {cluster_id} ===")

	# Load the comparison data
	df = pd.read_csv('example_clustering_comparison.csv')

	# Filter for the specific cluster
	cluster_mask = df[f'cluster_{method}'] == cluster_id
	cluster_descriptions = df[cluster_mask]['description'].tolist()

	print(f"\nCluster {cluster_id} contains {len(cluster_descriptions)} descriptions:")
	print("-" * 40)

	for i, desc in enumerate(cluster_descriptions, 1):
	print(f"{i:2d}. {desc}")

	# Load the detailed results
	with open('example_clustering_results.json', 'r') as f:
	results = json.load(f)

	keywords = results[method]['cluster_keywords'].get(str(cluster_id), [])
	print(f"\nTop keywords for this cluster: {', '.join(keywords)}")

	if __name__ == "__main__":
	# Run the main example
	run_clustering_example()

	# Analyze a specific cluster (you can modify the parameters)
	analyze_specific_cluster(method='kmeans', cluster_id=0)