Spaces:

tomthekkan
/

chat_bot

Sleeping

App Files Files Community

chat_bot / description_clustering.py

tomthekkan

Upload folder using huggingface_hub

6d5953d verified 7 months ago

raw

history blame contribute delete

19 kB

	#!/usr/bin/env python3
	"""
	Description Clustering Script

	This script provides multiple clustering techniques for analyzing descriptions from CSV files.
	It supports K-means clustering, hierarchical clustering, and topic modeling using LDA.

	Usage:
	python description_clustering.py --input data.csv --column descriptions --method kmeans --clusters 5
	"""

	import pandas as pd
	import numpy as np
	import argparse
	import json
	from pathlib import Path
	from typing import List, Dict, Any, Optional
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans, AgglomerativeClustering
	from sklearn.decomposition import LatentDirichletAllocation, NMF
	from sklearn.manifold import TSNE
	from sklearn.metrics import silhouette_score, calinski_harabasz_score
	from sklearn.preprocessing import StandardScaler
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	import warnings
	warnings.filterwarnings('ignore')

	# Download required NLTK data
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')

	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')

	try:
	nltk.data.find('corpora/wordnet')
	except LookupError:
	nltk.download('wordnet')


	class DescriptionClusterer:
	"""A comprehensive class for clustering text descriptions."""

	def __init__(self, random_state: int = 42):
	self.random_state = random_state
	self.vectorizer = None
	self.model = None
	self.cluster_labels = None
	self.feature_names = None
	self.embeddings = None

	def preprocess_text(self, texts: List[str]) -> List[str]:
	"""Preprocess text data by tokenizing, removing stopwords, and lemmatizing."""
	lemmatizer = WordNetLemmatizer()
	stop_words = set(stopwords.words('english'))

	processed_texts = []
	for text in texts:
	if pd.isna(text) or text == '':
	processed_texts.append('')
	continue

	# Tokenize
	tokens = word_tokenize(text.lower())

	# Remove stopwords and lemmatize
	tokens = [lemmatizer.lemmatize(token) for token in tokens
	if token.isalnum() and token not in stop_words and len(token) > 2]

	processed_texts.append(' '.join(tokens))

	return processed_texts

	def vectorize_text(self, texts: List[str], max_features: int = 1000,
	min_df: int = 2, max_df: float = 0.95) -> np.ndarray:
	"""Convert text to TF-IDF vectors."""
	self.vectorizer = TfidfVectorizer(
	max_features=max_features,
	min_df=min_df,
	max_df=max_df,
	ngram_range=(1, 2),
	stop_words='english'
	)

	self.embeddings = self.vectorizer.fit_transform(texts)
	self.feature_names = self.vectorizer.get_feature_names_out()
	return self.embeddings.toarray()

	def kmeans_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray:
	"""Perform K-means clustering."""
	self.model = KMeans(
	n_clusters=n_clusters,
	random_state=self.random_state,
	n_init=10
	)
	self.cluster_labels = self.model.fit_predict(embeddings)
	return self.cluster_labels

	def hierarchical_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray:
	"""Perform hierarchical clustering."""
	self.model = AgglomerativeClustering(
	n_clusters=n_clusters,
	linkage='ward'
	)
	self.cluster_labels = self.model.fit_predict(embeddings)
	return self.cluster_labels

	def topic_modeling_lda(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray:
	"""Perform topic modeling using Latent Dirichlet Allocation."""
	self.model = LatentDirichletAllocation(
	n_components=n_topics,
	random_state=self.random_state,
	max_iter=100
	)
	# LDA expects non-negative values, so we use the raw TF-IDF matrix
	topic_distributions = self.model.fit_transform(self.embeddings)
	self.cluster_labels = np.argmax(topic_distributions, axis=1)
	return self.cluster_labels

	def topic_modeling_nmf(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray:
	"""Perform topic modeling using Non-negative Matrix Factorization."""
	self.model = NMF(
	n_components=n_topics,
	random_state=self.random_state,
	max_iter=200
	)
	topic_distributions = self.model.fit_transform(self.embeddings)
	self.cluster_labels = np.argmax(topic_distributions, axis=1)
	return self.cluster_labels

	def evaluate_clustering(self, embeddings: np.ndarray) -> Dict[str, float]:
	"""Evaluate clustering quality using silhouette and Calinski-Harabasz scores."""
	if self.cluster_labels is None:
	return {}

	# Silhouette score (higher is better, range: -1 to 1)
	silhouette_avg = silhouette_score(embeddings, self.cluster_labels)

	# Calinski-Harabasz score (higher is better)
	calinski_harabasz = calinski_harabasz_score(embeddings, self.cluster_labels)

	return {
	'silhouette_score': silhouette_avg,
	'calinski_harabasz_score': calinski_harabasz
	}

	def get_cluster_keywords(self, n_keywords: int = 10) -> Dict[int, List[str]]:
	"""Extract top keywords for each cluster."""
	if self.cluster_labels is None or self.feature_names is None:
	return {}

	cluster_keywords = {}
	unique_clusters = np.unique(self.cluster_labels)

	for cluster_id in unique_clusters:
	cluster_mask = self.cluster_labels == cluster_id
	cluster_embeddings = self.embeddings[cluster_mask]

	if cluster_embeddings.shape[0] == 0:
	continue

	# Calculate mean TF-IDF scores for this cluster
	cluster_means = np.mean(cluster_embeddings.toarray(), axis=0)

	# Get top keywords
	top_indices = np.argsort(cluster_means)[-n_keywords:][::-1]
	keywords = [self.feature_names[i] for i in top_indices]
	cluster_keywords[cluster_id] = keywords

	return cluster_keywords

	def visualize_clusters(self, embeddings: np.ndarray, output_path: str = "cluster_visualization.png"):
	"""Create t-SNE visualization of clusters."""
	if self.cluster_labels is None:
	print("No clustering results to visualize.")
	return

	# Reduce dimensionality for visualization
	tsne = TSNE(n_components=2, random_state=self.random_state, perplexity=min(30, len(embeddings)-1))
	embeddings_2d = tsne.fit_transform(embeddings)

	# Create visualization
	plt.figure(figsize=(12, 8))
	scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
	c=self.cluster_labels, cmap='viridis', alpha=0.7)
	plt.colorbar(scatter)
	plt.title('t-SNE Visualization of Clusters')
	plt.xlabel('t-SNE Component 1')
	plt.ylabel('t-SNE Component 2')
	plt.tight_layout()
	plt.savefig(output_path, dpi=300, bbox_inches='tight')
	plt.show()
	print(f"Visualization saved to {output_path}")


	def create_sample_data(output_path: str = "sample_descriptions.csv"):
	"""Create sample data for demonstration."""
	sample_data = {
	'id': range(1, 101),
	'description': [
	# Technology/Software descriptions
	"Advanced machine learning algorithm for predictive analytics",
	"Cloud-based software solution for enterprise management",
	"Mobile app development framework with cross-platform support",
	"Data visualization tool for business intelligence",
	"Cybersecurity software for threat detection",
	"API integration platform for third-party services",
	"Database management system with real-time synchronization",
	"Web development framework with modern UI components",
	"Artificial intelligence chatbot for customer service",
	"Blockchain technology for secure transactions",

	# Food/Restaurant descriptions
	"Authentic Italian restaurant with traditional recipes",
	"Organic farm-to-table dining experience",
	"Gourmet burger joint with craft beer selection",
	"Sushi bar with fresh daily ingredients",
	"Vegan restaurant with plant-based alternatives",
	"Pizza place with wood-fired oven cooking",
	"Mexican restaurant with homemade tortillas",
	"Coffee shop with artisanal brewing methods",
	"Bakery with fresh pastries and bread",
	"Seafood restaurant with ocean views",

	# Travel/Tourism descriptions
	"Luxury hotel with spa and wellness facilities",
	"Adventure tour company for outdoor activities",
	"Cultural heritage site with guided tours",
	"Beach resort with water sports equipment",
	"Mountain hiking trail with scenic viewpoints",
	"City walking tour with historical landmarks",
	"Eco-tourism lodge in rainforest setting",
	"Ski resort with modern lift systems",
	"Wine tasting tour in vineyard region",
	"Wildlife safari with expert guides",

	# Health/Wellness descriptions
	"Fitness center with personal training programs",
	"Yoga studio with meditation classes",
	"Medical clinic with specialized treatments",
	"Nutrition counseling for healthy eating",
	"Physical therapy center for rehabilitation",
	"Mental health counseling services",
	"Alternative medicine practice with holistic approach",
	"Dental clinic with modern equipment",
	"Pharmacy with prescription services",
	"Wellness spa with massage therapy",

	# Education/Training descriptions
	"Online learning platform with interactive courses",
	"Language school with native speakers",
	"Professional certification program",
	"Technical training institute for skills development",
	"University course with research opportunities",
	"Workshop series for creative skills",
	"Corporate training program for leadership",
	"Vocational school with hands-on experience",
	"Tutoring service for academic support",
	"Distance learning program with flexible schedules",

	# Entertainment/Recreation descriptions
	"Movie theater with premium seating options",
	"Concert venue with state-of-the-art sound",
	"Gaming arcade with virtual reality experiences",
	"Art gallery with contemporary exhibitions",
	"Sports complex with multiple facilities",
	"Bowling alley with family entertainment",
	"Karaoke bar with private rooms",
	"Comedy club with stand-up performances",
	"Dance studio with various styles",
	"Music school with instrument lessons",

	# Business/Professional descriptions
	"Consulting firm with strategic planning services",
	"Marketing agency with digital expertise",
	"Legal practice with specialized areas",
	"Accounting firm with tax preparation",
	"Real estate agency with property management",
	"Insurance company with comprehensive coverage",
	"Financial planning service for investments",
	"Public relations firm with media relations",
	"Human resources consulting for recruitment",
	"IT consulting with system integration",

	# Retail/Shopping descriptions
	"Fashion boutique with designer clothing",
	"Electronics store with latest gadgets",
	"Bookstore with rare and used books",
	"Home improvement store with tools",
	"Jewelry store with custom designs",
	"Toy store with educational games",
	"Pet store with grooming services",
	"Garden center with plants and supplies",
	"Sporting goods store with equipment",
	"Antique shop with vintage items",

	# Transportation/Logistics descriptions
	"Delivery service with same-day options",
	"Moving company with packing services",
	"Taxi service with luxury vehicles",
	"Car rental agency with flexible terms",
	"Freight forwarding with international shipping",
	"Warehouse storage with climate control",
	"Courier service with tracking systems",
	"Transportation company with fleet management",
	"Logistics provider with supply chain solutions",
	"Shipping company with express delivery",

	# Home/Services descriptions
	"Cleaning service with eco-friendly products",
	"Plumbing company with emergency repairs",
	"Electrical contractor with safety certification",
	"HVAC service with maintenance plans",
	"Landscaping company with design services",
	"Roofing contractor with warranty coverage",
	"Painting service with color consultation",
	"Carpentry workshop with custom furniture",
	"Security system installation with monitoring",
	"Home automation with smart technology"
	]
	}

	df = pd.DataFrame(sample_data)
	df.to_csv(output_path, index=False)
	print(f"Sample data created with {len(df)} descriptions: {output_path}")
	return df


	def main():
	parser = argparse.ArgumentParser(description='Cluster descriptions from CSV file')
	parser.add_argument('--input', type=str, help='Input CSV file path')
	parser.add_argument('--column', type=str, default='description', help='Column name containing descriptions')
	parser.add_argument('--method', type=str, choices=['kmeans', 'hierarchical', 'lda', 'nmf'],
	default='kmeans', help='Clustering method')
	parser.add_argument('--clusters', type=int, default=5, help='Number of clusters')
	parser.add_argument('--max-features', type=int, default=1000, help='Maximum features for vectorization')
	parser.add_argument('--output', type=str, default='clustering_results.json', help='Output file for results')
	parser.add_argument('--visualize', action='store_true', help='Generate visualization')
	parser.add_argument('--create-sample', action='store_true', help='Create sample data for testing')

	args = parser.parse_args()

	# Create sample data if requested
	if args.create_sample:
	create_sample_data()
	return

	# Load data
	if not args.input:
	print("No input file specified. Creating sample data...")
	create_sample_data()
	args.input = "sample_descriptions.csv"

	if not Path(args.input).exists():
	print(f"Input file {args.input} not found.")
	return

	print(f"Loading data from {args.input}...")
	df = pd.read_csv(args.input)

	if args.column not in df.columns:
	print(f"Column '{args.column}' not found in CSV. Available columns: {list(df.columns)}")
	return

	# Initialize clusterer
	clusterer = DescriptionClusterer()

	# Preprocess text
	print("Preprocessing text...")
	descriptions = df[args.column].fillna('').astype(str).tolist()
	processed_descriptions = clusterer.preprocess_text(descriptions)

	# Vectorize text
	print("Vectorizing text...")
	embeddings = clusterer.vectorize_text(processed_descriptions, max_features=args.max_features)

	# Perform clustering
	print(f"Performing {args.method} clustering with {args.clusters} clusters...")
	if args.method == 'kmeans':
	cluster_labels = clusterer.kmeans_clustering(embeddings, args.clusters)
	elif args.method == 'hierarchical':
	cluster_labels = clusterer.hierarchical_clustering(embeddings, args.clusters)
	elif args.method == 'lda':
	cluster_labels = clusterer.topic_modeling_lda(embeddings, args.clusters)
	elif args.method == 'nmf':
	cluster_labels = clusterer.topic_modeling_nmf(embeddings, args.clusters)

	# Add cluster labels to dataframe
	df['cluster'] = cluster_labels

	# Evaluate clustering
	print("Evaluating clustering quality...")
	evaluation_scores = clusterer.evaluate_clustering(embeddings)

	# Get cluster keywords
	print("Extracting cluster keywords...")
	cluster_keywords = clusterer.get_cluster_keywords()

	# Generate results
	results = {
	'method': args.method,
	'n_clusters': args.clusters,
	'n_samples': len(df),
	'evaluation_scores': evaluation_scores,
	'cluster_keywords': cluster_keywords,
	'cluster_distribution': df['cluster'].value_counts().to_dict(),
	'sample_descriptions': {}
	}

	# Add sample descriptions for each cluster
	for cluster_id in sorted(df['cluster'].unique()):
	cluster_samples = df[df['cluster'] == cluster_id][args.column].head(3).tolist()
	results['sample_descriptions'][f'cluster_{cluster_id}'] = cluster_samples

	# Save results
	with open(args.output, 'w') as f:
	json.dump(results, f, indent=2)

	print(f"\nClustering completed!")
	print(f"Results saved to: {args.output}")
	print(f"\nEvaluation Scores:")
	for metric, score in evaluation_scores.items():
	print(f" {metric}: {score:.4f}")

	print(f"\nCluster Distribution:")
	for cluster_id, count in sorted(results['cluster_distribution'].items()):
	print(f" Cluster {cluster_id}: {count} descriptions")

	print(f"\nTop Keywords by Cluster:")
	for cluster_id, keywords in cluster_keywords.items():
	print(f" Cluster {cluster_id}: {', '.join(keywords[:5])}")

	# Save clustered data
	output_csv = f"clustered_{Path(args.input).name}"
	df.to_csv(output_csv, index=False)
	print(f"\nClustered data saved to: {output_csv}")

	# Generate visualization if requested
	if args.visualize:
	clusterer.visualize_clusters(embeddings)


	if __name__ == "__main__":
	main()