chat_bot / description_clustering.py
tomthekkan's picture
Upload folder using huggingface_hub
6d5953d verified
#!/usr/bin/env python3
"""
Description Clustering Script
This script provides multiple clustering techniques for analyzing descriptions from CSV files.
It supports K-means clustering, hierarchical clustering, and topic modeling using LDA.
Usage:
python description_clustering.py --input data.csv --column descriptions --method kmeans --clusters 5
"""
import pandas as pd
import numpy as np
import argparse
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
class DescriptionClusterer:
"""A comprehensive class for clustering text descriptions."""
def __init__(self, random_state: int = 42):
self.random_state = random_state
self.vectorizer = None
self.model = None
self.cluster_labels = None
self.feature_names = None
self.embeddings = None
def preprocess_text(self, texts: List[str]) -> List[str]:
"""Preprocess text data by tokenizing, removing stopwords, and lemmatizing."""
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
processed_texts = []
for text in texts:
if pd.isna(text) or text == '':
processed_texts.append('')
continue
# Tokenize
tokens = word_tokenize(text.lower())
# Remove stopwords and lemmatize
tokens = [lemmatizer.lemmatize(token) for token in tokens
if token.isalnum() and token not in stop_words and len(token) > 2]
processed_texts.append(' '.join(tokens))
return processed_texts
def vectorize_text(self, texts: List[str], max_features: int = 1000,
min_df: int = 2, max_df: float = 0.95) -> np.ndarray:
"""Convert text to TF-IDF vectors."""
self.vectorizer = TfidfVectorizer(
max_features=max_features,
min_df=min_df,
max_df=max_df,
ngram_range=(1, 2),
stop_words='english'
)
self.embeddings = self.vectorizer.fit_transform(texts)
self.feature_names = self.vectorizer.get_feature_names_out()
return self.embeddings.toarray()
def kmeans_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray:
"""Perform K-means clustering."""
self.model = KMeans(
n_clusters=n_clusters,
random_state=self.random_state,
n_init=10
)
self.cluster_labels = self.model.fit_predict(embeddings)
return self.cluster_labels
def hierarchical_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray:
"""Perform hierarchical clustering."""
self.model = AgglomerativeClustering(
n_clusters=n_clusters,
linkage='ward'
)
self.cluster_labels = self.model.fit_predict(embeddings)
return self.cluster_labels
def topic_modeling_lda(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray:
"""Perform topic modeling using Latent Dirichlet Allocation."""
self.model = LatentDirichletAllocation(
n_components=n_topics,
random_state=self.random_state,
max_iter=100
)
# LDA expects non-negative values, so we use the raw TF-IDF matrix
topic_distributions = self.model.fit_transform(self.embeddings)
self.cluster_labels = np.argmax(topic_distributions, axis=1)
return self.cluster_labels
def topic_modeling_nmf(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray:
"""Perform topic modeling using Non-negative Matrix Factorization."""
self.model = NMF(
n_components=n_topics,
random_state=self.random_state,
max_iter=200
)
topic_distributions = self.model.fit_transform(self.embeddings)
self.cluster_labels = np.argmax(topic_distributions, axis=1)
return self.cluster_labels
def evaluate_clustering(self, embeddings: np.ndarray) -> Dict[str, float]:
"""Evaluate clustering quality using silhouette and Calinski-Harabasz scores."""
if self.cluster_labels is None:
return {}
# Silhouette score (higher is better, range: -1 to 1)
silhouette_avg = silhouette_score(embeddings, self.cluster_labels)
# Calinski-Harabasz score (higher is better)
calinski_harabasz = calinski_harabasz_score(embeddings, self.cluster_labels)
return {
'silhouette_score': silhouette_avg,
'calinski_harabasz_score': calinski_harabasz
}
def get_cluster_keywords(self, n_keywords: int = 10) -> Dict[int, List[str]]:
"""Extract top keywords for each cluster."""
if self.cluster_labels is None or self.feature_names is None:
return {}
cluster_keywords = {}
unique_clusters = np.unique(self.cluster_labels)
for cluster_id in unique_clusters:
cluster_mask = self.cluster_labels == cluster_id
cluster_embeddings = self.embeddings[cluster_mask]
if cluster_embeddings.shape[0] == 0:
continue
# Calculate mean TF-IDF scores for this cluster
cluster_means = np.mean(cluster_embeddings.toarray(), axis=0)
# Get top keywords
top_indices = np.argsort(cluster_means)[-n_keywords:][::-1]
keywords = [self.feature_names[i] for i in top_indices]
cluster_keywords[cluster_id] = keywords
return cluster_keywords
def visualize_clusters(self, embeddings: np.ndarray, output_path: str = "cluster_visualization.png"):
"""Create t-SNE visualization of clusters."""
if self.cluster_labels is None:
print("No clustering results to visualize.")
return
# Reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=self.random_state, perplexity=min(30, len(embeddings)-1))
embeddings_2d = tsne.fit_transform(embeddings)
# Create visualization
plt.figure(figsize=(12, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
c=self.cluster_labels, cmap='viridis', alpha=0.7)
plt.colorbar(scatter)
plt.title('t-SNE Visualization of Clusters')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.show()
print(f"Visualization saved to {output_path}")
def create_sample_data(output_path: str = "sample_descriptions.csv"):
"""Create sample data for demonstration."""
sample_data = {
'id': range(1, 101),
'description': [
# Technology/Software descriptions
"Advanced machine learning algorithm for predictive analytics",
"Cloud-based software solution for enterprise management",
"Mobile app development framework with cross-platform support",
"Data visualization tool for business intelligence",
"Cybersecurity software for threat detection",
"API integration platform for third-party services",
"Database management system with real-time synchronization",
"Web development framework with modern UI components",
"Artificial intelligence chatbot for customer service",
"Blockchain technology for secure transactions",
# Food/Restaurant descriptions
"Authentic Italian restaurant with traditional recipes",
"Organic farm-to-table dining experience",
"Gourmet burger joint with craft beer selection",
"Sushi bar with fresh daily ingredients",
"Vegan restaurant with plant-based alternatives",
"Pizza place with wood-fired oven cooking",
"Mexican restaurant with homemade tortillas",
"Coffee shop with artisanal brewing methods",
"Bakery with fresh pastries and bread",
"Seafood restaurant with ocean views",
# Travel/Tourism descriptions
"Luxury hotel with spa and wellness facilities",
"Adventure tour company for outdoor activities",
"Cultural heritage site with guided tours",
"Beach resort with water sports equipment",
"Mountain hiking trail with scenic viewpoints",
"City walking tour with historical landmarks",
"Eco-tourism lodge in rainforest setting",
"Ski resort with modern lift systems",
"Wine tasting tour in vineyard region",
"Wildlife safari with expert guides",
# Health/Wellness descriptions
"Fitness center with personal training programs",
"Yoga studio with meditation classes",
"Medical clinic with specialized treatments",
"Nutrition counseling for healthy eating",
"Physical therapy center for rehabilitation",
"Mental health counseling services",
"Alternative medicine practice with holistic approach",
"Dental clinic with modern equipment",
"Pharmacy with prescription services",
"Wellness spa with massage therapy",
# Education/Training descriptions
"Online learning platform with interactive courses",
"Language school with native speakers",
"Professional certification program",
"Technical training institute for skills development",
"University course with research opportunities",
"Workshop series for creative skills",
"Corporate training program for leadership",
"Vocational school with hands-on experience",
"Tutoring service for academic support",
"Distance learning program with flexible schedules",
# Entertainment/Recreation descriptions
"Movie theater with premium seating options",
"Concert venue with state-of-the-art sound",
"Gaming arcade with virtual reality experiences",
"Art gallery with contemporary exhibitions",
"Sports complex with multiple facilities",
"Bowling alley with family entertainment",
"Karaoke bar with private rooms",
"Comedy club with stand-up performances",
"Dance studio with various styles",
"Music school with instrument lessons",
# Business/Professional descriptions
"Consulting firm with strategic planning services",
"Marketing agency with digital expertise",
"Legal practice with specialized areas",
"Accounting firm with tax preparation",
"Real estate agency with property management",
"Insurance company with comprehensive coverage",
"Financial planning service for investments",
"Public relations firm with media relations",
"Human resources consulting for recruitment",
"IT consulting with system integration",
# Retail/Shopping descriptions
"Fashion boutique with designer clothing",
"Electronics store with latest gadgets",
"Bookstore with rare and used books",
"Home improvement store with tools",
"Jewelry store with custom designs",
"Toy store with educational games",
"Pet store with grooming services",
"Garden center with plants and supplies",
"Sporting goods store with equipment",
"Antique shop with vintage items",
# Transportation/Logistics descriptions
"Delivery service with same-day options",
"Moving company with packing services",
"Taxi service with luxury vehicles",
"Car rental agency with flexible terms",
"Freight forwarding with international shipping",
"Warehouse storage with climate control",
"Courier service with tracking systems",
"Transportation company with fleet management",
"Logistics provider with supply chain solutions",
"Shipping company with express delivery",
# Home/Services descriptions
"Cleaning service with eco-friendly products",
"Plumbing company with emergency repairs",
"Electrical contractor with safety certification",
"HVAC service with maintenance plans",
"Landscaping company with design services",
"Roofing contractor with warranty coverage",
"Painting service with color consultation",
"Carpentry workshop with custom furniture",
"Security system installation with monitoring",
"Home automation with smart technology"
]
}
df = pd.DataFrame(sample_data)
df.to_csv(output_path, index=False)
print(f"Sample data created with {len(df)} descriptions: {output_path}")
return df
def main():
parser = argparse.ArgumentParser(description='Cluster descriptions from CSV file')
parser.add_argument('--input', type=str, help='Input CSV file path')
parser.add_argument('--column', type=str, default='description', help='Column name containing descriptions')
parser.add_argument('--method', type=str, choices=['kmeans', 'hierarchical', 'lda', 'nmf'],
default='kmeans', help='Clustering method')
parser.add_argument('--clusters', type=int, default=5, help='Number of clusters')
parser.add_argument('--max-features', type=int, default=1000, help='Maximum features for vectorization')
parser.add_argument('--output', type=str, default='clustering_results.json', help='Output file for results')
parser.add_argument('--visualize', action='store_true', help='Generate visualization')
parser.add_argument('--create-sample', action='store_true', help='Create sample data for testing')
args = parser.parse_args()
# Create sample data if requested
if args.create_sample:
create_sample_data()
return
# Load data
if not args.input:
print("No input file specified. Creating sample data...")
create_sample_data()
args.input = "sample_descriptions.csv"
if not Path(args.input).exists():
print(f"Input file {args.input} not found.")
return
print(f"Loading data from {args.input}...")
df = pd.read_csv(args.input)
if args.column not in df.columns:
print(f"Column '{args.column}' not found in CSV. Available columns: {list(df.columns)}")
return
# Initialize clusterer
clusterer = DescriptionClusterer()
# Preprocess text
print("Preprocessing text...")
descriptions = df[args.column].fillna('').astype(str).tolist()
processed_descriptions = clusterer.preprocess_text(descriptions)
# Vectorize text
print("Vectorizing text...")
embeddings = clusterer.vectorize_text(processed_descriptions, max_features=args.max_features)
# Perform clustering
print(f"Performing {args.method} clustering with {args.clusters} clusters...")
if args.method == 'kmeans':
cluster_labels = clusterer.kmeans_clustering(embeddings, args.clusters)
elif args.method == 'hierarchical':
cluster_labels = clusterer.hierarchical_clustering(embeddings, args.clusters)
elif args.method == 'lda':
cluster_labels = clusterer.topic_modeling_lda(embeddings, args.clusters)
elif args.method == 'nmf':
cluster_labels = clusterer.topic_modeling_nmf(embeddings, args.clusters)
# Add cluster labels to dataframe
df['cluster'] = cluster_labels
# Evaluate clustering
print("Evaluating clustering quality...")
evaluation_scores = clusterer.evaluate_clustering(embeddings)
# Get cluster keywords
print("Extracting cluster keywords...")
cluster_keywords = clusterer.get_cluster_keywords()
# Generate results
results = {
'method': args.method,
'n_clusters': args.clusters,
'n_samples': len(df),
'evaluation_scores': evaluation_scores,
'cluster_keywords': cluster_keywords,
'cluster_distribution': df['cluster'].value_counts().to_dict(),
'sample_descriptions': {}
}
# Add sample descriptions for each cluster
for cluster_id in sorted(df['cluster'].unique()):
cluster_samples = df[df['cluster'] == cluster_id][args.column].head(3).tolist()
results['sample_descriptions'][f'cluster_{cluster_id}'] = cluster_samples
# Save results
with open(args.output, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nClustering completed!")
print(f"Results saved to: {args.output}")
print(f"\nEvaluation Scores:")
for metric, score in evaluation_scores.items():
print(f" {metric}: {score:.4f}")
print(f"\nCluster Distribution:")
for cluster_id, count in sorted(results['cluster_distribution'].items()):
print(f" Cluster {cluster_id}: {count} descriptions")
print(f"\nTop Keywords by Cluster:")
for cluster_id, keywords in cluster_keywords.items():
print(f" Cluster {cluster_id}: {', '.join(keywords[:5])}")
# Save clustered data
output_csv = f"clustered_{Path(args.input).name}"
df.to_csv(output_csv, index=False)
print(f"\nClustered data saved to: {output_csv}")
# Generate visualization if requested
if args.visualize:
clusterer.visualize_clusters(embeddings)
if __name__ == "__main__":
main()