Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Description Clustering Script | |
| This script provides multiple clustering techniques for analyzing descriptions from CSV files. | |
| It supports K-means clustering, hierarchical clustering, and topic modeling using LDA. | |
| Usage: | |
| python description_clustering.py --input data.csv --column descriptions --method kmeans --clusters 5 | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Any, Optional | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans, AgglomerativeClustering | |
| from sklearn.decomposition import LatentDirichletAllocation, NMF | |
| from sklearn.manifold import TSNE | |
| from sklearn.metrics import silhouette_score, calinski_harabasz_score | |
| from sklearn.preprocessing import StandardScaler | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Download required NLTK data | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| try: | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('wordnet') | |
| class DescriptionClusterer: | |
| """A comprehensive class for clustering text descriptions.""" | |
| def __init__(self, random_state: int = 42): | |
| self.random_state = random_state | |
| self.vectorizer = None | |
| self.model = None | |
| self.cluster_labels = None | |
| self.feature_names = None | |
| self.embeddings = None | |
| def preprocess_text(self, texts: List[str]) -> List[str]: | |
| """Preprocess text data by tokenizing, removing stopwords, and lemmatizing.""" | |
| lemmatizer = WordNetLemmatizer() | |
| stop_words = set(stopwords.words('english')) | |
| processed_texts = [] | |
| for text in texts: | |
| if pd.isna(text) or text == '': | |
| processed_texts.append('') | |
| continue | |
| # Tokenize | |
| tokens = word_tokenize(text.lower()) | |
| # Remove stopwords and lemmatize | |
| tokens = [lemmatizer.lemmatize(token) for token in tokens | |
| if token.isalnum() and token not in stop_words and len(token) > 2] | |
| processed_texts.append(' '.join(tokens)) | |
| return processed_texts | |
| def vectorize_text(self, texts: List[str], max_features: int = 1000, | |
| min_df: int = 2, max_df: float = 0.95) -> np.ndarray: | |
| """Convert text to TF-IDF vectors.""" | |
| self.vectorizer = TfidfVectorizer( | |
| max_features=max_features, | |
| min_df=min_df, | |
| max_df=max_df, | |
| ngram_range=(1, 2), | |
| stop_words='english' | |
| ) | |
| self.embeddings = self.vectorizer.fit_transform(texts) | |
| self.feature_names = self.vectorizer.get_feature_names_out() | |
| return self.embeddings.toarray() | |
| def kmeans_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray: | |
| """Perform K-means clustering.""" | |
| self.model = KMeans( | |
| n_clusters=n_clusters, | |
| random_state=self.random_state, | |
| n_init=10 | |
| ) | |
| self.cluster_labels = self.model.fit_predict(embeddings) | |
| return self.cluster_labels | |
| def hierarchical_clustering(self, embeddings: np.ndarray, n_clusters: int = 5) -> np.ndarray: | |
| """Perform hierarchical clustering.""" | |
| self.model = AgglomerativeClustering( | |
| n_clusters=n_clusters, | |
| linkage='ward' | |
| ) | |
| self.cluster_labels = self.model.fit_predict(embeddings) | |
| return self.cluster_labels | |
| def topic_modeling_lda(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray: | |
| """Perform topic modeling using Latent Dirichlet Allocation.""" | |
| self.model = LatentDirichletAllocation( | |
| n_components=n_topics, | |
| random_state=self.random_state, | |
| max_iter=100 | |
| ) | |
| # LDA expects non-negative values, so we use the raw TF-IDF matrix | |
| topic_distributions = self.model.fit_transform(self.embeddings) | |
| self.cluster_labels = np.argmax(topic_distributions, axis=1) | |
| return self.cluster_labels | |
| def topic_modeling_nmf(self, embeddings: np.ndarray, n_topics: int = 5) -> np.ndarray: | |
| """Perform topic modeling using Non-negative Matrix Factorization.""" | |
| self.model = NMF( | |
| n_components=n_topics, | |
| random_state=self.random_state, | |
| max_iter=200 | |
| ) | |
| topic_distributions = self.model.fit_transform(self.embeddings) | |
| self.cluster_labels = np.argmax(topic_distributions, axis=1) | |
| return self.cluster_labels | |
| def evaluate_clustering(self, embeddings: np.ndarray) -> Dict[str, float]: | |
| """Evaluate clustering quality using silhouette and Calinski-Harabasz scores.""" | |
| if self.cluster_labels is None: | |
| return {} | |
| # Silhouette score (higher is better, range: -1 to 1) | |
| silhouette_avg = silhouette_score(embeddings, self.cluster_labels) | |
| # Calinski-Harabasz score (higher is better) | |
| calinski_harabasz = calinski_harabasz_score(embeddings, self.cluster_labels) | |
| return { | |
| 'silhouette_score': silhouette_avg, | |
| 'calinski_harabasz_score': calinski_harabasz | |
| } | |
| def get_cluster_keywords(self, n_keywords: int = 10) -> Dict[int, List[str]]: | |
| """Extract top keywords for each cluster.""" | |
| if self.cluster_labels is None or self.feature_names is None: | |
| return {} | |
| cluster_keywords = {} | |
| unique_clusters = np.unique(self.cluster_labels) | |
| for cluster_id in unique_clusters: | |
| cluster_mask = self.cluster_labels == cluster_id | |
| cluster_embeddings = self.embeddings[cluster_mask] | |
| if cluster_embeddings.shape[0] == 0: | |
| continue | |
| # Calculate mean TF-IDF scores for this cluster | |
| cluster_means = np.mean(cluster_embeddings.toarray(), axis=0) | |
| # Get top keywords | |
| top_indices = np.argsort(cluster_means)[-n_keywords:][::-1] | |
| keywords = [self.feature_names[i] for i in top_indices] | |
| cluster_keywords[cluster_id] = keywords | |
| return cluster_keywords | |
| def visualize_clusters(self, embeddings: np.ndarray, output_path: str = "cluster_visualization.png"): | |
| """Create t-SNE visualization of clusters.""" | |
| if self.cluster_labels is None: | |
| print("No clustering results to visualize.") | |
| return | |
| # Reduce dimensionality for visualization | |
| tsne = TSNE(n_components=2, random_state=self.random_state, perplexity=min(30, len(embeddings)-1)) | |
| embeddings_2d = tsne.fit_transform(embeddings) | |
| # Create visualization | |
| plt.figure(figsize=(12, 8)) | |
| scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], | |
| c=self.cluster_labels, cmap='viridis', alpha=0.7) | |
| plt.colorbar(scatter) | |
| plt.title('t-SNE Visualization of Clusters') | |
| plt.xlabel('t-SNE Component 1') | |
| plt.ylabel('t-SNE Component 2') | |
| plt.tight_layout() | |
| plt.savefig(output_path, dpi=300, bbox_inches='tight') | |
| plt.show() | |
| print(f"Visualization saved to {output_path}") | |
| def create_sample_data(output_path: str = "sample_descriptions.csv"): | |
| """Create sample data for demonstration.""" | |
| sample_data = { | |
| 'id': range(1, 101), | |
| 'description': [ | |
| # Technology/Software descriptions | |
| "Advanced machine learning algorithm for predictive analytics", | |
| "Cloud-based software solution for enterprise management", | |
| "Mobile app development framework with cross-platform support", | |
| "Data visualization tool for business intelligence", | |
| "Cybersecurity software for threat detection", | |
| "API integration platform for third-party services", | |
| "Database management system with real-time synchronization", | |
| "Web development framework with modern UI components", | |
| "Artificial intelligence chatbot for customer service", | |
| "Blockchain technology for secure transactions", | |
| # Food/Restaurant descriptions | |
| "Authentic Italian restaurant with traditional recipes", | |
| "Organic farm-to-table dining experience", | |
| "Gourmet burger joint with craft beer selection", | |
| "Sushi bar with fresh daily ingredients", | |
| "Vegan restaurant with plant-based alternatives", | |
| "Pizza place with wood-fired oven cooking", | |
| "Mexican restaurant with homemade tortillas", | |
| "Coffee shop with artisanal brewing methods", | |
| "Bakery with fresh pastries and bread", | |
| "Seafood restaurant with ocean views", | |
| # Travel/Tourism descriptions | |
| "Luxury hotel with spa and wellness facilities", | |
| "Adventure tour company for outdoor activities", | |
| "Cultural heritage site with guided tours", | |
| "Beach resort with water sports equipment", | |
| "Mountain hiking trail with scenic viewpoints", | |
| "City walking tour with historical landmarks", | |
| "Eco-tourism lodge in rainforest setting", | |
| "Ski resort with modern lift systems", | |
| "Wine tasting tour in vineyard region", | |
| "Wildlife safari with expert guides", | |
| # Health/Wellness descriptions | |
| "Fitness center with personal training programs", | |
| "Yoga studio with meditation classes", | |
| "Medical clinic with specialized treatments", | |
| "Nutrition counseling for healthy eating", | |
| "Physical therapy center for rehabilitation", | |
| "Mental health counseling services", | |
| "Alternative medicine practice with holistic approach", | |
| "Dental clinic with modern equipment", | |
| "Pharmacy with prescription services", | |
| "Wellness spa with massage therapy", | |
| # Education/Training descriptions | |
| "Online learning platform with interactive courses", | |
| "Language school with native speakers", | |
| "Professional certification program", | |
| "Technical training institute for skills development", | |
| "University course with research opportunities", | |
| "Workshop series for creative skills", | |
| "Corporate training program for leadership", | |
| "Vocational school with hands-on experience", | |
| "Tutoring service for academic support", | |
| "Distance learning program with flexible schedules", | |
| # Entertainment/Recreation descriptions | |
| "Movie theater with premium seating options", | |
| "Concert venue with state-of-the-art sound", | |
| "Gaming arcade with virtual reality experiences", | |
| "Art gallery with contemporary exhibitions", | |
| "Sports complex with multiple facilities", | |
| "Bowling alley with family entertainment", | |
| "Karaoke bar with private rooms", | |
| "Comedy club with stand-up performances", | |
| "Dance studio with various styles", | |
| "Music school with instrument lessons", | |
| # Business/Professional descriptions | |
| "Consulting firm with strategic planning services", | |
| "Marketing agency with digital expertise", | |
| "Legal practice with specialized areas", | |
| "Accounting firm with tax preparation", | |
| "Real estate agency with property management", | |
| "Insurance company with comprehensive coverage", | |
| "Financial planning service for investments", | |
| "Public relations firm with media relations", | |
| "Human resources consulting for recruitment", | |
| "IT consulting with system integration", | |
| # Retail/Shopping descriptions | |
| "Fashion boutique with designer clothing", | |
| "Electronics store with latest gadgets", | |
| "Bookstore with rare and used books", | |
| "Home improvement store with tools", | |
| "Jewelry store with custom designs", | |
| "Toy store with educational games", | |
| "Pet store with grooming services", | |
| "Garden center with plants and supplies", | |
| "Sporting goods store with equipment", | |
| "Antique shop with vintage items", | |
| # Transportation/Logistics descriptions | |
| "Delivery service with same-day options", | |
| "Moving company with packing services", | |
| "Taxi service with luxury vehicles", | |
| "Car rental agency with flexible terms", | |
| "Freight forwarding with international shipping", | |
| "Warehouse storage with climate control", | |
| "Courier service with tracking systems", | |
| "Transportation company with fleet management", | |
| "Logistics provider with supply chain solutions", | |
| "Shipping company with express delivery", | |
| # Home/Services descriptions | |
| "Cleaning service with eco-friendly products", | |
| "Plumbing company with emergency repairs", | |
| "Electrical contractor with safety certification", | |
| "HVAC service with maintenance plans", | |
| "Landscaping company with design services", | |
| "Roofing contractor with warranty coverage", | |
| "Painting service with color consultation", | |
| "Carpentry workshop with custom furniture", | |
| "Security system installation with monitoring", | |
| "Home automation with smart technology" | |
| ] | |
| } | |
| df = pd.DataFrame(sample_data) | |
| df.to_csv(output_path, index=False) | |
| print(f"Sample data created with {len(df)} descriptions: {output_path}") | |
| return df | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Cluster descriptions from CSV file') | |
| parser.add_argument('--input', type=str, help='Input CSV file path') | |
| parser.add_argument('--column', type=str, default='description', help='Column name containing descriptions') | |
| parser.add_argument('--method', type=str, choices=['kmeans', 'hierarchical', 'lda', 'nmf'], | |
| default='kmeans', help='Clustering method') | |
| parser.add_argument('--clusters', type=int, default=5, help='Number of clusters') | |
| parser.add_argument('--max-features', type=int, default=1000, help='Maximum features for vectorization') | |
| parser.add_argument('--output', type=str, default='clustering_results.json', help='Output file for results') | |
| parser.add_argument('--visualize', action='store_true', help='Generate visualization') | |
| parser.add_argument('--create-sample', action='store_true', help='Create sample data for testing') | |
| args = parser.parse_args() | |
| # Create sample data if requested | |
| if args.create_sample: | |
| create_sample_data() | |
| return | |
| # Load data | |
| if not args.input: | |
| print("No input file specified. Creating sample data...") | |
| create_sample_data() | |
| args.input = "sample_descriptions.csv" | |
| if not Path(args.input).exists(): | |
| print(f"Input file {args.input} not found.") | |
| return | |
| print(f"Loading data from {args.input}...") | |
| df = pd.read_csv(args.input) | |
| if args.column not in df.columns: | |
| print(f"Column '{args.column}' not found in CSV. Available columns: {list(df.columns)}") | |
| return | |
| # Initialize clusterer | |
| clusterer = DescriptionClusterer() | |
| # Preprocess text | |
| print("Preprocessing text...") | |
| descriptions = df[args.column].fillna('').astype(str).tolist() | |
| processed_descriptions = clusterer.preprocess_text(descriptions) | |
| # Vectorize text | |
| print("Vectorizing text...") | |
| embeddings = clusterer.vectorize_text(processed_descriptions, max_features=args.max_features) | |
| # Perform clustering | |
| print(f"Performing {args.method} clustering with {args.clusters} clusters...") | |
| if args.method == 'kmeans': | |
| cluster_labels = clusterer.kmeans_clustering(embeddings, args.clusters) | |
| elif args.method == 'hierarchical': | |
| cluster_labels = clusterer.hierarchical_clustering(embeddings, args.clusters) | |
| elif args.method == 'lda': | |
| cluster_labels = clusterer.topic_modeling_lda(embeddings, args.clusters) | |
| elif args.method == 'nmf': | |
| cluster_labels = clusterer.topic_modeling_nmf(embeddings, args.clusters) | |
| # Add cluster labels to dataframe | |
| df['cluster'] = cluster_labels | |
| # Evaluate clustering | |
| print("Evaluating clustering quality...") | |
| evaluation_scores = clusterer.evaluate_clustering(embeddings) | |
| # Get cluster keywords | |
| print("Extracting cluster keywords...") | |
| cluster_keywords = clusterer.get_cluster_keywords() | |
| # Generate results | |
| results = { | |
| 'method': args.method, | |
| 'n_clusters': args.clusters, | |
| 'n_samples': len(df), | |
| 'evaluation_scores': evaluation_scores, | |
| 'cluster_keywords': cluster_keywords, | |
| 'cluster_distribution': df['cluster'].value_counts().to_dict(), | |
| 'sample_descriptions': {} | |
| } | |
| # Add sample descriptions for each cluster | |
| for cluster_id in sorted(df['cluster'].unique()): | |
| cluster_samples = df[df['cluster'] == cluster_id][args.column].head(3).tolist() | |
| results['sample_descriptions'][f'cluster_{cluster_id}'] = cluster_samples | |
| # Save results | |
| with open(args.output, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\nClustering completed!") | |
| print(f"Results saved to: {args.output}") | |
| print(f"\nEvaluation Scores:") | |
| for metric, score in evaluation_scores.items(): | |
| print(f" {metric}: {score:.4f}") | |
| print(f"\nCluster Distribution:") | |
| for cluster_id, count in sorted(results['cluster_distribution'].items()): | |
| print(f" Cluster {cluster_id}: {count} descriptions") | |
| print(f"\nTop Keywords by Cluster:") | |
| for cluster_id, keywords in cluster_keywords.items(): | |
| print(f" Cluster {cluster_id}: {', '.join(keywords[:5])}") | |
| # Save clustered data | |
| output_csv = f"clustered_{Path(args.input).name}" | |
| df.to_csv(output_csv, index=False) | |
| print(f"\nClustered data saved to: {output_csv}") | |
| # Generate visualization if requested | |
| if args.visualize: | |
| clusterer.visualize_clusters(embeddings) | |
| if __name__ == "__main__": | |
| main() |