import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from textblob import TextBlob from sklearn.linear_model import LinearRegression def perform_customer_segmentation(sales_df, n_clusters=4): """ RFM (Recency, Frequency, Monetary) Clustering """ # Calculate RFM metrics current_date = sales_df['Date'].max() rfm = sales_df.groupby('CustomerID').agg({ 'Date': lambda x: (current_date - x.max()).days, # Recency 'TransactionID': 'count', # Frequency 'TotalPrice': 'sum' # Monetary }).rename(columns={ 'Date': 'Recency', 'TransactionID': 'Frequency', 'TotalPrice': 'Monetary' }) # Normalize scaler = StandardScaler() rfm_scaled = scaler.fit_transform(rfm) # KMeans kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) rfm['Cluster'] = kmeans.fit_predict(rfm_scaled) # Label mapping (rough approximation based on centroids could be better, but simple is fine) # We just return the dataframe with Cluster ID return rfm def forecast_sales(sales_df, days_to_forecast=30): """ Simple Linear Regression on daily sales """ daily_sales = sales_df.set_index('Date').resample('D')['TotalPrice'].sum().reset_index() daily_sales['DayIndex'] = np.arange(len(daily_sales)) X = daily_sales[['DayIndex']] y = daily_sales['TotalPrice'] model = LinearRegression() model.fit(X, y) # Future dates first_future_idx = int(daily_sales['DayIndex'].max()) + 1 future_indices = np.arange(first_future_idx, first_future_idx + days_to_forecast).reshape(-1, 1) future_df = pd.DataFrame(future_indices, columns=['DayIndex']) predictions = model.predict(future_df) last_date = daily_sales['Date'].max() future_dates = [last_date + pd.Timedelta(days=i) for i in range(1, days_to_forecast + 1)] forecast_df = pd.DataFrame({ 'Date': future_dates, 'Predicted_Sales': predictions }) return daily_sales, forecast_df def analyze_sentiment(reviews_df): """ Apply TextBlob to get polarity """ def get_polarity(text): return TextBlob(str(text)).sentiment.polarity reviews_df['Sentiment_Score'] = reviews_df['ReviewText'].apply(get_polarity) # Categorize def label_sentiment(score): if score > 0.1: return 'Positive' if score < -0.1: return 'Negative' return 'Neutral' reviews_df['Sentiment_Label'] = reviews_df['Sentiment_Score'].apply(label_sentiment) return reviews_df