File size: 2,737 Bytes
8f69dec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from sklearn.linear_model import LinearRegression

def perform_customer_segmentation(sales_df, n_clusters=4):
    """

    RFM (Recency, Frequency, Monetary) Clustering

    """
    # Calculate RFM metrics
    current_date = sales_df['Date'].max()
    
    rfm = sales_df.groupby('CustomerID').agg({
        'Date': lambda x: (current_date - x.max()).days, # Recency
        'TransactionID': 'count', # Frequency
        'TotalPrice': 'sum' # Monetary
    }).rename(columns={
        'Date': 'Recency',
        'TransactionID': 'Frequency',
        'TotalPrice': 'Monetary'
    })
    
    # Normalize
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm)
    
    # KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)
    
    # Label mapping (rough approximation based on centroids could be better, but simple is fine)
    # We just return the dataframe with Cluster ID
    return rfm

def forecast_sales(sales_df, days_to_forecast=30):
    """

    Simple Linear Regression on daily sales

    """
    daily_sales = sales_df.set_index('Date').resample('D')['TotalPrice'].sum().reset_index()
    daily_sales['DayIndex'] = np.arange(len(daily_sales))
    
    X = daily_sales[['DayIndex']]
    y = daily_sales['TotalPrice']
    
    model = LinearRegression()
    model.fit(X, y)
    
    # Future dates
    first_future_idx = int(daily_sales['DayIndex'].max()) + 1
    future_indices = np.arange(first_future_idx, first_future_idx + days_to_forecast).reshape(-1, 1)
    future_df = pd.DataFrame(future_indices, columns=['DayIndex'])
    predictions = model.predict(future_df)
    
    last_date = daily_sales['Date'].max()
    future_dates = [last_date + pd.Timedelta(days=i) for i in range(1, days_to_forecast + 1)]
    
    forecast_df = pd.DataFrame({
        'Date': future_dates,
        'Predicted_Sales': predictions
    })
    
    return daily_sales, forecast_df

def analyze_sentiment(reviews_df):
    """

    Apply TextBlob to get polarity

    """
    def get_polarity(text):
        return TextBlob(str(text)).sentiment.polarity

    reviews_df['Sentiment_Score'] = reviews_df['ReviewText'].apply(get_polarity)
    
    # Categorize
    def label_sentiment(score):
        if score > 0.1: return 'Positive'
        if score < -0.1: return 'Negative'
        return 'Neutral'
        
    reviews_df['Sentiment_Label'] = reviews_df['Sentiment_Score'].apply(label_sentiment)
    return reviews_df