File size: 9,856 Bytes
aa893a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import pandas as pd
import matplotlib.pyplot as plt


def _safe_title(text):
    """Small helper to avoid None or empty titles."""
    if text is None:
        return ""
    return str(text)


def create_time_series_plot(df, date_col, value_col, agg_func="sum", freq="M", category_col=None):
    """
    Create a time series line plot.

    If category_col is provided (for example "Region" or "Model"),
    it will draw one line per category with a legend.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    date_col : str
        Column name with datetime values (e.g. "Date").
    value_col : str
        Numeric column to aggregate (e.g. "Estimated_Deliveries").
    agg_func : str
        Aggregation method: "sum", "mean", "count", "median".
    freq : str
        Resampling frequency ("M" = month, "Q" = quarter).
    category_col : str or None
        Optional column to group by (e.g. "Region" or "Model").

    Returns
    -------
    fig : matplotlib.figure.Figure or None
        The figure object with the plot, or None if something fails.
    """
    if date_col not in df.columns or value_col not in df.columns:
        return None

    data = df.copy()

    # Make sure date column is datetime
    if not pd.api.types.is_datetime64_any_dtype(data[date_col]):
        try:
            data[date_col] = pd.to_datetime(data[date_col])
        except Exception:
            return None

    # If no category, just aggregate the whole series over time (old behavior)
    if category_col is None or category_col not in df.columns:
        data = data.set_index(date_col)

        if agg_func == "mean":
            grouped = data[value_col].resample(freq).mean()
        elif agg_func == "count":
            grouped = data[value_col].resample(freq).count()
        elif agg_func == "median":
            grouped = data[value_col].resample(freq).median()
        else:
            # default to sum
            grouped = data[value_col].resample(freq).sum()

        fig, ax = plt.subplots()
        ax.plot(grouped.index, grouped.values)
        ax.set_xlabel("Date")
        ax.set_ylabel(_safe_title(value_col))
        ax.set_title(f"Time Series of {value_col} ({agg_func})")
        ax.grid(True)
        fig.tight_layout()
        return fig

    # If category_col is given (e.g. Region / Model), draw a line per category.
    data = data[[date_col, value_col, category_col]].dropna()
    if data.empty:
        return None

    # Group by category and date (using Grouper for frequency)
    grouped = data.groupby(
        [category_col, pd.Grouper(key=date_col, freq=freq)]
    )[value_col]

    if agg_func == "mean":
        grouped = grouped.mean()
    elif agg_func == "count":
        grouped = grouped.count()
    elif agg_func == "median":
        grouped = grouped.median()
    else:
        grouped = grouped.sum()

    # Now grouped is a Series with MultiIndex: (category, date)
    # We unstack so that the index is date and columns are categories.
    table = grouped.unstack(0)

    fig, ax = plt.subplots()

    # Plot one line per category
    for col in table.columns:
        # Some categories might be all NaN if data is weird; skip those
        series = table[col].dropna()
        if series.empty:
            continue
        ax.plot(series.index, series.values, label=str(col))

    ax.set_xlabel("Date")
    ax.set_ylabel(_safe_title(value_col))
    ax.set_title(f"Time Series of {value_col} by {category_col} ({agg_func})")
    ax.grid(True)
    ax.legend(title=_safe_title(category_col))

    fig.tight_layout()
    return fig


def create_distribution_plot(df, numeric_col, kind="hist", bins=30):
    """
    Create a distribution plot for a numeric column.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    numeric_col : str
        Numeric column to visualize.
    kind : str
        "hist" for histogram, "box" for box plot.
    bins : int
        Number of bins for histogram.

    Returns
    -------
    fig : matplotlib.figure.Figure or None
        The figure object with the plot, or None if something fails.
    """
    if numeric_col not in df.columns:
        return None

    series = df[numeric_col].dropna()
    if series.empty:
        return None

    fig, ax = plt.subplots()

    if kind == "box":
        ax.boxplot(series.values, vert=True)
        ax.set_xticks([1])
        ax.set_xticklabels([_safe_title(numeric_col)])
        ax.set_ylabel(_safe_title(numeric_col))
        ax.set_title(f"Box Plot of {numeric_col}")
    else:
        # Default to histogram
        ax.hist(series.values, bins=bins)
        ax.set_xlabel(_safe_title(numeric_col))
        ax.set_ylabel("Frequency")
        ax.set_title(f"Histogram of {numeric_col}")

    fig.tight_layout()
    return fig


def create_category_bar_plot(df, category_col, value_col=None, agg_func="count", top_n=10):
    """
    Create a bar chart for a categorical column.

    - If value_col is None or agg_func == "count":
        Show counts of each category.
    - If value_col is numeric:
        Aggregate using sum / mean / median.

    This works nicely with columns like "Region" or "Model"
    for the Tesla dataset.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    category_col : str
        Name of the categorical column (e.g. "Region").
    value_col : str or None
        Numeric column to aggregate (e.g. "Estimated_Deliveries"), or None.
    agg_func : str
        "sum", "mean", "median", or "count".
    top_n : int
        Show only the top N categories.

    Returns
    -------
    fig : matplotlib.figure.Figure or None
        The figure object with the plot, or None if something fails.
    """
    if category_col not in df.columns:
        return None

    data = df.copy()

    # Pure counts mode
    if value_col is None or agg_func == "count":
        counts = data[category_col].value_counts().head(top_n)
        y_values = counts.values
        x_labels = counts.index.astype(str).tolist()
        title = f"Top {top_n} {category_col} by Count"
        y_label = "Count"
    else:
        if value_col not in df.columns:
            return None

        data = data[[category_col, value_col]].dropna()
        if data.empty:
            return None

        if agg_func == "mean":
            grouped = data.groupby(category_col)[value_col].mean()
        elif agg_func == "median":
            grouped = data.groupby(category_col)[value_col].median()
        else:
            # default to sum
            grouped = data.groupby(category_col)[value_col].sum()

        grouped = grouped.sort_values(ascending=False).head(top_n)
        y_values = grouped.values
        x_labels = grouped.index.astype(str).tolist()
        title = f"Top {top_n} {category_col} by {agg_func} of {value_col}"
        y_label = _safe_title(value_col)

    fig, ax = plt.subplots()
    ax.bar(x_labels, y_values)
    ax.set_xlabel(_safe_title(category_col))
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.tick_params(axis="x", rotation=45)

    fig.tight_layout()
    return fig


def create_scatter_plot(df, x_col, y_col, category_col=None):
    """
    Create a scatter plot for two numeric columns.

    If category_col is given (e.g. "Region" or "Model"),
    points will be split by that category and a legend will be shown.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    x_col : str
        Column for x-axis.
    y_col : str
        Column for y-axis.
    category_col : str or None
        Optional column to group points by.

    Returns
    -------
    fig : matplotlib.figure.Figure or None
        The figure object with the plot, or None if something fails.
    """
    if x_col not in df.columns or y_col not in df.columns:
        return None

    data = df[[x_col, y_col] + ([category_col] if category_col and category_col in df.columns else [])].dropna()
    if data.empty:
        return None

    fig, ax = plt.subplots()

    if category_col is None or category_col not in data.columns:
        # Simple scatter, no categories
        ax.scatter(data[x_col], data[y_col])
        ax.set_title(f"Scatter Plot: {x_col} vs {y_col}")
    else:
        # One scatter per category, with legend
        for cat_value, group_df in data.groupby(category_col):
            ax.scatter(group_df[x_col], group_df[y_col], label=str(cat_value))

        ax.set_title(f"Scatter Plot: {x_col} vs {y_col} by {category_col}")
        ax.legend(title=_safe_title(category_col))

    ax.set_xlabel(_safe_title(x_col))
    ax.set_ylabel(_safe_title(y_col))
    ax.grid(True)

    fig.tight_layout()
    return fig


def create_correlation_heatmap(df, numeric_cols):
    """
    Create a correlation heatmap for numeric columns,
    with the numeric values displayed on the cells.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    numeric_cols : list of str
        List of numeric column names to include in the correlation matrix.

    Returns
    -------
    fig : matplotlib.figure.Figure or None
        The figure object with the heatmap, or None if something fails.
    """
    cols = [c for c in numeric_cols if c in df.columns]
    if len(cols) < 2:
        return None

    corr = df[cols].corr()

    fig, ax = plt.subplots()
    cax = ax.matshow(corr.values)
    fig.colorbar(cax)

    ax.set_xticks(range(len(cols)))
    ax.set_yticks(range(len(cols)))
    ax.set_xticklabels(cols, rotation=45, ha="left")
    ax.set_yticklabels(cols)
    ax.set_title("Correlation Heatmap", pad=20)

    # Add the correlation values on top of each cell
    for i in range(len(cols)):
        for j in range(len(cols)):
            value = corr.values[i, j]
            ax.text(j, i, f"{value:.2f}", va="center", ha="center", fontsize=8)

    fig.tight_layout()
    return fig