Update app.py
Browse files
app.py
CHANGED
|
@@ -22,20 +22,23 @@ def process_file(file, instructions, api_key):
|
|
| 22 |
Analyze the following dataset and instructions:
|
| 23 |
|
| 24 |
Data columns: {list(df.columns)}
|
|
|
|
| 25 |
Instructions: {instructions}
|
| 26 |
|
| 27 |
-
Based on this, create 3 appropriate visualizations. For each visualization
|
| 28 |
-
1.
|
| 29 |
-
2.
|
| 30 |
-
3.
|
| 31 |
-
4.
|
| 32 |
-
5. Any necessary data preprocessing steps (e.g., grouping, sorting, etc.)
|
| 33 |
|
|
|
|
|
|
|
|
|
|
| 34 |
Return your response as a Python list of dictionaries:
|
| 35 |
[
|
| 36 |
-
{{"title": "...", "plot_type": "...", "x": "...", "y": "...", "
|
| 37 |
-
{{"title": "...", "plot_type": "...", "x": "...", "y": "...", "
|
| 38 |
-
{{"title": "...", "plot_type": "...", "x": "...", "y": "...", "
|
| 39 |
]
|
| 40 |
""")
|
| 41 |
|
|
@@ -56,29 +59,41 @@ def process_file(file, instructions, api_key):
|
|
| 56 |
for plot in plots[:3]: # Ensure max 3 plots
|
| 57 |
fig, ax = plt.subplots(figsize=(10, 6))
|
| 58 |
|
| 59 |
-
# Apply preprocessing
|
| 60 |
plot_df = df.copy()
|
| 61 |
-
if '
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
plot_df = plot_df.groupby(
|
| 65 |
-
|
| 66 |
-
plot_df = plot_df.
|
| 67 |
-
|
| 68 |
-
|
|
|
|
| 69 |
|
| 70 |
if plot['plot_type'] == 'bar':
|
| 71 |
plot_df.plot(kind='bar', x=plot['x'], y=plot['y'], ax=ax)
|
| 72 |
elif plot['plot_type'] == 'line':
|
| 73 |
plot_df.plot(kind='line', x=plot['x'], y=plot['y'], ax=ax)
|
| 74 |
elif plot['plot_type'] == 'scatter':
|
| 75 |
-
plot_df.plot(kind='scatter', x=plot['x'], y=plot['y'], ax=ax
|
|
|
|
| 76 |
elif plot['plot_type'] == 'hist':
|
| 77 |
-
plot_df[plot['x']].hist(ax=ax)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
ax.set_title(plot['title'])
|
| 80 |
-
|
| 81 |
-
|
|
|
|
| 82 |
plt.tight_layout()
|
| 83 |
|
| 84 |
buf = io.BytesIO()
|
|
|
|
| 22 |
Analyze the following dataset and instructions:
|
| 23 |
|
| 24 |
Data columns: {list(df.columns)}
|
| 25 |
+
Data shape: {df.shape}
|
| 26 |
Instructions: {instructions}
|
| 27 |
|
| 28 |
+
Based on this, create 3 appropriate visualizations that provide meaningful insights. For each visualization:
|
| 29 |
+
1. Choose the most suitable plot type (bar, line, scatter, hist, pie, heatmap)
|
| 30 |
+
2. Determine appropriate data aggregation (e.g., top 5 categories, monthly averages)
|
| 31 |
+
3. Select relevant columns for x-axis, y-axis, and any additional dimensions (color, size)
|
| 32 |
+
4. Provide a clear, concise title that explains the insight
|
|
|
|
| 33 |
|
| 34 |
+
Consider data density and choose visualizations that simplify and clarify the information.
|
| 35 |
+
Limit the number of data points displayed to ensure readability (e.g., top 5, top 10).
|
| 36 |
+
|
| 37 |
Return your response as a Python list of dictionaries:
|
| 38 |
[
|
| 39 |
+
{{"title": "...", "plot_type": "...", "x": "...", "y": "...", "agg_func": "...", "top_n": ..., "additional": {{"color": "...", "size": "..."}}}},
|
| 40 |
+
{{"title": "...", "plot_type": "...", "x": "...", "y": "...", "agg_func": "...", "top_n": ..., "additional": {{"color": "...", "size": "..."}}}},
|
| 41 |
+
{{"title": "...", "plot_type": "...", "x": "...", "y": "...", "agg_func": "...", "top_n": ..., "additional": {{"color": "...", "size": "..."}}}
|
| 42 |
]
|
| 43 |
""")
|
| 44 |
|
|
|
|
| 59 |
for plot in plots[:3]: # Ensure max 3 plots
|
| 60 |
fig, ax = plt.subplots(figsize=(10, 6))
|
| 61 |
|
| 62 |
+
# Apply preprocessing and aggregation
|
| 63 |
plot_df = df.copy()
|
| 64 |
+
if plot['agg_func'] == 'sum':
|
| 65 |
+
plot_df = plot_df.groupby(plot['x'])[plot['y']].sum().reset_index()
|
| 66 |
+
elif plot['agg_func'] == 'mean':
|
| 67 |
+
plot_df = plot_df.groupby(plot['x'])[plot['y']].mean().reset_index()
|
| 68 |
+
elif plot['agg_func'] == 'count':
|
| 69 |
+
plot_df = plot_df.groupby(plot['x']).size().reset_index(name=plot['y'])
|
| 70 |
+
|
| 71 |
+
if 'top_n' in plot and plot['top_n']:
|
| 72 |
+
plot_df = plot_df.nlargest(plot['top_n'], plot['y'])
|
| 73 |
|
| 74 |
if plot['plot_type'] == 'bar':
|
| 75 |
plot_df.plot(kind='bar', x=plot['x'], y=plot['y'], ax=ax)
|
| 76 |
elif plot['plot_type'] == 'line':
|
| 77 |
plot_df.plot(kind='line', x=plot['x'], y=plot['y'], ax=ax)
|
| 78 |
elif plot['plot_type'] == 'scatter':
|
| 79 |
+
plot_df.plot(kind='scatter', x=plot['x'], y=plot['y'], ax=ax,
|
| 80 |
+
c=plot['additional'].get('color'), s=plot_df[plot['additional'].get('size', 'y')])
|
| 81 |
elif plot['plot_type'] == 'hist':
|
| 82 |
+
plot_df[plot['x']].hist(ax=ax, bins=20)
|
| 83 |
+
elif plot['plot_type'] == 'pie':
|
| 84 |
+
plot_df.plot(kind='pie', y=plot['y'], labels=plot_df[plot['x']], ax=ax, autopct='%1.1f%%')
|
| 85 |
+
elif plot['plot_type'] == 'heatmap':
|
| 86 |
+
pivot_df = plot_df.pivot(index=plot['x'], columns=plot['additional']['color'], values=plot['y'])
|
| 87 |
+
ax.imshow(pivot_df, cmap='YlOrRd')
|
| 88 |
+
ax.set_xticks(range(len(pivot_df.columns)))
|
| 89 |
+
ax.set_yticks(range(len(pivot_df.index)))
|
| 90 |
+
ax.set_xticklabels(pivot_df.columns)
|
| 91 |
+
ax.set_yticklabels(pivot_df.index)
|
| 92 |
|
| 93 |
ax.set_title(plot['title'])
|
| 94 |
+
if plot['plot_type'] != 'pie':
|
| 95 |
+
ax.set_xlabel(plot['x'])
|
| 96 |
+
ax.set_ylabel(plot['y'])
|
| 97 |
plt.tight_layout()
|
| 98 |
|
| 99 |
buf = io.BytesIO()
|