salihfurkaan commited on
Commit
6b37a61
·
1 Parent(s): 841e95d

Refine visualizations: horizontal bars, smart filtering, label truncation

Browse files
cleaned_data.csv CHANGED
@@ -1,6 +1,6 @@
1
- name,age,tags,details
2
- Alice,25,['HR' 'admin'],{'role': 'staff'}
3
- Bob,30,['eng' 'dev'],{'role': 'dev'}
4
- Charlie,35,['eng' 'lead'],{'role': 'lead'}
5
- David,40,['mgmt'],{'role': 'manager'}
6
- Eve,22,['HR'],{'role': 'staff'}
 
1
+ name,age,tags,details,longtext,shortcategory
2
+ Alice,25,['HR' 'admin'],{'role': 'staff'},This is a very long text string that should be filtered out from categorical plots because it is too long.,Cat A
3
+ Bob,30,['eng' 'dev'],{'role': 'dev'},Another very long text string that serves the same purpose of testing the filtration logic.,Cat B
4
+ Charlie,35,['eng' 'lead'],{'role': 'lead'},Yet another long string to ensure the average length is high enough.,Cat A
5
+ David,40,['mgmt'],{'role': 'manager'},Repeating the long string to have some frequency but still be long.,Cat C
6
+ Eve,22,['HR'],{'role': 'staff'},This is a very long text string that should be filtered out from categorical plots because it is too long.,Cat B
src/__pycache__/visualization.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/visualization.cpython-313.pyc and b/src/__pycache__/visualization.cpython-313.pyc differ
 
src/visualization.py CHANGED
@@ -27,17 +27,36 @@ def generate_charts(df, profile):
27
  figures.append(fig_hist)
28
 
29
  # 3. Categorical Counts - Top 3
 
30
  cat_cols = profile.get('categorical_columns', [])
31
- for col in cat_cols[:3]:
 
 
 
 
 
 
 
 
 
 
32
  unique_count = df[col].nunique()
33
- if unique_count < 50: # Don't plot high cardinality
34
- counts = df[col].value_counts().head(15) # Top 15
35
- fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Top Categories in {col}")
 
 
 
 
 
 
36
  figures.append(fig_bar)
37
  else:
38
- # For high cardinality, maybe just plot top 10?
39
  counts = df[col].value_counts().head(10)
40
- fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Top 10 Most Frequent in {col}")
 
 
 
41
  figures.append(fig_bar)
42
 
43
  # 4. Text Length Distribution (Fallback for Text Data)
 
27
  figures.append(fig_hist)
28
 
29
  # 3. Categorical Counts - Top 3
30
+ # Filter out columns that are likely just distinct text (avg length > 20) unless unique count is very low
31
  cat_cols = profile.get('categorical_columns', [])
32
+ valid_cat_cols = []
33
+
34
+ for col in cat_cols:
35
+ # Check average length if object
36
+ if df[col].dtype == 'object':
37
+ avg_len = df[col].astype(str).str.len().mean()
38
+ if avg_len > 30 and df[col].nunique() > 10:
39
+ continue # Skip likely text/path columns
40
+ valid_cat_cols.append(col)
41
+
42
+ for col in valid_cat_cols[:3]:
43
  unique_count = df[col].nunique()
44
+ # For layout, horizontal bars are better for text labels
45
+ if unique_count < 50:
46
+ counts = df[col].value_counts().head(15)
47
+ # Truncate labels for display
48
+ short_labels = [str(x)[:30] + "..." if len(str(x)) > 30 else str(x) for x in counts.index]
49
+
50
+ fig_bar = px.bar(x=counts.values, y=short_labels, orientation='h',
51
+ labels={'x': 'Count', 'y': col}, title=f"Top Categories in {col}")
52
+ fig_bar.update_layout(yaxis=dict(autorange="reversed")) # Top to bottom
53
  figures.append(fig_bar)
54
  else:
 
55
  counts = df[col].value_counts().head(10)
56
+ short_labels = [str(x)[:30] + "..." if len(str(x)) > 30 else str(x) for x in counts.index]
57
+ fig_bar = px.bar(x=counts.values, y=short_labels, orientation='h',
58
+ labels={'x': 'Count', 'y': col}, title=f"Top 10 Most Frequent in {col}")
59
+ fig_bar.update_layout(yaxis=dict(autorange="reversed"))
60
  figures.append(fig_bar)
61
 
62
  # 4. Text Length Distribution (Fallback for Text Data)