Nucha commited on
Commit
611a545
·
verified ·
1 Parent(s): ad31e83

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -35
app.py CHANGED
@@ -6,14 +6,15 @@ from PIL import Image
6
  import gradio as gr
7
  from wordcloud import WordCloud
8
 
9
- DEFAULT_CSV = "Soft_Skills__Top_5000.csv" # Put this file in the Space repo root
 
 
 
 
10
  DEFAULT_TEXT_COL = "ทักษะ"
11
  DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
12
 
13
  def _load_dataframe(file):
14
- """
15
- Load a CSV either from the uploaded file or from DEFAULT_CSV if present.
16
- """
17
  if file is not None:
18
  return pd.read_csv(file.name if hasattr(file, "name") else file)
19
  if os.path.exists(DEFAULT_CSV):
@@ -21,14 +22,30 @@ def _load_dataframe(file):
21
  raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
22
 
23
  def _detect_columns(df):
24
- # Try defaults first; else guess the first two columns
25
  if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
26
  return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
27
  if len(df.columns) >= 2:
28
  return df.columns[0], df.columns[1]
29
  raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
30
 
31
- def generate_wordcloud(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  csv_file,
33
  text_col,
34
  freq_col,
@@ -42,34 +59,27 @@ def generate_wordcloud(
42
  collocations,
43
  stopwords_text,
44
  mask_image,
45
- random_state
 
 
46
  ):
47
- # Load data
48
  df = _load_dataframe(csv_file)
49
- # Auto-pick columns if "auto"
50
  if text_col == "auto" or freq_col == "auto":
51
  auto_text, auto_freq = _detect_columns(df)
52
- if text_col == "auto":
53
- text_col = auto_text
54
- if freq_col == "auto":
55
- freq_col = auto_freq
56
 
57
  if text_col not in df.columns or freq_col not in df.columns:
58
  raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
59
 
60
- # Clean and build frequency dict
61
  sub = df[[text_col, freq_col]].dropna()
62
- # Coerce frequency to numeric
63
  sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
64
- # Keep top N by frequency
65
  sub = sub.sort_values(freq_col, ascending=False).head(max_words)
66
 
67
  frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
68
-
69
  if not frequencies:
70
  raise gr.Error("No words found after processing. Please check your CSV columns.")
71
 
72
- # Stopwords
73
  stopwords = set()
74
  if stopwords_text:
75
  for w in stopwords_text.splitlines():
@@ -77,7 +87,6 @@ def generate_wordcloud(
77
  if w:
78
  stopwords.add(w)
79
 
80
- # Optional mask
81
  mask = None
82
  if mask_image is not None:
83
  try:
@@ -86,7 +95,6 @@ def generate_wordcloud(
86
  except Exception as e:
87
  raise gr.Error(f"Failed to read mask image: {e}")
88
 
89
- # Try to use Thai-capable font if present
90
  font_path = None
91
  for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
92
  if os.path.exists(cand):
@@ -106,16 +114,19 @@ def generate_wordcloud(
106
  font_path=font_path,
107
  random_state=random_state,
108
  )
109
-
110
  wc.generate_from_frequencies(frequencies)
111
- img = wc.to_image()
112
 
113
- # Return image and also a CSV preview (top words used)
114
  preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
115
- return img, preview
116
 
117
- with gr.Blocks(title="Soft Skills WordCloud") as demo:
118
- gr.Markdown("# Soft Skills Word Cloud\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
 
 
 
 
 
 
119
 
120
  with gr.Row():
121
  with gr.Column(scale=1):
@@ -123,11 +134,7 @@ with gr.Blocks(title="Soft Skills WordCloud") as demo:
123
  text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
124
  freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
125
  max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
126
- background_color = gr.Dropdown(
127
- choices=["white", "black"],
128
- value="white",
129
- label="Background color"
130
- )
131
  colormap = gr.Dropdown(
132
  choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
133
  value="default",
@@ -142,16 +149,21 @@ with gr.Blocks(title="Soft Skills WordCloud") as demo:
142
  mask_image = gr.Image(type="filepath", label="Mask image (optional)")
143
  random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
144
 
145
- run_btn = gr.Button("Generate Word Cloud", variant="primary")
 
 
 
 
146
 
147
  with gr.Column(scale=1):
148
  out_img = gr.Image(label="Word Cloud", type="pil")
 
149
  out_table = gr.Dataframe(label="Top words used", wrap=True)
150
 
151
  run_btn.click(
152
- fn=generate_wordcloud,
153
- inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state],
154
- outputs=[out_img, out_table],
155
  )
156
 
157
  if __name__ == "__main__":
 
6
  import gradio as gr
7
  from wordcloud import WordCloud
8
 
9
+ import matplotlib
10
+ matplotlib.use("Agg")
11
+ import matplotlib.pyplot as plt
12
+
13
+ DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"
14
  DEFAULT_TEXT_COL = "ทักษะ"
15
  DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
16
 
17
  def _load_dataframe(file):
 
 
 
18
  if file is not None:
19
  return pd.read_csv(file.name if hasattr(file, "name") else file)
20
  if os.path.exists(DEFAULT_CSV):
 
22
  raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
23
 
24
  def _detect_columns(df):
 
25
  if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
26
  return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
27
  if len(df.columns) >= 2:
28
  return df.columns[0], df.columns[1]
29
  raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
30
 
31
+ def _to_bar_chart_image(df_words, top_k):
32
+ sub = df_words.head(int(top_k)).copy()
33
+ sub = sub.iloc[::-1]
34
+ fig = plt.figure(figsize=(8, max(3, 0.35*len(sub))))
35
+ ax = fig.add_subplot(111)
36
+ ax.barh(sub["word"], sub["frequency"])
37
+ ax.set_xlabel("Frequency")
38
+ ax.set_ylabel("Word")
39
+ ax.set_title(f"Top {len(sub)} Words by Frequency")
40
+ fig.tight_layout()
41
+ import io as _io
42
+ buf = _io.BytesIO()
43
+ fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
44
+ plt.close(fig)
45
+ buf.seek(0)
46
+ return Image.open(buf)
47
+
48
+ def generate_wordcloud_and_bar(
49
  csv_file,
50
  text_col,
51
  freq_col,
 
59
  collocations,
60
  stopwords_text,
61
  mask_image,
62
+ random_state,
63
+ show_bar,
64
+ topk_bar
65
  ):
 
66
  df = _load_dataframe(csv_file)
 
67
  if text_col == "auto" or freq_col == "auto":
68
  auto_text, auto_freq = _detect_columns(df)
69
+ if text_col == "auto": text_col = auto_text
70
+ if freq_col == "auto": freq_col = auto_freq
 
 
71
 
72
  if text_col not in df.columns or freq_col not in df.columns:
73
  raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
74
 
 
75
  sub = df[[text_col, freq_col]].dropna()
 
76
  sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
 
77
  sub = sub.sort_values(freq_col, ascending=False).head(max_words)
78
 
79
  frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
 
80
  if not frequencies:
81
  raise gr.Error("No words found after processing. Please check your CSV columns.")
82
 
 
83
  stopwords = set()
84
  if stopwords_text:
85
  for w in stopwords_text.splitlines():
 
87
  if w:
88
  stopwords.add(w)
89
 
 
90
  mask = None
91
  if mask_image is not None:
92
  try:
 
95
  except Exception as e:
96
  raise gr.Error(f"Failed to read mask image: {e}")
97
 
 
98
  font_path = None
99
  for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
100
  if os.path.exists(cand):
 
114
  font_path=font_path,
115
  random_state=random_state,
116
  )
 
117
  wc.generate_from_frequencies(frequencies)
118
+ img_cloud = wc.to_image()
119
 
 
120
  preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
 
121
 
122
+ img_bar = None
123
+ if show_bar:
124
+ img_bar = _to_bar_chart_image(preview, topk_bar)
125
+
126
+ return img_cloud, img_bar, preview
127
+
128
+ with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo:
129
+ gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
130
 
131
  with gr.Row():
132
  with gr.Column(scale=1):
 
134
  text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
135
  freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
136
  max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
137
+ background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color")
 
 
 
 
138
  colormap = gr.Dropdown(
139
  choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
140
  value="default",
 
149
  mask_image = gr.Image(type="filepath", label="Mask image (optional)")
150
  random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
151
 
152
+ gr.Markdown("### Bar Chart Options")
153
+ show_bar = gr.Checkbox(value=True, label="Show bar chart")
154
+ topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart")
155
+
156
+ run_btn = gr.Button("Generate", variant="primary")
157
 
158
  with gr.Column(scale=1):
159
  out_img = gr.Image(label="Word Cloud", type="pil")
160
+ out_bar = gr.Image(label="Bar Chart (Top K)", type="pil")
161
  out_table = gr.Dataframe(label="Top words used", wrap=True)
162
 
163
  run_btn.click(
164
+ fn=generate_wordcloud_and_bar,
165
+ inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar],
166
+ outputs=[out_img, out_bar, out_table],
167
  )
168
 
169
  if __name__ == "__main__":