Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -6,14 +6,15 @@ from PIL import Image
|
|
| 6 |
import gradio as gr
|
| 7 |
from wordcloud import WordCloud
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
DEFAULT_TEXT_COL = "ทักษะ"
|
| 11 |
DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
|
| 12 |
|
| 13 |
def _load_dataframe(file):
|
| 14 |
-
"""
|
| 15 |
-
Load a CSV either from the uploaded file or from DEFAULT_CSV if present.
|
| 16 |
-
"""
|
| 17 |
if file is not None:
|
| 18 |
return pd.read_csv(file.name if hasattr(file, "name") else file)
|
| 19 |
if os.path.exists(DEFAULT_CSV):
|
|
@@ -21,14 +22,30 @@ def _load_dataframe(file):
|
|
| 21 |
raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
|
| 22 |
|
| 23 |
def _detect_columns(df):
|
| 24 |
-
# Try defaults first; else guess the first two columns
|
| 25 |
if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
|
| 26 |
return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
|
| 27 |
if len(df.columns) >= 2:
|
| 28 |
return df.columns[0], df.columns[1]
|
| 29 |
raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
|
| 30 |
|
| 31 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
csv_file,
|
| 33 |
text_col,
|
| 34 |
freq_col,
|
|
@@ -42,34 +59,27 @@ def generate_wordcloud(
|
|
| 42 |
collocations,
|
| 43 |
stopwords_text,
|
| 44 |
mask_image,
|
| 45 |
-
random_state
|
|
|
|
|
|
|
| 46 |
):
|
| 47 |
-
# Load data
|
| 48 |
df = _load_dataframe(csv_file)
|
| 49 |
-
# Auto-pick columns if "auto"
|
| 50 |
if text_col == "auto" or freq_col == "auto":
|
| 51 |
auto_text, auto_freq = _detect_columns(df)
|
| 52 |
-
if text_col == "auto":
|
| 53 |
-
|
| 54 |
-
if freq_col == "auto":
|
| 55 |
-
freq_col = auto_freq
|
| 56 |
|
| 57 |
if text_col not in df.columns or freq_col not in df.columns:
|
| 58 |
raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
|
| 59 |
|
| 60 |
-
# Clean and build frequency dict
|
| 61 |
sub = df[[text_col, freq_col]].dropna()
|
| 62 |
-
# Coerce frequency to numeric
|
| 63 |
sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
|
| 64 |
-
# Keep top N by frequency
|
| 65 |
sub = sub.sort_values(freq_col, ascending=False).head(max_words)
|
| 66 |
|
| 67 |
frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
|
| 68 |
-
|
| 69 |
if not frequencies:
|
| 70 |
raise gr.Error("No words found after processing. Please check your CSV columns.")
|
| 71 |
|
| 72 |
-
# Stopwords
|
| 73 |
stopwords = set()
|
| 74 |
if stopwords_text:
|
| 75 |
for w in stopwords_text.splitlines():
|
|
@@ -77,7 +87,6 @@ def generate_wordcloud(
|
|
| 77 |
if w:
|
| 78 |
stopwords.add(w)
|
| 79 |
|
| 80 |
-
# Optional mask
|
| 81 |
mask = None
|
| 82 |
if mask_image is not None:
|
| 83 |
try:
|
|
@@ -86,7 +95,6 @@ def generate_wordcloud(
|
|
| 86 |
except Exception as e:
|
| 87 |
raise gr.Error(f"Failed to read mask image: {e}")
|
| 88 |
|
| 89 |
-
# Try to use Thai-capable font if present
|
| 90 |
font_path = None
|
| 91 |
for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
|
| 92 |
if os.path.exists(cand):
|
|
@@ -106,16 +114,19 @@ def generate_wordcloud(
|
|
| 106 |
font_path=font_path,
|
| 107 |
random_state=random_state,
|
| 108 |
)
|
| 109 |
-
|
| 110 |
wc.generate_from_frequencies(frequencies)
|
| 111 |
-
|
| 112 |
|
| 113 |
-
# Return image and also a CSV preview (top words used)
|
| 114 |
preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
|
| 115 |
-
return img, preview
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
with gr.Row():
|
| 121 |
with gr.Column(scale=1):
|
|
@@ -123,11 +134,7 @@ with gr.Blocks(title="Soft Skills WordCloud") as demo:
|
|
| 123 |
text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
|
| 124 |
freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
|
| 125 |
max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
|
| 126 |
-
background_color = gr.Dropdown(
|
| 127 |
-
choices=["white", "black"],
|
| 128 |
-
value="white",
|
| 129 |
-
label="Background color"
|
| 130 |
-
)
|
| 131 |
colormap = gr.Dropdown(
|
| 132 |
choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
|
| 133 |
value="default",
|
|
@@ -142,16 +149,21 @@ with gr.Blocks(title="Soft Skills WordCloud") as demo:
|
|
| 142 |
mask_image = gr.Image(type="filepath", label="Mask image (optional)")
|
| 143 |
random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
|
| 144 |
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
with gr.Column(scale=1):
|
| 148 |
out_img = gr.Image(label="Word Cloud", type="pil")
|
|
|
|
| 149 |
out_table = gr.Dataframe(label="Top words used", wrap=True)
|
| 150 |
|
| 151 |
run_btn.click(
|
| 152 |
-
fn=
|
| 153 |
-
inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state],
|
| 154 |
-
outputs=[out_img, out_table],
|
| 155 |
)
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from wordcloud import WordCloud
|
| 8 |
|
| 9 |
+
import matplotlib
|
| 10 |
+
matplotlib.use("Agg")
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
+
|
| 13 |
+
DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"
|
| 14 |
DEFAULT_TEXT_COL = "ทักษะ"
|
| 15 |
DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
|
| 16 |
|
| 17 |
def _load_dataframe(file):
|
|
|
|
|
|
|
|
|
|
| 18 |
if file is not None:
|
| 19 |
return pd.read_csv(file.name if hasattr(file, "name") else file)
|
| 20 |
if os.path.exists(DEFAULT_CSV):
|
|
|
|
| 22 |
raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
|
| 23 |
|
| 24 |
def _detect_columns(df):
|
|
|
|
| 25 |
if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
|
| 26 |
return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
|
| 27 |
if len(df.columns) >= 2:
|
| 28 |
return df.columns[0], df.columns[1]
|
| 29 |
raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
|
| 30 |
|
| 31 |
+
def _to_bar_chart_image(df_words, top_k):
|
| 32 |
+
sub = df_words.head(int(top_k)).copy()
|
| 33 |
+
sub = sub.iloc[::-1]
|
| 34 |
+
fig = plt.figure(figsize=(8, max(3, 0.35*len(sub))))
|
| 35 |
+
ax = fig.add_subplot(111)
|
| 36 |
+
ax.barh(sub["word"], sub["frequency"])
|
| 37 |
+
ax.set_xlabel("Frequency")
|
| 38 |
+
ax.set_ylabel("Word")
|
| 39 |
+
ax.set_title(f"Top {len(sub)} Words by Frequency")
|
| 40 |
+
fig.tight_layout()
|
| 41 |
+
import io as _io
|
| 42 |
+
buf = _io.BytesIO()
|
| 43 |
+
fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
|
| 44 |
+
plt.close(fig)
|
| 45 |
+
buf.seek(0)
|
| 46 |
+
return Image.open(buf)
|
| 47 |
+
|
| 48 |
+
def generate_wordcloud_and_bar(
|
| 49 |
csv_file,
|
| 50 |
text_col,
|
| 51 |
freq_col,
|
|
|
|
| 59 |
collocations,
|
| 60 |
stopwords_text,
|
| 61 |
mask_image,
|
| 62 |
+
random_state,
|
| 63 |
+
show_bar,
|
| 64 |
+
topk_bar
|
| 65 |
):
|
|
|
|
| 66 |
df = _load_dataframe(csv_file)
|
|
|
|
| 67 |
if text_col == "auto" or freq_col == "auto":
|
| 68 |
auto_text, auto_freq = _detect_columns(df)
|
| 69 |
+
if text_col == "auto": text_col = auto_text
|
| 70 |
+
if freq_col == "auto": freq_col = auto_freq
|
|
|
|
|
|
|
| 71 |
|
| 72 |
if text_col not in df.columns or freq_col not in df.columns:
|
| 73 |
raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
|
| 74 |
|
|
|
|
| 75 |
sub = df[[text_col, freq_col]].dropna()
|
|
|
|
| 76 |
sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
|
|
|
|
| 77 |
sub = sub.sort_values(freq_col, ascending=False).head(max_words)
|
| 78 |
|
| 79 |
frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
|
|
|
|
| 80 |
if not frequencies:
|
| 81 |
raise gr.Error("No words found after processing. Please check your CSV columns.")
|
| 82 |
|
|
|
|
| 83 |
stopwords = set()
|
| 84 |
if stopwords_text:
|
| 85 |
for w in stopwords_text.splitlines():
|
|
|
|
| 87 |
if w:
|
| 88 |
stopwords.add(w)
|
| 89 |
|
|
|
|
| 90 |
mask = None
|
| 91 |
if mask_image is not None:
|
| 92 |
try:
|
|
|
|
| 95 |
except Exception as e:
|
| 96 |
raise gr.Error(f"Failed to read mask image: {e}")
|
| 97 |
|
|
|
|
| 98 |
font_path = None
|
| 99 |
for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
|
| 100 |
if os.path.exists(cand):
|
|
|
|
| 114 |
font_path=font_path,
|
| 115 |
random_state=random_state,
|
| 116 |
)
|
|
|
|
| 117 |
wc.generate_from_frequencies(frequencies)
|
| 118 |
+
img_cloud = wc.to_image()
|
| 119 |
|
|
|
|
| 120 |
preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
|
|
|
|
| 121 |
|
| 122 |
+
img_bar = None
|
| 123 |
+
if show_bar:
|
| 124 |
+
img_bar = _to_bar_chart_image(preview, topk_bar)
|
| 125 |
+
|
| 126 |
+
return img_cloud, img_bar, preview
|
| 127 |
+
|
| 128 |
+
with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo:
|
| 129 |
+
gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
|
| 130 |
|
| 131 |
with gr.Row():
|
| 132 |
with gr.Column(scale=1):
|
|
|
|
| 134 |
text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
|
| 135 |
freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
|
| 136 |
max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
|
| 137 |
+
background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
colormap = gr.Dropdown(
|
| 139 |
choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
|
| 140 |
value="default",
|
|
|
|
| 149 |
mask_image = gr.Image(type="filepath", label="Mask image (optional)")
|
| 150 |
random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
|
| 151 |
|
| 152 |
+
gr.Markdown("### Bar Chart Options")
|
| 153 |
+
show_bar = gr.Checkbox(value=True, label="Show bar chart")
|
| 154 |
+
topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart")
|
| 155 |
+
|
| 156 |
+
run_btn = gr.Button("Generate", variant="primary")
|
| 157 |
|
| 158 |
with gr.Column(scale=1):
|
| 159 |
out_img = gr.Image(label="Word Cloud", type="pil")
|
| 160 |
+
out_bar = gr.Image(label="Bar Chart (Top K)", type="pil")
|
| 161 |
out_table = gr.Dataframe(label="Top words used", wrap=True)
|
| 162 |
|
| 163 |
run_btn.click(
|
| 164 |
+
fn=generate_wordcloud_and_bar,
|
| 165 |
+
inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar],
|
| 166 |
+
outputs=[out_img, out_bar, out_table],
|
| 167 |
)
|
| 168 |
|
| 169 |
if __name__ == "__main__":
|