Nucha's picture
Upload app.py
611a545 verified
import os
import io
import numpy as np
import pandas as pd
from PIL import Image
import gradio as gr
from wordcloud import WordCloud
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"
DEFAULT_TEXT_COL = "ทักษะ"
DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
def _load_dataframe(file):
if file is not None:
return pd.read_csv(file.name if hasattr(file, "name") else file)
if os.path.exists(DEFAULT_CSV):
return pd.read_csv(DEFAULT_CSV)
raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
def _detect_columns(df):
if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
if len(df.columns) >= 2:
return df.columns[0], df.columns[1]
raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
def _to_bar_chart_image(df_words, top_k):
sub = df_words.head(int(top_k)).copy()
sub = sub.iloc[::-1]
fig = plt.figure(figsize=(8, max(3, 0.35*len(sub))))
ax = fig.add_subplot(111)
ax.barh(sub["word"], sub["frequency"])
ax.set_xlabel("Frequency")
ax.set_ylabel("Word")
ax.set_title(f"Top {len(sub)} Words by Frequency")
fig.tight_layout()
import io as _io
buf = _io.BytesIO()
fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
plt.close(fig)
buf.seek(0)
return Image.open(buf)
def generate_wordcloud_and_bar(
csv_file,
text_col,
freq_col,
max_words,
background_color,
colormap,
width,
height,
scale,
prefer_horizontal,
collocations,
stopwords_text,
mask_image,
random_state,
show_bar,
topk_bar
):
df = _load_dataframe(csv_file)
if text_col == "auto" or freq_col == "auto":
auto_text, auto_freq = _detect_columns(df)
if text_col == "auto": text_col = auto_text
if freq_col == "auto": freq_col = auto_freq
if text_col not in df.columns or freq_col not in df.columns:
raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
sub = df[[text_col, freq_col]].dropna()
sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
sub = sub.sort_values(freq_col, ascending=False).head(max_words)
frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
if not frequencies:
raise gr.Error("No words found after processing. Please check your CSV columns.")
stopwords = set()
if stopwords_text:
for w in stopwords_text.splitlines():
w = w.strip()
if w:
stopwords.add(w)
mask = None
if mask_image is not None:
try:
pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L")
mask = np.array(pil)
except Exception as e:
raise gr.Error(f"Failed to read mask image: {e}")
font_path = None
for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
if os.path.exists(cand):
font_path = cand
break
wc = WordCloud(
width=width,
height=height,
scale=scale,
background_color=background_color,
colormap=None if colormap == "default" else colormap,
prefer_horizontal=prefer_horizontal,
collocations=collocations,
stopwords=stopwords,
mask=mask,
font_path=font_path,
random_state=random_state,
)
wc.generate_from_frequencies(frequencies)
img_cloud = wc.to_image()
preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
img_bar = None
if show_bar:
img_bar = _to_bar_chart_image(preview, topk_bar)
return img_cloud, img_bar, preview
with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo:
gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
with gr.Row():
with gr.Column(scale=1):
csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"])
text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color")
colormap = gr.Dropdown(
choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
value="default",
label="Colormap"
)
width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)")
height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)")
scale = gr.Slider(1, 5, value=2, step=1, label="Scale")
prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout")
collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)")
stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand")
mask_image = gr.Image(type="filepath", label="Mask image (optional)")
random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
gr.Markdown("### Bar Chart Options")
show_bar = gr.Checkbox(value=True, label="Show bar chart")
topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart")
run_btn = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
out_img = gr.Image(label="Word Cloud", type="pil")
out_bar = gr.Image(label="Bar Chart (Top K)", type="pil")
out_table = gr.Dataframe(label="Top words used", wrap=True)
run_btn.click(
fn=generate_wordcloud_and_bar,
inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar],
outputs=[out_img, out_bar, out_table],
)
if __name__ == "__main__":
demo.launch()