import os import io import numpy as np import pandas as pd from PIL import Image import gradio as gr from wordcloud import WordCloud import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt DEFAULT_CSV = "Soft_Skills__Top_5000_.csv" DEFAULT_TEXT_COL = "ทักษะ" DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ" def _load_dataframe(file): if file is not None: return pd.read_csv(file.name if hasattr(file, "name") else file) if os.path.exists(DEFAULT_CSV): return pd.read_csv(DEFAULT_CSV) raise gr.Error("No CSV provided and default file not found. Please upload a CSV.") def _detect_columns(df): if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns: return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL if len(df.columns) >= 2: return df.columns[0], df.columns[1] raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].") def _to_bar_chart_image(df_words, top_k): sub = df_words.head(int(top_k)).copy() sub = sub.iloc[::-1] fig = plt.figure(figsize=(8, max(3, 0.35*len(sub)))) ax = fig.add_subplot(111) ax.barh(sub["word"], sub["frequency"]) ax.set_xlabel("Frequency") ax.set_ylabel("Word") ax.set_title(f"Top {len(sub)} Words by Frequency") fig.tight_layout() import io as _io buf = _io.BytesIO() fig.savefig(buf, format="png", dpi=160, bbox_inches="tight") plt.close(fig) buf.seek(0) return Image.open(buf) def generate_wordcloud_and_bar( csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar ): df = _load_dataframe(csv_file) if text_col == "auto" or freq_col == "auto": auto_text, auto_freq = _detect_columns(df) if text_col == "auto": text_col = auto_text if freq_col == "auto": freq_col = auto_freq if text_col not in df.columns or freq_col not in df.columns: raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}") sub = df[[text_col, freq_col]].dropna() sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float) sub = sub.sort_values(freq_col, ascending=False).head(max_words) frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()} if not frequencies: raise gr.Error("No words found after processing. Please check your CSV columns.") stopwords = set() if stopwords_text: for w in stopwords_text.splitlines(): w = w.strip() if w: stopwords.add(w) mask = None if mask_image is not None: try: pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L") mask = np.array(pil) except Exception as e: raise gr.Error(f"Failed to read mask image: {e}") font_path = None for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]: if os.path.exists(cand): font_path = cand break wc = WordCloud( width=width, height=height, scale=scale, background_color=background_color, colormap=None if colormap == "default" else colormap, prefer_horizontal=prefer_horizontal, collocations=collocations, stopwords=stopwords, mask=mask, font_path=font_path, random_state=random_state, ) wc.generate_from_frequencies(frequencies) img_cloud = wc.to_image() preview = sub.rename(columns={text_col: "word", freq_col: "frequency"}) img_bar = None if show_bar: img_bar = _to_bar_chart_image(preview, topk_bar) return img_cloud, img_bar, preview with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo: gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.") with gr.Row(): with gr.Column(scale=1): csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"]) text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column") freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column") max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words") background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color") colormap = gr.Dropdown( choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"], value="default", label="Colormap" ) width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)") height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)") scale = gr.Slider(1, 5, value=2, step=1, label="Scale") prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout") collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)") stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand") mask_image = gr.Image(type="filepath", label="Mask image (optional)") random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)") gr.Markdown("### Bar Chart Options") show_bar = gr.Checkbox(value=True, label="Show bar chart") topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart") run_btn = gr.Button("Generate", variant="primary") with gr.Column(scale=1): out_img = gr.Image(label="Word Cloud", type="pil") out_bar = gr.Image(label="Bar Chart (Top K)", type="pil") out_table = gr.Dataframe(label="Top words used", wrap=True) run_btn.click( fn=generate_wordcloud_and_bar, inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar], outputs=[out_img, out_bar, out_table], ) if __name__ == "__main__": demo.launch()