Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import numpy as np | |
| import pandas as pd | |
| from PIL import Image | |
| import gradio as gr | |
| from wordcloud import WordCloud | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| DEFAULT_CSV = "Soft_Skills__Top_5000_.csv" | |
| DEFAULT_TEXT_COL = "ทักษะ" | |
| DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ" | |
| def _load_dataframe(file): | |
| if file is not None: | |
| return pd.read_csv(file.name if hasattr(file, "name") else file) | |
| if os.path.exists(DEFAULT_CSV): | |
| return pd.read_csv(DEFAULT_CSV) | |
| raise gr.Error("No CSV provided and default file not found. Please upload a CSV.") | |
| def _detect_columns(df): | |
| if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns: | |
| return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL | |
| if len(df.columns) >= 2: | |
| return df.columns[0], df.columns[1] | |
| raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].") | |
| def _to_bar_chart_image(df_words, top_k): | |
| sub = df_words.head(int(top_k)).copy() | |
| sub = sub.iloc[::-1] | |
| fig = plt.figure(figsize=(8, max(3, 0.35*len(sub)))) | |
| ax = fig.add_subplot(111) | |
| ax.barh(sub["word"], sub["frequency"]) | |
| ax.set_xlabel("Frequency") | |
| ax.set_ylabel("Word") | |
| ax.set_title(f"Top {len(sub)} Words by Frequency") | |
| fig.tight_layout() | |
| import io as _io | |
| buf = _io.BytesIO() | |
| fig.savefig(buf, format="png", dpi=160, bbox_inches="tight") | |
| plt.close(fig) | |
| buf.seek(0) | |
| return Image.open(buf) | |
| def generate_wordcloud_and_bar( | |
| csv_file, | |
| text_col, | |
| freq_col, | |
| max_words, | |
| background_color, | |
| colormap, | |
| width, | |
| height, | |
| scale, | |
| prefer_horizontal, | |
| collocations, | |
| stopwords_text, | |
| mask_image, | |
| random_state, | |
| show_bar, | |
| topk_bar | |
| ): | |
| df = _load_dataframe(csv_file) | |
| if text_col == "auto" or freq_col == "auto": | |
| auto_text, auto_freq = _detect_columns(df) | |
| if text_col == "auto": text_col = auto_text | |
| if freq_col == "auto": freq_col = auto_freq | |
| if text_col not in df.columns or freq_col not in df.columns: | |
| raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}") | |
| sub = df[[text_col, freq_col]].dropna() | |
| sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float) | |
| sub = sub.sort_values(freq_col, ascending=False).head(max_words) | |
| frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()} | |
| if not frequencies: | |
| raise gr.Error("No words found after processing. Please check your CSV columns.") | |
| stopwords = set() | |
| if stopwords_text: | |
| for w in stopwords_text.splitlines(): | |
| w = w.strip() | |
| if w: | |
| stopwords.add(w) | |
| mask = None | |
| if mask_image is not None: | |
| try: | |
| pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L") | |
| mask = np.array(pil) | |
| except Exception as e: | |
| raise gr.Error(f"Failed to read mask image: {e}") | |
| font_path = None | |
| for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]: | |
| if os.path.exists(cand): | |
| font_path = cand | |
| break | |
| wc = WordCloud( | |
| width=width, | |
| height=height, | |
| scale=scale, | |
| background_color=background_color, | |
| colormap=None if colormap == "default" else colormap, | |
| prefer_horizontal=prefer_horizontal, | |
| collocations=collocations, | |
| stopwords=stopwords, | |
| mask=mask, | |
| font_path=font_path, | |
| random_state=random_state, | |
| ) | |
| wc.generate_from_frequencies(frequencies) | |
| img_cloud = wc.to_image() | |
| preview = sub.rename(columns={text_col: "word", freq_col: "frequency"}) | |
| img_bar = None | |
| if show_bar: | |
| img_bar = _to_bar_chart_image(preview, topk_bar) | |
| return img_cloud, img_bar, preview | |
| with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo: | |
| gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"]) | |
| text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column") | |
| freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column") | |
| max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words") | |
| background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color") | |
| colormap = gr.Dropdown( | |
| choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"], | |
| value="default", | |
| label="Colormap" | |
| ) | |
| width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)") | |
| height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)") | |
| scale = gr.Slider(1, 5, value=2, step=1, label="Scale") | |
| prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout") | |
| collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)") | |
| stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand") | |
| mask_image = gr.Image(type="filepath", label="Mask image (optional)") | |
| random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)") | |
| gr.Markdown("### Bar Chart Options") | |
| show_bar = gr.Checkbox(value=True, label="Show bar chart") | |
| topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart") | |
| run_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(scale=1): | |
| out_img = gr.Image(label="Word Cloud", type="pil") | |
| out_bar = gr.Image(label="Bar Chart (Top K)", type="pil") | |
| out_table = gr.Dataframe(label="Top words used", wrap=True) | |
| run_btn.click( | |
| fn=generate_wordcloud_and_bar, | |
| inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar], | |
| outputs=[out_img, out_bar, out_table], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |