Spaces:

Nucha
/

WordCloud_BarChart

Sleeping

App Files Files Community

Nucha commited on Aug 29, 2025

Commit

a79f725

verified ·

1 Parent(s): 67d52df

Upload 2 files

Browse files

Files changed (2) hide show

app.py +154 -75
requirements.txt +4 -4

app.py CHANGED Viewed

@@ -1,80 +1,159 @@
 import os
 import io
-import re
-from typing import Dict, Optional, Tuple
-import gradio as gr
 import pandas as pd
 from wordcloud import WordCloud
-DEFAULT_CSV_PATH = "Soft_Skills__Top_5000.csv" # auto-load if present
-DEFAULT_MAX_WORDS = 400
-DEFAULT_BG = "white"
-# --- helpers ---
-LIKELY_SKILL_COLS = {"skill", "skills", "soft skill", "soft skills", "name", "keyword", "term"}
-LIKELY_COUNT_COLS = {"count", "counts", "frequency", "freq", "weight", "n"}
-def _find_column(cols, candidates):
-low = {c.lower(): c for c in cols}
-for cand in candidates:
-for c in cols:
-if cand == c.lower():
-return c
-# fuzzy contains
-for key, orig in low.items():
-if cand in key:
-return orig
-return None
-def _load_csv(file_obj: Optional[gr.File]) -> pd.DataFrame:
-"""Load from uploaded file or default path."""
-if file_obj and hasattr(file_obj, "name") and file_obj.name:
-return pd.read_csv(file_obj.name)
-if os.path.exists(DEFAULT_CSV_PATH):
-return pd.read_csv(DEFAULT_CSV_PATH)
-raise FileNotFoundError("CSV not provided. Upload a file or add Soft_Skills__Top_5000_.csv to the repo.")
-def _guess_columns(df: pd.DataFrame) -> Tuple[Optional[str], Optional[str]]:
-skill_col = _find_column(df.columns, LIKELY_SKILL_COLS) or df.select_dtypes(include=["object"]).columns[:1].tolist()[0]
-count_col = _find_column(df.columns, LIKELY_COUNT_COLS)
-return skill_col, count_col
-def _clean_text(s: str) -> str:
-if not isinstance(s, str):
-s = str(s)
-s = s.strip()
-# preserve Thai/letters/numbers, replace other punctuation with spaces
-s = re.sub(r"[^\w\u0E00-\u0E7F\s\-]+", " ", s)
-s = re.sub(r"\s+", " ", s)
-return s
-def build_frequencies(df: pd.DataFrame, skill_col: str, count_col: Optional[str], stopwords_text: str) -> pd.DataFrame:
-if skill_col not in df.columns:
-raise ValueError(f"Skill column '{skill_col}' not found.")
-tmp = df.copy()
-tmp[skill_col] = tmp[skill_col].map(_clean_text)
-tmp = tmp.dropna(subset=[skill_col])
-demo.launch()

+\
 import os
 import io
+import numpy as np
 import pandas as pd
+from PIL import Image
+import gradio as gr
 from wordcloud import WordCloud
+DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"  # Put this file in the Space repo root
+DEFAULT_TEXT_COL = "ทักษะ"
+DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
+def _load_dataframe(file):
+    """
+    Load a CSV either from the uploaded file or from DEFAULT_CSV if present.
+    """
+    if file is not None:
+        return pd.read_csv(file.name if hasattr(file, "name") else file)
+    if os.path.exists(DEFAULT_CSV):
+        return pd.read_csv(DEFAULT_CSV)
+    raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
+def _detect_columns(df):
+    # Try defaults first; else guess the first two columns
+    if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
+        return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
+    if len(df.columns) >= 2:
+        return df.columns[0], df.columns[1]
+    raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
+def generate_wordcloud(
+    csv_file,
+    text_col,
+    freq_col,
+    max_words,
+    background_color,
+    colormap,
+    width,
+    height,
+    scale,
+    prefer_horizontal,
+    collocations,
+    stopwords_text,
+    mask_image,
+    random_state
+):
+    # Load data
+    df = _load_dataframe(csv_file)
+    # Auto-pick columns if "auto"
+    if text_col == "auto" or freq_col == "auto":
+        auto_text, auto_freq = _detect_columns(df)
+        if text_col == "auto":
+            text_col = auto_text
+        if freq_col == "auto":
+            freq_col = auto_freq
+    if text_col not in df.columns or freq_col not in df.columns:
+        raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
+    # Clean and build frequency dict
+    sub = df[[text_col, freq_col]].dropna()
+    # Coerce frequency to numeric
+    sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
+    # Keep top N by frequency
+    sub = sub.sort_values(freq_col, ascending=False).head(max_words)
+    frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
+    if not frequencies:
+        raise gr.Error("No words found after processing. Please check your CSV columns.")
+    # Stopwords
+    stopwords = set()
+    if stopwords_text:
+        for w in stopwords_text.splitlines():
+            w = w.strip()
+            if w:
+                stopwords.add(w)
+    # Optional mask
+    mask = None
+    if mask_image is not None:
+        try:
+            pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L")
+            mask = np.array(pil)
+        except Exception as e:
+            raise gr.Error(f"Failed to read mask image: {e}")
+    # Try to use Thai-capable font if present
+    font_path = None
+    for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
+        if os.path.exists(cand):
+            font_path = cand
+            break
+    wc = WordCloud(
+        width=width,
+        height=height,
+        scale=scale,
+        background_color=background_color,
+        colormap=None if colormap == "default" else colormap,
+        prefer_horizontal=prefer_horizontal,
+        collocations=collocations,
+        stopwords=stopwords,
+        mask=mask,
+        font_path=font_path,
+        random_state=random_state,
+    )
+    wc.generate_from_frequencies(frequencies)
+    img = wc.to_image()
+    # Return image and also a CSV preview (top words used)
+    preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
+    return img, preview
+with gr.Blocks(title="Soft Skills WordCloud") as demo:
+    gr.Markdown("# Soft Skills Word Cloud\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"])
+            text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
+            freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
+            max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
+            background_color = gr.Dropdown(
+                choices=["white", "black"],
+                value="white",
+                label="Background color"
+            )
+            colormap = gr.Dropdown(
+                choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
+                value="default",
+                label="Colormap"
+            )
+            width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)")
+            height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)")
+            scale = gr.Slider(1, 5, value=2, step=1, label="Scale")
+            prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout")
+            collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)")
+            stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand")
+            mask_image = gr.Image(type="filepath", label="Mask image (optional)", tool=None)
+            random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
+            run_btn = gr.Button("Generate Word Cloud", variant="primary")
+        with gr.Column(scale=1):
+            out_img = gr.Image(label="Word Cloud", type="pil")
+            out_table = gr.Dataframe(label="Top words used", wrap=True)
+    run_btn.click(
+        fn=generate_wordcloud,
+        inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state],
+        outputs=[out_img, out_table],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-gradio>=4.44.0
-pandas>=2.1.0
 wordcloud>=1.9.3
-matplotlib>=3.8.0
-Pillow>=10.0.0

+gradio>=4.0.0
+pandas>=2.0.0
 wordcloud>=1.9.3
+numpy
+Pillow