Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

JacobLinCool Codex commited on Jun 8

Commit

04ad98e

verified ·

1 Parent(s): ca84660

fix: ignore hosting region tags

Browse files

Co-authored-by: Codex <noreply@openai.com>

Files changed (10) hide show

app.py +37 -3
hackathon_advisor/dashboard.py +164 -17
hackathon_advisor/data.py +22 -1
hackathon_advisor/quest_analysis.py +2 -2
scripts/crawl_hf_spaces.py +3 -2
static/app.js +5 -1
tests/test_app.py +18 -0
tests/test_crawl_hf_spaces.py +2 -1
tests/test_dashboard.py +70 -2
tests/test_data.py +17 -0

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import sys
 import tempfile
 from threading import Lock, Thread
 import time
 from typing import Any, Iterator
 from uuid import uuid4
@@ -29,7 +30,13 @@ from hackathon_advisor.dashboard_storage import (
     persist_refresh_artifacts,
     require_writable_cache_dir,
 )
-from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO, Project, ProjectIndex
 from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
 from hackathon_advisor.model_runtime import create_tool_planner
 from hackathon_advisor.profiling import (
@@ -242,11 +249,13 @@ def _run_refresh_job(run_id: str, cache_dir: Path) -> None:
             },
         )
     except Exception as error:  # noqa: BLE001 - background job must report every failure as state
         _set_refresh_state(
             status="failed",
             stage="",
             finished_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
-            error=str(error),
             result=None,
         )
@@ -414,6 +423,17 @@ def _format_output_tail(output_tail: list[str]) -> str:
     return "\n".join(output_tail) if output_tail else "(no output)"
 def _replace_runtime_from_files(projects_path: Path, index_path: Path, refreshed_dashboard: dict[str, Any]) -> None:
     global index, engine, _cpu_engine, dashboard_payload
     new_index = ProjectIndex.from_files(projects_path, index_path)
@@ -425,6 +445,20 @@ def _replace_runtime_from_files(projects_path: Path, index_path: Path, refreshed
         dashboard_payload = refreshed_dashboard
 def _session_from_json(session_json: str = "{}") -> dict[str, Any]:
     try:
         session = json.loads(session_json or "{}")
@@ -521,7 +555,7 @@ def static_file(path: str) -> FileResponse:
 @app.get("/api/dashboard")
 def dashboard() -> dict:
     with _runtime_lock:
-        payload = dict(dashboard_payload)
     payload["refresh"] = _refresh_public_state()
     return payload

 import tempfile
 from threading import Lock, Thread
 import time
+import traceback
 from typing import Any, Iterator
 from uuid import uuid4
     persist_refresh_artifacts,
     require_writable_cache_dir,
 )
+from hackathon_advisor.data import (
+    DEFAULT_EMBEDDING_MODEL_FILE,
+    DEFAULT_EMBEDDING_MODEL_REPO,
+    Project,
+    ProjectIndex,
+    normalize_project_tags,
+)
 from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
 from hackathon_advisor.model_runtime import create_tool_planner
 from hackathon_advisor.profiling import (
             },
         )
     except Exception as error:  # noqa: BLE001 - background job must report every failure as state
+        print("[dashboard-refresh] failed", flush=True)
+        traceback.print_exception(type(error), error, error.__traceback__)
         _set_refresh_state(
             status="failed",
             stage="",
             finished_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
+            error=_format_refresh_error(error),
             result=None,
         )
     return "\n".join(output_tail) if output_tail else "(no output)"
+def _format_refresh_error(error: BaseException) -> str:
+    parts = [f"{type(error).__name__}: {error}"]
+    cause = error.__cause__
+    if cause is not None:
+        parts.append(f"caused by {type(cause).__name__}: {cause}")
+    context = error.__context__
+    if context is not None and context is not cause:
+        parts.append(f"context {type(context).__name__}: {context}")
+    return "; ".join(parts)
 def _replace_runtime_from_files(projects_path: Path, index_path: Path, refreshed_dashboard: dict[str, Any]) -> None:
     global index, engine, _cpu_engine, dashboard_payload
     new_index = ProjectIndex.from_files(projects_path, index_path)
         dashboard_payload = refreshed_dashboard
+def _public_dashboard_payload(payload: dict[str, Any]) -> dict[str, Any]:
+    public_payload = dict(payload)
+    public_payload["points"] = [_public_dashboard_point(point) for point in payload.get("points") or []]
+    return public_payload
+def _public_dashboard_point(point: Any) -> dict[str, Any]:
+    if not isinstance(point, dict):
+        return {}
+    public_point = dict(point)
+    public_point["tags"] = list(normalize_project_tags(public_point.get("tags") or []))
+    return public_point
 def _session_from_json(session_json: str = "{}") -> dict[str, Any]:
     try:
         session = json.loads(session_json or "{}")
 @app.get("/api/dashboard")
 def dashboard() -> dict:
     with _runtime_lock:
+        payload = _public_dashboard_payload(dashboard_payload)
     payload["refresh"] = _refresh_public_state()
     return payload

hackathon_advisor/dashboard.py CHANGED Viewed

@@ -6,7 +6,14 @@ from datetime import datetime, timezone
 import math
 from typing import Any
-from hackathon_advisor.data import Project, ProjectIndex, public_project_summary, public_project_title, tokenize
 from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
@@ -14,12 +21,30 @@ DASHBOARD_SCHEMA_VERSION = 1
 TSNE_RANDOM_STATE = 42
 TSNE_MIN_PROJECTS = 3
 LINKS_PER_PROJECT = 2
 STOPWORDS = {
     "agent",
     "app",
     "assistant",
     "build",
     "demo",
     "face",
     "for",
@@ -27,13 +52,55 @@ STOPWORDS = {
     "gradio",
     "hackathon",
     "hugging",
     "local",
     "model",
     "project",
     "small",
     "space",
     "this",
     "with",
 }
@@ -79,6 +146,7 @@ def build_dashboard_payload(
             "random_state": TSNE_RANDOM_STATE,
             "perplexity": _tsne_perplexity(len(projects)),
         },
         "points": points,
         "links": links,
         "clusters": clusters,
@@ -205,15 +273,25 @@ def _cluster_payloads(
     )
     cluster_id_by_raw = {label: f"cluster-{position + 1}" for position, label in enumerate(ordered_raw_labels)}
     clusters: list[dict[str, Any]] = []
     for raw_label in ordered_raw_labels:
         indexes = grouped[raw_label]
-        keywords = _cluster_keywords(projects[index] for index in indexes)
-        label = " / ".join(word.title() for word in keywords[:2]) if keywords else "Project cluster"
         representatives = sorted(
-            (projects[index] for index in indexes),
             key=lambda project: (project.likes, project.last_modified, project.title.lower()),
             reverse=True,
         )[:4]
         clusters.append(
             {
                 "id": cluster_id_by_raw[raw_label],
@@ -237,20 +315,89 @@ def _cluster_center(coordinates: Sequence[tuple[float, float]], indexes: Sequenc
     )
-def _cluster_keywords(projects: Sequence[Project]) -> list[str]:
-    counts: Counter[str] = Counter()
     for project in projects:
-        text = " ".join(
-            [
-                project.title,
-                project.slug.replace("-", " ").replace("_", " "),
-                project.summary,
-                " ".join(project.tags),
-                " ".join(project.models),
-            ]
         )
-        counts.update(token for token in tokenize(text) if token not in STOPWORDS and not token.startswith("region"))
-    return [token for token, _count in counts.most_common(5)]
 def _normalize_quest_matches(
@@ -302,7 +449,7 @@ def _point_payloads(
                 "likes": project.likes,
                 "sdk": project.sdk,
                 "models": list(project.models),
-                "tags": list(project.tags),
                 "last_modified": project.last_modified,
                 "x": x,
                 "y": y,

 import math
 from typing import Any
+from hackathon_advisor.data import (
+    Project,
+    ProjectIndex,
+    normalize_project_tags,
+    public_project_summary,
+    public_project_title,
+    tokenize,
+)
 from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
 TSNE_RANDOM_STATE = 42
 TSNE_MIN_PROJECTS = 3
 LINKS_PER_PROJECT = 2
+CLUSTER_LABEL_ALGORITHM = "distinctive-keywords-v1"
 STOPWORDS = {
+    "about",
     "agent",
     "app",
+    "apps",
+    "ai",
+    "all",
+    "an",
+    "and",
+    "are",
+    "as",
+    "at",
+    "before",
     "assistant",
+    "be",
+    "been",
+    "being",
     "build",
+    "build-small",
+    "build-small-hackathon",
+    "built",
+    "by",
     "demo",
     "face",
     "for",
     "gradio",
     "hackathon",
     "hugging",
+    "huggingface",
+    "in",
+    "is",
+    "it",
+    "its",
+    "first",
     "local",
+    "make",
+    "makes",
+    "made",
+    "me",
     "model",
+    "models",
+    "my",
+    "of",
+    "on",
+    "or",
+    "our",
+    "one",
     "project",
+    "projects",
+    "pro",
+    "region",
+    "run",
+    "runs",
     "small",
     "space",
+    "spaces",
+    "submission",
+    "the",
+    "their",
+    "them",
+    "these",
+    "they",
     "this",
+    "those",
+    "to",
+    "tool",
+    "tools",
+    "try",
+    "us",
+    "use",
+    "used",
+    "uses",
+    "using",
+    "we",
     "with",
+    "you",
+    "your",
 }
             "random_state": TSNE_RANDOM_STATE,
             "perplexity": _tsne_perplexity(len(projects)),
         },
+        "cluster_label_algorithm": CLUSTER_LABEL_ALGORITHM,
         "points": points,
         "links": links,
         "clusters": clusters,
     )
     cluster_id_by_raw = {label: f"cluster-{position + 1}" for position, label in enumerate(ordered_raw_labels)}
     clusters: list[dict[str, Any]] = []
+    corpus_document_frequency = _corpus_document_frequency(projects)
     for raw_label in ordered_raw_labels:
         indexes = grouped[raw_label]
+        cluster_projects = [projects[index] for index in indexes]
         representatives = sorted(
+            cluster_projects,
             key=lambda project: (project.likes, project.last_modified, project.title.lower()),
             reverse=True,
         )[:4]
+        keywords = _cluster_keywords(
+            cluster_projects,
+            corpus_document_frequency=corpus_document_frequency,
+            corpus_project_count=len(projects),
+        )
+        label = (
+            " / ".join(word.title() for word in keywords[:2])
+            if keywords
+            else _representative_cluster_label(representatives)
+        )
         clusters.append(
             {
                 "id": cluster_id_by_raw[raw_label],
     )
+def _corpus_document_frequency(projects: Sequence[Project]) -> Counter[str]:
+    document_frequency: Counter[str] = Counter()
     for project in projects:
+        document_frequency.update(set(_project_keyword_tokens(project)))
+    return document_frequency
+def _cluster_keywords(
+    projects: Sequence[Project],
+    *,
+    corpus_document_frequency: Mapping[str, int],
+    corpus_project_count: int,
+) -> list[str]:
+    counts: Counter[str] = Counter()
+    document_frequency: Counter[str] = Counter()
+    project_list = list(projects)
+    for project in project_list:
+        tokens = _project_keyword_tokens(project)
+        counts.update(tokens)
+        document_frequency.update(set(tokens))
+    if not project_list:
+        return []
+    min_cluster_documents = 1 if len(project_list) <= 3 else 2
+    scored: list[tuple[float, int, int, str]] = []
+    for token, count in counts.items():
+        cluster_documents = document_frequency[token]
+        if cluster_documents < min_cluster_documents:
+            continue
+        corpus_documents = int(corpus_document_frequency.get(token) or 0)
+        if corpus_documents <= 0:
+            continue
+        inverse_document_frequency = math.log((1 + corpus_project_count) / (1 + corpus_documents))
+        if inverse_document_frequency <= 0.0:
+            continue
+        exclusivity = cluster_documents / corpus_documents
+        coverage = cluster_documents / len(project_list)
+        score = (
+            (1.0 + math.log(count))
+            * inverse_document_frequency
+            * (0.35 + 0.65 * exclusivity)
+            * (0.35 + 0.65 * coverage)
         )
+        scored.append((score, cluster_documents, count, token))
+    scored.sort(key=lambda item: (-item[0], -item[1], -item[2], item[3]))
+    return [token for _score, _cluster_documents, _count, token in scored[:5]]
+def _project_keyword_tokens(project: Project) -> list[str]:
+    text = " ".join(
+        [
+            project.title,
+            project.slug.replace("-", " ").replace("_", " "),
+            project.summary,
+            " ".join(normalize_project_tags(project.tags)),
+            " ".join(project.models),
+        ]
+    )
+    return [token for token in tokenize(text) if _is_cluster_keyword(token)]
+def _is_cluster_keyword(token: str) -> bool:
+    if token in STOPWORDS:
+        return False
+    if token.startswith("region"):
+        return False
+    if token.isdigit():
+        return False
+    return True
+def _representative_cluster_label(projects: Sequence[Project]) -> str:
+    labels: list[str] = []
+    for project in projects:
+        title = public_project_title(project.title)
+        if title == "Untitled project":
+            continue
+        labels.append(title)
+        if len(labels) == 2:
+            break
+    return " / ".join(labels) if labels else "Mixed projects"
 def _normalize_quest_matches(
                 "likes": project.likes,
                 "sdk": project.sdk,
                 "models": list(project.models),
+                "tags": list(normalize_project_tags(project.tags)),
                 "last_modified": project.last_modified,
                 "x": x,
                 "y": y,

hackathon_advisor/data.py CHANGED Viewed

@@ -32,6 +32,7 @@ DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
 DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
 DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
 APP_FILE_EMBEDDING_CHAR_LIMIT = 2000
 EmbeddingFunction = Callable[[str], Sequence[float]]
@@ -108,7 +109,7 @@ class Project:
             "id": self.id,
             "title": public_project_title(self.title),
             "summary": public_project_summary(self.summary),
-            "tags": list(self.tags),
             "models": list(self.models),
             "datasets": list(self.datasets),
             "likes": self.likes,
@@ -150,6 +151,7 @@ class Project:
         )
         return payload
 @dataclass(frozen=True)
 class SearchHit:
     project: Project
@@ -185,6 +187,25 @@ def public_project_title(title: str) -> str:
     return cleaned
 def public_project_summary(summary: str) -> str:
     cleaned = " ".join(str(summary).split())
     if not cleaned:

 DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
 DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
 APP_FILE_EMBEDDING_CHAR_LIMIT = 2000
+HOSTING_METADATA_TAG_PREFIXES = ("region:",)
 EmbeddingFunction = Callable[[str], Sequence[float]]
             "id": self.id,
             "title": public_project_title(self.title),
             "summary": public_project_summary(self.summary),
+            "tags": list(normalize_project_tags(self.tags)),
             "models": list(self.models),
             "datasets": list(self.datasets),
             "likes": self.likes,
         )
         return payload
 @dataclass(frozen=True)
 class SearchHit:
     project: Project
     return cleaned
+def normalize_project_tags(tags: Sequence[Any]) -> tuple[str, ...]:
+    cleaned: list[str] = []
+    seen: set[str] = set()
+    for raw_tag in tags or ():
+        tag = " ".join(str(raw_tag or "").split())
+        if not tag or is_hosting_metadata_tag(tag):
+            continue
+        if tag in seen:
+            continue
+        seen.add(tag)
+        cleaned.append(tag)
+    return tuple(cleaned)
+def is_hosting_metadata_tag(tag: str) -> bool:
+    folded = str(tag or "").strip().casefold()
+    return any(folded.startswith(prefix) for prefix in HOSTING_METADATA_TAG_PREFIXES)
 def public_project_summary(summary: str) -> str:
     cleaned = " ".join(str(summary).split())
     if not cleaned:

hackathon_advisor/quest_analysis.py CHANGED Viewed

@@ -7,7 +7,7 @@ import json
 import os
 from typing import Any, Protocol
-from hackathon_advisor.data import Project
 from hackathon_advisor.model_runtime import (
     DEFAULT_MODEL_ID,
     _minicpm_generation_kwargs,
@@ -317,7 +317,7 @@ def render_project_quest_prompt(project: Project) -> str:
         title=project.title,
         sdk=project.sdk,
         declared_models=project.models,
-        tags=project.tags,
         readme_segment=build_readme_segment(project.readme_body),
         app_file_name=project.app_file,
         app_file_segment=build_app_segment(project.app_file_source, project.app_file_embedding_text),

 import os
 from typing import Any, Protocol
+from hackathon_advisor.data import Project, normalize_project_tags
 from hackathon_advisor.model_runtime import (
     DEFAULT_MODEL_ID,
     _minicpm_generation_kwargs,
         title=project.title,
         sdk=project.sdk,
         declared_models=project.models,
+        tags=normalize_project_tags(project.tags),
         readme_segment=build_readme_segment(project.readme_body),
         app_file_name=project.app_file,
         app_file_segment=build_app_segment(project.app_file_source, project.app_file_embedding_text),

scripts/crawl_hf_spaces.py CHANGED Viewed

@@ -19,7 +19,7 @@ from huggingface_hub.errors import EntryNotFoundError
 ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(ROOT))
-from hackathon_advisor.data import extract_app_file_embedding_text
 API = "https://huggingface.co/api"
@@ -88,11 +88,12 @@ def project_from_space(space: Any) -> dict[str, Any]:
     title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
     summary = str(card.get("short_description") or card.get("description") or "")
     return {
         "id": space_id,
         "title": title,
         "summary": summary,
-        "tags": sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or []))),
         "models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
         "datasets": [
             str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []

 ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(ROOT))
+from hackathon_advisor.data import extract_app_file_embedding_text, normalize_project_tags
 API = "https://huggingface.co/api"
     title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
     summary = str(card.get("short_description") or card.get("description") or "")
+    raw_tags = sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or [])))
     return {
         "id": space_id,
         "title": title,
         "summary": summary,
+        "tags": list(normalize_project_tags(raw_tags)),
         "models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
         "datasets": [
             str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []

static/app.js CHANGED Viewed

@@ -498,7 +498,7 @@ function renderAtlasDetail(point) {
       return `<span>${escapeHtml(atlasQuestLabel(match.quest))} ${confidence}%</span>`;
     })
     .join("");
-  const tags = [...(point.models || []).slice(0, 3), ...(point.tags || []).slice(0, 3)]
     .map((tag) => `<span>${escapeHtml(tag)}</span>`)
     .join("");
   atlasDetailEl.innerHTML = `
@@ -511,6 +511,10 @@ function renderAtlasDetail(point) {
   `;
 }
 function renderAtlasReport(data) {
   if (!atlasReportEl) return;
   const cluster = selectedClusterId

       return `<span>${escapeHtml(atlasQuestLabel(match.quest))} ${confidence}%</span>`;
     })
     .join("");
+  const tags = [...(point.models || []).slice(0, 3), ...visibleProjectTags(point.tags || []).slice(0, 3)]
     .map((tag) => `<span>${escapeHtml(tag)}</span>`)
     .join("");
   atlasDetailEl.innerHTML = `
   `;
 }
+function visibleProjectTags(tags) {
+  return (tags || []).filter((tag) => !String(tag || "").toLowerCase().startsWith("region:"));
+}
 function renderAtlasReport(data) {
   if (!atlasReportEl) return;
   const cluster = selectedClusterId

tests/test_app.py CHANGED Viewed

@@ -133,6 +133,24 @@ def test_dashboard_endpoint_exposes_atlas_payload() -> None:
     assert payload["links"]
     assert payload["quest_report"]["status"] in {"analyzed", "not_analyzed"}
     assert payload["refresh"]["status"] in {"idle", "running", "succeeded", "failed"}
 def test_dashboard_refresh_requires_bucket(monkeypatch) -> None:

     assert payload["links"]
     assert payload["quest_report"]["status"] in {"analyzed", "not_analyzed"}
     assert payload["refresh"]["status"] in {"idle", "running", "succeeded", "failed"}
+    assert all(
+        not str(tag).casefold().startswith("region:")
+        for point in payload["points"]
+        for tag in point.get("tags", [])
+    )
+def test_refresh_error_format_includes_exception_chain() -> None:
+    try:
+        try:
+            raise ValueError("bad quest")
+        except ValueError as cause:
+            raise RuntimeError("refresh failed") from cause
+    except RuntimeError as error:
+        message = app_module._format_refresh_error(error)
+    assert "RuntimeError: refresh failed" in message
+    assert "caused by ValueError: bad quest" in message
 def test_dashboard_refresh_requires_bucket(monkeypatch) -> None:

tests/test_crawl_hf_spaces.py CHANGED Viewed

@@ -44,7 +44,7 @@ def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
             SimpleNamespace(rfilename="README.md"),
             SimpleNamespace(rfilename="app.py"),
         ],
-        tags=["gradio"],
         models=[],
         datasets=[],
         likes=3,
@@ -61,6 +61,7 @@ def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
     assert project["app_file_source"] == "import gradio as gr\ngr.Textbox(label='Idea')\n"
     assert "gr.Textbox" in project["app_file_embedding_text"]
     assert "Idea" in project["app_file_embedding_text"]
 def test_project_from_space_tolerates_stale_frontmatter_app_file(monkeypatch) -> None:

             SimpleNamespace(rfilename="README.md"),
             SimpleNamespace(rfilename="app.py"),
         ],
+        tags=["gradio", "region:us"],
         models=[],
         datasets=[],
         likes=3,
     assert project["app_file_source"] == "import gradio as gr\ngr.Textbox(label='Idea')\n"
     assert "gr.Textbox" in project["app_file_embedding_text"]
     assert "Idea" in project["app_file_embedding_text"]
+    assert project["tags"] == ["gradio"]
 def test_project_from_space_tolerates_stale_frontmatter_app_file(monkeypatch) -> None:

tests/test_dashboard.py CHANGED Viewed

@@ -2,7 +2,11 @@ from __future__ import annotations
 from pathlib import Path
-from hackathon_advisor.dashboard import build_dashboard_payload, validate_dashboard_payload
 from hackathon_advisor.data import Project, ProjectIndex, build_index_payload
 from hackathon_advisor.quest_analysis import (
     MiniCPMQuestAnalyzer,
@@ -45,6 +49,7 @@ def test_dashboard_builder_projects_embeddings_with_tsne_and_clusters() -> None:
     assert payload["quest_report"]["status"] == "analyzed"
     assert all(0 <= point["x"] <= 100 and 0 <= point["y"] <= 100 for point in payload["points"])
     assert all(point["quest_ids"] for point in payload["points"])
 def test_dashboard_builder_is_deterministic_for_fixed_vectors() -> None:
@@ -59,6 +64,18 @@ def test_dashboard_builder_is_deterministic_for_fixed_vectors() -> None:
     assert left["clusters"] == right["clusters"]
 def test_quest_analysis_validation_accepts_strict_project_coverage() -> None:
     projects = fake_projects(4)
     raw = {
@@ -343,7 +360,7 @@ def test_quest_prompt_uses_raw_readme_and_app_source_segments() -> None:
         id="build-small-hackathon/two-segment",
         title="Two Segment",
         summary="card summary should not drive quest analysis",
-        tags=("gradio",),
         models=("openbmb/MiniCPM5-1B",),
         datasets=(),
         likes=1,
@@ -367,6 +384,7 @@ def test_quest_prompt_uses_raw_readme_and_app_source_segments() -> None:
     assert "from llama_cpp import Llama" in prompt
     assert "card summary should not drive quest analysis" not in prompt
     assert "compact app signals should not drive quest analysis" not in prompt
 def test_quest_analyzer_rejects_non_minicpm_backend(monkeypatch) -> None:
@@ -400,6 +418,56 @@ def fake_index() -> ProjectIndex:
     )
 def fake_projects(count: int) -> list[Project]:
     return [
         Project(

 from pathlib import Path
+from hackathon_advisor.dashboard import (
+    CLUSTER_LABEL_ALGORITHM,
+    build_dashboard_payload,
+    validate_dashboard_payload,
+)
 from hackathon_advisor.data import Project, ProjectIndex, build_index_payload
 from hackathon_advisor.quest_analysis import (
     MiniCPMQuestAnalyzer,
     assert payload["quest_report"]["status"] == "analyzed"
     assert all(0 <= point["x"] <= 100 and 0 <= point["y"] <= 100 for point in payload["points"])
     assert all(point["quest_ids"] for point in payload["points"])
+    assert payload["cluster_label_algorithm"] == CLUSTER_LABEL_ALGORITHM
 def test_dashboard_builder_is_deterministic_for_fixed_vectors() -> None:
     assert left["clusters"] == right["clusters"]
+def test_dashboard_cluster_labels_ignore_hackathon_wide_noise() -> None:
+    index = noisy_cluster_label_index()
+    payload = build_dashboard_payload(index, generated_at="2026-06-08T00:00:00+00:00")
+    banned = {"ai", "build-small-hackathon", "gradio", "hackathon", "project", "region", "us"}
+    keywords = {keyword for cluster in payload["clusters"] for keyword in cluster["keywords"]}
+    assert keywords.isdisjoint(banned)
+    assert {"dream", "family", "garden", "notice", "order", "repair"} & keywords
+    assert all("region:us" not in point["tags"] for point in payload["points"])
 def test_quest_analysis_validation_accepts_strict_project_coverage() -> None:
     projects = fake_projects(4)
     raw = {
         id="build-small-hackathon/two-segment",
         title="Two Segment",
         summary="card summary should not drive quest analysis",
+        tags=("gradio", "region:us"),
         models=("openbmb/MiniCPM5-1B",),
         datasets=(),
         likes=1,
     assert "from llama_cpp import Llama" in prompt
     assert "card summary should not drive quest analysis" not in prompt
     assert "compact app signals should not drive quest analysis" not in prompt
+    assert "region:us" not in prompt
 def test_quest_analyzer_rejects_non_minicpm_backend(monkeypatch) -> None:
     )
+def noisy_cluster_label_index() -> ProjectIndex:
+    themes = [
+        ("dream", ("Dream Lantern", "Dream Atlas"), "dream journal symbolic oracle"),
+        ("family", ("Family Ledger", "Care Kinship"), "family care bill coordination"),
+        ("garden", ("Garden Notebook", "Seed Exchange"), "garden seed neighborhood plants"),
+        ("notice", ("Notice Helper", "Scam Screen"), "notice scam safety verification"),
+        ("order", ("Order Desk", "Inventory Voice"), "order inventory audio assistant"),
+        ("repair", ("Repair Coach", "Tool Shed"), "repair maintenance workshop"),
+    ]
+    projects: list[Project] = []
+    embeddings = []
+    for theme_index, (theme, titles, summary) in enumerate(themes):
+        for title in titles:
+            projects.append(
+                Project(
+                    id=f"build-small-hackathon/{title.lower().replace(' ', '-')}",
+                    title=title,
+                    summary=(
+                        f"{summary} for a build-small-hackathon AI project in the US region "
+                        "with a Gradio demo."
+                    ),
+                    tags=("build-small-hackathon", "ai", "gradio", "region:us", theme),
+                    models=("tiny-model",),
+                    datasets=(),
+                    likes=theme_index,
+                    sdk="gradio",
+                    license="mit",
+                    created_at="2026-06-01T00:00:00+00:00",
+                    last_modified=f"2026-06-{theme_index + 1:02d}T00:00:00+00:00",
+                    host=f"https://{title.lower().replace(' ', '-')}.hf.space",
+                    url=f"https://huggingface.co/spaces/build-small-hackathon/{title.lower().replace(' ', '-')}",
+                    app_file="app.py",
+                    app_file_embedding_text="shared local small model app",
+                )
+            )
+            vector = [0.0] * len(themes)
+            vector[theme_index] = 1.0
+            embeddings.append(vector)
+    snapshot_generated_at = "2026-06-08T00:00:00+00:00"
+    source = "https://example.test/spaces"
+    payload = build_index_payload(projects, snapshot_generated_at, source, embeddings)
+    return ProjectIndex(
+        projects=projects,
+        generated_at=snapshot_generated_at,
+        source=source,
+        index_payload=payload,
+    )
 def fake_projects(count: int) -> list[Project]:
     return [
         Project(

tests/test_data.py CHANGED Viewed

@@ -85,6 +85,23 @@ def test_searchable_text_includes_main_app_file_signals() -> None:
     assert "Project idea" in searchable
 def test_searchable_text_excludes_refresh_readme_body_for_stable_reuse() -> None:
     project = Project(
         id="build-small-hackathon/long-readme",

     assert "Project idea" in searchable
+def test_public_project_tags_exclude_hosting_metadata() -> None:
+    project = Project.from_dict(
+        {
+            "id": "build-small-hackathon/idea-canvas",
+            "title": "Idea Canvas",
+            "summary": "",
+            "tags": ["gradio", "region:us", "local-first", "region:eu", "gradio"],
+            "models": [],
+            "datasets": [],
+            "url": "https://example.test",
+        }
+    )
+    assert project.tags == ("gradio", "region:us", "local-first", "region:eu", "gradio")
+    assert project.to_public_dict()["tags"] == ["gradio", "local-first"]
 def test_searchable_text_excludes_refresh_readme_body_for_stable_reuse() -> None:
     project = Project(
         id="build-small-hackathon/long-readme",