JacobLinCool Codex commited on
Commit
04ad98e
·
verified ·
1 Parent(s): ca84660

fix: ignore hosting region tags

Browse files

Co-authored-by: Codex <noreply@openai.com>

app.py CHANGED
@@ -10,6 +10,7 @@ import sys
10
  import tempfile
11
  from threading import Lock, Thread
12
  import time
 
13
  from typing import Any, Iterator
14
  from uuid import uuid4
15
 
@@ -29,7 +30,13 @@ from hackathon_advisor.dashboard_storage import (
29
  persist_refresh_artifacts,
30
  require_writable_cache_dir,
31
  )
32
- from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO, Project, ProjectIndex
 
 
 
 
 
 
33
  from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
34
  from hackathon_advisor.model_runtime import create_tool_planner
35
  from hackathon_advisor.profiling import (
@@ -242,11 +249,13 @@ def _run_refresh_job(run_id: str, cache_dir: Path) -> None:
242
  },
243
  )
244
  except Exception as error: # noqa: BLE001 - background job must report every failure as state
 
 
245
  _set_refresh_state(
246
  status="failed",
247
  stage="",
248
  finished_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
249
- error=str(error),
250
  result=None,
251
  )
252
 
@@ -414,6 +423,17 @@ def _format_output_tail(output_tail: list[str]) -> str:
414
  return "\n".join(output_tail) if output_tail else "(no output)"
415
 
416
 
 
 
 
 
 
 
 
 
 
 
 
417
  def _replace_runtime_from_files(projects_path: Path, index_path: Path, refreshed_dashboard: dict[str, Any]) -> None:
418
  global index, engine, _cpu_engine, dashboard_payload
419
  new_index = ProjectIndex.from_files(projects_path, index_path)
@@ -425,6 +445,20 @@ def _replace_runtime_from_files(projects_path: Path, index_path: Path, refreshed
425
  dashboard_payload = refreshed_dashboard
426
 
427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  def _session_from_json(session_json: str = "{}") -> dict[str, Any]:
429
  try:
430
  session = json.loads(session_json or "{}")
@@ -521,7 +555,7 @@ def static_file(path: str) -> FileResponse:
521
  @app.get("/api/dashboard")
522
  def dashboard() -> dict:
523
  with _runtime_lock:
524
- payload = dict(dashboard_payload)
525
  payload["refresh"] = _refresh_public_state()
526
  return payload
527
 
 
10
  import tempfile
11
  from threading import Lock, Thread
12
  import time
13
+ import traceback
14
  from typing import Any, Iterator
15
  from uuid import uuid4
16
 
 
30
  persist_refresh_artifacts,
31
  require_writable_cache_dir,
32
  )
33
+ from hackathon_advisor.data import (
34
+ DEFAULT_EMBEDDING_MODEL_FILE,
35
+ DEFAULT_EMBEDDING_MODEL_REPO,
36
+ Project,
37
+ ProjectIndex,
38
+ normalize_project_tags,
39
+ )
40
  from hackathon_advisor.demo_rehearsal import build_demo_rehearsal
41
  from hackathon_advisor.model_runtime import create_tool_planner
42
  from hackathon_advisor.profiling import (
 
249
  },
250
  )
251
  except Exception as error: # noqa: BLE001 - background job must report every failure as state
252
+ print("[dashboard-refresh] failed", flush=True)
253
+ traceback.print_exception(type(error), error, error.__traceback__)
254
  _set_refresh_state(
255
  status="failed",
256
  stage="",
257
  finished_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
258
+ error=_format_refresh_error(error),
259
  result=None,
260
  )
261
 
 
423
  return "\n".join(output_tail) if output_tail else "(no output)"
424
 
425
 
426
+ def _format_refresh_error(error: BaseException) -> str:
427
+ parts = [f"{type(error).__name__}: {error}"]
428
+ cause = error.__cause__
429
+ if cause is not None:
430
+ parts.append(f"caused by {type(cause).__name__}: {cause}")
431
+ context = error.__context__
432
+ if context is not None and context is not cause:
433
+ parts.append(f"context {type(context).__name__}: {context}")
434
+ return "; ".join(parts)
435
+
436
+
437
  def _replace_runtime_from_files(projects_path: Path, index_path: Path, refreshed_dashboard: dict[str, Any]) -> None:
438
  global index, engine, _cpu_engine, dashboard_payload
439
  new_index = ProjectIndex.from_files(projects_path, index_path)
 
445
  dashboard_payload = refreshed_dashboard
446
 
447
 
448
+ def _public_dashboard_payload(payload: dict[str, Any]) -> dict[str, Any]:
449
+ public_payload = dict(payload)
450
+ public_payload["points"] = [_public_dashboard_point(point) for point in payload.get("points") or []]
451
+ return public_payload
452
+
453
+
454
+ def _public_dashboard_point(point: Any) -> dict[str, Any]:
455
+ if not isinstance(point, dict):
456
+ return {}
457
+ public_point = dict(point)
458
+ public_point["tags"] = list(normalize_project_tags(public_point.get("tags") or []))
459
+ return public_point
460
+
461
+
462
  def _session_from_json(session_json: str = "{}") -> dict[str, Any]:
463
  try:
464
  session = json.loads(session_json or "{}")
 
555
  @app.get("/api/dashboard")
556
  def dashboard() -> dict:
557
  with _runtime_lock:
558
+ payload = _public_dashboard_payload(dashboard_payload)
559
  payload["refresh"] = _refresh_public_state()
560
  return payload
561
 
hackathon_advisor/dashboard.py CHANGED
@@ -6,7 +6,14 @@ from datetime import datetime, timezone
6
  import math
7
  from typing import Any
8
 
9
- from hackathon_advisor.data import Project, ProjectIndex, public_project_summary, public_project_title, tokenize
 
 
 
 
 
 
 
10
  from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
11
 
12
 
@@ -14,12 +21,30 @@ DASHBOARD_SCHEMA_VERSION = 1
14
  TSNE_RANDOM_STATE = 42
15
  TSNE_MIN_PROJECTS = 3
16
  LINKS_PER_PROJECT = 2
 
17
 
18
  STOPWORDS = {
 
19
  "agent",
20
  "app",
 
 
 
 
 
 
 
 
 
21
  "assistant",
 
 
 
22
  "build",
 
 
 
 
23
  "demo",
24
  "face",
25
  "for",
@@ -27,13 +52,55 @@ STOPWORDS = {
27
  "gradio",
28
  "hackathon",
29
  "hugging",
 
 
 
 
 
 
30
  "local",
 
 
 
 
31
  "model",
 
 
 
 
 
 
 
32
  "project",
 
 
 
 
 
33
  "small",
34
  "space",
 
 
 
 
 
 
 
35
  "this",
 
 
 
 
 
 
 
 
 
 
 
36
  "with",
 
 
37
  }
38
 
39
 
@@ -79,6 +146,7 @@ def build_dashboard_payload(
79
  "random_state": TSNE_RANDOM_STATE,
80
  "perplexity": _tsne_perplexity(len(projects)),
81
  },
 
82
  "points": points,
83
  "links": links,
84
  "clusters": clusters,
@@ -205,15 +273,25 @@ def _cluster_payloads(
205
  )
206
  cluster_id_by_raw = {label: f"cluster-{position + 1}" for position, label in enumerate(ordered_raw_labels)}
207
  clusters: list[dict[str, Any]] = []
 
208
  for raw_label in ordered_raw_labels:
209
  indexes = grouped[raw_label]
210
- keywords = _cluster_keywords(projects[index] for index in indexes)
211
- label = " / ".join(word.title() for word in keywords[:2]) if keywords else "Project cluster"
212
  representatives = sorted(
213
- (projects[index] for index in indexes),
214
  key=lambda project: (project.likes, project.last_modified, project.title.lower()),
215
  reverse=True,
216
  )[:4]
 
 
 
 
 
 
 
 
 
 
217
  clusters.append(
218
  {
219
  "id": cluster_id_by_raw[raw_label],
@@ -237,20 +315,89 @@ def _cluster_center(coordinates: Sequence[tuple[float, float]], indexes: Sequenc
237
  )
238
 
239
 
240
- def _cluster_keywords(projects: Sequence[Project]) -> list[str]:
241
- counts: Counter[str] = Counter()
242
  for project in projects:
243
- text = " ".join(
244
- [
245
- project.title,
246
- project.slug.replace("-", " ").replace("_", " "),
247
- project.summary,
248
- " ".join(project.tags),
249
- " ".join(project.models),
250
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  )
252
- counts.update(token for token in tokenize(text) if token not in STOPWORDS and not token.startswith("region"))
253
- return [token for token, _count in counts.most_common(5)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
 
256
  def _normalize_quest_matches(
@@ -302,7 +449,7 @@ def _point_payloads(
302
  "likes": project.likes,
303
  "sdk": project.sdk,
304
  "models": list(project.models),
305
- "tags": list(project.tags),
306
  "last_modified": project.last_modified,
307
  "x": x,
308
  "y": y,
 
6
  import math
7
  from typing import Any
8
 
9
+ from hackathon_advisor.data import (
10
+ Project,
11
+ ProjectIndex,
12
+ normalize_project_tags,
13
+ public_project_summary,
14
+ public_project_title,
15
+ tokenize,
16
+ )
17
  from hackathon_advisor.quest_taxonomy import QUESTS, normalize_match, quest_profiles
18
 
19
 
 
21
  TSNE_RANDOM_STATE = 42
22
  TSNE_MIN_PROJECTS = 3
23
  LINKS_PER_PROJECT = 2
24
+ CLUSTER_LABEL_ALGORITHM = "distinctive-keywords-v1"
25
 
26
  STOPWORDS = {
27
+ "about",
28
  "agent",
29
  "app",
30
+ "apps",
31
+ "ai",
32
+ "all",
33
+ "an",
34
+ "and",
35
+ "are",
36
+ "as",
37
+ "at",
38
+ "before",
39
  "assistant",
40
+ "be",
41
+ "been",
42
+ "being",
43
  "build",
44
+ "build-small",
45
+ "build-small-hackathon",
46
+ "built",
47
+ "by",
48
  "demo",
49
  "face",
50
  "for",
 
52
  "gradio",
53
  "hackathon",
54
  "hugging",
55
+ "huggingface",
56
+ "in",
57
+ "is",
58
+ "it",
59
+ "its",
60
+ "first",
61
  "local",
62
+ "make",
63
+ "makes",
64
+ "made",
65
+ "me",
66
  "model",
67
+ "models",
68
+ "my",
69
+ "of",
70
+ "on",
71
+ "or",
72
+ "our",
73
+ "one",
74
  "project",
75
+ "projects",
76
+ "pro",
77
+ "region",
78
+ "run",
79
+ "runs",
80
  "small",
81
  "space",
82
+ "spaces",
83
+ "submission",
84
+ "the",
85
+ "their",
86
+ "them",
87
+ "these",
88
+ "they",
89
  "this",
90
+ "those",
91
+ "to",
92
+ "tool",
93
+ "tools",
94
+ "try",
95
+ "us",
96
+ "use",
97
+ "used",
98
+ "uses",
99
+ "using",
100
+ "we",
101
  "with",
102
+ "you",
103
+ "your",
104
  }
105
 
106
 
 
146
  "random_state": TSNE_RANDOM_STATE,
147
  "perplexity": _tsne_perplexity(len(projects)),
148
  },
149
+ "cluster_label_algorithm": CLUSTER_LABEL_ALGORITHM,
150
  "points": points,
151
  "links": links,
152
  "clusters": clusters,
 
273
  )
274
  cluster_id_by_raw = {label: f"cluster-{position + 1}" for position, label in enumerate(ordered_raw_labels)}
275
  clusters: list[dict[str, Any]] = []
276
+ corpus_document_frequency = _corpus_document_frequency(projects)
277
  for raw_label in ordered_raw_labels:
278
  indexes = grouped[raw_label]
279
+ cluster_projects = [projects[index] for index in indexes]
 
280
  representatives = sorted(
281
+ cluster_projects,
282
  key=lambda project: (project.likes, project.last_modified, project.title.lower()),
283
  reverse=True,
284
  )[:4]
285
+ keywords = _cluster_keywords(
286
+ cluster_projects,
287
+ corpus_document_frequency=corpus_document_frequency,
288
+ corpus_project_count=len(projects),
289
+ )
290
+ label = (
291
+ " / ".join(word.title() for word in keywords[:2])
292
+ if keywords
293
+ else _representative_cluster_label(representatives)
294
+ )
295
  clusters.append(
296
  {
297
  "id": cluster_id_by_raw[raw_label],
 
315
  )
316
 
317
 
318
+ def _corpus_document_frequency(projects: Sequence[Project]) -> Counter[str]:
319
+ document_frequency: Counter[str] = Counter()
320
  for project in projects:
321
+ document_frequency.update(set(_project_keyword_tokens(project)))
322
+ return document_frequency
323
+
324
+
325
+ def _cluster_keywords(
326
+ projects: Sequence[Project],
327
+ *,
328
+ corpus_document_frequency: Mapping[str, int],
329
+ corpus_project_count: int,
330
+ ) -> list[str]:
331
+ counts: Counter[str] = Counter()
332
+ document_frequency: Counter[str] = Counter()
333
+ project_list = list(projects)
334
+ for project in project_list:
335
+ tokens = _project_keyword_tokens(project)
336
+ counts.update(tokens)
337
+ document_frequency.update(set(tokens))
338
+
339
+ if not project_list:
340
+ return []
341
+
342
+ min_cluster_documents = 1 if len(project_list) <= 3 else 2
343
+ scored: list[tuple[float, int, int, str]] = []
344
+ for token, count in counts.items():
345
+ cluster_documents = document_frequency[token]
346
+ if cluster_documents < min_cluster_documents:
347
+ continue
348
+ corpus_documents = int(corpus_document_frequency.get(token) or 0)
349
+ if corpus_documents <= 0:
350
+ continue
351
+ inverse_document_frequency = math.log((1 + corpus_project_count) / (1 + corpus_documents))
352
+ if inverse_document_frequency <= 0.0:
353
+ continue
354
+ exclusivity = cluster_documents / corpus_documents
355
+ coverage = cluster_documents / len(project_list)
356
+ score = (
357
+ (1.0 + math.log(count))
358
+ * inverse_document_frequency
359
+ * (0.35 + 0.65 * exclusivity)
360
+ * (0.35 + 0.65 * coverage)
361
  )
362
+ scored.append((score, cluster_documents, count, token))
363
+
364
+ scored.sort(key=lambda item: (-item[0], -item[1], -item[2], item[3]))
365
+ return [token for _score, _cluster_documents, _count, token in scored[:5]]
366
+
367
+
368
+ def _project_keyword_tokens(project: Project) -> list[str]:
369
+ text = " ".join(
370
+ [
371
+ project.title,
372
+ project.slug.replace("-", " ").replace("_", " "),
373
+ project.summary,
374
+ " ".join(normalize_project_tags(project.tags)),
375
+ " ".join(project.models),
376
+ ]
377
+ )
378
+ return [token for token in tokenize(text) if _is_cluster_keyword(token)]
379
+
380
+
381
+ def _is_cluster_keyword(token: str) -> bool:
382
+ if token in STOPWORDS:
383
+ return False
384
+ if token.startswith("region"):
385
+ return False
386
+ if token.isdigit():
387
+ return False
388
+ return True
389
+
390
+
391
+ def _representative_cluster_label(projects: Sequence[Project]) -> str:
392
+ labels: list[str] = []
393
+ for project in projects:
394
+ title = public_project_title(project.title)
395
+ if title == "Untitled project":
396
+ continue
397
+ labels.append(title)
398
+ if len(labels) == 2:
399
+ break
400
+ return " / ".join(labels) if labels else "Mixed projects"
401
 
402
 
403
  def _normalize_quest_matches(
 
449
  "likes": project.likes,
450
  "sdk": project.sdk,
451
  "models": list(project.models),
452
+ "tags": list(normalize_project_tags(project.tags)),
453
  "last_modified": project.last_modified,
454
  "x": x,
455
  "y": y,
hackathon_advisor/data.py CHANGED
@@ -32,6 +32,7 @@ DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
  APP_FILE_EMBEDDING_CHAR_LIMIT = 2000
 
35
 
36
 
37
  EmbeddingFunction = Callable[[str], Sequence[float]]
@@ -108,7 +109,7 @@ class Project:
108
  "id": self.id,
109
  "title": public_project_title(self.title),
110
  "summary": public_project_summary(self.summary),
111
- "tags": list(self.tags),
112
  "models": list(self.models),
113
  "datasets": list(self.datasets),
114
  "likes": self.likes,
@@ -150,6 +151,7 @@ class Project:
150
  )
151
  return payload
152
 
 
153
  @dataclass(frozen=True)
154
  class SearchHit:
155
  project: Project
@@ -185,6 +187,25 @@ def public_project_title(title: str) -> str:
185
  return cleaned
186
 
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def public_project_summary(summary: str) -> str:
189
  cleaned = " ".join(str(summary).split())
190
  if not cleaned:
 
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
  APP_FILE_EMBEDDING_CHAR_LIMIT = 2000
35
+ HOSTING_METADATA_TAG_PREFIXES = ("region:",)
36
 
37
 
38
  EmbeddingFunction = Callable[[str], Sequence[float]]
 
109
  "id": self.id,
110
  "title": public_project_title(self.title),
111
  "summary": public_project_summary(self.summary),
112
+ "tags": list(normalize_project_tags(self.tags)),
113
  "models": list(self.models),
114
  "datasets": list(self.datasets),
115
  "likes": self.likes,
 
151
  )
152
  return payload
153
 
154
+
155
  @dataclass(frozen=True)
156
  class SearchHit:
157
  project: Project
 
187
  return cleaned
188
 
189
 
190
+ def normalize_project_tags(tags: Sequence[Any]) -> tuple[str, ...]:
191
+ cleaned: list[str] = []
192
+ seen: set[str] = set()
193
+ for raw_tag in tags or ():
194
+ tag = " ".join(str(raw_tag or "").split())
195
+ if not tag or is_hosting_metadata_tag(tag):
196
+ continue
197
+ if tag in seen:
198
+ continue
199
+ seen.add(tag)
200
+ cleaned.append(tag)
201
+ return tuple(cleaned)
202
+
203
+
204
+ def is_hosting_metadata_tag(tag: str) -> bool:
205
+ folded = str(tag or "").strip().casefold()
206
+ return any(folded.startswith(prefix) for prefix in HOSTING_METADATA_TAG_PREFIXES)
207
+
208
+
209
  def public_project_summary(summary: str) -> str:
210
  cleaned = " ".join(str(summary).split())
211
  if not cleaned:
hackathon_advisor/quest_analysis.py CHANGED
@@ -7,7 +7,7 @@ import json
7
  import os
8
  from typing import Any, Protocol
9
 
10
- from hackathon_advisor.data import Project
11
  from hackathon_advisor.model_runtime import (
12
  DEFAULT_MODEL_ID,
13
  _minicpm_generation_kwargs,
@@ -317,7 +317,7 @@ def render_project_quest_prompt(project: Project) -> str:
317
  title=project.title,
318
  sdk=project.sdk,
319
  declared_models=project.models,
320
- tags=project.tags,
321
  readme_segment=build_readme_segment(project.readme_body),
322
  app_file_name=project.app_file,
323
  app_file_segment=build_app_segment(project.app_file_source, project.app_file_embedding_text),
 
7
  import os
8
  from typing import Any, Protocol
9
 
10
+ from hackathon_advisor.data import Project, normalize_project_tags
11
  from hackathon_advisor.model_runtime import (
12
  DEFAULT_MODEL_ID,
13
  _minicpm_generation_kwargs,
 
317
  title=project.title,
318
  sdk=project.sdk,
319
  declared_models=project.models,
320
+ tags=normalize_project_tags(project.tags),
321
  readme_segment=build_readme_segment(project.readme_body),
322
  app_file_name=project.app_file,
323
  app_file_segment=build_app_segment(project.app_file_source, project.app_file_embedding_text),
scripts/crawl_hf_spaces.py CHANGED
@@ -19,7 +19,7 @@ from huggingface_hub.errors import EntryNotFoundError
19
  ROOT = Path(__file__).resolve().parents[1]
20
  sys.path.insert(0, str(ROOT))
21
 
22
- from hackathon_advisor.data import extract_app_file_embedding_text
23
 
24
 
25
  API = "https://huggingface.co/api"
@@ -88,11 +88,12 @@ def project_from_space(space: Any) -> dict[str, Any]:
88
 
89
  title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
90
  summary = str(card.get("short_description") or card.get("description") or "")
 
91
  return {
92
  "id": space_id,
93
  "title": title,
94
  "summary": summary,
95
- "tags": sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or []))),
96
  "models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
97
  "datasets": [
98
  str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []
 
19
  ROOT = Path(__file__).resolve().parents[1]
20
  sys.path.insert(0, str(ROOT))
21
 
22
+ from hackathon_advisor.data import extract_app_file_embedding_text, normalize_project_tags
23
 
24
 
25
  API = "https://huggingface.co/api"
 
88
 
89
  title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
90
  summary = str(card.get("short_description") or card.get("description") or "")
91
+ raw_tags = sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or [])))
92
  return {
93
  "id": space_id,
94
  "title": title,
95
  "summary": summary,
96
+ "tags": list(normalize_project_tags(raw_tags)),
97
  "models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
98
  "datasets": [
99
  str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []
static/app.js CHANGED
@@ -498,7 +498,7 @@ function renderAtlasDetail(point) {
498
  return `<span>${escapeHtml(atlasQuestLabel(match.quest))} ${confidence}%</span>`;
499
  })
500
  .join("");
501
- const tags = [...(point.models || []).slice(0, 3), ...(point.tags || []).slice(0, 3)]
502
  .map((tag) => `<span>${escapeHtml(tag)}</span>`)
503
  .join("");
504
  atlasDetailEl.innerHTML = `
@@ -511,6 +511,10 @@ function renderAtlasDetail(point) {
511
  `;
512
  }
513
 
 
 
 
 
514
  function renderAtlasReport(data) {
515
  if (!atlasReportEl) return;
516
  const cluster = selectedClusterId
 
498
  return `<span>${escapeHtml(atlasQuestLabel(match.quest))} ${confidence}%</span>`;
499
  })
500
  .join("");
501
+ const tags = [...(point.models || []).slice(0, 3), ...visibleProjectTags(point.tags || []).slice(0, 3)]
502
  .map((tag) => `<span>${escapeHtml(tag)}</span>`)
503
  .join("");
504
  atlasDetailEl.innerHTML = `
 
511
  `;
512
  }
513
 
514
+ function visibleProjectTags(tags) {
515
+ return (tags || []).filter((tag) => !String(tag || "").toLowerCase().startsWith("region:"));
516
+ }
517
+
518
  function renderAtlasReport(data) {
519
  if (!atlasReportEl) return;
520
  const cluster = selectedClusterId
tests/test_app.py CHANGED
@@ -133,6 +133,24 @@ def test_dashboard_endpoint_exposes_atlas_payload() -> None:
133
  assert payload["links"]
134
  assert payload["quest_report"]["status"] in {"analyzed", "not_analyzed"}
135
  assert payload["refresh"]["status"] in {"idle", "running", "succeeded", "failed"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
 
138
  def test_dashboard_refresh_requires_bucket(monkeypatch) -> None:
 
133
  assert payload["links"]
134
  assert payload["quest_report"]["status"] in {"analyzed", "not_analyzed"}
135
  assert payload["refresh"]["status"] in {"idle", "running", "succeeded", "failed"}
136
+ assert all(
137
+ not str(tag).casefold().startswith("region:")
138
+ for point in payload["points"]
139
+ for tag in point.get("tags", [])
140
+ )
141
+
142
+
143
+ def test_refresh_error_format_includes_exception_chain() -> None:
144
+ try:
145
+ try:
146
+ raise ValueError("bad quest")
147
+ except ValueError as cause:
148
+ raise RuntimeError("refresh failed") from cause
149
+ except RuntimeError as error:
150
+ message = app_module._format_refresh_error(error)
151
+
152
+ assert "RuntimeError: refresh failed" in message
153
+ assert "caused by ValueError: bad quest" in message
154
 
155
 
156
  def test_dashboard_refresh_requires_bucket(monkeypatch) -> None:
tests/test_crawl_hf_spaces.py CHANGED
@@ -44,7 +44,7 @@ def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
44
  SimpleNamespace(rfilename="README.md"),
45
  SimpleNamespace(rfilename="app.py"),
46
  ],
47
- tags=["gradio"],
48
  models=[],
49
  datasets=[],
50
  likes=3,
@@ -61,6 +61,7 @@ def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
61
  assert project["app_file_source"] == "import gradio as gr\ngr.Textbox(label='Idea')\n"
62
  assert "gr.Textbox" in project["app_file_embedding_text"]
63
  assert "Idea" in project["app_file_embedding_text"]
 
64
 
65
 
66
  def test_project_from_space_tolerates_stale_frontmatter_app_file(monkeypatch) -> None:
 
44
  SimpleNamespace(rfilename="README.md"),
45
  SimpleNamespace(rfilename="app.py"),
46
  ],
47
+ tags=["gradio", "region:us"],
48
  models=[],
49
  datasets=[],
50
  likes=3,
 
61
  assert project["app_file_source"] == "import gradio as gr\ngr.Textbox(label='Idea')\n"
62
  assert "gr.Textbox" in project["app_file_embedding_text"]
63
  assert "Idea" in project["app_file_embedding_text"]
64
+ assert project["tags"] == ["gradio"]
65
 
66
 
67
  def test_project_from_space_tolerates_stale_frontmatter_app_file(monkeypatch) -> None:
tests/test_dashboard.py CHANGED
@@ -2,7 +2,11 @@ from __future__ import annotations
2
 
3
  from pathlib import Path
4
 
5
- from hackathon_advisor.dashboard import build_dashboard_payload, validate_dashboard_payload
 
 
 
 
6
  from hackathon_advisor.data import Project, ProjectIndex, build_index_payload
7
  from hackathon_advisor.quest_analysis import (
8
  MiniCPMQuestAnalyzer,
@@ -45,6 +49,7 @@ def test_dashboard_builder_projects_embeddings_with_tsne_and_clusters() -> None:
45
  assert payload["quest_report"]["status"] == "analyzed"
46
  assert all(0 <= point["x"] <= 100 and 0 <= point["y"] <= 100 for point in payload["points"])
47
  assert all(point["quest_ids"] for point in payload["points"])
 
48
 
49
 
50
  def test_dashboard_builder_is_deterministic_for_fixed_vectors() -> None:
@@ -59,6 +64,18 @@ def test_dashboard_builder_is_deterministic_for_fixed_vectors() -> None:
59
  assert left["clusters"] == right["clusters"]
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def test_quest_analysis_validation_accepts_strict_project_coverage() -> None:
63
  projects = fake_projects(4)
64
  raw = {
@@ -343,7 +360,7 @@ def test_quest_prompt_uses_raw_readme_and_app_source_segments() -> None:
343
  id="build-small-hackathon/two-segment",
344
  title="Two Segment",
345
  summary="card summary should not drive quest analysis",
346
- tags=("gradio",),
347
  models=("openbmb/MiniCPM5-1B",),
348
  datasets=(),
349
  likes=1,
@@ -367,6 +384,7 @@ def test_quest_prompt_uses_raw_readme_and_app_source_segments() -> None:
367
  assert "from llama_cpp import Llama" in prompt
368
  assert "card summary should not drive quest analysis" not in prompt
369
  assert "compact app signals should not drive quest analysis" not in prompt
 
370
 
371
 
372
  def test_quest_analyzer_rejects_non_minicpm_backend(monkeypatch) -> None:
@@ -400,6 +418,56 @@ def fake_index() -> ProjectIndex:
400
  )
401
 
402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  def fake_projects(count: int) -> list[Project]:
404
  return [
405
  Project(
 
2
 
3
  from pathlib import Path
4
 
5
+ from hackathon_advisor.dashboard import (
6
+ CLUSTER_LABEL_ALGORITHM,
7
+ build_dashboard_payload,
8
+ validate_dashboard_payload,
9
+ )
10
  from hackathon_advisor.data import Project, ProjectIndex, build_index_payload
11
  from hackathon_advisor.quest_analysis import (
12
  MiniCPMQuestAnalyzer,
 
49
  assert payload["quest_report"]["status"] == "analyzed"
50
  assert all(0 <= point["x"] <= 100 and 0 <= point["y"] <= 100 for point in payload["points"])
51
  assert all(point["quest_ids"] for point in payload["points"])
52
+ assert payload["cluster_label_algorithm"] == CLUSTER_LABEL_ALGORITHM
53
 
54
 
55
  def test_dashboard_builder_is_deterministic_for_fixed_vectors() -> None:
 
64
  assert left["clusters"] == right["clusters"]
65
 
66
 
67
+ def test_dashboard_cluster_labels_ignore_hackathon_wide_noise() -> None:
68
+ index = noisy_cluster_label_index()
69
+
70
+ payload = build_dashboard_payload(index, generated_at="2026-06-08T00:00:00+00:00")
71
+
72
+ banned = {"ai", "build-small-hackathon", "gradio", "hackathon", "project", "region", "us"}
73
+ keywords = {keyword for cluster in payload["clusters"] for keyword in cluster["keywords"]}
74
+ assert keywords.isdisjoint(banned)
75
+ assert {"dream", "family", "garden", "notice", "order", "repair"} & keywords
76
+ assert all("region:us" not in point["tags"] for point in payload["points"])
77
+
78
+
79
  def test_quest_analysis_validation_accepts_strict_project_coverage() -> None:
80
  projects = fake_projects(4)
81
  raw = {
 
360
  id="build-small-hackathon/two-segment",
361
  title="Two Segment",
362
  summary="card summary should not drive quest analysis",
363
+ tags=("gradio", "region:us"),
364
  models=("openbmb/MiniCPM5-1B",),
365
  datasets=(),
366
  likes=1,
 
384
  assert "from llama_cpp import Llama" in prompt
385
  assert "card summary should not drive quest analysis" not in prompt
386
  assert "compact app signals should not drive quest analysis" not in prompt
387
+ assert "region:us" not in prompt
388
 
389
 
390
  def test_quest_analyzer_rejects_non_minicpm_backend(monkeypatch) -> None:
 
418
  )
419
 
420
 
421
+ def noisy_cluster_label_index() -> ProjectIndex:
422
+ themes = [
423
+ ("dream", ("Dream Lantern", "Dream Atlas"), "dream journal symbolic oracle"),
424
+ ("family", ("Family Ledger", "Care Kinship"), "family care bill coordination"),
425
+ ("garden", ("Garden Notebook", "Seed Exchange"), "garden seed neighborhood plants"),
426
+ ("notice", ("Notice Helper", "Scam Screen"), "notice scam safety verification"),
427
+ ("order", ("Order Desk", "Inventory Voice"), "order inventory audio assistant"),
428
+ ("repair", ("Repair Coach", "Tool Shed"), "repair maintenance workshop"),
429
+ ]
430
+ projects: list[Project] = []
431
+ embeddings = []
432
+ for theme_index, (theme, titles, summary) in enumerate(themes):
433
+ for title in titles:
434
+ projects.append(
435
+ Project(
436
+ id=f"build-small-hackathon/{title.lower().replace(' ', '-')}",
437
+ title=title,
438
+ summary=(
439
+ f"{summary} for a build-small-hackathon AI project in the US region "
440
+ "with a Gradio demo."
441
+ ),
442
+ tags=("build-small-hackathon", "ai", "gradio", "region:us", theme),
443
+ models=("tiny-model",),
444
+ datasets=(),
445
+ likes=theme_index,
446
+ sdk="gradio",
447
+ license="mit",
448
+ created_at="2026-06-01T00:00:00+00:00",
449
+ last_modified=f"2026-06-{theme_index + 1:02d}T00:00:00+00:00",
450
+ host=f"https://{title.lower().replace(' ', '-')}.hf.space",
451
+ url=f"https://huggingface.co/spaces/build-small-hackathon/{title.lower().replace(' ', '-')}",
452
+ app_file="app.py",
453
+ app_file_embedding_text="shared local small model app",
454
+ )
455
+ )
456
+ vector = [0.0] * len(themes)
457
+ vector[theme_index] = 1.0
458
+ embeddings.append(vector)
459
+
460
+ snapshot_generated_at = "2026-06-08T00:00:00+00:00"
461
+ source = "https://example.test/spaces"
462
+ payload = build_index_payload(projects, snapshot_generated_at, source, embeddings)
463
+ return ProjectIndex(
464
+ projects=projects,
465
+ generated_at=snapshot_generated_at,
466
+ source=source,
467
+ index_payload=payload,
468
+ )
469
+
470
+
471
  def fake_projects(count: int) -> list[Project]:
472
  return [
473
  Project(
tests/test_data.py CHANGED
@@ -85,6 +85,23 @@ def test_searchable_text_includes_main_app_file_signals() -> None:
85
  assert "Project idea" in searchable
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def test_searchable_text_excludes_refresh_readme_body_for_stable_reuse() -> None:
89
  project = Project(
90
  id="build-small-hackathon/long-readme",
 
85
  assert "Project idea" in searchable
86
 
87
 
88
+ def test_public_project_tags_exclude_hosting_metadata() -> None:
89
+ project = Project.from_dict(
90
+ {
91
+ "id": "build-small-hackathon/idea-canvas",
92
+ "title": "Idea Canvas",
93
+ "summary": "",
94
+ "tags": ["gradio", "region:us", "local-first", "region:eu", "gradio"],
95
+ "models": [],
96
+ "datasets": [],
97
+ "url": "https://example.test",
98
+ }
99
+ )
100
+
101
+ assert project.tags == ("gradio", "region:us", "local-first", "region:eu", "gradio")
102
+ assert project.to_public_dict()["tags"] == ["gradio", "local-first"]
103
+
104
+
105
  def test_searchable_text_excludes_refresh_readme_body_for_stable_reuse() -> None:
106
  project = Project(
107
  id="build-small-hackathon/long-readme",