Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

JacobLinCool commited on Jun 7

Commit

d0718ca

verified ·

1 Parent(s): f5031de

feat: embed app file signals in project index

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +8 -3
data/project_index.json +0 -0
data/projects.json +0 -0
data/sample_trace.jsonl +4 -4
hackathon_advisor/data.py +122 -10
hackathon_advisor/llama_embedding.py +152 -4
scripts/build_project_index.py +1 -0
scripts/crawl_hf_spaces.py +133 -32
tests/test_crawl_hf_spaces.py +61 -0
tests/test_data.py +32 -1
tests/test_llama_embedding.py +36 -1

.gitattributes CHANGED Viewed

@@ -1,2 +1,3 @@
 # Auto detect text files and perform LF normalization
 * text=auto

 # Auto detect text files and perform LF normalization
 * text=auto
+static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -58,9 +58,11 @@ python scripts/generate_sample_trace.py --projects data/projects.json --index da
 ```
 The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
-source, project order, digest, embedding dimensions, and normalized vector shape before the app starts. The canonical
-index is built on Modal with `ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the
-user query with the same GGUF model and performs local cosine search over the checked-in vectors.
 ## Trace Artifact
@@ -194,6 +196,9 @@ ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
 `agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
 the ZeroGPU allocation. The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
 `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
 Local tests and CPU-only development still default to `ADVISOR_MODEL_BACKEND=rules`.
 ## Test

 ```
 The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
+source, project order, searchable text digest, embedding dimensions, and normalized vector shape before the app starts.
+The crawler snapshots every public Space in the org and, when README frontmatter declares `app_file`, includes that main
+app file as the highest-signal project evidence for embedding. The canonical index is built on Modal with
+`ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the user query with the same GGUF
+model and performs local cosine search over the checked-in vectors.
 ## Trace Artifact
 `agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
 the ZeroGPU allocation. The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
 `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
+On macOS local runs with `ADVISOR_MODEL_BACKEND=minicpm-transformers`, the app automatically runs llama.cpp query
+embedding in a worker process so the MiniCPM PyTorch runtime and llama.cpp do not load conflicting OpenMP runtimes in
+the same Python process.
 Local tests and CPU-only development still default to `ADVISOR_MODEL_BACKEND=rules`.
 ## Test

data/project_index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/projects.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/sample_trace.jsonl CHANGED Viewed

@@ -1,4 +1,4 @@
-{"app": "hackathon-advisor", "generated_at": "2026-06-07T08:17:29+00:00", "idea_count": 2, "index": {"algorithm": "llama-cpp-embedding-v1", "index_generated_at": "2026-06-07T08:16:19+00:00", "snapshot_digest": "1d59c168b446bc1268ef648897fac523e85b258fd6048a37409b9cfbc5c28329", "snapshot_generated_at": "2026-06-06T19:20:47+00:00"}, "schema_version": 1, "turn_count": 3, "type": "trace_manifest"}
-{"artifact_title": "A local-first archive cartographer for family photos", "input": "A local-first archive cartographer for family photos", "overall": 4.6, "plan_steps": 0, "response": "The ink bleeds around A local-first archive cartographer for family photos. Closest echoes: page 30: Family Bill Assistant; page 50: Local in 30s — Lore Lens; page 41: Investigative News Agent. The seal reads ECHO x4 at 4.6/10. Keep the audience, but change the mechanism or artifact so the demo proves a gap instead of joining a cluster.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {"pitch": "A local-first archive cartographer for family photos", "title": "A local-first archive cartographer for family photos"}, "name": "save_idea"}, "errors": [], "status": "valid"}, "tools": [{"name": "save_idea", "summary": "Wrote idea page 'A local-first archive cartographer for family photos'."}, {"name": "search_projects", "summary": "Checked 5 closest project echoes."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.6/10."}], "turn_index": 1, "type": "agent_turn", "verdict": "ECHO x4"}
-{"artifact_title": "Personal archive cartographer", "input": "write bolder and find whitespace", "overall": 4.8, "plan_steps": 0, "response": "Gold gathers on Personal archive cartographer. A tiny model that maps a private archive into stories without sending it to cloud APIs. Nearest echoes are weak: Dream Museum, Storybook. The seal reads ECHO x4 at 4.8/10. The next move is to make one concrete before/after scene and cite the two weakest nearby echoes in the margin.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "find_whitespace"}, "errors": [], "status": "valid"}, "tools": [{"name": "find_whitespace", "summary": "Ranked 4 under-explored regions."}, {"name": "save_idea", "summary": "Wrote idea page 'Personal archive cartographer'."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.8/10."}], "turn_index": 2, "type": "agent_turn", "verdict": "ECHO x4"}
-{"artifact_title": "Personal archive cartographer", "input": "make a build plan", "overall": 4.8, "plan_steps": 6, "response": "The wax seal for Personal archive cartographer reads 4.8/10, ECHO x4. The build path is: 1. Lock a one-sentence promise and one test input that proves what is different. 2. Compare against the nearest echoes, then sharpen the part only this idea can own. 3. Build the smallest happy path: input, nearby project citations, score, and one shareable output. 4. Add one selected-goal feature only after the core loop is smooth enough to explain without narration. 5. Collect successful advisor examples before training a tiny LoRA. 6. Write build notes from the exact decisions, screenshots, and outputs.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "make_plan"}, "errors": [], "status": "valid"}, "tools": [{"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.8/10."}, {"name": "make_plan", "summary": "Drafted 6 build steps."}], "turn_index": 3, "type": "agent_turn", "verdict": "ECHO x4"}

+{"app": "hackathon-advisor", "generated_at": "2026-06-07T11:52:22+00:00", "idea_count": 2, "index": {"algorithm": "llama-cpp-embedding-v1", "index_generated_at": "2026-06-07T11:52:16+00:00", "snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034", "snapshot_generated_at": "2026-06-07T11:51:09+00:00"}, "schema_version": 1, "turn_count": 3, "type": "trace_manifest"}
+{"artifact_title": "A local-first archive cartographer for family photos", "input": "A local-first archive cartographer for family photos", "overall": 4.6, "plan_steps": 0, "response": "The ink bleeds around A local-first archive cartographer for family photos. Closest echoes: page 67: Mini Fam; page 38: Family Bill Assistant; page 84: PawMap. The seal reads ECHO x4 at 4.6/10. Keep the audience, but change the mechanism or artifact so the demo proves a gap instead of joining a cluster.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {"pitch": "A local-first archive cartographer for family photos", "title": "A local-first archive cartographer for family photos"}, "name": "save_idea"}, "errors": [], "status": "valid"}, "tools": [{"name": "save_idea", "summary": "Wrote idea page 'A local-first archive cartographer for family photos'."}, {"name": "search_projects", "summary": "Checked 5 closest project echoes."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.6/10."}], "turn_index": 1, "type": "agent_turn", "verdict": "ECHO x4"}
+{"artifact_title": "Hands-on science coach", "input": "write bolder and find whitespace", "overall": 4.0, "plan_steps": 0, "response": "Gold gathers on Hands-on science coach. A lab-notebook companion that designs safe experiments from household materials. Nearest echoes are weak: Storybook, Yui Home Assisstant. The seal reads ECHO x4 at 4.0/10. The next move is to make one concrete before/after scene and cite the two weakest nearby echoes in the margin.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "find_whitespace"}, "errors": [], "status": "valid"}, "tools": [{"name": "find_whitespace", "summary": "Ranked 4 under-explored regions."}, {"name": "save_idea", "summary": "Wrote idea page 'Hands-on science coach'."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.0/10."}], "turn_index": 2, "type": "agent_turn", "verdict": "ECHO x4"}
+{"artifact_title": "Hands-on science coach", "input": "make a build plan", "overall": 4.0, "plan_steps": 6, "response": "The wax seal for Hands-on science coach reads 4.0/10, ECHO x4. The build path is: 1. Lock a one-sentence promise and one test input that proves what is different. 2. Compare against the nearest echoes, then sharpen the part only this idea can own. 3. Build the smallest happy path: input, nearby project citations, score, and one shareable output. 4. Add one selected-goal feature only after the core loop is smooth enough to explain without narration. 5. Collect successful advisor examples before training a tiny LoRA. 6. Write build notes from the exact decisions, screenshots, and outputs.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "make_plan"}, "errors": [], "status": "valid"}, "tools": [{"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.0/10."}, {"name": "make_plan", "summary": "Drafted 6 build steps."}], "turn_index": 3, "type": "agent_turn", "verdict": "ECHO x4"}

hackathon_advisor/data.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
 from datetime import datetime, timezone
@@ -7,11 +8,13 @@ from hashlib import sha256
 import json
 import math
 from pathlib import Path
 import re
 from typing import Any
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
 GENERIC_PUBLIC_TITLE_RE = re.compile(
     r"^(?:my\s+)?build\s+small\s+hackathon$",
     re.IGNORECASE,
@@ -23,11 +26,12 @@ GENERIC_PUBLIC_SUMMARY_RE = re.compile(
     re.IGNORECASE,
 )
-INDEX_SCHEMA_VERSION = 2
 INDEX_ALGORITHM = "llama-cpp-embedding-v1"
 DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
 DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
 DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
 EmbeddingFunction = Callable[[str], Sequence[float]]
@@ -48,6 +52,8 @@ class Project:
     last_modified: str
     host: str
     url: str
     @classmethod
     def from_dict(cls, data: dict) -> "Project":
@@ -65,6 +71,8 @@ class Project:
             last_modified=str(data.get("last_modified") or ""),
             host=str(data.get("host") or ""),
             url=str(data.get("url") or f"https://huggingface.co/spaces/{data['id']}"),
         )
     @property
@@ -73,15 +81,21 @@ class Project:
     @property
     def searchable_text(self) -> str:
-        return " ".join(
-            [
-                self.title,
-                self.slug.replace("-", " ").replace("_", " "),
-                self.summary,
-                " ".join(self.tags),
-                " ".join(self.models),
-                " ".join(self.datasets),
             ]
         )
     def to_public_dict(self) -> dict:
@@ -99,6 +113,7 @@ class Project:
             "last_modified": self.last_modified,
             "host": self.host,
             "url": self.url,
         }
     def to_snapshot_dict(self) -> dict:
@@ -116,6 +131,8 @@ class Project:
             "last_modified": self.last_modified,
             "host": self.host,
             "url": self.url,
         }
@@ -163,6 +180,99 @@ def public_project_summary(summary: str) -> str:
     return cleaned
 @dataclass(frozen=True)
 class WhitespaceSeed:
     label: str
@@ -433,7 +543,9 @@ def validate_index_payload(
     indexed_ids = [document.get("project_id") for document in documents]
     if indexed_ids != project_ids:
         raise ValueError("project index project order does not match projects snapshot")
-    for document in documents:
         vector = document.get("vector")
         if not isinstance(vector, list) or len(vector) != dimensions:
             raise ValueError("project index vector dimensions do not match embedding metadata")

 from __future__ import annotations
+import ast
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
 from datetime import datetime, timezone
 import json
 import math
 from pathlib import Path
+from pathlib import PurePosixPath
 import re
 from typing import Any
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
+HTML_TAG_RE = re.compile(r"<[^>]+>")
 GENERIC_PUBLIC_TITLE_RE = re.compile(
     r"^(?:my\s+)?build\s+small\s+hackathon$",
     re.IGNORECASE,
     re.IGNORECASE,
 )
+INDEX_SCHEMA_VERSION = 3
 INDEX_ALGORITHM = "llama-cpp-embedding-v1"
 DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
 DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
 DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
+APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
 EmbeddingFunction = Callable[[str], Sequence[float]]
     last_modified: str
     host: str
     url: str
+    app_file: str = ""
+    app_file_embedding_text: str = ""
     @classmethod
     def from_dict(cls, data: dict) -> "Project":
             last_modified=str(data.get("last_modified") or ""),
             host=str(data.get("host") or ""),
             url=str(data.get("url") or f"https://huggingface.co/spaces/{data['id']}"),
+            app_file=str(data.get("app_file") or ""),
+            app_file_embedding_text=str(data.get("app_file_embedding_text") or ""),
         )
     @property
     @property
     def searchable_text(self) -> str:
+        return "\n".join(
+            part
+            for part in [
+                f"title: {self.title}",
+                f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
+                f"summary: {self.summary}",
+                f"tags: {' '.join(self.tags)}",
+                f"models: {' '.join(self.models)}",
+                f"datasets: {' '.join(self.datasets)}",
+                f"main app file: {self.app_file}" if self.app_file else "",
+                f"main app file content:\n{self.app_file_embedding_text}"
+                if self.app_file_embedding_text
+                else "",
             ]
+            if part.strip()
         )
     def to_public_dict(self) -> dict:
             "last_modified": self.last_modified,
             "host": self.host,
             "url": self.url,
+            "app_file": self.app_file,
         }
     def to_snapshot_dict(self) -> dict:
             "last_modified": self.last_modified,
             "host": self.host,
             "url": self.url,
+            "app_file": self.app_file,
+            "app_file_embedding_text": self.app_file_embedding_text,
         }
     return cleaned
+def extract_app_file_embedding_text(app_file: str, text: str) -> str:
+    cleaned_file = str(app_file).strip()
+    cleaned_text = str(text or "")
+    if not cleaned_file or not cleaned_text.strip():
+        return ""
+    suffix = PurePosixPath(cleaned_file).suffix.lower()
+    if suffix == ".py":
+        body = python_app_signals(cleaned_text)
+    else:
+        body = cleaned_text
+    return bounded_embedding_text(body, APP_FILE_EMBEDDING_CHAR_LIMIT)
+def python_app_signals(source: str) -> str:
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return source
+    signals: list[str] = []
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            signals.append(node.name)
+            for arg in node.args.args:
+                signals.append(arg.arg)
+        elif isinstance(node, ast.ClassDef):
+            signals.append(node.name)
+        elif isinstance(node, ast.Call):
+            name = call_name(node.func)
+            if name:
+                signals.append(name)
+            signals.extend(keyword.arg for keyword in node.keywords if keyword.arg)
+        elif isinstance(node, ast.Constant) and isinstance(node.value, str):
+            signals.append(node.value)
+    return ordered_normalized_text(signals)
+def call_name(node: ast.AST) -> str:
+    if isinstance(node, ast.Name):
+        return node.id
+    if isinstance(node, ast.Attribute):
+        parent = call_name(node.value)
+        return f"{parent}.{node.attr}" if parent else node.attr
+    return ""
+def ordered_normalized_text(values: Sequence[str]) -> str:
+    seen: set[str] = set()
+    ordered: list[str] = []
+    for value in values:
+        cleaned = clean_embedding_signal(value)
+        if not cleaned:
+            continue
+        if cleaned in seen:
+            continue
+        seen.add(cleaned)
+        ordered.append(cleaned)
+    return "\n".join(ordered)
+def clean_embedding_signal(value: str) -> str:
+    cleaned = HTML_TAG_RE.sub(" ", str(value))
+    cleaned = " ".join(cleaned.split())
+    if looks_like_style_blob(cleaned):
+        return ""
+    return cleaned
+def looks_like_style_blob(text: str) -> bool:
+    if len(text) < 80:
+        return False
+    style_markers = (
+        text.count("{")
+        + text.count("}")
+        + text.count(";")
+        + text.count("!important")
+        + text.count("rgba(")
+        + text.count("linear-gradient")
+    )
+    return style_markers >= 8 and style_markers / len(text) > 0.015
+def bounded_embedding_text(text: str, limit: int) -> str:
+    cleaned = " ".join(str(text).split())
+    if len(cleaned) <= limit:
+        return cleaned
+    marker = " ... "
+    edge = max(1, (limit - len(marker)) // 2)
+    return f"{cleaned[:edge].rstrip()}{marker}{cleaned[-edge:].lstrip()}"
 @dataclass(frozen=True)
 class WhitespaceSeed:
     label: str
     indexed_ids = [document.get("project_id") for document in documents]
     if indexed_ids != project_ids:
         raise ValueError("project index project order does not match projects snapshot")
+    for project, document in zip(projects, documents, strict=True):
+        if document.get("text_digest") != sha256(project.searchable_text.encode("utf-8")).hexdigest():
+            raise ValueError("project index text digest does not match searchable project text")
         vector = document.get("vector")
         if not isinstance(vector, list) or len(vector) != dimensions:
             raise ValueError("project index vector dimensions do not match embedding metadata")

hackathon_advisor/llama_embedding.py CHANGED Viewed

@@ -1,8 +1,14 @@
 from __future__ import annotations
 from collections.abc import Sequence
-from pathlib import Path
 import os
 from typing import Any
 from hackathon_advisor.data import (
@@ -12,7 +18,8 @@ from hackathon_advisor.data import (
 TRUE_VALUES = {"1", "true", "yes", "on"}
-DEFAULT_N_CTX = 512
 class LlamaCppEmbedder:
@@ -74,8 +81,114 @@ class LlamaCppEmbedder:
         return self._model
-def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder:
-    return LlamaCppEmbedder(
         model_repo=os.environ.get(
             "ADVISOR_EMBEDDING_MODEL_REPO",
             str(metadata.get("model_repo") or DEFAULT_EMBEDDING_MODEL_REPO),
@@ -111,3 +224,38 @@ def _optional_int_env(name: str) -> int | None:
     if value <= 0:
         raise RuntimeError(f"{name} must be a positive integer.")
     return value

 from __future__ import annotations
 from collections.abc import Sequence
+import atexit
+import json
 import os
+from pathlib import Path
+import platform
+import subprocess
+import sys
+import threading
 from typing import Any
 from hackathon_advisor.data import (
 TRUE_VALUES = {"1", "true", "yes", "on"}
+FALSE_VALUES = {"0", "false", "no", "off"}
+DEFAULT_N_CTX = 2048
 class LlamaCppEmbedder:
         return self._model
+class SubprocessLlamaCppEmbedder:
+    def __init__(
+        self,
+        *,
+        model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
+        model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
+        model_path: str = "",
+        n_ctx: int = DEFAULT_N_CTX,
+        n_batch: int | None = None,
+        n_threads: int | None = None,
+        n_gpu_layers: int = 0,
+        verbose: bool = False,
+    ) -> None:
+        self.model_repo = model_repo.strip() or DEFAULT_EMBEDDING_MODEL_REPO
+        self.model_file = model_file.strip() or DEFAULT_EMBEDDING_MODEL_FILE
+        self.model_path = model_path.strip()
+        self.n_ctx = n_ctx
+        self.n_batch = n_batch or n_ctx
+        self.n_threads = n_threads
+        self.n_gpu_layers = n_gpu_layers
+        self.verbose = verbose
+        self._process: subprocess.Popen[str] | None = None
+        self._request_id = 0
+        self._lock = threading.Lock()
+        atexit.register(self.close)
+    def __call__(self, text: str) -> Sequence[float]:
+        return self.embed(text)
+    def embed(self, text: str) -> Sequence[float]:
+        with self._lock:
+            process = self._ensure_process()
+            self._request_id += 1
+            request_id = self._request_id
+            request = json.dumps({"id": request_id, "text": text}, ensure_ascii=False)
+            try:
+                assert process.stdin is not None
+                assert process.stdout is not None
+                process.stdin.write(f"{request}\n")
+                process.stdin.flush()
+                line = process.stdout.readline()
+            except (BrokenPipeError, OSError) as error:
+                self.close()
+                raise RuntimeError("llama.cpp embedding worker stopped before returning a vector.") from error
+            if not line:
+                returncode = process.poll()
+                self.close()
+                detail = f" with exit code {returncode}" if returncode is not None else ""
+                raise RuntimeError(f"llama.cpp embedding worker exited{detail}.")
+            try:
+                response = json.loads(line)
+            except json.JSONDecodeError as error:
+                raise RuntimeError("llama.cpp embedding worker returned invalid JSON.") from error
+            if response.get("id") != request_id:
+                raise RuntimeError("llama.cpp embedding worker returned an out-of-order response.")
+            if response.get("error"):
+                raise RuntimeError(str(response["error"]))
+            vector = response.get("vector")
+            if not isinstance(vector, list):
+                raise RuntimeError("llama.cpp embedding worker did not return a vector.")
+            return vector
+    def close(self) -> None:
+        process = self._process
+        self._process = None
+        if process is None:
+            return
+        if process.poll() is None:
+            process.terminate()
+            try:
+                process.wait(timeout=2)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                process.wait(timeout=2)
+    def _ensure_process(self) -> subprocess.Popen[str]:
+        if self._process is not None and self._process.poll() is None:
+            return self._process
+        self._process = subprocess.Popen(
+            [sys.executable, "-u", "-m", "hackathon_advisor.llama_embedding", "--worker"],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=None if self.verbose else subprocess.DEVNULL,
+            text=True,
+            cwd=Path(__file__).resolve().parents[1],
+        )
+        config = json.dumps(
+            {
+                "model_repo": self.model_repo,
+                "model_file": self.model_file,
+                "model_path": self.model_path,
+                "n_ctx": self.n_ctx,
+                "n_batch": self.n_batch,
+                "n_threads": self.n_threads,
+                "n_gpu_layers": self.n_gpu_layers,
+                "verbose": self.verbose,
+            },
+            ensure_ascii=False,
+        )
+        assert self._process.stdin is not None
+        self._process.stdin.write(f"{config}\n")
+        self._process.stdin.flush()
+        return self._process
+def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder | SubprocessLlamaCppEmbedder:
+    embedder_cls = SubprocessLlamaCppEmbedder if _use_subprocess_embedder() else LlamaCppEmbedder
+    return embedder_cls(
         model_repo=os.environ.get(
             "ADVISOR_EMBEDDING_MODEL_REPO",
             str(metadata.get("model_repo") or DEFAULT_EMBEDDING_MODEL_REPO),
     if value <= 0:
         raise RuntimeError(f"{name} must be a positive integer.")
     return value
+def _use_subprocess_embedder() -> bool:
+    raw = os.environ.get("ADVISOR_EMBEDDING_SUBPROCESS", "").strip().lower()
+    if raw in TRUE_VALUES:
+        return True
+    if raw in FALSE_VALUES:
+        return False
+    backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
+    return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}
+def _worker_loop() -> None:
+    config_line = sys.stdin.readline()
+    if not config_line:
+        return
+    embedder = LlamaCppEmbedder(**json.loads(config_line))
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        request = json.loads(line)
+        request_id = request.get("id")
+        try:
+            vector = list(embedder.embed(str(request.get("text") or "")))
+            response = {"id": request_id, "vector": vector}
+        except Exception as error:
+            response = {"id": request_id, "error": str(error)}
+        print(json.dumps(response), flush=True)
+if __name__ == "__main__":
+    if len(sys.argv) == 2 and sys.argv[1] == "--worker":
+        _worker_loop()
+    else:
+        raise SystemExit("usage: python -m hackathon_advisor.llama_embedding --worker")

scripts/build_project_index.py CHANGED Viewed

@@ -81,6 +81,7 @@ def build_payload(
         "build_source": build_source,
         "builder": builder,
         "llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
     }
     if modal_app:
         metadata["modal_app"] = modal_app

         "build_source": build_source,
         "builder": builder,
         "llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
+        "n_ctx": n_ctx,
     }
     if modal_app:
         metadata["modal_app"] = modal_app

scripts/crawl_hf_spaces.py CHANGED Viewed

@@ -5,11 +5,16 @@ import argparse
 from datetime import datetime, timezone
 import json
 from pathlib import Path
-import time
 from typing import Any
-from urllib.error import HTTPError
-from urllib.parse import quote
-from urllib.request import Request, urlopen
 API = "https://huggingface.co/api"
@@ -19,20 +24,13 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Snapshot public Spaces in a Hugging Face org.")
     parser.add_argument("--org", default="build-small-hackathon")
     parser.add_argument("--out", default="data/projects.json")
-    parser.add_argument("--limit", type=int, default=100)
     args = parser.parse_args()
-    spaces = fetch_json(f"{API}/spaces?author={quote(args.org)}&limit={args.limit}")
-    projects = []
-    for item in spaces:
-        space_id = item["id"]
-        detail = fetch_json(f"{API}/spaces/{quote(space_id, safe='/')}")
-        projects.append(project_from_detail(detail))
-        time.sleep(0.05)
     payload = {
         "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
-        "source": f"{API}/spaces?author={args.org}&limit={args.limit}",
         "projects": sorted(projects, key=lambda project: project["id"].lower()),
     }
     output = Path(args.out)
@@ -41,38 +39,141 @@ def main() -> None:
     print(f"wrote {len(projects)} projects to {output}")
-def fetch_json(url: str) -> Any:
-    request = Request(url, headers={"User-Agent": "hackathon-advisor-crawler/0.1"})
-    try:
-        with urlopen(request, timeout=30) as response:
-            return json.loads(response.read().decode("utf-8"))
-    except HTTPError as error:
-        raise RuntimeError(f"failed to fetch {url}: {error.code}") from error
-def project_from_detail(detail: dict[str, Any]) -> dict[str, Any]:
-    card = detail.get("cardData") or {}
-    space_id = str(detail["id"])
     title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
     summary = str(card.get("short_description") or card.get("description") or "")
-    tags = sorted(set(str(tag) for tag in (card.get("tags") or detail.get("tags") or [])))
     return {
         "id": space_id,
         "title": title,
         "summary": summary,
-        "tags": tags,
-        "models": [str(model) for model in detail.get("models") or card.get("models") or []],
-        "datasets": [str(dataset) for dataset in detail.get("datasets") or card.get("datasets") or []],
-        "likes": int(detail.get("likes") or 0),
-        "sdk": str(card.get("sdk") or detail.get("sdk") or ""),
         "license": str(card.get("license") or ""),
-        "created_at": str(detail.get("createdAt") or ""),
-        "last_modified": str(detail.get("lastModified") or ""),
-        "host": str(detail.get("host") or ""),
         "url": f"https://huggingface.co/spaces/{space_id}",
     }
 def humanize_slug(slug: str) -> str:
     return " ".join(part for part in slug.replace("_", "-").split("-") if part).title()

 from datetime import datetime, timezone
 import json
 from pathlib import Path
+from pathlib import PurePosixPath
+import sys
 from typing import Any
+from huggingface_hub import HfApi, hf_hub_download
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from hackathon_advisor.data import extract_app_file_embedding_text
 API = "https://huggingface.co/api"
     parser = argparse.ArgumentParser(description="Snapshot public Spaces in a Hugging Face org.")
     parser.add_argument("--org", default="build-small-hackathon")
     parser.add_argument("--out", default="data/projects.json")
     args = parser.parse_args()
+    projects = crawl_projects(args.org)
     payload = {
         "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
+        "source": f"{API}/spaces?author={args.org}",
         "projects": sorted(projects, key=lambda project: project["id"].lower()),
     }
     output = Path(args.out)
     print(f"wrote {len(projects)} projects to {output}")
+def crawl_projects(org: str) -> list[dict[str, Any]]:
+    api = HfApi(token=False)
+    spaces = api.list_spaces(author=org, full=True, token=False)
+    return [
+        project_from_space(space)
+        for space in spaces
+        if not bool(getattr(space, "private", False))
+    ]
+def project_from_space(space: Any) -> dict[str, Any]:
+    card = card_data(space)
+    space_id = str(space.id)
+    siblings = sibling_names(space)
+    readme = download_repo_text(space_id, "README.md") if "README.md" in siblings else ""
+    frontmatter = readme_frontmatter(readme)
+    app_file = validate_app_file(str(frontmatter.get("app_file") or ""), space_id=space_id)
+    app_file_embedding_text = ""
+    if app_file:
+        if app_file not in siblings:
+            raise RuntimeError(f"{space_id} README frontmatter points to missing app_file: {app_file}")
+        app_file_embedding_text = extract_app_file_embedding_text(
+            app_file,
+            download_repo_text(space_id, app_file),
+        )
     title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
     summary = str(card.get("short_description") or card.get("description") or "")
     return {
         "id": space_id,
         "title": title,
         "summary": summary,
+        "tags": sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or []))),
+        "models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
+        "datasets": [
+            str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []
+        ],
+        "likes": int(getattr(space, "likes", None) or 0),
+        "sdk": str(card.get("sdk") or getattr(space, "sdk", None) or ""),
         "license": str(card.get("license") or ""),
+        "created_at": isoformat(getattr(space, "created_at", None)),
+        "last_modified": isoformat(getattr(space, "last_modified", None)),
+        "host": host_url(space),
         "url": f"https://huggingface.co/spaces/{space_id}",
+        "app_file": app_file,
+        "app_file_embedding_text": app_file_embedding_text,
     }
+def card_data(space: Any) -> dict[str, Any]:
+    raw = getattr(space, "card_data", None) or getattr(space, "cardData", None) or {}
+    if isinstance(raw, dict):
+        return raw
+    to_dict = getattr(raw, "to_dict", None)
+    if callable(to_dict):
+        return dict(to_dict())
+    return {}
+def sibling_names(space: Any) -> set[str]:
+    return {str(sibling.rfilename) for sibling in getattr(space, "siblings", None) or []}
+def download_repo_text(repo_id: str, filename: str) -> str:
+    path = hf_hub_download(
+        repo_id=repo_id,
+        repo_type="space",
+        filename=filename,
+        token=False,
+        etag_timeout=30,
+    )
+    return Path(path).read_text(encoding="utf-8")
+def readme_frontmatter(readme: str) -> dict[str, str]:
+    lines = readme.splitlines()
+    if not lines or lines[0].strip() != "---":
+        return {}
+    values: dict[str, str] = {}
+    closed = False
+    for line in lines[1:]:
+        stripped = line.strip()
+        if stripped in {"---", "..."}:
+            closed = True
+            break
+        if not stripped or stripped.startswith("#") or ":" not in line:
+            continue
+        if line[:1].isspace() or stripped.startswith("-"):
+            continue
+        key, raw_value = line.split(":", 1)
+        key = key.strip()
+        if key:
+            values[key] = yaml_scalar(raw_value)
+    return values if closed else {}
+def yaml_scalar(raw_value: str) -> str:
+    value = raw_value.strip()
+    if not value:
+        return ""
+    if " #" in value:
+        value = value.split(" #", 1)[0].rstrip()
+    if value[:1] in {"'", '"'} and value[-1:] == value[:1]:
+        return value[1:-1]
+    return value
+def validate_app_file(app_file: str, *, space_id: str) -> str:
+    cleaned = app_file.strip()
+    if not cleaned:
+        return ""
+    path = PurePosixPath(cleaned)
+    if path.is_absolute() or ".." in path.parts or cleaned.endswith("/"):
+        raise RuntimeError(f"{space_id} README frontmatter has an invalid app_file path: {app_file}")
+    return path.as_posix()
+def isoformat(value: Any) -> str:
+    if value is None:
+        return ""
+    formatter = getattr(value, "isoformat", None)
+    if callable(formatter):
+        return formatter()
+    return str(value)
+def host_url(space: Any) -> str:
+    host = str(getattr(space, "host", None) or "")
+    if host:
+        return host
+    subdomain = str(getattr(space, "subdomain", None) or "")
+    return f"https://{subdomain}.hf.space" if subdomain else ""
 def humanize_slug(slug: str) -> str:
     return " ".join(part for part in slug.replace("_", "-").split("-") if part).title()

tests/test_crawl_hf_spaces.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from __future__ import annotations
+from types import SimpleNamespace
+import pytest
+from scripts import crawl_hf_spaces
+def test_readme_frontmatter_extracts_app_file() -> None:
+    frontmatter = crawl_hf_spaces.readme_frontmatter(
+        """---
+title: Tiny Demo
+app_file: "src/app.py" # main entrypoint
+tags:
+  - gradio
+---
+# Tiny Demo
+"""
+    )
+    assert frontmatter["app_file"] == "src/app.py"
+def test_validate_app_file_rejects_untrusted_paths() -> None:
+    with pytest.raises(RuntimeError, match="invalid app_file path"):
+        crawl_hf_spaces.validate_app_file("../app.py", space_id="build-small-hackathon/demo")
+def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
+    downloads = {
+        ("build-small-hackathon/demo", "README.md"): "---\napp_file: app.py\n---\n",
+        ("build-small-hackathon/demo", "app.py"): "import gradio as gr\ngr.Textbox(label='Idea')\n",
+    }
+    def fake_download(repo_id: str, filename: str) -> str:
+        return downloads[(repo_id, filename)]
+    monkeypatch.setattr(crawl_hf_spaces, "download_repo_text", fake_download)
+    space = SimpleNamespace(
+        id="build-small-hackathon/demo",
+        card_data={"title": "Demo", "short_description": "Advisor demo", "sdk": "gradio"},
+        siblings=[
+            SimpleNamespace(rfilename="README.md"),
+            SimpleNamespace(rfilename="app.py"),
+        ],
+        tags=["gradio"],
+        models=[],
+        datasets=[],
+        likes=3,
+        created_at=None,
+        last_modified=None,
+        host="https://example.test",
+        private=False,
+    )
+    project = crawl_hf_spaces.project_from_space(space)
+    assert project["app_file"] == "app.py"
+    assert "gr.Textbox" in project["app_file_embedding_text"]
+    assert "Idea" in project["app_file_embedding_text"]

tests/test_data.py CHANGED Viewed

@@ -3,7 +3,12 @@ from pathlib import Path
 from tests.helpers import load_test_index
 import json
-from hackathon_advisor.data import Project, ProjectIndex, public_project_summary, public_project_title
 def test_project_index_searches_snapshot() -> None:
@@ -54,6 +59,32 @@ def test_public_project_cards_hide_generic_submission_copy() -> None:
     assert public["summary"] == ""
 def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
     payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
     payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"

 from tests.helpers import load_test_index
 import json
+from hackathon_advisor.data import (
+    Project,
+    ProjectIndex,
+    public_project_summary,
+    public_project_title,
+)
 def test_project_index_searches_snapshot() -> None:
     assert public["summary"] == ""
+def test_searchable_text_includes_main_app_file_signals() -> None:
+    project = Project(
+        id="build-small-hackathon/idea-canvas",
+        title="Idea Canvas",
+        summary="",
+        tags=("gradio",),
+        models=(),
+        datasets=(),
+        likes=0,
+        sdk="gradio",
+        license="",
+        created_at="",
+        last_modified="",
+        host="",
+        url="https://example.test",
+        app_file="app.py",
+        app_file_embedding_text="score_idea\ngr.Textbox\nProject idea",
+    )
+    searchable = project.searchable_text
+    assert "main app file: app.py" in searchable
+    assert "score_idea" in searchable
+    assert "Project idea" in searchable
 def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
     payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
     payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"

tests/test_llama_embedding.py CHANGED Viewed

@@ -3,7 +3,12 @@ import sys
 from types import ModuleType
 from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
-from hackathon_advisor.llama_embedding import DEFAULT_N_CTX, LlamaCppEmbedder, create_llama_cpp_embedder
 def test_llama_embedder_uses_q8_defaults_and_configured_context(
@@ -60,3 +65,33 @@ def test_create_llama_embedder_accepts_explicit_batch(monkeypatch) -> None:
     embedder = create_llama_cpp_embedder({"dimensions": 768})
     assert embedder.n_batch == 256

 from types import ModuleType
 from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
+from hackathon_advisor.llama_embedding import (
+    DEFAULT_N_CTX,
+    LlamaCppEmbedder,
+    SubprocessLlamaCppEmbedder,
+    create_llama_cpp_embedder,
+)
 def test_llama_embedder_uses_q8_defaults_and_configured_context(
     embedder = create_llama_cpp_embedder({"dimensions": 768})
     assert embedder.n_batch == 256
+def test_create_llama_embedder_can_isolate_native_runtime(monkeypatch) -> None:
+    monkeypatch.setenv("ADVISOR_EMBEDDING_SUBPROCESS", "1")
+    embedder = create_llama_cpp_embedder({"dimensions": 768})
+    assert isinstance(embedder, SubprocessLlamaCppEmbedder)
+    embedder.close()
+def test_create_llama_embedder_isolates_macos_minicpm_runtime(monkeypatch) -> None:
+    monkeypatch.delenv("ADVISOR_EMBEDDING_SUBPROCESS", raising=False)
+    monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
+    monkeypatch.setattr("hackathon_advisor.llama_embedding.platform.system", lambda: "Darwin")
+    embedder = create_llama_cpp_embedder({"dimensions": 768})
+    assert isinstance(embedder, SubprocessLlamaCppEmbedder)
+    embedder.close()
+def test_create_llama_embedder_keeps_in_process_when_isolation_disabled(monkeypatch) -> None:
+    monkeypatch.setenv("ADVISOR_EMBEDDING_SUBPROCESS", "0")
+    monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
+    monkeypatch.setattr("hackathon_advisor.llama_embedding.platform.system", lambda: "Darwin")
+    embedder = create_llama_cpp_embedder({"dimensions": 768})
+    assert isinstance(embedder, LlamaCppEmbedder)