JacobLinCool commited on
Commit
d0718ca
·
verified ·
1 Parent(s): f5031de

feat: embed app file signals in project index

Browse files
.gitattributes CHANGED
@@ -1,2 +1,3 @@
1
  # Auto detect text files and perform LF normalization
2
  * text=auto
 
 
1
  # Auto detect text files and perform LF normalization
2
  * text=auto
3
+ static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -58,9 +58,11 @@ python scripts/generate_sample_trace.py --projects data/projects.json --index da
58
  ```
59
 
60
  The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
61
- source, project order, digest, embedding dimensions, and normalized vector shape before the app starts. The canonical
62
- index is built on Modal with `ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the
63
- user query with the same GGUF model and performs local cosine search over the checked-in vectors.
 
 
64
 
65
  ## Trace Artifact
66
 
@@ -194,6 +196,9 @@ ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
194
  `agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
195
  the ZeroGPU allocation. The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
196
  `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
 
 
 
197
  Local tests and CPU-only development still default to `ADVISOR_MODEL_BACKEND=rules`.
198
 
199
  ## Test
 
58
  ```
59
 
60
  The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
61
+ source, project order, searchable text digest, embedding dimensions, and normalized vector shape before the app starts.
62
+ The crawler snapshots every public Space in the org and, when README frontmatter declares `app_file`, includes that main
63
+ app file as the highest-signal project evidence for embedding. The canonical index is built on Modal with
64
+ `ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the user query with the same GGUF
65
+ model and performs local cosine search over the checked-in vectors.
66
 
67
  ## Trace Artifact
68
 
 
196
  `agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
197
  the ZeroGPU allocation. The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
198
  `ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
199
+ On macOS local runs with `ADVISOR_MODEL_BACKEND=minicpm-transformers`, the app automatically runs llama.cpp query
200
+ embedding in a worker process so the MiniCPM PyTorch runtime and llama.cpp do not load conflicting OpenMP runtimes in
201
+ the same Python process.
202
  Local tests and CPU-only development still default to `ADVISOR_MODEL_BACKEND=rules`.
203
 
204
  ## Test
data/project_index.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/projects.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/sample_trace.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"app": "hackathon-advisor", "generated_at": "2026-06-07T08:17:29+00:00", "idea_count": 2, "index": {"algorithm": "llama-cpp-embedding-v1", "index_generated_at": "2026-06-07T08:16:19+00:00", "snapshot_digest": "1d59c168b446bc1268ef648897fac523e85b258fd6048a37409b9cfbc5c28329", "snapshot_generated_at": "2026-06-06T19:20:47+00:00"}, "schema_version": 1, "turn_count": 3, "type": "trace_manifest"}
2
- {"artifact_title": "A local-first archive cartographer for family photos", "input": "A local-first archive cartographer for family photos", "overall": 4.6, "plan_steps": 0, "response": "The ink bleeds around A local-first archive cartographer for family photos. Closest echoes: page 30: Family Bill Assistant; page 50: Local in 30s — Lore Lens; page 41: Investigative News Agent. The seal reads ECHO x4 at 4.6/10. Keep the audience, but change the mechanism or artifact so the demo proves a gap instead of joining a cluster.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {"pitch": "A local-first archive cartographer for family photos", "title": "A local-first archive cartographer for family photos"}, "name": "save_idea"}, "errors": [], "status": "valid"}, "tools": [{"name": "save_idea", "summary": "Wrote idea page 'A local-first archive cartographer for family photos'."}, {"name": "search_projects", "summary": "Checked 5 closest project echoes."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.6/10."}], "turn_index": 1, "type": "agent_turn", "verdict": "ECHO x4"}
3
- {"artifact_title": "Personal archive cartographer", "input": "write bolder and find whitespace", "overall": 4.8, "plan_steps": 0, "response": "Gold gathers on Personal archive cartographer. A tiny model that maps a private archive into stories without sending it to cloud APIs. Nearest echoes are weak: Dream Museum, Storybook. The seal reads ECHO x4 at 4.8/10. The next move is to make one concrete before/after scene and cite the two weakest nearby echoes in the margin.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "find_whitespace"}, "errors": [], "status": "valid"}, "tools": [{"name": "find_whitespace", "summary": "Ranked 4 under-explored regions."}, {"name": "save_idea", "summary": "Wrote idea page 'Personal archive cartographer'."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.8/10."}], "turn_index": 2, "type": "agent_turn", "verdict": "ECHO x4"}
4
- {"artifact_title": "Personal archive cartographer", "input": "make a build plan", "overall": 4.8, "plan_steps": 6, "response": "The wax seal for Personal archive cartographer reads 4.8/10, ECHO x4. The build path is: 1. Lock a one-sentence promise and one test input that proves what is different. 2. Compare against the nearest echoes, then sharpen the part only this idea can own. 3. Build the smallest happy path: input, nearby project citations, score, and one shareable output. 4. Add one selected-goal feature only after the core loop is smooth enough to explain without narration. 5. Collect successful advisor examples before training a tiny LoRA. 6. Write build notes from the exact decisions, screenshots, and outputs.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "make_plan"}, "errors": [], "status": "valid"}, "tools": [{"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.8/10."}, {"name": "make_plan", "summary": "Drafted 6 build steps."}], "turn_index": 3, "type": "agent_turn", "verdict": "ECHO x4"}
 
1
+ {"app": "hackathon-advisor", "generated_at": "2026-06-07T11:52:22+00:00", "idea_count": 2, "index": {"algorithm": "llama-cpp-embedding-v1", "index_generated_at": "2026-06-07T11:52:16+00:00", "snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034", "snapshot_generated_at": "2026-06-07T11:51:09+00:00"}, "schema_version": 1, "turn_count": 3, "type": "trace_manifest"}
2
+ {"artifact_title": "A local-first archive cartographer for family photos", "input": "A local-first archive cartographer for family photos", "overall": 4.6, "plan_steps": 0, "response": "The ink bleeds around A local-first archive cartographer for family photos. Closest echoes: page 67: Mini Fam; page 38: Family Bill Assistant; page 84: PawMap. The seal reads ECHO x4 at 4.6/10. Keep the audience, but change the mechanism or artifact so the demo proves a gap instead of joining a cluster.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {"pitch": "A local-first archive cartographer for family photos", "title": "A local-first archive cartographer for family photos"}, "name": "save_idea"}, "errors": [], "status": "valid"}, "tools": [{"name": "save_idea", "summary": "Wrote idea page 'A local-first archive cartographer for family photos'."}, {"name": "search_projects", "summary": "Checked 5 closest project echoes."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.6/10."}], "turn_index": 1, "type": "agent_turn", "verdict": "ECHO x4"}
3
+ {"artifact_title": "Hands-on science coach", "input": "write bolder and find whitespace", "overall": 4.0, "plan_steps": 0, "response": "Gold gathers on Hands-on science coach. A lab-notebook companion that designs safe experiments from household materials. Nearest echoes are weak: Storybook, Yui Home Assisstant. The seal reads ECHO x4 at 4.0/10. The next move is to make one concrete before/after scene and cite the two weakest nearby echoes in the margin.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "find_whitespace"}, "errors": [], "status": "valid"}, "tools": [{"name": "find_whitespace", "summary": "Ranked 4 under-explored regions."}, {"name": "save_idea", "summary": "Wrote idea page 'Hands-on science coach'."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.0/10."}], "turn_index": 2, "type": "agent_turn", "verdict": "ECHO x4"}
4
+ {"artifact_title": "Hands-on science coach", "input": "make a build plan", "overall": 4.0, "plan_steps": 6, "response": "The wax seal for Hands-on science coach reads 4.0/10, ECHO x4. The build path is: 1. Lock a one-sentence promise and one test input that proves what is different. 2. Compare against the nearest echoes, then sharpen the part only this idea can own. 3. Build the smallest happy path: input, nearby project citations, score, and one shareable output. 4. Add one selected-goal feature only after the core loop is smooth enough to explain without narration. 5. Collect successful advisor examples before training a tiny LoRA. 6. Write build notes from the exact decisions, screenshots, and outputs.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "make_plan"}, "errors": [], "status": "valid"}, "tools": [{"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.0/10."}, {"name": "make_plan", "summary": "Drafted 6 build steps."}], "turn_index": 3, "type": "agent_turn", "verdict": "ECHO x4"}
hackathon_advisor/data.py CHANGED
@@ -1,5 +1,6 @@
1
  from __future__ import annotations
2
 
 
3
  from collections.abc import Callable, Sequence
4
  from dataclasses import dataclass
5
  from datetime import datetime, timezone
@@ -7,11 +8,13 @@ from hashlib import sha256
7
  import json
8
  import math
9
  from pathlib import Path
 
10
  import re
11
  from typing import Any
12
 
13
 
14
  TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
 
15
  GENERIC_PUBLIC_TITLE_RE = re.compile(
16
  r"^(?:my\s+)?build\s+small\s+hackathon$",
17
  re.IGNORECASE,
@@ -23,11 +26,12 @@ GENERIC_PUBLIC_SUMMARY_RE = re.compile(
23
  re.IGNORECASE,
24
  )
25
 
26
- INDEX_SCHEMA_VERSION = 2
27
  INDEX_ALGORITHM = "llama-cpp-embedding-v1"
28
  DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
29
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
30
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
 
31
 
32
 
33
  EmbeddingFunction = Callable[[str], Sequence[float]]
@@ -48,6 +52,8 @@ class Project:
48
  last_modified: str
49
  host: str
50
  url: str
 
 
51
 
52
  @classmethod
53
  def from_dict(cls, data: dict) -> "Project":
@@ -65,6 +71,8 @@ class Project:
65
  last_modified=str(data.get("last_modified") or ""),
66
  host=str(data.get("host") or ""),
67
  url=str(data.get("url") or f"https://huggingface.co/spaces/{data['id']}"),
 
 
68
  )
69
 
70
  @property
@@ -73,15 +81,21 @@ class Project:
73
 
74
  @property
75
  def searchable_text(self) -> str:
76
- return " ".join(
77
- [
78
- self.title,
79
- self.slug.replace("-", " ").replace("_", " "),
80
- self.summary,
81
- " ".join(self.tags),
82
- " ".join(self.models),
83
- " ".join(self.datasets),
 
 
 
 
 
84
  ]
 
85
  )
86
 
87
  def to_public_dict(self) -> dict:
@@ -99,6 +113,7 @@ class Project:
99
  "last_modified": self.last_modified,
100
  "host": self.host,
101
  "url": self.url,
 
102
  }
103
 
104
  def to_snapshot_dict(self) -> dict:
@@ -116,6 +131,8 @@ class Project:
116
  "last_modified": self.last_modified,
117
  "host": self.host,
118
  "url": self.url,
 
 
119
  }
120
 
121
 
@@ -163,6 +180,99 @@ def public_project_summary(summary: str) -> str:
163
  return cleaned
164
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  @dataclass(frozen=True)
167
  class WhitespaceSeed:
168
  label: str
@@ -433,7 +543,9 @@ def validate_index_payload(
433
  indexed_ids = [document.get("project_id") for document in documents]
434
  if indexed_ids != project_ids:
435
  raise ValueError("project index project order does not match projects snapshot")
436
- for document in documents:
 
 
437
  vector = document.get("vector")
438
  if not isinstance(vector, list) or len(vector) != dimensions:
439
  raise ValueError("project index vector dimensions do not match embedding metadata")
 
1
  from __future__ import annotations
2
 
3
+ import ast
4
  from collections.abc import Callable, Sequence
5
  from dataclasses import dataclass
6
  from datetime import datetime, timezone
 
8
  import json
9
  import math
10
  from pathlib import Path
11
+ from pathlib import PurePosixPath
12
  import re
13
  from typing import Any
14
 
15
 
16
  TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
17
+ HTML_TAG_RE = re.compile(r"<[^>]+>")
18
  GENERIC_PUBLIC_TITLE_RE = re.compile(
19
  r"^(?:my\s+)?build\s+small\s+hackathon$",
20
  re.IGNORECASE,
 
26
  re.IGNORECASE,
27
  )
28
 
29
+ INDEX_SCHEMA_VERSION = 3
30
  INDEX_ALGORITHM = "llama-cpp-embedding-v1"
31
  DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
+ APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
35
 
36
 
37
  EmbeddingFunction = Callable[[str], Sequence[float]]
 
52
  last_modified: str
53
  host: str
54
  url: str
55
+ app_file: str = ""
56
+ app_file_embedding_text: str = ""
57
 
58
  @classmethod
59
  def from_dict(cls, data: dict) -> "Project":
 
71
  last_modified=str(data.get("last_modified") or ""),
72
  host=str(data.get("host") or ""),
73
  url=str(data.get("url") or f"https://huggingface.co/spaces/{data['id']}"),
74
+ app_file=str(data.get("app_file") or ""),
75
+ app_file_embedding_text=str(data.get("app_file_embedding_text") or ""),
76
  )
77
 
78
  @property
 
81
 
82
  @property
83
  def searchable_text(self) -> str:
84
+ return "\n".join(
85
+ part
86
+ for part in [
87
+ f"title: {self.title}",
88
+ f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
89
+ f"summary: {self.summary}",
90
+ f"tags: {' '.join(self.tags)}",
91
+ f"models: {' '.join(self.models)}",
92
+ f"datasets: {' '.join(self.datasets)}",
93
+ f"main app file: {self.app_file}" if self.app_file else "",
94
+ f"main app file content:\n{self.app_file_embedding_text}"
95
+ if self.app_file_embedding_text
96
+ else "",
97
  ]
98
+ if part.strip()
99
  )
100
 
101
  def to_public_dict(self) -> dict:
 
113
  "last_modified": self.last_modified,
114
  "host": self.host,
115
  "url": self.url,
116
+ "app_file": self.app_file,
117
  }
118
 
119
  def to_snapshot_dict(self) -> dict:
 
131
  "last_modified": self.last_modified,
132
  "host": self.host,
133
  "url": self.url,
134
+ "app_file": self.app_file,
135
+ "app_file_embedding_text": self.app_file_embedding_text,
136
  }
137
 
138
 
 
180
  return cleaned
181
 
182
 
183
+ def extract_app_file_embedding_text(app_file: str, text: str) -> str:
184
+ cleaned_file = str(app_file).strip()
185
+ cleaned_text = str(text or "")
186
+ if not cleaned_file or not cleaned_text.strip():
187
+ return ""
188
+
189
+ suffix = PurePosixPath(cleaned_file).suffix.lower()
190
+ if suffix == ".py":
191
+ body = python_app_signals(cleaned_text)
192
+ else:
193
+ body = cleaned_text
194
+ return bounded_embedding_text(body, APP_FILE_EMBEDDING_CHAR_LIMIT)
195
+
196
+
197
+ def python_app_signals(source: str) -> str:
198
+ try:
199
+ tree = ast.parse(source)
200
+ except SyntaxError:
201
+ return source
202
+
203
+ signals: list[str] = []
204
+ for node in ast.walk(tree):
205
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
206
+ signals.append(node.name)
207
+ for arg in node.args.args:
208
+ signals.append(arg.arg)
209
+ elif isinstance(node, ast.ClassDef):
210
+ signals.append(node.name)
211
+ elif isinstance(node, ast.Call):
212
+ name = call_name(node.func)
213
+ if name:
214
+ signals.append(name)
215
+ signals.extend(keyword.arg for keyword in node.keywords if keyword.arg)
216
+ elif isinstance(node, ast.Constant) and isinstance(node.value, str):
217
+ signals.append(node.value)
218
+
219
+ return ordered_normalized_text(signals)
220
+
221
+
222
+ def call_name(node: ast.AST) -> str:
223
+ if isinstance(node, ast.Name):
224
+ return node.id
225
+ if isinstance(node, ast.Attribute):
226
+ parent = call_name(node.value)
227
+ return f"{parent}.{node.attr}" if parent else node.attr
228
+ return ""
229
+
230
+
231
+ def ordered_normalized_text(values: Sequence[str]) -> str:
232
+ seen: set[str] = set()
233
+ ordered: list[str] = []
234
+ for value in values:
235
+ cleaned = clean_embedding_signal(value)
236
+ if not cleaned:
237
+ continue
238
+ if cleaned in seen:
239
+ continue
240
+ seen.add(cleaned)
241
+ ordered.append(cleaned)
242
+ return "\n".join(ordered)
243
+
244
+
245
+ def clean_embedding_signal(value: str) -> str:
246
+ cleaned = HTML_TAG_RE.sub(" ", str(value))
247
+ cleaned = " ".join(cleaned.split())
248
+ if looks_like_style_blob(cleaned):
249
+ return ""
250
+ return cleaned
251
+
252
+
253
+ def looks_like_style_blob(text: str) -> bool:
254
+ if len(text) < 80:
255
+ return False
256
+ style_markers = (
257
+ text.count("{")
258
+ + text.count("}")
259
+ + text.count(";")
260
+ + text.count("!important")
261
+ + text.count("rgba(")
262
+ + text.count("linear-gradient")
263
+ )
264
+ return style_markers >= 8 and style_markers / len(text) > 0.015
265
+
266
+
267
+ def bounded_embedding_text(text: str, limit: int) -> str:
268
+ cleaned = " ".join(str(text).split())
269
+ if len(cleaned) <= limit:
270
+ return cleaned
271
+ marker = " ... "
272
+ edge = max(1, (limit - len(marker)) // 2)
273
+ return f"{cleaned[:edge].rstrip()}{marker}{cleaned[-edge:].lstrip()}"
274
+
275
+
276
  @dataclass(frozen=True)
277
  class WhitespaceSeed:
278
  label: str
 
543
  indexed_ids = [document.get("project_id") for document in documents]
544
  if indexed_ids != project_ids:
545
  raise ValueError("project index project order does not match projects snapshot")
546
+ for project, document in zip(projects, documents, strict=True):
547
+ if document.get("text_digest") != sha256(project.searchable_text.encode("utf-8")).hexdigest():
548
+ raise ValueError("project index text digest does not match searchable project text")
549
  vector = document.get("vector")
550
  if not isinstance(vector, list) or len(vector) != dimensions:
551
  raise ValueError("project index vector dimensions do not match embedding metadata")
hackathon_advisor/llama_embedding.py CHANGED
@@ -1,8 +1,14 @@
1
  from __future__ import annotations
2
 
3
  from collections.abc import Sequence
4
- from pathlib import Path
 
5
  import os
 
 
 
 
 
6
  from typing import Any
7
 
8
  from hackathon_advisor.data import (
@@ -12,7 +18,8 @@ from hackathon_advisor.data import (
12
 
13
 
14
  TRUE_VALUES = {"1", "true", "yes", "on"}
15
- DEFAULT_N_CTX = 512
 
16
 
17
 
18
  class LlamaCppEmbedder:
@@ -74,8 +81,114 @@ class LlamaCppEmbedder:
74
  return self._model
75
 
76
 
77
- def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder:
78
- return LlamaCppEmbedder(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  model_repo=os.environ.get(
80
  "ADVISOR_EMBEDDING_MODEL_REPO",
81
  str(metadata.get("model_repo") or DEFAULT_EMBEDDING_MODEL_REPO),
@@ -111,3 +224,38 @@ def _optional_int_env(name: str) -> int | None:
111
  if value <= 0:
112
  raise RuntimeError(f"{name} must be a positive integer.")
113
  return value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  from collections.abc import Sequence
4
+ import atexit
5
+ import json
6
  import os
7
+ from pathlib import Path
8
+ import platform
9
+ import subprocess
10
+ import sys
11
+ import threading
12
  from typing import Any
13
 
14
  from hackathon_advisor.data import (
 
18
 
19
 
20
  TRUE_VALUES = {"1", "true", "yes", "on"}
21
+ FALSE_VALUES = {"0", "false", "no", "off"}
22
+ DEFAULT_N_CTX = 2048
23
 
24
 
25
  class LlamaCppEmbedder:
 
81
  return self._model
82
 
83
 
84
+ class SubprocessLlamaCppEmbedder:
85
+ def __init__(
86
+ self,
87
+ *,
88
+ model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
89
+ model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
90
+ model_path: str = "",
91
+ n_ctx: int = DEFAULT_N_CTX,
92
+ n_batch: int | None = None,
93
+ n_threads: int | None = None,
94
+ n_gpu_layers: int = 0,
95
+ verbose: bool = False,
96
+ ) -> None:
97
+ self.model_repo = model_repo.strip() or DEFAULT_EMBEDDING_MODEL_REPO
98
+ self.model_file = model_file.strip() or DEFAULT_EMBEDDING_MODEL_FILE
99
+ self.model_path = model_path.strip()
100
+ self.n_ctx = n_ctx
101
+ self.n_batch = n_batch or n_ctx
102
+ self.n_threads = n_threads
103
+ self.n_gpu_layers = n_gpu_layers
104
+ self.verbose = verbose
105
+ self._process: subprocess.Popen[str] | None = None
106
+ self._request_id = 0
107
+ self._lock = threading.Lock()
108
+ atexit.register(self.close)
109
+
110
+ def __call__(self, text: str) -> Sequence[float]:
111
+ return self.embed(text)
112
+
113
+ def embed(self, text: str) -> Sequence[float]:
114
+ with self._lock:
115
+ process = self._ensure_process()
116
+ self._request_id += 1
117
+ request_id = self._request_id
118
+ request = json.dumps({"id": request_id, "text": text}, ensure_ascii=False)
119
+ try:
120
+ assert process.stdin is not None
121
+ assert process.stdout is not None
122
+ process.stdin.write(f"{request}\n")
123
+ process.stdin.flush()
124
+ line = process.stdout.readline()
125
+ except (BrokenPipeError, OSError) as error:
126
+ self.close()
127
+ raise RuntimeError("llama.cpp embedding worker stopped before returning a vector.") from error
128
+ if not line:
129
+ returncode = process.poll()
130
+ self.close()
131
+ detail = f" with exit code {returncode}" if returncode is not None else ""
132
+ raise RuntimeError(f"llama.cpp embedding worker exited{detail}.")
133
+ try:
134
+ response = json.loads(line)
135
+ except json.JSONDecodeError as error:
136
+ raise RuntimeError("llama.cpp embedding worker returned invalid JSON.") from error
137
+ if response.get("id") != request_id:
138
+ raise RuntimeError("llama.cpp embedding worker returned an out-of-order response.")
139
+ if response.get("error"):
140
+ raise RuntimeError(str(response["error"]))
141
+ vector = response.get("vector")
142
+ if not isinstance(vector, list):
143
+ raise RuntimeError("llama.cpp embedding worker did not return a vector.")
144
+ return vector
145
+
146
+ def close(self) -> None:
147
+ process = self._process
148
+ self._process = None
149
+ if process is None:
150
+ return
151
+ if process.poll() is None:
152
+ process.terminate()
153
+ try:
154
+ process.wait(timeout=2)
155
+ except subprocess.TimeoutExpired:
156
+ process.kill()
157
+ process.wait(timeout=2)
158
+
159
+ def _ensure_process(self) -> subprocess.Popen[str]:
160
+ if self._process is not None and self._process.poll() is None:
161
+ return self._process
162
+ self._process = subprocess.Popen(
163
+ [sys.executable, "-u", "-m", "hackathon_advisor.llama_embedding", "--worker"],
164
+ stdin=subprocess.PIPE,
165
+ stdout=subprocess.PIPE,
166
+ stderr=None if self.verbose else subprocess.DEVNULL,
167
+ text=True,
168
+ cwd=Path(__file__).resolve().parents[1],
169
+ )
170
+ config = json.dumps(
171
+ {
172
+ "model_repo": self.model_repo,
173
+ "model_file": self.model_file,
174
+ "model_path": self.model_path,
175
+ "n_ctx": self.n_ctx,
176
+ "n_batch": self.n_batch,
177
+ "n_threads": self.n_threads,
178
+ "n_gpu_layers": self.n_gpu_layers,
179
+ "verbose": self.verbose,
180
+ },
181
+ ensure_ascii=False,
182
+ )
183
+ assert self._process.stdin is not None
184
+ self._process.stdin.write(f"{config}\n")
185
+ self._process.stdin.flush()
186
+ return self._process
187
+
188
+
189
+ def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder | SubprocessLlamaCppEmbedder:
190
+ embedder_cls = SubprocessLlamaCppEmbedder if _use_subprocess_embedder() else LlamaCppEmbedder
191
+ return embedder_cls(
192
  model_repo=os.environ.get(
193
  "ADVISOR_EMBEDDING_MODEL_REPO",
194
  str(metadata.get("model_repo") or DEFAULT_EMBEDDING_MODEL_REPO),
 
224
  if value <= 0:
225
  raise RuntimeError(f"{name} must be a positive integer.")
226
  return value
227
+
228
+
229
+ def _use_subprocess_embedder() -> bool:
230
+ raw = os.environ.get("ADVISOR_EMBEDDING_SUBPROCESS", "").strip().lower()
231
+ if raw in TRUE_VALUES:
232
+ return True
233
+ if raw in FALSE_VALUES:
234
+ return False
235
+ backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
236
+ return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}
237
+
238
+
239
+ def _worker_loop() -> None:
240
+ config_line = sys.stdin.readline()
241
+ if not config_line:
242
+ return
243
+ embedder = LlamaCppEmbedder(**json.loads(config_line))
244
+ for line in sys.stdin:
245
+ if not line.strip():
246
+ continue
247
+ request = json.loads(line)
248
+ request_id = request.get("id")
249
+ try:
250
+ vector = list(embedder.embed(str(request.get("text") or "")))
251
+ response = {"id": request_id, "vector": vector}
252
+ except Exception as error:
253
+ response = {"id": request_id, "error": str(error)}
254
+ print(json.dumps(response), flush=True)
255
+
256
+
257
+ if __name__ == "__main__":
258
+ if len(sys.argv) == 2 and sys.argv[1] == "--worker":
259
+ _worker_loop()
260
+ else:
261
+ raise SystemExit("usage: python -m hackathon_advisor.llama_embedding --worker")
scripts/build_project_index.py CHANGED
@@ -81,6 +81,7 @@ def build_payload(
81
  "build_source": build_source,
82
  "builder": builder,
83
  "llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
 
84
  }
85
  if modal_app:
86
  metadata["modal_app"] = modal_app
 
81
  "build_source": build_source,
82
  "builder": builder,
83
  "llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
84
+ "n_ctx": n_ctx,
85
  }
86
  if modal_app:
87
  metadata["modal_app"] = modal_app
scripts/crawl_hf_spaces.py CHANGED
@@ -5,11 +5,16 @@ import argparse
5
  from datetime import datetime, timezone
6
  import json
7
  from pathlib import Path
8
- import time
 
9
  from typing import Any
10
- from urllib.error import HTTPError
11
- from urllib.parse import quote
12
- from urllib.request import Request, urlopen
 
 
 
 
13
 
14
 
15
  API = "https://huggingface.co/api"
@@ -19,20 +24,13 @@ def main() -> None:
19
  parser = argparse.ArgumentParser(description="Snapshot public Spaces in a Hugging Face org.")
20
  parser.add_argument("--org", default="build-small-hackathon")
21
  parser.add_argument("--out", default="data/projects.json")
22
- parser.add_argument("--limit", type=int, default=100)
23
  args = parser.parse_args()
24
 
25
- spaces = fetch_json(f"{API}/spaces?author={quote(args.org)}&limit={args.limit}")
26
- projects = []
27
- for item in spaces:
28
- space_id = item["id"]
29
- detail = fetch_json(f"{API}/spaces/{quote(space_id, safe='/')}")
30
- projects.append(project_from_detail(detail))
31
- time.sleep(0.05)
32
 
33
  payload = {
34
  "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
35
- "source": f"{API}/spaces?author={args.org}&limit={args.limit}",
36
  "projects": sorted(projects, key=lambda project: project["id"].lower()),
37
  }
38
  output = Path(args.out)
@@ -41,38 +39,141 @@ def main() -> None:
41
  print(f"wrote {len(projects)} projects to {output}")
42
 
43
 
44
- def fetch_json(url: str) -> Any:
45
- request = Request(url, headers={"User-Agent": "hackathon-advisor-crawler/0.1"})
46
- try:
47
- with urlopen(request, timeout=30) as response:
48
- return json.loads(response.read().decode("utf-8"))
49
- except HTTPError as error:
50
- raise RuntimeError(f"failed to fetch {url}: {error.code}") from error
 
51
 
52
 
53
- def project_from_detail(detail: dict[str, Any]) -> dict[str, Any]:
54
- card = detail.get("cardData") or {}
55
- space_id = str(detail["id"])
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
57
  summary = str(card.get("short_description") or card.get("description") or "")
58
- tags = sorted(set(str(tag) for tag in (card.get("tags") or detail.get("tags") or [])))
59
  return {
60
  "id": space_id,
61
  "title": title,
62
  "summary": summary,
63
- "tags": tags,
64
- "models": [str(model) for model in detail.get("models") or card.get("models") or []],
65
- "datasets": [str(dataset) for dataset in detail.get("datasets") or card.get("datasets") or []],
66
- "likes": int(detail.get("likes") or 0),
67
- "sdk": str(card.get("sdk") or detail.get("sdk") or ""),
 
 
68
  "license": str(card.get("license") or ""),
69
- "created_at": str(detail.get("createdAt") or ""),
70
- "last_modified": str(detail.get("lastModified") or ""),
71
- "host": str(detail.get("host") or ""),
72
  "url": f"https://huggingface.co/spaces/{space_id}",
 
 
73
  }
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def humanize_slug(slug: str) -> str:
77
  return " ".join(part for part in slug.replace("_", "-").split("-") if part).title()
78
 
 
5
  from datetime import datetime, timezone
6
  import json
7
  from pathlib import Path
8
+ from pathlib import PurePosixPath
9
+ import sys
10
  from typing import Any
11
+
12
+ from huggingface_hub import HfApi, hf_hub_download
13
+
14
+ ROOT = Path(__file__).resolve().parents[1]
15
+ sys.path.insert(0, str(ROOT))
16
+
17
+ from hackathon_advisor.data import extract_app_file_embedding_text
18
 
19
 
20
  API = "https://huggingface.co/api"
 
24
  parser = argparse.ArgumentParser(description="Snapshot public Spaces in a Hugging Face org.")
25
  parser.add_argument("--org", default="build-small-hackathon")
26
  parser.add_argument("--out", default="data/projects.json")
 
27
  args = parser.parse_args()
28
 
29
+ projects = crawl_projects(args.org)
 
 
 
 
 
 
30
 
31
  payload = {
32
  "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
33
+ "source": f"{API}/spaces?author={args.org}",
34
  "projects": sorted(projects, key=lambda project: project["id"].lower()),
35
  }
36
  output = Path(args.out)
 
39
  print(f"wrote {len(projects)} projects to {output}")
40
 
41
 
42
+ def crawl_projects(org: str) -> list[dict[str, Any]]:
43
+ api = HfApi(token=False)
44
+ spaces = api.list_spaces(author=org, full=True, token=False)
45
+ return [
46
+ project_from_space(space)
47
+ for space in spaces
48
+ if not bool(getattr(space, "private", False))
49
+ ]
50
 
51
 
52
+ def project_from_space(space: Any) -> dict[str, Any]:
53
+ card = card_data(space)
54
+ space_id = str(space.id)
55
+ siblings = sibling_names(space)
56
+ readme = download_repo_text(space_id, "README.md") if "README.md" in siblings else ""
57
+ frontmatter = readme_frontmatter(readme)
58
+ app_file = validate_app_file(str(frontmatter.get("app_file") or ""), space_id=space_id)
59
+ app_file_embedding_text = ""
60
+ if app_file:
61
+ if app_file not in siblings:
62
+ raise RuntimeError(f"{space_id} README frontmatter points to missing app_file: {app_file}")
63
+ app_file_embedding_text = extract_app_file_embedding_text(
64
+ app_file,
65
+ download_repo_text(space_id, app_file),
66
+ )
67
+
68
  title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
69
  summary = str(card.get("short_description") or card.get("description") or "")
 
70
  return {
71
  "id": space_id,
72
  "title": title,
73
  "summary": summary,
74
+ "tags": sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or []))),
75
+ "models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
76
+ "datasets": [
77
+ str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []
78
+ ],
79
+ "likes": int(getattr(space, "likes", None) or 0),
80
+ "sdk": str(card.get("sdk") or getattr(space, "sdk", None) or ""),
81
  "license": str(card.get("license") or ""),
82
+ "created_at": isoformat(getattr(space, "created_at", None)),
83
+ "last_modified": isoformat(getattr(space, "last_modified", None)),
84
+ "host": host_url(space),
85
  "url": f"https://huggingface.co/spaces/{space_id}",
86
+ "app_file": app_file,
87
+ "app_file_embedding_text": app_file_embedding_text,
88
  }
89
 
90
 
91
+ def card_data(space: Any) -> dict[str, Any]:
92
+ raw = getattr(space, "card_data", None) or getattr(space, "cardData", None) or {}
93
+ if isinstance(raw, dict):
94
+ return raw
95
+ to_dict = getattr(raw, "to_dict", None)
96
+ if callable(to_dict):
97
+ return dict(to_dict())
98
+ return {}
99
+
100
+
101
+ def sibling_names(space: Any) -> set[str]:
102
+ return {str(sibling.rfilename) for sibling in getattr(space, "siblings", None) or []}
103
+
104
+
105
+ def download_repo_text(repo_id: str, filename: str) -> str:
106
+ path = hf_hub_download(
107
+ repo_id=repo_id,
108
+ repo_type="space",
109
+ filename=filename,
110
+ token=False,
111
+ etag_timeout=30,
112
+ )
113
+ return Path(path).read_text(encoding="utf-8")
114
+
115
+
116
+ def readme_frontmatter(readme: str) -> dict[str, str]:
117
+ lines = readme.splitlines()
118
+ if not lines or lines[0].strip() != "---":
119
+ return {}
120
+
121
+ values: dict[str, str] = {}
122
+ closed = False
123
+ for line in lines[1:]:
124
+ stripped = line.strip()
125
+ if stripped in {"---", "..."}:
126
+ closed = True
127
+ break
128
+ if not stripped or stripped.startswith("#") or ":" not in line:
129
+ continue
130
+ if line[:1].isspace() or stripped.startswith("-"):
131
+ continue
132
+ key, raw_value = line.split(":", 1)
133
+ key = key.strip()
134
+ if key:
135
+ values[key] = yaml_scalar(raw_value)
136
+ return values if closed else {}
137
+
138
+
139
+ def yaml_scalar(raw_value: str) -> str:
140
+ value = raw_value.strip()
141
+ if not value:
142
+ return ""
143
+ if " #" in value:
144
+ value = value.split(" #", 1)[0].rstrip()
145
+ if value[:1] in {"'", '"'} and value[-1:] == value[:1]:
146
+ return value[1:-1]
147
+ return value
148
+
149
+
150
+ def validate_app_file(app_file: str, *, space_id: str) -> str:
151
+ cleaned = app_file.strip()
152
+ if not cleaned:
153
+ return ""
154
+ path = PurePosixPath(cleaned)
155
+ if path.is_absolute() or ".." in path.parts or cleaned.endswith("/"):
156
+ raise RuntimeError(f"{space_id} README frontmatter has an invalid app_file path: {app_file}")
157
+ return path.as_posix()
158
+
159
+
160
+ def isoformat(value: Any) -> str:
161
+ if value is None:
162
+ return ""
163
+ formatter = getattr(value, "isoformat", None)
164
+ if callable(formatter):
165
+ return formatter()
166
+ return str(value)
167
+
168
+
169
+ def host_url(space: Any) -> str:
170
+ host = str(getattr(space, "host", None) or "")
171
+ if host:
172
+ return host
173
+ subdomain = str(getattr(space, "subdomain", None) or "")
174
+ return f"https://{subdomain}.hf.space" if subdomain else ""
175
+
176
+
177
  def humanize_slug(slug: str) -> str:
178
  return " ".join(part for part in slug.replace("_", "-").split("-") if part).title()
179
 
tests/test_crawl_hf_spaces.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from types import SimpleNamespace
4
+
5
+ import pytest
6
+
7
+ from scripts import crawl_hf_spaces
8
+
9
+
10
+ def test_readme_frontmatter_extracts_app_file() -> None:
11
+ frontmatter = crawl_hf_spaces.readme_frontmatter(
12
+ """---
13
+ title: Tiny Demo
14
+ app_file: "src/app.py" # main entrypoint
15
+ tags:
16
+ - gradio
17
+ ---
18
+ # Tiny Demo
19
+ """
20
+ )
21
+
22
+ assert frontmatter["app_file"] == "src/app.py"
23
+
24
+
25
+ def test_validate_app_file_rejects_untrusted_paths() -> None:
26
+ with pytest.raises(RuntimeError, match="invalid app_file path"):
27
+ crawl_hf_spaces.validate_app_file("../app.py", space_id="build-small-hackathon/demo")
28
+
29
+
30
+ def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
31
+ downloads = {
32
+ ("build-small-hackathon/demo", "README.md"): "---\napp_file: app.py\n---\n",
33
+ ("build-small-hackathon/demo", "app.py"): "import gradio as gr\ngr.Textbox(label='Idea')\n",
34
+ }
35
+
36
+ def fake_download(repo_id: str, filename: str) -> str:
37
+ return downloads[(repo_id, filename)]
38
+
39
+ monkeypatch.setattr(crawl_hf_spaces, "download_repo_text", fake_download)
40
+ space = SimpleNamespace(
41
+ id="build-small-hackathon/demo",
42
+ card_data={"title": "Demo", "short_description": "Advisor demo", "sdk": "gradio"},
43
+ siblings=[
44
+ SimpleNamespace(rfilename="README.md"),
45
+ SimpleNamespace(rfilename="app.py"),
46
+ ],
47
+ tags=["gradio"],
48
+ models=[],
49
+ datasets=[],
50
+ likes=3,
51
+ created_at=None,
52
+ last_modified=None,
53
+ host="https://example.test",
54
+ private=False,
55
+ )
56
+
57
+ project = crawl_hf_spaces.project_from_space(space)
58
+
59
+ assert project["app_file"] == "app.py"
60
+ assert "gr.Textbox" in project["app_file_embedding_text"]
61
+ assert "Idea" in project["app_file_embedding_text"]
tests/test_data.py CHANGED
@@ -3,7 +3,12 @@ from pathlib import Path
3
  from tests.helpers import load_test_index
4
  import json
5
 
6
- from hackathon_advisor.data import Project, ProjectIndex, public_project_summary, public_project_title
 
 
 
 
 
7
 
8
 
9
  def test_project_index_searches_snapshot() -> None:
@@ -54,6 +59,32 @@ def test_public_project_cards_hide_generic_submission_copy() -> None:
54
  assert public["summary"] == ""
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
58
  payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
59
  payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
 
3
  from tests.helpers import load_test_index
4
  import json
5
 
6
+ from hackathon_advisor.data import (
7
+ Project,
8
+ ProjectIndex,
9
+ public_project_summary,
10
+ public_project_title,
11
+ )
12
 
13
 
14
  def test_project_index_searches_snapshot() -> None:
 
59
  assert public["summary"] == ""
60
 
61
 
62
+ def test_searchable_text_includes_main_app_file_signals() -> None:
63
+ project = Project(
64
+ id="build-small-hackathon/idea-canvas",
65
+ title="Idea Canvas",
66
+ summary="",
67
+ tags=("gradio",),
68
+ models=(),
69
+ datasets=(),
70
+ likes=0,
71
+ sdk="gradio",
72
+ license="",
73
+ created_at="",
74
+ last_modified="",
75
+ host="",
76
+ url="https://example.test",
77
+ app_file="app.py",
78
+ app_file_embedding_text="score_idea\ngr.Textbox\nProject idea",
79
+ )
80
+
81
+ searchable = project.searchable_text
82
+
83
+ assert "main app file: app.py" in searchable
84
+ assert "score_idea" in searchable
85
+ assert "Project idea" in searchable
86
+
87
+
88
  def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
89
  payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
90
  payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
tests/test_llama_embedding.py CHANGED
@@ -3,7 +3,12 @@ import sys
3
  from types import ModuleType
4
 
5
  from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
6
- from hackathon_advisor.llama_embedding import DEFAULT_N_CTX, LlamaCppEmbedder, create_llama_cpp_embedder
 
 
 
 
 
7
 
8
 
9
  def test_llama_embedder_uses_q8_defaults_and_configured_context(
@@ -60,3 +65,33 @@ def test_create_llama_embedder_accepts_explicit_batch(monkeypatch) -> None:
60
  embedder = create_llama_cpp_embedder({"dimensions": 768})
61
 
62
  assert embedder.n_batch == 256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from types import ModuleType
4
 
5
  from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
6
+ from hackathon_advisor.llama_embedding import (
7
+ DEFAULT_N_CTX,
8
+ LlamaCppEmbedder,
9
+ SubprocessLlamaCppEmbedder,
10
+ create_llama_cpp_embedder,
11
+ )
12
 
13
 
14
  def test_llama_embedder_uses_q8_defaults_and_configured_context(
 
65
  embedder = create_llama_cpp_embedder({"dimensions": 768})
66
 
67
  assert embedder.n_batch == 256
68
+
69
+
70
+ def test_create_llama_embedder_can_isolate_native_runtime(monkeypatch) -> None:
71
+ monkeypatch.setenv("ADVISOR_EMBEDDING_SUBPROCESS", "1")
72
+
73
+ embedder = create_llama_cpp_embedder({"dimensions": 768})
74
+
75
+ assert isinstance(embedder, SubprocessLlamaCppEmbedder)
76
+ embedder.close()
77
+
78
+
79
+ def test_create_llama_embedder_isolates_macos_minicpm_runtime(monkeypatch) -> None:
80
+ monkeypatch.delenv("ADVISOR_EMBEDDING_SUBPROCESS", raising=False)
81
+ monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
82
+ monkeypatch.setattr("hackathon_advisor.llama_embedding.platform.system", lambda: "Darwin")
83
+
84
+ embedder = create_llama_cpp_embedder({"dimensions": 768})
85
+
86
+ assert isinstance(embedder, SubprocessLlamaCppEmbedder)
87
+ embedder.close()
88
+
89
+
90
+ def test_create_llama_embedder_keeps_in_process_when_isolation_disabled(monkeypatch) -> None:
91
+ monkeypatch.setenv("ADVISOR_EMBEDDING_SUBPROCESS", "0")
92
+ monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
93
+ monkeypatch.setattr("hackathon_advisor.llama_embedding.platform.system", lambda: "Darwin")
94
+
95
+ embedder = create_llama_cpp_embedder({"dimensions": 768})
96
+
97
+ assert isinstance(embedder, LlamaCppEmbedder)