Spaces:
Running on Zero
Running on Zero
feat: embed app file signals in project index
Browse files- .gitattributes +1 -0
- README.md +8 -3
- data/project_index.json +0 -0
- data/projects.json +0 -0
- data/sample_trace.jsonl +4 -4
- hackathon_advisor/data.py +122 -10
- hackathon_advisor/llama_embedding.py +152 -4
- scripts/build_project_index.py +1 -0
- scripts/crawl_hf_spaces.py +133 -32
- tests/test_crawl_hf_spaces.py +61 -0
- tests/test_data.py +32 -1
- tests/test_llama_embedding.py +36 -1
.gitattributes
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
# Auto detect text files and perform LF normalization
|
| 2 |
* text=auto
|
|
|
|
|
|
| 1 |
# Auto detect text files and perform LF normalization
|
| 2 |
* text=auto
|
| 3 |
+
static/assets/parchment.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -58,9 +58,11 @@ python scripts/generate_sample_trace.py --projects data/projects.json --index da
|
|
| 58 |
```
|
| 59 |
|
| 60 |
The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
|
| 61 |
-
source, project order, digest, embedding dimensions, and normalized vector shape before the app starts.
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
| 64 |
|
| 65 |
## Trace Artifact
|
| 66 |
|
|
@@ -194,6 +196,9 @@ ADVISOR_ASR_MODEL_ID=nvidia/nemotron-speech-streaming-en-0.6b
|
|
| 194 |
`agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
|
| 195 |
the ZeroGPU allocation. The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
|
| 196 |
`ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
|
|
|
|
|
|
|
|
|
|
| 197 |
Local tests and CPU-only development still default to `ADVISOR_MODEL_BACKEND=rules`.
|
| 198 |
|
| 199 |
## Test
|
|
|
|
| 58 |
```
|
| 59 |
|
| 60 |
The app uses `data/projects.json` and `data/project_index.json` at runtime. The index validates the snapshot timestamp,
|
| 61 |
+
source, project order, searchable text digest, embedding dimensions, and normalized vector shape before the app starts.
|
| 62 |
+
The crawler snapshots every public Space in the org and, when README frontmatter declares `app_file`, includes that main
|
| 63 |
+
app file as the highest-signal project evidence for embedding. The canonical index is built on Modal with
|
| 64 |
+
`ggml-org/embeddinggemma-300m-qat-q8_0-GGUF` through llama.cpp; runtime search embeds the user query with the same GGUF
|
| 65 |
+
model and performs local cosine search over the checked-in vectors.
|
| 66 |
|
| 67 |
## Trace Artifact
|
| 68 |
|
|
|
|
| 196 |
`agent_turn` wraps the engine call with `spaces.GPU` when `ADVISOR_ZERO_GPU=1`, so model loading and generation run on
|
| 197 |
the ZeroGPU allocation. The retrieval query embedder downloads the GGUF model through `huggingface_hub` unless
|
| 198 |
`ADVISOR_EMBEDDING_MODEL_PATH` points to a local file. `/api/transcribe` uses the same ZeroGPU wrapper for Nemotron ASR.
|
| 199 |
+
On macOS local runs with `ADVISOR_MODEL_BACKEND=minicpm-transformers`, the app automatically runs llama.cpp query
|
| 200 |
+
embedding in a worker process so the MiniCPM PyTorch runtime and llama.cpp do not load conflicting OpenMP runtimes in
|
| 201 |
+
the same Python process.
|
| 202 |
Local tests and CPU-only development still default to `ADVISOR_MODEL_BACKEND=rules`.
|
| 203 |
|
| 204 |
## Test
|
data/project_index.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/projects.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/sample_trace.jsonl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{"app": "hackathon-advisor", "generated_at": "2026-06-
|
| 2 |
-
{"artifact_title": "A local-first archive cartographer for family photos", "input": "A local-first archive cartographer for family photos", "overall": 4.6, "plan_steps": 0, "response": "The ink bleeds around A local-first archive cartographer for family photos. Closest echoes: page
|
| 3 |
-
{"artifact_title": "
|
| 4 |
-
{"artifact_title": "
|
|
|
|
| 1 |
+
{"app": "hackathon-advisor", "generated_at": "2026-06-07T11:52:22+00:00", "idea_count": 2, "index": {"algorithm": "llama-cpp-embedding-v1", "index_generated_at": "2026-06-07T11:52:16+00:00", "snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034", "snapshot_generated_at": "2026-06-07T11:51:09+00:00"}, "schema_version": 1, "turn_count": 3, "type": "trace_manifest"}
|
| 2 |
+
{"artifact_title": "A local-first archive cartographer for family photos", "input": "A local-first archive cartographer for family photos", "overall": 4.6, "plan_steps": 0, "response": "The ink bleeds around A local-first archive cartographer for family photos. Closest echoes: page 67: Mini Fam; page 38: Family Bill Assistant; page 84: PawMap. The seal reads ECHO x4 at 4.6/10. Keep the audience, but change the mechanism or artifact so the demo proves a gap instead of joining a cluster.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {"pitch": "A local-first archive cartographer for family photos", "title": "A local-first archive cartographer for family photos"}, "name": "save_idea"}, "errors": [], "status": "valid"}, "tools": [{"name": "save_idea", "summary": "Wrote idea page 'A local-first archive cartographer for family photos'."}, {"name": "search_projects", "summary": "Checked 5 closest project echoes."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.6/10."}], "turn_index": 1, "type": "agent_turn", "verdict": "ECHO x4"}
|
| 3 |
+
{"artifact_title": "Hands-on science coach", "input": "write bolder and find whitespace", "overall": 4.0, "plan_steps": 0, "response": "Gold gathers on Hands-on science coach. A lab-notebook companion that designs safe experiments from household materials. Nearest echoes are weak: Storybook, Yui Home Assisstant. The seal reads ECHO x4 at 4.0/10. The next move is to make one concrete before/after scene and cite the two weakest nearby echoes in the margin.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "find_whitespace"}, "errors": [], "status": "valid"}, "tools": [{"name": "find_whitespace", "summary": "Ranked 4 under-explored regions."}, {"name": "save_idea", "summary": "Wrote idea page 'Hands-on science coach'."}, {"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.0/10."}], "turn_index": 2, "type": "agent_turn", "verdict": "ECHO x4"}
|
| 4 |
+
{"artifact_title": "Hands-on science coach", "input": "make a build plan", "overall": 4.0, "plan_steps": 6, "response": "The wax seal for Hands-on science coach reads 4.0/10, ECHO x4. The build path is: 1. Lock a one-sentence promise and one test input that proves what is different. 2. Compare against the nearest echoes, then sharpen the part only this idea can own. 3. Build the smallest happy path: input, nearby project citations, score, and one shareable output. 4. Add one selected-goal feature only after the core loop is smooth enough to explain without narration. 5. Collect successful advisor examples before training a tiny LoRA. 6. Write build notes from the exact decisions, screenshots, and outputs.", "schema_version": 1, "tool_resolution": {"call": {"arguments": {}, "name": "make_plan"}, "errors": [], "status": "valid"}, "tools": [{"name": "score_idea", "summary": "Pressed a five-quadrant seal: 4.0/10."}, {"name": "make_plan", "summary": "Drafted 6 build steps."}], "turn_index": 3, "type": "agent_turn", "verdict": "ECHO x4"}
|
hackathon_advisor/data.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
| 3 |
from collections.abc import Callable, Sequence
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from datetime import datetime, timezone
|
|
@@ -7,11 +8,13 @@ from hashlib import sha256
|
|
| 7 |
import json
|
| 8 |
import math
|
| 9 |
from pathlib import Path
|
|
|
|
| 10 |
import re
|
| 11 |
from typing import Any
|
| 12 |
|
| 13 |
|
| 14 |
TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
|
|
|
|
| 15 |
GENERIC_PUBLIC_TITLE_RE = re.compile(
|
| 16 |
r"^(?:my\s+)?build\s+small\s+hackathon$",
|
| 17 |
re.IGNORECASE,
|
|
@@ -23,11 +26,12 @@ GENERIC_PUBLIC_SUMMARY_RE = re.compile(
|
|
| 23 |
re.IGNORECASE,
|
| 24 |
)
|
| 25 |
|
| 26 |
-
INDEX_SCHEMA_VERSION =
|
| 27 |
INDEX_ALGORITHM = "llama-cpp-embedding-v1"
|
| 28 |
DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
|
| 29 |
DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
|
| 30 |
DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
EmbeddingFunction = Callable[[str], Sequence[float]]
|
|
@@ -48,6 +52,8 @@ class Project:
|
|
| 48 |
last_modified: str
|
| 49 |
host: str
|
| 50 |
url: str
|
|
|
|
|
|
|
| 51 |
|
| 52 |
@classmethod
|
| 53 |
def from_dict(cls, data: dict) -> "Project":
|
|
@@ -65,6 +71,8 @@ class Project:
|
|
| 65 |
last_modified=str(data.get("last_modified") or ""),
|
| 66 |
host=str(data.get("host") or ""),
|
| 67 |
url=str(data.get("url") or f"https://huggingface.co/spaces/{data['id']}"),
|
|
|
|
|
|
|
| 68 |
)
|
| 69 |
|
| 70 |
@property
|
|
@@ -73,15 +81,21 @@ class Project:
|
|
| 73 |
|
| 74 |
@property
|
| 75 |
def searchable_text(self) -> str:
|
| 76 |
-
return "
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
self.
|
| 81 |
-
"
|
| 82 |
-
"
|
| 83 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
]
|
|
|
|
| 85 |
)
|
| 86 |
|
| 87 |
def to_public_dict(self) -> dict:
|
|
@@ -99,6 +113,7 @@ class Project:
|
|
| 99 |
"last_modified": self.last_modified,
|
| 100 |
"host": self.host,
|
| 101 |
"url": self.url,
|
|
|
|
| 102 |
}
|
| 103 |
|
| 104 |
def to_snapshot_dict(self) -> dict:
|
|
@@ -116,6 +131,8 @@ class Project:
|
|
| 116 |
"last_modified": self.last_modified,
|
| 117 |
"host": self.host,
|
| 118 |
"url": self.url,
|
|
|
|
|
|
|
| 119 |
}
|
| 120 |
|
| 121 |
|
|
@@ -163,6 +180,99 @@ def public_project_summary(summary: str) -> str:
|
|
| 163 |
return cleaned
|
| 164 |
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
@dataclass(frozen=True)
|
| 167 |
class WhitespaceSeed:
|
| 168 |
label: str
|
|
@@ -433,7 +543,9 @@ def validate_index_payload(
|
|
| 433 |
indexed_ids = [document.get("project_id") for document in documents]
|
| 434 |
if indexed_ids != project_ids:
|
| 435 |
raise ValueError("project index project order does not match projects snapshot")
|
| 436 |
-
for document in documents:
|
|
|
|
|
|
|
| 437 |
vector = document.get("vector")
|
| 438 |
if not isinstance(vector, list) or len(vector) != dimensions:
|
| 439 |
raise ValueError("project index vector dimensions do not match embedding metadata")
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import ast
|
| 4 |
from collections.abc import Callable, Sequence
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from datetime import datetime, timezone
|
|
|
|
| 8 |
import json
|
| 9 |
import math
|
| 10 |
from pathlib import Path
|
| 11 |
+
from pathlib import PurePosixPath
|
| 12 |
import re
|
| 13 |
from typing import Any
|
| 14 |
|
| 15 |
|
| 16 |
TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9.+_-]*", re.IGNORECASE)
|
| 17 |
+
HTML_TAG_RE = re.compile(r"<[^>]+>")
|
| 18 |
GENERIC_PUBLIC_TITLE_RE = re.compile(
|
| 19 |
r"^(?:my\s+)?build\s+small\s+hackathon$",
|
| 20 |
re.IGNORECASE,
|
|
|
|
| 26 |
re.IGNORECASE,
|
| 27 |
)
|
| 28 |
|
| 29 |
+
INDEX_SCHEMA_VERSION = 3
|
| 30 |
INDEX_ALGORITHM = "llama-cpp-embedding-v1"
|
| 31 |
DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
|
| 32 |
DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
|
| 33 |
DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
|
| 34 |
+
APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
|
| 35 |
|
| 36 |
|
| 37 |
EmbeddingFunction = Callable[[str], Sequence[float]]
|
|
|
|
| 52 |
last_modified: str
|
| 53 |
host: str
|
| 54 |
url: str
|
| 55 |
+
app_file: str = ""
|
| 56 |
+
app_file_embedding_text: str = ""
|
| 57 |
|
| 58 |
@classmethod
|
| 59 |
def from_dict(cls, data: dict) -> "Project":
|
|
|
|
| 71 |
last_modified=str(data.get("last_modified") or ""),
|
| 72 |
host=str(data.get("host") or ""),
|
| 73 |
url=str(data.get("url") or f"https://huggingface.co/spaces/{data['id']}"),
|
| 74 |
+
app_file=str(data.get("app_file") or ""),
|
| 75 |
+
app_file_embedding_text=str(data.get("app_file_embedding_text") or ""),
|
| 76 |
)
|
| 77 |
|
| 78 |
@property
|
|
|
|
| 81 |
|
| 82 |
@property
|
| 83 |
def searchable_text(self) -> str:
|
| 84 |
+
return "\n".join(
|
| 85 |
+
part
|
| 86 |
+
for part in [
|
| 87 |
+
f"title: {self.title}",
|
| 88 |
+
f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
|
| 89 |
+
f"summary: {self.summary}",
|
| 90 |
+
f"tags: {' '.join(self.tags)}",
|
| 91 |
+
f"models: {' '.join(self.models)}",
|
| 92 |
+
f"datasets: {' '.join(self.datasets)}",
|
| 93 |
+
f"main app file: {self.app_file}" if self.app_file else "",
|
| 94 |
+
f"main app file content:\n{self.app_file_embedding_text}"
|
| 95 |
+
if self.app_file_embedding_text
|
| 96 |
+
else "",
|
| 97 |
]
|
| 98 |
+
if part.strip()
|
| 99 |
)
|
| 100 |
|
| 101 |
def to_public_dict(self) -> dict:
|
|
|
|
| 113 |
"last_modified": self.last_modified,
|
| 114 |
"host": self.host,
|
| 115 |
"url": self.url,
|
| 116 |
+
"app_file": self.app_file,
|
| 117 |
}
|
| 118 |
|
| 119 |
def to_snapshot_dict(self) -> dict:
|
|
|
|
| 131 |
"last_modified": self.last_modified,
|
| 132 |
"host": self.host,
|
| 133 |
"url": self.url,
|
| 134 |
+
"app_file": self.app_file,
|
| 135 |
+
"app_file_embedding_text": self.app_file_embedding_text,
|
| 136 |
}
|
| 137 |
|
| 138 |
|
|
|
|
| 180 |
return cleaned
|
| 181 |
|
| 182 |
|
| 183 |
+
def extract_app_file_embedding_text(app_file: str, text: str) -> str:
|
| 184 |
+
cleaned_file = str(app_file).strip()
|
| 185 |
+
cleaned_text = str(text or "")
|
| 186 |
+
if not cleaned_file or not cleaned_text.strip():
|
| 187 |
+
return ""
|
| 188 |
+
|
| 189 |
+
suffix = PurePosixPath(cleaned_file).suffix.lower()
|
| 190 |
+
if suffix == ".py":
|
| 191 |
+
body = python_app_signals(cleaned_text)
|
| 192 |
+
else:
|
| 193 |
+
body = cleaned_text
|
| 194 |
+
return bounded_embedding_text(body, APP_FILE_EMBEDDING_CHAR_LIMIT)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def python_app_signals(source: str) -> str:
|
| 198 |
+
try:
|
| 199 |
+
tree = ast.parse(source)
|
| 200 |
+
except SyntaxError:
|
| 201 |
+
return source
|
| 202 |
+
|
| 203 |
+
signals: list[str] = []
|
| 204 |
+
for node in ast.walk(tree):
|
| 205 |
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 206 |
+
signals.append(node.name)
|
| 207 |
+
for arg in node.args.args:
|
| 208 |
+
signals.append(arg.arg)
|
| 209 |
+
elif isinstance(node, ast.ClassDef):
|
| 210 |
+
signals.append(node.name)
|
| 211 |
+
elif isinstance(node, ast.Call):
|
| 212 |
+
name = call_name(node.func)
|
| 213 |
+
if name:
|
| 214 |
+
signals.append(name)
|
| 215 |
+
signals.extend(keyword.arg for keyword in node.keywords if keyword.arg)
|
| 216 |
+
elif isinstance(node, ast.Constant) and isinstance(node.value, str):
|
| 217 |
+
signals.append(node.value)
|
| 218 |
+
|
| 219 |
+
return ordered_normalized_text(signals)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def call_name(node: ast.AST) -> str:
|
| 223 |
+
if isinstance(node, ast.Name):
|
| 224 |
+
return node.id
|
| 225 |
+
if isinstance(node, ast.Attribute):
|
| 226 |
+
parent = call_name(node.value)
|
| 227 |
+
return f"{parent}.{node.attr}" if parent else node.attr
|
| 228 |
+
return ""
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def ordered_normalized_text(values: Sequence[str]) -> str:
|
| 232 |
+
seen: set[str] = set()
|
| 233 |
+
ordered: list[str] = []
|
| 234 |
+
for value in values:
|
| 235 |
+
cleaned = clean_embedding_signal(value)
|
| 236 |
+
if not cleaned:
|
| 237 |
+
continue
|
| 238 |
+
if cleaned in seen:
|
| 239 |
+
continue
|
| 240 |
+
seen.add(cleaned)
|
| 241 |
+
ordered.append(cleaned)
|
| 242 |
+
return "\n".join(ordered)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def clean_embedding_signal(value: str) -> str:
|
| 246 |
+
cleaned = HTML_TAG_RE.sub(" ", str(value))
|
| 247 |
+
cleaned = " ".join(cleaned.split())
|
| 248 |
+
if looks_like_style_blob(cleaned):
|
| 249 |
+
return ""
|
| 250 |
+
return cleaned
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def looks_like_style_blob(text: str) -> bool:
|
| 254 |
+
if len(text) < 80:
|
| 255 |
+
return False
|
| 256 |
+
style_markers = (
|
| 257 |
+
text.count("{")
|
| 258 |
+
+ text.count("}")
|
| 259 |
+
+ text.count(";")
|
| 260 |
+
+ text.count("!important")
|
| 261 |
+
+ text.count("rgba(")
|
| 262 |
+
+ text.count("linear-gradient")
|
| 263 |
+
)
|
| 264 |
+
return style_markers >= 8 and style_markers / len(text) > 0.015
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def bounded_embedding_text(text: str, limit: int) -> str:
|
| 268 |
+
cleaned = " ".join(str(text).split())
|
| 269 |
+
if len(cleaned) <= limit:
|
| 270 |
+
return cleaned
|
| 271 |
+
marker = " ... "
|
| 272 |
+
edge = max(1, (limit - len(marker)) // 2)
|
| 273 |
+
return f"{cleaned[:edge].rstrip()}{marker}{cleaned[-edge:].lstrip()}"
|
| 274 |
+
|
| 275 |
+
|
| 276 |
@dataclass(frozen=True)
|
| 277 |
class WhitespaceSeed:
|
| 278 |
label: str
|
|
|
|
| 543 |
indexed_ids = [document.get("project_id") for document in documents]
|
| 544 |
if indexed_ids != project_ids:
|
| 545 |
raise ValueError("project index project order does not match projects snapshot")
|
| 546 |
+
for project, document in zip(projects, documents, strict=True):
|
| 547 |
+
if document.get("text_digest") != sha256(project.searchable_text.encode("utf-8")).hexdigest():
|
| 548 |
+
raise ValueError("project index text digest does not match searchable project text")
|
| 549 |
vector = document.get("vector")
|
| 550 |
if not isinstance(vector, list) or len(vector) != dimensions:
|
| 551 |
raise ValueError("project index vector dimensions do not match embedding metadata")
|
hackathon_advisor/llama_embedding.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from collections.abc import Sequence
|
| 4 |
-
|
|
|
|
| 5 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from typing import Any
|
| 7 |
|
| 8 |
from hackathon_advisor.data import (
|
|
@@ -12,7 +18,8 @@ from hackathon_advisor.data import (
|
|
| 12 |
|
| 13 |
|
| 14 |
TRUE_VALUES = {"1", "true", "yes", "on"}
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
class LlamaCppEmbedder:
|
|
@@ -74,8 +81,114 @@ class LlamaCppEmbedder:
|
|
| 74 |
return self._model
|
| 75 |
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
model_repo=os.environ.get(
|
| 80 |
"ADVISOR_EMBEDDING_MODEL_REPO",
|
| 81 |
str(metadata.get("model_repo") or DEFAULT_EMBEDDING_MODEL_REPO),
|
|
@@ -111,3 +224,38 @@ def _optional_int_env(name: str) -> int | None:
|
|
| 111 |
if value <= 0:
|
| 112 |
raise RuntimeError(f"{name} must be a positive integer.")
|
| 113 |
return value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
from collections.abc import Sequence
|
| 4 |
+
import atexit
|
| 5 |
+
import json
|
| 6 |
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import platform
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
import threading
|
| 12 |
from typing import Any
|
| 13 |
|
| 14 |
from hackathon_advisor.data import (
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
TRUE_VALUES = {"1", "true", "yes", "on"}
|
| 21 |
+
FALSE_VALUES = {"0", "false", "no", "off"}
|
| 22 |
+
DEFAULT_N_CTX = 2048
|
| 23 |
|
| 24 |
|
| 25 |
class LlamaCppEmbedder:
|
|
|
|
| 81 |
return self._model
|
| 82 |
|
| 83 |
|
| 84 |
+
class SubprocessLlamaCppEmbedder:
|
| 85 |
+
def __init__(
|
| 86 |
+
self,
|
| 87 |
+
*,
|
| 88 |
+
model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
|
| 89 |
+
model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
|
| 90 |
+
model_path: str = "",
|
| 91 |
+
n_ctx: int = DEFAULT_N_CTX,
|
| 92 |
+
n_batch: int | None = None,
|
| 93 |
+
n_threads: int | None = None,
|
| 94 |
+
n_gpu_layers: int = 0,
|
| 95 |
+
verbose: bool = False,
|
| 96 |
+
) -> None:
|
| 97 |
+
self.model_repo = model_repo.strip() or DEFAULT_EMBEDDING_MODEL_REPO
|
| 98 |
+
self.model_file = model_file.strip() or DEFAULT_EMBEDDING_MODEL_FILE
|
| 99 |
+
self.model_path = model_path.strip()
|
| 100 |
+
self.n_ctx = n_ctx
|
| 101 |
+
self.n_batch = n_batch or n_ctx
|
| 102 |
+
self.n_threads = n_threads
|
| 103 |
+
self.n_gpu_layers = n_gpu_layers
|
| 104 |
+
self.verbose = verbose
|
| 105 |
+
self._process: subprocess.Popen[str] | None = None
|
| 106 |
+
self._request_id = 0
|
| 107 |
+
self._lock = threading.Lock()
|
| 108 |
+
atexit.register(self.close)
|
| 109 |
+
|
| 110 |
+
def __call__(self, text: str) -> Sequence[float]:
|
| 111 |
+
return self.embed(text)
|
| 112 |
+
|
| 113 |
+
def embed(self, text: str) -> Sequence[float]:
|
| 114 |
+
with self._lock:
|
| 115 |
+
process = self._ensure_process()
|
| 116 |
+
self._request_id += 1
|
| 117 |
+
request_id = self._request_id
|
| 118 |
+
request = json.dumps({"id": request_id, "text": text}, ensure_ascii=False)
|
| 119 |
+
try:
|
| 120 |
+
assert process.stdin is not None
|
| 121 |
+
assert process.stdout is not None
|
| 122 |
+
process.stdin.write(f"{request}\n")
|
| 123 |
+
process.stdin.flush()
|
| 124 |
+
line = process.stdout.readline()
|
| 125 |
+
except (BrokenPipeError, OSError) as error:
|
| 126 |
+
self.close()
|
| 127 |
+
raise RuntimeError("llama.cpp embedding worker stopped before returning a vector.") from error
|
| 128 |
+
if not line:
|
| 129 |
+
returncode = process.poll()
|
| 130 |
+
self.close()
|
| 131 |
+
detail = f" with exit code {returncode}" if returncode is not None else ""
|
| 132 |
+
raise RuntimeError(f"llama.cpp embedding worker exited{detail}.")
|
| 133 |
+
try:
|
| 134 |
+
response = json.loads(line)
|
| 135 |
+
except json.JSONDecodeError as error:
|
| 136 |
+
raise RuntimeError("llama.cpp embedding worker returned invalid JSON.") from error
|
| 137 |
+
if response.get("id") != request_id:
|
| 138 |
+
raise RuntimeError("llama.cpp embedding worker returned an out-of-order response.")
|
| 139 |
+
if response.get("error"):
|
| 140 |
+
raise RuntimeError(str(response["error"]))
|
| 141 |
+
vector = response.get("vector")
|
| 142 |
+
if not isinstance(vector, list):
|
| 143 |
+
raise RuntimeError("llama.cpp embedding worker did not return a vector.")
|
| 144 |
+
return vector
|
| 145 |
+
|
| 146 |
+
def close(self) -> None:
|
| 147 |
+
process = self._process
|
| 148 |
+
self._process = None
|
| 149 |
+
if process is None:
|
| 150 |
+
return
|
| 151 |
+
if process.poll() is None:
|
| 152 |
+
process.terminate()
|
| 153 |
+
try:
|
| 154 |
+
process.wait(timeout=2)
|
| 155 |
+
except subprocess.TimeoutExpired:
|
| 156 |
+
process.kill()
|
| 157 |
+
process.wait(timeout=2)
|
| 158 |
+
|
| 159 |
+
def _ensure_process(self) -> subprocess.Popen[str]:
|
| 160 |
+
if self._process is not None and self._process.poll() is None:
|
| 161 |
+
return self._process
|
| 162 |
+
self._process = subprocess.Popen(
|
| 163 |
+
[sys.executable, "-u", "-m", "hackathon_advisor.llama_embedding", "--worker"],
|
| 164 |
+
stdin=subprocess.PIPE,
|
| 165 |
+
stdout=subprocess.PIPE,
|
| 166 |
+
stderr=None if self.verbose else subprocess.DEVNULL,
|
| 167 |
+
text=True,
|
| 168 |
+
cwd=Path(__file__).resolve().parents[1],
|
| 169 |
+
)
|
| 170 |
+
config = json.dumps(
|
| 171 |
+
{
|
| 172 |
+
"model_repo": self.model_repo,
|
| 173 |
+
"model_file": self.model_file,
|
| 174 |
+
"model_path": self.model_path,
|
| 175 |
+
"n_ctx": self.n_ctx,
|
| 176 |
+
"n_batch": self.n_batch,
|
| 177 |
+
"n_threads": self.n_threads,
|
| 178 |
+
"n_gpu_layers": self.n_gpu_layers,
|
| 179 |
+
"verbose": self.verbose,
|
| 180 |
+
},
|
| 181 |
+
ensure_ascii=False,
|
| 182 |
+
)
|
| 183 |
+
assert self._process.stdin is not None
|
| 184 |
+
self._process.stdin.write(f"{config}\n")
|
| 185 |
+
self._process.stdin.flush()
|
| 186 |
+
return self._process
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def create_llama_cpp_embedder(metadata: dict[str, Any]) -> LlamaCppEmbedder | SubprocessLlamaCppEmbedder:
|
| 190 |
+
embedder_cls = SubprocessLlamaCppEmbedder if _use_subprocess_embedder() else LlamaCppEmbedder
|
| 191 |
+
return embedder_cls(
|
| 192 |
model_repo=os.environ.get(
|
| 193 |
"ADVISOR_EMBEDDING_MODEL_REPO",
|
| 194 |
str(metadata.get("model_repo") or DEFAULT_EMBEDDING_MODEL_REPO),
|
|
|
|
| 224 |
if value <= 0:
|
| 225 |
raise RuntimeError(f"{name} must be a positive integer.")
|
| 226 |
return value
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _use_subprocess_embedder() -> bool:
|
| 230 |
+
raw = os.environ.get("ADVISOR_EMBEDDING_SUBPROCESS", "").strip().lower()
|
| 231 |
+
if raw in TRUE_VALUES:
|
| 232 |
+
return True
|
| 233 |
+
if raw in FALSE_VALUES:
|
| 234 |
+
return False
|
| 235 |
+
backend = os.environ.get("ADVISOR_MODEL_BACKEND", "").strip().lower()
|
| 236 |
+
return platform.system() == "Darwin" and backend in {"minicpm", "minicpm-transformers"}
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def _worker_loop() -> None:
|
| 240 |
+
config_line = sys.stdin.readline()
|
| 241 |
+
if not config_line:
|
| 242 |
+
return
|
| 243 |
+
embedder = LlamaCppEmbedder(**json.loads(config_line))
|
| 244 |
+
for line in sys.stdin:
|
| 245 |
+
if not line.strip():
|
| 246 |
+
continue
|
| 247 |
+
request = json.loads(line)
|
| 248 |
+
request_id = request.get("id")
|
| 249 |
+
try:
|
| 250 |
+
vector = list(embedder.embed(str(request.get("text") or "")))
|
| 251 |
+
response = {"id": request_id, "vector": vector}
|
| 252 |
+
except Exception as error:
|
| 253 |
+
response = {"id": request_id, "error": str(error)}
|
| 254 |
+
print(json.dumps(response), flush=True)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
if __name__ == "__main__":
|
| 258 |
+
if len(sys.argv) == 2 and sys.argv[1] == "--worker":
|
| 259 |
+
_worker_loop()
|
| 260 |
+
else:
|
| 261 |
+
raise SystemExit("usage: python -m hackathon_advisor.llama_embedding --worker")
|
scripts/build_project_index.py
CHANGED
|
@@ -81,6 +81,7 @@ def build_payload(
|
|
| 81 |
"build_source": build_source,
|
| 82 |
"builder": builder,
|
| 83 |
"llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
|
|
|
|
| 84 |
}
|
| 85 |
if modal_app:
|
| 86 |
metadata["modal_app"] = modal_app
|
|
|
|
| 81 |
"build_source": build_source,
|
| 82 |
"builder": builder,
|
| 83 |
"llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
|
| 84 |
+
"n_ctx": n_ctx,
|
| 85 |
}
|
| 86 |
if modal_app:
|
| 87 |
metadata["modal_app"] = modal_app
|
scripts/crawl_hf_spaces.py
CHANGED
|
@@ -5,11 +5,16 @@ import argparse
|
|
| 5 |
from datetime import datetime, timezone
|
| 6 |
import json
|
| 7 |
from pathlib import Path
|
| 8 |
-
import
|
|
|
|
| 9 |
from typing import Any
|
| 10 |
-
|
| 11 |
-
from
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
API = "https://huggingface.co/api"
|
|
@@ -19,20 +24,13 @@ def main() -> None:
|
|
| 19 |
parser = argparse.ArgumentParser(description="Snapshot public Spaces in a Hugging Face org.")
|
| 20 |
parser.add_argument("--org", default="build-small-hackathon")
|
| 21 |
parser.add_argument("--out", default="data/projects.json")
|
| 22 |
-
parser.add_argument("--limit", type=int, default=100)
|
| 23 |
args = parser.parse_args()
|
| 24 |
|
| 25 |
-
|
| 26 |
-
projects = []
|
| 27 |
-
for item in spaces:
|
| 28 |
-
space_id = item["id"]
|
| 29 |
-
detail = fetch_json(f"{API}/spaces/{quote(space_id, safe='/')}")
|
| 30 |
-
projects.append(project_from_detail(detail))
|
| 31 |
-
time.sleep(0.05)
|
| 32 |
|
| 33 |
payload = {
|
| 34 |
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
| 35 |
-
"source": f"{API}/spaces?author={args.org}
|
| 36 |
"projects": sorted(projects, key=lambda project: project["id"].lower()),
|
| 37 |
}
|
| 38 |
output = Path(args.out)
|
|
@@ -41,38 +39,141 @@ def main() -> None:
|
|
| 41 |
print(f"wrote {len(projects)} projects to {output}")
|
| 42 |
|
| 43 |
|
| 44 |
-
def
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
card =
|
| 55 |
-
space_id = str(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
|
| 57 |
summary = str(card.get("short_description") or card.get("description") or "")
|
| 58 |
-
tags = sorted(set(str(tag) for tag in (card.get("tags") or detail.get("tags") or [])))
|
| 59 |
return {
|
| 60 |
"id": space_id,
|
| 61 |
"title": title,
|
| 62 |
"summary": summary,
|
| 63 |
-
"tags": tags,
|
| 64 |
-
"models": [str(model) for model in
|
| 65 |
-
"datasets": [
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
"license": str(card.get("license") or ""),
|
| 69 |
-
"created_at":
|
| 70 |
-
"last_modified":
|
| 71 |
-
"host":
|
| 72 |
"url": f"https://huggingface.co/spaces/{space_id}",
|
|
|
|
|
|
|
| 73 |
}
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def humanize_slug(slug: str) -> str:
|
| 77 |
return " ".join(part for part in slug.replace("_", "-").split("-") if part).title()
|
| 78 |
|
|
|
|
| 5 |
from datetime import datetime, timezone
|
| 6 |
import json
|
| 7 |
from pathlib import Path
|
| 8 |
+
from pathlib import PurePosixPath
|
| 9 |
+
import sys
|
| 10 |
from typing import Any
|
| 11 |
+
|
| 12 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 13 |
+
|
| 14 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 15 |
+
sys.path.insert(0, str(ROOT))
|
| 16 |
+
|
| 17 |
+
from hackathon_advisor.data import extract_app_file_embedding_text
|
| 18 |
|
| 19 |
|
| 20 |
API = "https://huggingface.co/api"
|
|
|
|
| 24 |
parser = argparse.ArgumentParser(description="Snapshot public Spaces in a Hugging Face org.")
|
| 25 |
parser.add_argument("--org", default="build-small-hackathon")
|
| 26 |
parser.add_argument("--out", default="data/projects.json")
|
|
|
|
| 27 |
args = parser.parse_args()
|
| 28 |
|
| 29 |
+
projects = crawl_projects(args.org)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
payload = {
|
| 32 |
"generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
| 33 |
+
"source": f"{API}/spaces?author={args.org}",
|
| 34 |
"projects": sorted(projects, key=lambda project: project["id"].lower()),
|
| 35 |
}
|
| 36 |
output = Path(args.out)
|
|
|
|
| 39 |
print(f"wrote {len(projects)} projects to {output}")
|
| 40 |
|
| 41 |
|
| 42 |
+
def crawl_projects(org: str) -> list[dict[str, Any]]:
|
| 43 |
+
api = HfApi(token=False)
|
| 44 |
+
spaces = api.list_spaces(author=org, full=True, token=False)
|
| 45 |
+
return [
|
| 46 |
+
project_from_space(space)
|
| 47 |
+
for space in spaces
|
| 48 |
+
if not bool(getattr(space, "private", False))
|
| 49 |
+
]
|
| 50 |
|
| 51 |
|
| 52 |
+
def project_from_space(space: Any) -> dict[str, Any]:
|
| 53 |
+
card = card_data(space)
|
| 54 |
+
space_id = str(space.id)
|
| 55 |
+
siblings = sibling_names(space)
|
| 56 |
+
readme = download_repo_text(space_id, "README.md") if "README.md" in siblings else ""
|
| 57 |
+
frontmatter = readme_frontmatter(readme)
|
| 58 |
+
app_file = validate_app_file(str(frontmatter.get("app_file") or ""), space_id=space_id)
|
| 59 |
+
app_file_embedding_text = ""
|
| 60 |
+
if app_file:
|
| 61 |
+
if app_file not in siblings:
|
| 62 |
+
raise RuntimeError(f"{space_id} README frontmatter points to missing app_file: {app_file}")
|
| 63 |
+
app_file_embedding_text = extract_app_file_embedding_text(
|
| 64 |
+
app_file,
|
| 65 |
+
download_repo_text(space_id, app_file),
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
title = str(card.get("title") or humanize_slug(space_id.rsplit("/", 1)[-1]))
|
| 69 |
summary = str(card.get("short_description") or card.get("description") or "")
|
|
|
|
| 70 |
return {
|
| 71 |
"id": space_id,
|
| 72 |
"title": title,
|
| 73 |
"summary": summary,
|
| 74 |
+
"tags": sorted(set(str(tag) for tag in (card.get("tags") or getattr(space, "tags", None) or []))),
|
| 75 |
+
"models": [str(model) for model in getattr(space, "models", None) or card.get("models") or []],
|
| 76 |
+
"datasets": [
|
| 77 |
+
str(dataset) for dataset in getattr(space, "datasets", None) or card.get("datasets") or []
|
| 78 |
+
],
|
| 79 |
+
"likes": int(getattr(space, "likes", None) or 0),
|
| 80 |
+
"sdk": str(card.get("sdk") or getattr(space, "sdk", None) or ""),
|
| 81 |
"license": str(card.get("license") or ""),
|
| 82 |
+
"created_at": isoformat(getattr(space, "created_at", None)),
|
| 83 |
+
"last_modified": isoformat(getattr(space, "last_modified", None)),
|
| 84 |
+
"host": host_url(space),
|
| 85 |
"url": f"https://huggingface.co/spaces/{space_id}",
|
| 86 |
+
"app_file": app_file,
|
| 87 |
+
"app_file_embedding_text": app_file_embedding_text,
|
| 88 |
}
|
| 89 |
|
| 90 |
|
| 91 |
+
def card_data(space: Any) -> dict[str, Any]:
|
| 92 |
+
raw = getattr(space, "card_data", None) or getattr(space, "cardData", None) or {}
|
| 93 |
+
if isinstance(raw, dict):
|
| 94 |
+
return raw
|
| 95 |
+
to_dict = getattr(raw, "to_dict", None)
|
| 96 |
+
if callable(to_dict):
|
| 97 |
+
return dict(to_dict())
|
| 98 |
+
return {}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def sibling_names(space: Any) -> set[str]:
|
| 102 |
+
return {str(sibling.rfilename) for sibling in getattr(space, "siblings", None) or []}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def download_repo_text(repo_id: str, filename: str) -> str:
|
| 106 |
+
path = hf_hub_download(
|
| 107 |
+
repo_id=repo_id,
|
| 108 |
+
repo_type="space",
|
| 109 |
+
filename=filename,
|
| 110 |
+
token=False,
|
| 111 |
+
etag_timeout=30,
|
| 112 |
+
)
|
| 113 |
+
return Path(path).read_text(encoding="utf-8")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def readme_frontmatter(readme: str) -> dict[str, str]:
|
| 117 |
+
lines = readme.splitlines()
|
| 118 |
+
if not lines or lines[0].strip() != "---":
|
| 119 |
+
return {}
|
| 120 |
+
|
| 121 |
+
values: dict[str, str] = {}
|
| 122 |
+
closed = False
|
| 123 |
+
for line in lines[1:]:
|
| 124 |
+
stripped = line.strip()
|
| 125 |
+
if stripped in {"---", "..."}:
|
| 126 |
+
closed = True
|
| 127 |
+
break
|
| 128 |
+
if not stripped or stripped.startswith("#") or ":" not in line:
|
| 129 |
+
continue
|
| 130 |
+
if line[:1].isspace() or stripped.startswith("-"):
|
| 131 |
+
continue
|
| 132 |
+
key, raw_value = line.split(":", 1)
|
| 133 |
+
key = key.strip()
|
| 134 |
+
if key:
|
| 135 |
+
values[key] = yaml_scalar(raw_value)
|
| 136 |
+
return values if closed else {}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def yaml_scalar(raw_value: str) -> str:
|
| 140 |
+
value = raw_value.strip()
|
| 141 |
+
if not value:
|
| 142 |
+
return ""
|
| 143 |
+
if " #" in value:
|
| 144 |
+
value = value.split(" #", 1)[0].rstrip()
|
| 145 |
+
if value[:1] in {"'", '"'} and value[-1:] == value[:1]:
|
| 146 |
+
return value[1:-1]
|
| 147 |
+
return value
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def validate_app_file(app_file: str, *, space_id: str) -> str:
|
| 151 |
+
cleaned = app_file.strip()
|
| 152 |
+
if not cleaned:
|
| 153 |
+
return ""
|
| 154 |
+
path = PurePosixPath(cleaned)
|
| 155 |
+
if path.is_absolute() or ".." in path.parts or cleaned.endswith("/"):
|
| 156 |
+
raise RuntimeError(f"{space_id} README frontmatter has an invalid app_file path: {app_file}")
|
| 157 |
+
return path.as_posix()
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def isoformat(value: Any) -> str:
|
| 161 |
+
if value is None:
|
| 162 |
+
return ""
|
| 163 |
+
formatter = getattr(value, "isoformat", None)
|
| 164 |
+
if callable(formatter):
|
| 165 |
+
return formatter()
|
| 166 |
+
return str(value)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def host_url(space: Any) -> str:
|
| 170 |
+
host = str(getattr(space, "host", None) or "")
|
| 171 |
+
if host:
|
| 172 |
+
return host
|
| 173 |
+
subdomain = str(getattr(space, "subdomain", None) or "")
|
| 174 |
+
return f"https://{subdomain}.hf.space" if subdomain else ""
|
| 175 |
+
|
| 176 |
+
|
| 177 |
def humanize_slug(slug: str) -> str:
|
| 178 |
return " ".join(part for part in slug.replace("_", "-").split("-") if part).title()
|
| 179 |
|
tests/test_crawl_hf_spaces.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from types import SimpleNamespace
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from scripts import crawl_hf_spaces
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_readme_frontmatter_extracts_app_file() -> None:
|
| 11 |
+
frontmatter = crawl_hf_spaces.readme_frontmatter(
|
| 12 |
+
"""---
|
| 13 |
+
title: Tiny Demo
|
| 14 |
+
app_file: "src/app.py" # main entrypoint
|
| 15 |
+
tags:
|
| 16 |
+
- gradio
|
| 17 |
+
---
|
| 18 |
+
# Tiny Demo
|
| 19 |
+
"""
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
assert frontmatter["app_file"] == "src/app.py"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_validate_app_file_rejects_untrusted_paths() -> None:
|
| 26 |
+
with pytest.raises(RuntimeError, match="invalid app_file path"):
|
| 27 |
+
crawl_hf_spaces.validate_app_file("../app.py", space_id="build-small-hackathon/demo")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_project_from_space_downloads_frontmatter_app_file(monkeypatch) -> None:
|
| 31 |
+
downloads = {
|
| 32 |
+
("build-small-hackathon/demo", "README.md"): "---\napp_file: app.py\n---\n",
|
| 33 |
+
("build-small-hackathon/demo", "app.py"): "import gradio as gr\ngr.Textbox(label='Idea')\n",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
def fake_download(repo_id: str, filename: str) -> str:
|
| 37 |
+
return downloads[(repo_id, filename)]
|
| 38 |
+
|
| 39 |
+
monkeypatch.setattr(crawl_hf_spaces, "download_repo_text", fake_download)
|
| 40 |
+
space = SimpleNamespace(
|
| 41 |
+
id="build-small-hackathon/demo",
|
| 42 |
+
card_data={"title": "Demo", "short_description": "Advisor demo", "sdk": "gradio"},
|
| 43 |
+
siblings=[
|
| 44 |
+
SimpleNamespace(rfilename="README.md"),
|
| 45 |
+
SimpleNamespace(rfilename="app.py"),
|
| 46 |
+
],
|
| 47 |
+
tags=["gradio"],
|
| 48 |
+
models=[],
|
| 49 |
+
datasets=[],
|
| 50 |
+
likes=3,
|
| 51 |
+
created_at=None,
|
| 52 |
+
last_modified=None,
|
| 53 |
+
host="https://example.test",
|
| 54 |
+
private=False,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
project = crawl_hf_spaces.project_from_space(space)
|
| 58 |
+
|
| 59 |
+
assert project["app_file"] == "app.py"
|
| 60 |
+
assert "gr.Textbox" in project["app_file_embedding_text"]
|
| 61 |
+
assert "Idea" in project["app_file_embedding_text"]
|
tests/test_data.py
CHANGED
|
@@ -3,7 +3,12 @@ from pathlib import Path
|
|
| 3 |
from tests.helpers import load_test_index
|
| 4 |
import json
|
| 5 |
|
| 6 |
-
from hackathon_advisor.data import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def test_project_index_searches_snapshot() -> None:
|
|
@@ -54,6 +59,32 @@ def test_public_project_cards_hide_generic_submission_copy() -> None:
|
|
| 54 |
assert public["summary"] == ""
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
|
| 58 |
payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
|
| 59 |
payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
|
|
|
|
| 3 |
from tests.helpers import load_test_index
|
| 4 |
import json
|
| 5 |
|
| 6 |
+
from hackathon_advisor.data import (
|
| 7 |
+
Project,
|
| 8 |
+
ProjectIndex,
|
| 9 |
+
public_project_summary,
|
| 10 |
+
public_project_title,
|
| 11 |
+
)
|
| 12 |
|
| 13 |
|
| 14 |
def test_project_index_searches_snapshot() -> None:
|
|
|
|
| 59 |
assert public["summary"] == ""
|
| 60 |
|
| 61 |
|
| 62 |
+
def test_searchable_text_includes_main_app_file_signals() -> None:
|
| 63 |
+
project = Project(
|
| 64 |
+
id="build-small-hackathon/idea-canvas",
|
| 65 |
+
title="Idea Canvas",
|
| 66 |
+
summary="",
|
| 67 |
+
tags=("gradio",),
|
| 68 |
+
models=(),
|
| 69 |
+
datasets=(),
|
| 70 |
+
likes=0,
|
| 71 |
+
sdk="gradio",
|
| 72 |
+
license="",
|
| 73 |
+
created_at="",
|
| 74 |
+
last_modified="",
|
| 75 |
+
host="",
|
| 76 |
+
url="https://example.test",
|
| 77 |
+
app_file="app.py",
|
| 78 |
+
app_file_embedding_text="score_idea\ngr.Textbox\nProject idea",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
searchable = project.searchable_text
|
| 82 |
+
|
| 83 |
+
assert "main app file: app.py" in searchable
|
| 84 |
+
assert "score_idea" in searchable
|
| 85 |
+
assert "Project idea" in searchable
|
| 86 |
+
|
| 87 |
+
|
| 88 |
def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
|
| 89 |
payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
|
| 90 |
payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
|
tests/test_llama_embedding.py
CHANGED
|
@@ -3,7 +3,12 @@ import sys
|
|
| 3 |
from types import ModuleType
|
| 4 |
|
| 5 |
from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
|
| 6 |
-
from hackathon_advisor.llama_embedding import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def test_llama_embedder_uses_q8_defaults_and_configured_context(
|
|
@@ -60,3 +65,33 @@ def test_create_llama_embedder_accepts_explicit_batch(monkeypatch) -> None:
|
|
| 60 |
embedder = create_llama_cpp_embedder({"dimensions": 768})
|
| 61 |
|
| 62 |
assert embedder.n_batch == 256
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from types import ModuleType
|
| 4 |
|
| 5 |
from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
|
| 6 |
+
from hackathon_advisor.llama_embedding import (
|
| 7 |
+
DEFAULT_N_CTX,
|
| 8 |
+
LlamaCppEmbedder,
|
| 9 |
+
SubprocessLlamaCppEmbedder,
|
| 10 |
+
create_llama_cpp_embedder,
|
| 11 |
+
)
|
| 12 |
|
| 13 |
|
| 14 |
def test_llama_embedder_uses_q8_defaults_and_configured_context(
|
|
|
|
| 65 |
embedder = create_llama_cpp_embedder({"dimensions": 768})
|
| 66 |
|
| 67 |
assert embedder.n_batch == 256
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_create_llama_embedder_can_isolate_native_runtime(monkeypatch) -> None:
|
| 71 |
+
monkeypatch.setenv("ADVISOR_EMBEDDING_SUBPROCESS", "1")
|
| 72 |
+
|
| 73 |
+
embedder = create_llama_cpp_embedder({"dimensions": 768})
|
| 74 |
+
|
| 75 |
+
assert isinstance(embedder, SubprocessLlamaCppEmbedder)
|
| 76 |
+
embedder.close()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def test_create_llama_embedder_isolates_macos_minicpm_runtime(monkeypatch) -> None:
|
| 80 |
+
monkeypatch.delenv("ADVISOR_EMBEDDING_SUBPROCESS", raising=False)
|
| 81 |
+
monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
|
| 82 |
+
monkeypatch.setattr("hackathon_advisor.llama_embedding.platform.system", lambda: "Darwin")
|
| 83 |
+
|
| 84 |
+
embedder = create_llama_cpp_embedder({"dimensions": 768})
|
| 85 |
+
|
| 86 |
+
assert isinstance(embedder, SubprocessLlamaCppEmbedder)
|
| 87 |
+
embedder.close()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def test_create_llama_embedder_keeps_in_process_when_isolation_disabled(monkeypatch) -> None:
|
| 91 |
+
monkeypatch.setenv("ADVISOR_EMBEDDING_SUBPROCESS", "0")
|
| 92 |
+
monkeypatch.setenv("ADVISOR_MODEL_BACKEND", "minicpm-transformers")
|
| 93 |
+
monkeypatch.setattr("hackathon_advisor.llama_embedding.platform.system", lambda: "Darwin")
|
| 94 |
+
|
| 95 |
+
embedder = create_llama_cpp_embedder({"dimensions": 768})
|
| 96 |
+
|
| 97 |
+
assert isinstance(embedder, LlamaCppEmbedder)
|