File size: 3,544 Bytes
e12a049
13fe947
 
 
 
 
 
 
 
e12a049
 
 
 
13fe947
e12a049
 
 
 
13fe947
 
 
 
 
 
e12a049
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
 
 
e12a049
 
13fe947
e12a049
 
 
 
 
 
 
 
 
 
 
 
 
13fe947
 
 
e12a049
 
 
 
 
 
13fe947
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e12a049
 
 
 
13fe947
 
e12a049
13fe947
 
e12a049
13fe947
 
 
 
e12a049
13fe947
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""Modal wiring for the project index build.

The user-facing entrypoint is `scripts/build_project_index.py --location modal`,
which calls `run_remote_build` below. The shared embedding logic lives in
`scripts.build_project_index.build_payload`; this module only owns the Modal
app/image/remote-function definitions. `modal run scripts/modal_build_project_index.py`
also works for callers who prefer the Modal CLI directly.
"""
from __future__ import annotations

import json
from pathlib import Path
import sys
from typing import Any

import modal

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from hackathon_advisor.data import DEFAULT_EMBEDDING_MODEL_FILE, DEFAULT_EMBEDDING_MODEL_REPO
from hackathon_advisor.llama_embedding import DEFAULT_N_CTX

APP_NAME = "hackathon-advisor-llama-index"

app = modal.App(APP_NAME)
image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "huggingface-hub>=0.36,<1",
        "llama-cpp-python>=0.3.26,<1",
    )
    .add_local_python_source("hackathon_advisor", copy=True)
    .add_local_python_source("scripts", copy=True)
)


@app.function(image=image, cpu=4.0, memory=4096, timeout=1800)
def build_project_index_remote(
    project_snapshot: dict[str, Any],
    model_repo: str,
    model_file: str,
    model_path: str = "",
    n_ctx: int = DEFAULT_N_CTX,
    n_threads: int | None = None,
) -> dict[str, Any]:
    import tempfile
    from pathlib import Path

    from scripts.build_project_index import build_payload

    with tempfile.TemporaryDirectory() as tmpdir:
        project_path = Path(tmpdir) / "projects.json"
        project_path.write_text(
            json.dumps(project_snapshot, ensure_ascii=False),
            encoding="utf-8",
        )
        return build_payload(
            project_path,
            model_repo=model_repo,
            model_file=model_file,
            model_path=model_path,
            n_ctx=n_ctx,
            n_threads=n_threads,
            build_source="modal remote function",
            builder="scripts/modal_build_project_index.py",
            modal_app=APP_NAME,
        )


def run_remote_build(
    projects_path: Path,
    *,
    model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
    model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
    model_path: str = "",
    n_ctx: int = DEFAULT_N_CTX,
    n_threads: int | None = None,
) -> dict[str, Any]:
    """Build the index on Modal and return the payload.

    Used by `scripts/build_project_index.py --location modal`, which runs as a plain
    Python process, so this opens its own ephemeral Modal app context.
    """
    project_snapshot = json.loads(projects_path.read_text(encoding="utf-8"))
    with app.run():
        return build_project_index_remote.remote(
            project_snapshot,
            model_repo,
            model_file,
            model_path,
            n_ctx,
            n_threads,
        )


@app.local_entrypoint()
def main(
    projects: str = "data/projects.json",
    out: str = "data/project_index.json",
    model_repo: str = DEFAULT_EMBEDDING_MODEL_REPO,
    model_file: str = DEFAULT_EMBEDDING_MODEL_FILE,
) -> None:
    # Runs under `modal run`, which already manages the app context.
    from scripts.build_project_index import write_payload

    payload = build_project_index_remote.remote(
        json.loads(Path(projects).read_text(encoding="utf-8")),
        model_repo,
        model_file,
    )
    write_payload(Path(out), payload)