File size: 4,783 Bytes
74711df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Backfill missing non-skill resume-section embeddings."""

from __future__ import annotations

import argparse
import hashlib

from sqlalchemy import delete, select

from src.core.config import get_settings
from src.llm.factory import build_default_llm_client
from src.llm.registry import ModelAliasRegistry
from src.storage.db import get_session
from src.storage.models import Embedding, ResumeSection
from src.storage.repositories import EmbeddingRepository


def run(
    *,
    embedding_alias: str | None,
    limit: int | None,
    resume_id: int | None,
    replace_existing: bool,
    dry_run: bool,
) -> int:
    """Runs section-embedding backfill with optional scoping and dry-run mode."""
    session = get_session()
    inserted = 0
    skipped = 0
    failed = 0
    try:
        settings = get_settings()
        alias = embedding_alias or settings.embedding_model_alias
        target_model = ModelAliasRegistry(settings.model_aliases_path).get(alias).default_model
        client = build_default_llm_client()
        repo = EmbeddingRepository(session)

        if replace_existing:
            if resume_id is None:
                session.execute(
                    delete(Embedding).where(
                        Embedding.model == target_model,
                    )
                )
            else:
                section_ids = select(ResumeSection.id).where(ResumeSection.resume_id == resume_id)
                session.execute(
                    delete(Embedding).where(
                        Embedding.model == target_model,
                        Embedding.owner_id.in_(section_ids),
                    )
                )
            session.flush()

        existing = set(session.execute(select(Embedding.owner_id, Embedding.model)).all())

        stmt = select(ResumeSection).where(ResumeSection.section_type != "skills")
        if resume_id is not None:
            stmt = stmt.where(ResumeSection.resume_id == resume_id)
        stmt = stmt.order_by(ResumeSection.id)
        if limit is not None and limit > 0:
            stmt = stmt.limit(limit)

        sections = session.scalars(stmt).all()
        for section in sections:
            content = (section.content or "").strip()
            if not content:
                skipped += 1
                continue
            try:
                vectors, meta = client.embed_with_meta(texts=[content], embedding_model_alias=alias)
                if len(vectors) != 1:
                    raise ValueError(f"Expected exactly one vector, got {len(vectors)}")
                model = meta.selected_model or alias
                if (section.id, model) in existing:
                    skipped += 1
                    continue
                repo.create(
                    owner_id=int(section.id),
                    model=model,
                    vector=[float(v) for v in vectors[0]],
                    text_hash=hashlib.sha256(content.encode("utf-8")).hexdigest(),
                )
                inserted += 1
                existing.add((section.id, model))
            except Exception as exc:
                failed += 1
                print(
                    f"section_id={section.id} status=error error_type={type(exc).__name__} error={exc}"
                )

        if dry_run:
            session.rollback()
        else:
            session.commit()

        print(f"inserted={inserted}")
        print(f"skipped={skipped}")
        print(f"failed={failed}")
        print(f"target_model={target_model}")
        print(f"replace_existing={replace_existing}")
        print(f"dry_run={dry_run}")
        return 0 if failed == 0 else 1
    finally:
        session.close()


def main() -> int:
    """Parses CLI arguments and runs the backfill command."""
    parser = argparse.ArgumentParser(description="Backfill non-skill resume section embeddings.")
    parser.add_argument("--embedding-alias", default=None, help="Embedding alias override.")
    parser.add_argument(
        "--limit", type=int, default=None, help="Maximum number of sections to process."
    )
    parser.add_argument(
        "--resume-id", type=int, default=None, help="Restrict processing to one resume."
    )
    parser.add_argument(
        "--replace-existing",
        action="store_true",
        help="Delete existing embeddings for the selected model before backfilling.",
    )
    parser.add_argument("--dry-run", action="store_true", help="Run without committing changes.")
    args = parser.parse_args()
    return run(
        embedding_alias=args.embedding_alias,
        limit=args.limit,
        resume_id=args.resume_id,
        replace_existing=args.replace_existing,
        dry_run=args.dry_run,
    )


if __name__ == "__main__":
    raise SystemExit(main())