"""Backfill missing non-skill resume-section embeddings.""" from __future__ import annotations import argparse import hashlib from sqlalchemy import delete, select from src.core.config import get_settings from src.llm.factory import build_default_llm_client from src.llm.registry import ModelAliasRegistry from src.storage.db import get_session from src.storage.models import Embedding, ResumeSection from src.storage.repositories import EmbeddingRepository def run( *, embedding_alias: str | None, limit: int | None, resume_id: int | None, replace_existing: bool, dry_run: bool, ) -> int: """Runs section-embedding backfill with optional scoping and dry-run mode.""" session = get_session() inserted = 0 skipped = 0 failed = 0 try: settings = get_settings() alias = embedding_alias or settings.embedding_model_alias target_model = ModelAliasRegistry(settings.model_aliases_path).get(alias).default_model client = build_default_llm_client() repo = EmbeddingRepository(session) if replace_existing: if resume_id is None: session.execute( delete(Embedding).where( Embedding.model == target_model, ) ) else: section_ids = select(ResumeSection.id).where(ResumeSection.resume_id == resume_id) session.execute( delete(Embedding).where( Embedding.model == target_model, Embedding.owner_id.in_(section_ids), ) ) session.flush() existing = set(session.execute(select(Embedding.owner_id, Embedding.model)).all()) stmt = select(ResumeSection).where(ResumeSection.section_type != "skills") if resume_id is not None: stmt = stmt.where(ResumeSection.resume_id == resume_id) stmt = stmt.order_by(ResumeSection.id) if limit is not None and limit > 0: stmt = stmt.limit(limit) sections = session.scalars(stmt).all() for section in sections: content = (section.content or "").strip() if not content: skipped += 1 continue try: vectors, meta = client.embed_with_meta(texts=[content], embedding_model_alias=alias) if len(vectors) != 1: raise ValueError(f"Expected exactly one vector, got {len(vectors)}") model = meta.selected_model or alias if (section.id, model) in existing: skipped += 1 continue repo.create( owner_id=int(section.id), model=model, vector=[float(v) for v in vectors[0]], text_hash=hashlib.sha256(content.encode("utf-8")).hexdigest(), ) inserted += 1 existing.add((section.id, model)) except Exception as exc: failed += 1 print( f"section_id={section.id} status=error error_type={type(exc).__name__} error={exc}" ) if dry_run: session.rollback() else: session.commit() print(f"inserted={inserted}") print(f"skipped={skipped}") print(f"failed={failed}") print(f"target_model={target_model}") print(f"replace_existing={replace_existing}") print(f"dry_run={dry_run}") return 0 if failed == 0 else 1 finally: session.close() def main() -> int: """Parses CLI arguments and runs the backfill command.""" parser = argparse.ArgumentParser(description="Backfill non-skill resume section embeddings.") parser.add_argument("--embedding-alias", default=None, help="Embedding alias override.") parser.add_argument( "--limit", type=int, default=None, help="Maximum number of sections to process." ) parser.add_argument( "--resume-id", type=int, default=None, help="Restrict processing to one resume." ) parser.add_argument( "--replace-existing", action="store_true", help="Delete existing embeddings for the selected model before backfilling.", ) parser.add_argument("--dry-run", action="store_true", help="Run without committing changes.") args = parser.parse_args() return run( embedding_alias=args.embedding_alias, limit=args.limit, resume_id=args.resume_id, replace_existing=args.replace_existing, dry_run=args.dry_run, ) if __name__ == "__main__": raise SystemExit(main())