tooktang commited on Mar 3

Commit

1e1d0ce

verified ·

1 Parent(s): a3adf95

Initial release: Qwen3-Reranker-4B CoreML ANE-optimized bundle + service

Browse files

Files changed (20) hide show

.gitattributes +1 -0
README.md +133 -0
bundles/qwen3_reranker_ane_bundle_4b/manifest.json +29 -0
bundles/qwen3_reranker_ane_bundle_4b/packages/b1_s128.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
bundles/qwen3_reranker_ane_bundle_4b/packages/b1_s128.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
bundles/qwen3_reranker_ane_bundle_4b/packages/b1_s128.mlpackage/Manifest.json +18 -0
bundles/qwen3_reranker_ane_bundle_4b/tokenizer/chat_template.jinja +85 -0
bundles/qwen3_reranker_ane_bundle_4b/tokenizer/tokenizer.json +3 -0
bundles/qwen3_reranker_ane_bundle_4b/tokenizer/tokenizer_config.json +14 -0
qwen3_ane_rerank/__init__.py +5 -0
qwen3_ane_rerank/__main__.py +5 -0
qwen3_ane_rerank/api.py +84 -0
qwen3_ane_rerank/cli.py +174 -0
qwen3_ane_rerank/converter.py +267 -0
qwen3_ane_rerank/manifest.py +109 -0
qwen3_ane_rerank/profiles.py +47 -0
qwen3_ane_rerank/runtime.py +278 -0
requirements-service.txt +5 -0
run_server.sh +74 -0
setup_venv.sh +33 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+bundles/qwen3_reranker_ane_bundle_4b/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,133 @@

+---
+license: apache-2.0
+language:
+- en
+- zh
+tags:
+- qwen3
+- reranker
+- coreml
+- apple-silicon
+- ane
+pipeline_tag: text-ranking
+library_name: coremltools
+base_model: Qwen/Qwen3-Reranker-4B
+---
+# Qwen3-Reranker-4B-CoreML (ANE-Optimized)
+## English
+This repository provides a pre-converted CoreML bundle derived from `Qwen3-Reranker-4B` and an OpenAI-style rerank API service for Apple Silicon.
+### Bundle Specs
+| Item | Value |
+| --- | --- |
+| Base model | `Qwen/Qwen3-Reranker-4B` |
+| Task | Text reranking |
+| Profiles | `b1_s128` |
+| Bundle path | `bundles/qwen3_reranker_ane_bundle_4b` |
+| Default model id | `qwen3-reranker-4b-ane` |
+| Package size (approx.) | `7.5G` |
+### Scope
+- This release is **text-only reranking**.
+- Endpoint: `POST /rerank` and `POST /v1/rerank`.
+### Quick Start
+```bash
+./setup_venv.sh
+./run_server.sh
+```
+Health check:
+```bash
+curl -s http://127.0.0.1:8000/health
+```
+Rerank request:
+```bash
+curl -s http://127.0.0.1:8000/v1/rerank \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "query": "capital of China",
+    "documents": [
+      "The capital of China is Beijing.",
+      "Gravity is a force."
+    ],
+    "top_n": 2,
+    "return_documents": true
+  }'
+```
+### Notes
+- Fixed shape profile (`s128`) for low-power deployment.
+- Inputs longer than profile capacity return an explicit error.
+- First request has warm-up latency.
+- Default compute setting is `cpu_and_ne` (ANE-preferred, not ANE-guaranteed).
+## 中文
+这个仓库提供基于 `Qwen3-Reranker-4B` 的预转换 CoreML bundle，以及可直接运行的文本重排服务（`/v1/rerank`）。
+### Bundle 规格
+| 项目 | 值 |
+| --- | --- |
+| 基础模型 | `Qwen/Qwen3-Reranker-4B` |
+| 任务类型 | 文本重排 |
+| Profile | `b1_s128` |
+| Bundle 路径 | `bundles/qwen3_reranker_ane_bundle_4b` |
+| 默认模型名 | `qwen3-reranker-4b-ane` |
+| 包体积（约） | `7.5G` |
+### 范围说明
+- 本版本仅支持**纯文本重排**。
+- 接口为 `POST /rerank` 与 `POST /v1/rerank`。
+### 快速开始
+```bash
+./setup_venv.sh
+./run_server.sh
+```
+健康检查：
+```bash
+curl -s http://127.0.0.1:8000/health
+```
+重排请求：
+```bash
+curl -s http://127.0.0.1:8000/v1/rerank \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "query": "capital of China",
+    "documents": [
+      "The capital of China is Beijing.",
+      "Gravity is a force."
+    ],
+    "top_n": 2,
+    "return_documents": true
+  }'
+```
+### 说明
+- 固定 shape profile（`s128`），偏向低功耗部署。
+- 输入超过 profile 上限会明确报错。
+- 首次请求会有预热延迟。
+- 默认 `cpu_and_ne`，是偏向 ANE 调度，不等于 100% 仅 ANE 执行。
+## License
+Apache-2.0. Please also follow the license and usage terms of the base Qwen model.

bundles/qwen3_reranker_ane_bundle_4b/manifest.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "format_version": 1,
+  "task": "rerank",
+  "model_name": "Qwen3-Reranker-4B",
+  "source_model_dir": "/Volumes/256G/Applications/ANE/Qwen3-Reranker-4B",
+  "tokenizer_dir": "tokenizer",
+  "hidden_size": 2560,
+  "yes_token_id": 9693,
+  "no_token_id": 2152,
+  "system_prompt": "Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".",
+  "pair_template": "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}",
+  "prefix_text": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n",
+  "suffix_text": "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n",
+  "created_at_utc": "2026-03-02T15:48:04.694204+00:00",
+  "profiles": [
+    {
+      "profile_id": "b1_s128",
+      "batch_size": 1,
+      "seq_len": 128,
+      "package_path": "packages/b1_s128.mlpackage",
+      "compiled_path": null,
+      "input_names": [
+        "input_ids",
+        "attention_mask"
+      ],
+      "output_name": "score"
+    }
+  ]
+}

bundles/qwen3_reranker_ane_bundle_4b/packages/b1_s128.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:954e30d62948183fd3323ac8db3b1b2be3526ef788a19028807ce39642c4450b
+size 791454

bundles/qwen3_reranker_ane_bundle_4b/packages/b1_s128.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a76471b81324fce92ce1fe1edfbc9fe964abd77f3f97d84db70a1773f605de92
+size 8043733440

bundles/qwen3_reranker_ane_bundle_4b/packages/b1_s128.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "82518FB0-9416-470C-98CD-55225EF89B96": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "FB126581-AA8F-423B-B9F0-2EBA683BA825": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "82518FB0-9416-470C-98CD-55225EF89B96"
+}

bundles/qwen3_reranker_ane_bundle_4b/tokenizer/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

bundles/qwen3_reranker_ane_bundle_4b/tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

bundles/qwen3_reranker_ane_bundle_4b/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

qwen3_ane_rerank/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Qwen3 reranker conversion + ANE serving toolkit."""
+from .manifest import BundleManifest, ProfileEntry
+__all__ = ["BundleManifest", "ProfileEntry"]

qwen3_ane_rerank/__main__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .cli import main
+if __name__ == "__main__":
+    main()

qwen3_ane_rerank/api.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from __future__ import annotations
+from typing import Any
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from .runtime import Qwen3AneRerankRuntime
+class RerankRequest(BaseModel):
+    query: str
+    documents: list[str]
+    model: str | None = None
+    top_n: int | None = Field(default=None, ge=1)
+    return_documents: bool = False
+    instruction: str | None = None
+    user: str | None = None
+def create_app(runtime: Qwen3AneRerankRuntime, default_model_id: str | None = None) -> FastAPI:
+    app = FastAPI(title="Qwen3 ANE Reranker Service", version="0.1.0")
+    @app.get("/health")
+    def health() -> dict[str, Any]:
+        return {
+            "ok": True,
+            "task": "rerank",
+            "model": default_model_id or runtime.manifest.model_name,
+            "profiles": [
+                {
+                    "id": p.entry.profile_id,
+                    "batch_size": p.entry.batch_size,
+                    "seq_len": p.entry.seq_len,
+                }
+                for p in runtime.profiles
+            ],
+        }
+    @app.post("/rerank")
+    @app.post("/v1/rerank")
+    def rerank(req: RerankRequest) -> dict[str, Any]:
+        try:
+            if req.query == "":
+                raise ValueError("query must not be empty")
+            if not req.documents:
+                raise ValueError("documents must not be empty")
+            if any(doc == "" for doc in req.documents):
+                raise ValueError("documents must not contain empty strings")
+            results, prompt_tokens = runtime.rerank(
+                query=req.query,
+                documents=req.documents,
+                top_n=req.top_n,
+                instruction=req.instruction,
+            )
+            data = []
+            for row in results:
+                item = {
+                    "object": "rerank_result",
+                    "index": row["index"],
+                    "relevance_score": row["relevance_score"],
+                }
+                if req.return_documents:
+                    item["document"] = req.documents[row["index"]]
+                data.append(item)
+            model_name = req.model or default_model_id or runtime.manifest.model_name
+            return {
+                "object": "list",
+                "data": data,
+                "model": model_name,
+                "usage": {
+                    "prompt_tokens": int(prompt_tokens),
+                    "total_tokens": int(prompt_tokens),
+                },
+            }
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+        except RuntimeError as exc:
+            raise HTTPException(status_code=500, detail=str(exc)) from exc
+    return app

qwen3_ane_rerank/cli.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from __future__ import annotations
+import argparse
+from pathlib import Path
+from .converter import BuildOptions, build_bundle
+from .profiles import parse_profiles
+from .runtime import Qwen3AneRerankRuntime
+def _add_common_build_args(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--profiles",
+        type=str,
+        default=None,
+        help="Shape profiles as comma list BxS (e.g. 1x128,4x128)",
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="macOS14",
+        choices=["macOS14", "macOS15", "iOS17", "iOS18"],
+        help="Core ML minimum deployment target",
+    )
+    parser.add_argument(
+        "--compile-mlmodelc",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Compile .mlpackage into .mlmodelc with coremlcompiler",
+    )
+    parser.add_argument(
+        "--system-prompt",
+        default=(
+            "Judge whether the Document meets the requirements based on the Query and the "
+            'Instruct provided. Note that the answer can only be "yes" or "no".'
+        ),
+        help="System prompt used in reranker prompt template",
+    )
+def cmd_convert(args: argparse.Namespace) -> None:
+    profiles = parse_profiles(args.profiles)
+    options = BuildOptions(
+        model_dir=Path(args.model_dir),
+        bundle_dir=Path(args.bundle_dir),
+        profiles=profiles,
+        compile_mlmodelc=bool(args.compile_mlmodelc),
+        minimum_deployment_target=args.target,
+        system_prompt=args.system_prompt,
+    )
+    manifest = build_bundle(options)
+    print(f"Built bundle at: {Path(args.bundle_dir).resolve()}")
+    print(f"Model: {manifest.model_name}")
+    print(f"Task: {manifest.task}")
+    print(f"Hidden size: {manifest.hidden_size}")
+    print(f"Token ids yes/no: {manifest.yes_token_id}/{manifest.no_token_id}")
+    print("Profiles:")
+    for entry in manifest.profiles:
+        print(
+            f"  - {entry.profile_id}: batch={entry.batch_size}, seq={entry.seq_len}, "
+            f"model={entry.compiled_path or entry.package_path}"
+        )
+def cmd_serve(args: argparse.Namespace) -> None:
+    bundle_dir = Path(args.bundle_dir)
+    manifest_path = bundle_dir / "manifest.json"
+    if not manifest_path.exists():
+        if not args.auto_build:
+            raise SystemExit(
+                f"Bundle not found at {bundle_dir}. Run convert first or pass --auto-build --model-dir."
+            )
+        if not args.model_dir:
+            raise SystemExit("--model-dir is required when --auto-build is enabled")
+        profiles = parse_profiles(args.profiles)
+        options = BuildOptions(
+            model_dir=Path(args.model_dir),
+            bundle_dir=bundle_dir,
+            profiles=profiles,
+            compile_mlmodelc=bool(args.compile_mlmodelc),
+            minimum_deployment_target=args.target,
+            system_prompt=args.system_prompt,
+        )
+        print("Bundle not found; building from source model...")
+        build_bundle(options)
+    runtime = Qwen3AneRerankRuntime(bundle_dir=bundle_dir, compute_units=args.compute_units)
+    from .api import create_app
+    import uvicorn
+    app = create_app(runtime=runtime, default_model_id=args.model_id)
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+    )
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="qwen3-ane-rerank",
+        description="Convert Qwen3-Reranker model to Core ML ANE bundle and serve /v1/rerank endpoint.",
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    convert_parser = subparsers.add_parser(
+        "convert",
+        help="Convert local HF Qwen3-Reranker model into ANE-ready Core ML profile bundle",
+    )
+    convert_parser.add_argument("--model-dir", required=True, help="Path to source HF model directory")
+    convert_parser.add_argument(
+        "--bundle-dir",
+        required=True,
+        help="Output bundle directory (manifest + packages + tokenizer)",
+    )
+    _add_common_build_args(convert_parser)
+    convert_parser.set_defaults(func=cmd_convert)
+    serve_parser = subparsers.add_parser(
+        "serve",
+        help="Run /v1/rerank endpoint backed by Core ML ANE profiles",
+    )
+    serve_parser.add_argument(
+        "--bundle-dir",
+        required=True,
+        help="Bundle directory created by convert",
+    )
+    serve_parser.add_argument(
+        "--model-dir",
+        default=None,
+        help="Source HF model directory (required if --auto-build and bundle missing)",
+    )
+    serve_parser.add_argument(
+        "--auto-build",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Auto-build bundle from --model-dir when manifest is missing",
+    )
+    _add_common_build_args(serve_parser)
+    serve_parser.add_argument("--host", default="127.0.0.1")
+    serve_parser.add_argument("--port", type=int, default=8000)
+    serve_parser.add_argument(
+        "--compute-units",
+        default="cpu_and_ne",
+        choices=["cpu_and_ne", "all", "cpu_only", "cpu_and_gpu"],
+        help="Core ML compute units preference",
+    )
+    serve_parser.add_argument(
+        "--model-id",
+        default="qwen3-reranker-0.6b-ane",
+        help="Model id returned in API responses",
+    )
+    serve_parser.add_argument(
+        "--log-level",
+        default="info",
+        choices=["critical", "error", "warning", "info", "debug", "trace"],
+    )
+    serve_parser.set_defaults(func=cmd_serve)
+    return parser
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    args.func(args)
+if __name__ == "__main__":
+    main()

qwen3_ane_rerank/converter.py ADDED Viewed

	@@ -0,0 +1,267 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import shutil
+import subprocess
+from typing import Any
+from .manifest import BundleManifest, ProfileEntry
+from .profiles import ShapeProfile
+DEFAULT_SYSTEM_PROMPT = (
+    "Judge whether the Document meets the requirements based on the Query and the "
+    'Instruct provided. Note that the answer can only be "yes" or "no".'
+)
+DEFAULT_PAIR_TEMPLATE = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
+DEFAULT_PREFIX_TEMPLATE = "<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n"
+DEFAULT_SUFFIX_TEXT = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+@dataclass(slots=True)
+class BuildOptions:
+    model_dir: Path
+    bundle_dir: Path
+    profiles: list[ShapeProfile]
+    compile_mlmodelc: bool = True
+    minimum_deployment_target: str = "macOS14"
+    system_prompt: str = DEFAULT_SYSTEM_PROMPT
+    pair_template: str = DEFAULT_PAIR_TEMPLATE
+    suffix_text: str = DEFAULT_SUFFIX_TEXT
+def _import_conversion_deps() -> tuple[Any, Any, Any, Any, Any]:
+    try:
+        import numpy as np
+        import torch
+        import coremltools as ct
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+    except Exception as exc:  # pragma: no cover - runtime dependency check
+        raise RuntimeError(
+            "Missing conversion dependencies. Install torch, transformers, coremltools, numpy."
+        ) from exc
+    return np, torch, ct, AutoModelForCausalLM, AutoTokenizer
+def _resolve_target(ct: Any, raw: str) -> Any:
+    if raw == "macOS14":
+        return ct.target.macOS14
+    if raw == "macOS15":
+        return ct.target.macOS15
+    if raw == "iOS17":
+        return ct.target.iOS17
+    if raw == "iOS18":
+        return ct.target.iOS18
+    raise ValueError(f"Unsupported minimum deployment target: {raw}")
+def _compile_mlpackage(package_path: Path, compiled_root: Path) -> Path:
+    compiled_root.mkdir(parents=True, exist_ok=True)
+    cmd = ["xcrun", "coremlcompiler", "compile", str(package_path), str(compiled_root)]
+    proc = subprocess.run(cmd, capture_output=True, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError(
+            "coremlcompiler compile failed for "
+            f"{package_path.name}:\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}"
+        )
+    expected = compiled_root / f"{package_path.stem}.mlmodelc"
+    if expected.exists():
+        return expected
+    matches = sorted(compiled_root.glob("*.mlmodelc"), key=lambda p: p.stat().st_mtime)
+    if not matches:
+        raise RuntimeError(f"coremlcompiler succeeded but no .mlmodelc found under {compiled_root}")
+    return matches[-1]
+def _ensure_tokenizer_has_pad_token(tokenizer: Any) -> None:
+    if tokenizer.pad_token_id is not None:
+        return
+    if tokenizer.eos_token is not None:
+        tokenizer.pad_token = tokenizer.eos_token
+        return
+    if tokenizer.unk_token is not None:
+        tokenizer.pad_token = tokenizer.unk_token
+        return
+    raise RuntimeError("Tokenizer has no pad/eos/unk token; cannot build fixed-shape ANE pipeline")
+def _resolve_token_id(tokenizer: Any, token: str) -> int:
+    token_id = tokenizer.convert_tokens_to_ids(token)
+    if token_id is None:
+        raise RuntimeError(f"Unable to resolve token id for '{token}'")
+    unk_id = tokenizer.unk_token_id
+    if token_id < 0 or (unk_id is not None and token_id == unk_id):
+        raise RuntimeError(f"Token '{token}' is missing from tokenizer vocab")
+    return int(token_id)
+def build_bundle(options: BuildOptions) -> BundleManifest:
+    np, torch, ct, AutoModelForCausalLM, AutoTokenizer = _import_conversion_deps()
+    model_dir = options.model_dir.resolve()
+    bundle_dir = options.bundle_dir.resolve()
+    packages_dir = bundle_dir / "packages"
+    compiled_dir = bundle_dir / "compiled"
+    tokenizer_dir = bundle_dir / "tokenizer"
+    bundle_dir.mkdir(parents=True, exist_ok=True)
+    packages_dir.mkdir(parents=True, exist_ok=True)
+    if options.compile_mlmodelc:
+        compiled_dir.mkdir(parents=True, exist_ok=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(model_dir),
+        local_files_only=True,
+        trust_remote_code=False,
+        use_fast=True,
+    )
+    tokenizer.padding_side = "left"
+    _ensure_tokenizer_has_pad_token(tokenizer)
+    tokenizer.save_pretrained(str(tokenizer_dir))
+    model = AutoModelForCausalLM.from_pretrained(
+        str(model_dir),
+        local_files_only=True,
+        trust_remote_code=False,
+        dtype=torch.float32,
+    )
+    model = model.float().eval()
+    if hasattr(model, "config") and hasattr(model.config, "use_cache"):
+        model.config.use_cache = False
+    if hasattr(model, "config") and hasattr(model.config, "_attn_implementation"):
+        model.config._attn_implementation = "eager"
+    if not hasattr(model, "model") or not hasattr(model, "lm_head"):
+        raise RuntimeError("Unsupported model structure: expected .model backbone and .lm_head")
+    hidden_size = int(getattr(model.config, "hidden_size", 0))
+    if hidden_size <= 0:
+        raise RuntimeError("Unable to infer hidden size from model config")
+    yes_token_id = _resolve_token_id(tokenizer, "yes")
+    no_token_id = _resolve_token_id(tokenizer, "no")
+    score_weight = (
+        model.lm_head.weight[yes_token_id].detach().to(torch.float32)
+        - model.lm_head.weight[no_token_id].detach().to(torch.float32)
+    )
+    backbone = model.model
+    class Qwen3RerankWrapper(torch.nn.Module):
+        def __init__(self, language_backbone: Any, score_weight_vec: Any, seq_len: int):
+            super().__init__()
+            self.backbone = language_backbone
+            self.register_buffer("score_weight", score_weight_vec.view(1, -1), persistent=False)
+            causal = torch.tril(torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32))
+            self.register_buffer("causal_template", causal, persistent=False)
+            self.neg_inf = -1e4
+        def _build_attention_bias(self, attention_mask: Any) -> Any:
+            mask = attention_mask.to(torch.float32)
+            key_valid = mask.unsqueeze(1).unsqueeze(1)
+            query_valid = mask.unsqueeze(1).unsqueeze(3)
+            allowed = self.causal_template * key_valid * query_valid
+            return (1.0 - allowed) * self.neg_inf
+        def forward(self, input_ids: Any, attention_mask: Any) -> Any:
+            input_ids = input_ids.to(torch.int64)
+            attention_bias = self._build_attention_bias(attention_mask)
+            outputs = self.backbone(
+                input_ids=input_ids,
+                attention_mask=attention_bias,
+                return_dict=False,
+            )
+            last_hidden = outputs[0][:, -1, :]
+            logit_delta = (last_hidden * self.score_weight).sum(dim=1, keepdim=True)
+            return torch.sigmoid(logit_delta)
+    target = _resolve_target(ct, options.minimum_deployment_target)
+    prefix_text = DEFAULT_PREFIX_TEMPLATE.format(system_prompt=options.system_prompt)
+    profile_entries: list[ProfileEntry] = []
+    for profile in options.profiles:
+        profile_id = profile.profile_id
+        package_path = packages_dir / f"{profile_id}.mlpackage"
+        if package_path.exists():
+            shutil.rmtree(package_path)
+        wrapper = Qwen3RerankWrapper(backbone, score_weight, seq_len=profile.seq_len).eval()
+        input_ids = torch.full(
+            (profile.batch_size, profile.seq_len),
+            fill_value=int(tokenizer.pad_token_id),
+            dtype=torch.int32,
+        )
+        attention_mask = torch.ones(
+            (profile.batch_size, profile.seq_len),
+            dtype=torch.int32,
+        )
+        exported = torch.export.export(wrapper, (input_ids, attention_mask), strict=False)
+        exported = exported.run_decompositions({})
+        ct.convert(
+            exported,
+            convert_to="mlprogram",
+            minimum_deployment_target=target,
+            compute_precision=ct.precision.FLOAT16,
+            skip_model_load=True,
+            package_dir=str(package_path),
+            inputs=[
+                ct.TensorType(
+                    name="input_ids",
+                    shape=input_ids.shape,
+                    dtype=np.int32,
+                ),
+                ct.TensorType(
+                    name="attention_mask",
+                    shape=attention_mask.shape,
+                    dtype=np.int32,
+                ),
+            ],
+            outputs=[ct.TensorType(name="score")],
+        )
+        compiled_path: Path | None = None
+        if options.compile_mlmodelc:
+            compiled_path = _compile_mlpackage(package_path, compiled_dir)
+        profile_entries.append(
+            ProfileEntry(
+                profile_id=profile_id,
+                batch_size=profile.batch_size,
+                seq_len=profile.seq_len,
+                package_path=str(package_path.relative_to(bundle_dir)),
+                compiled_path=(
+                    str(compiled_path.relative_to(bundle_dir)) if compiled_path is not None else None
+                ),
+                input_names=["input_ids", "attention_mask"],
+                output_name="score",
+            )
+        )
+    manifest = BundleManifest.create(
+        model_name=Path(model_dir).name,
+        source_model_dir=str(model_dir),
+        tokenizer_dir=str(tokenizer_dir.relative_to(bundle_dir)),
+        hidden_size=hidden_size,
+        yes_token_id=yes_token_id,
+        no_token_id=no_token_id,
+        system_prompt=options.system_prompt,
+        pair_template=options.pair_template,
+        prefix_text=prefix_text,
+        suffix_text=options.suffix_text,
+        profiles=profile_entries,
+    )
+    manifest.save(bundle_dir)
+    return manifest
+def build_bundle_if_missing(options: BuildOptions) -> BundleManifest:
+    bundle_dir = options.bundle_dir.resolve()
+    manifest_path = bundle_dir / "manifest.json"
+    if manifest_path.exists():
+        return BundleManifest.load(bundle_dir)
+    return build_bundle(options)

qwen3_ane_rerank/manifest.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from __future__ import annotations
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
+import json
+from pathlib import Path
+from typing import Any
+MANIFEST_FILENAME = "manifest.json"
+@dataclass(slots=True)
+class ProfileEntry:
+    profile_id: str
+    batch_size: int
+    seq_len: int
+    package_path: str
+    compiled_path: str | None
+    input_names: list[str]
+    output_name: str
+@dataclass(slots=True)
+class BundleManifest:
+    format_version: int
+    task: str
+    model_name: str
+    source_model_dir: str
+    tokenizer_dir: str
+    hidden_size: int
+    yes_token_id: int
+    no_token_id: int
+    system_prompt: str
+    pair_template: str
+    prefix_text: str
+    suffix_text: str
+    created_at_utc: str
+    profiles: list[ProfileEntry]
+    @classmethod
+    def create(
+        cls,
+        *,
+        model_name: str,
+        source_model_dir: str,
+        tokenizer_dir: str,
+        hidden_size: int,
+        yes_token_id: int,
+        no_token_id: int,
+        system_prompt: str,
+        pair_template: str,
+        prefix_text: str,
+        suffix_text: str,
+        profiles: list[ProfileEntry],
+    ) -> "BundleManifest":
+        return cls(
+            format_version=1,
+            task="rerank",
+            model_name=model_name,
+            source_model_dir=source_model_dir,
+            tokenizer_dir=tokenizer_dir,
+            hidden_size=hidden_size,
+            yes_token_id=yes_token_id,
+            no_token_id=no_token_id,
+            system_prompt=system_prompt,
+            pair_template=pair_template,
+            prefix_text=prefix_text,
+            suffix_text=suffix_text,
+            created_at_utc=datetime.now(timezone.utc).isoformat(),
+            profiles=profiles,
+        )
+    @classmethod
+    def from_dict(cls, payload: dict[str, Any]) -> "BundleManifest":
+        profiles = [ProfileEntry(**entry) for entry in payload["profiles"]]
+        return cls(
+            format_version=payload["format_version"],
+            task=payload.get("task", "rerank"),
+            model_name=payload["model_name"],
+            source_model_dir=payload["source_model_dir"],
+            tokenizer_dir=payload["tokenizer_dir"],
+            hidden_size=payload["hidden_size"],
+            yes_token_id=payload["yes_token_id"],
+            no_token_id=payload["no_token_id"],
+            system_prompt=payload["system_prompt"],
+            pair_template=payload["pair_template"],
+            prefix_text=payload["prefix_text"],
+            suffix_text=payload["suffix_text"],
+            created_at_utc=payload["created_at_utc"],
+            profiles=profiles,
+        )
+    def to_dict(self) -> dict[str, Any]:
+        payload = asdict(self)
+        payload["profiles"] = [asdict(entry) for entry in self.profiles]
+        return payload
+    def save(self, bundle_dir: Path) -> Path:
+        bundle_dir.mkdir(parents=True, exist_ok=True)
+        path = bundle_dir / MANIFEST_FILENAME
+        path.write_text(json.dumps(self.to_dict(), indent=2, ensure_ascii=False), encoding="utf-8")
+        return path
+    @classmethod
+    def load(cls, bundle_dir: Path) -> "BundleManifest":
+        path = bundle_dir / MANIFEST_FILENAME
+        payload = json.loads(path.read_text(encoding="utf-8"))
+        return cls.from_dict(payload)

qwen3_ane_rerank/profiles.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from __future__ import annotations
+from dataclasses import dataclass
+DEFAULT_PROFILES = [
+    (1, 128),
+    (4, 128),
+]
+@dataclass(frozen=True, slots=True)
+class ShapeProfile:
+    batch_size: int
+    seq_len: int
+    @property
+    def profile_id(self) -> str:
+        return f"b{self.batch_size}_s{self.seq_len}"
+def parse_profiles(raw: str | None) -> list[ShapeProfile]:
+    if not raw:
+        return [ShapeProfile(batch_size=b, seq_len=s) for b, s in DEFAULT_PROFILES]
+    parsed: list[ShapeProfile] = []
+    for item in raw.split(","):
+        item = item.strip().lower()
+        if not item:
+            continue
+        if "x" not in item:
+            raise ValueError(f"Invalid profile '{item}'. Expected format BxS, e.g. 4x512")
+        left, right = item.split("x", 1)
+        batch_size = int(left)
+        seq_len = int(right)
+        if batch_size <= 0 or seq_len <= 0:
+            raise ValueError(f"Invalid profile '{item}'. B and S must be positive")
+        parsed.append(ShapeProfile(batch_size=batch_size, seq_len=seq_len))
+    if not parsed:
+        raise ValueError("No valid profiles parsed")
+    unique = {(p.batch_size, p.seq_len): p for p in parsed}
+    return [
+        unique[key]
+        for key in sorted(unique.keys(), key=lambda x: (x[1], x[0]))
+    ]

qwen3_ane_rerank/runtime.py ADDED Viewed

	@@ -0,0 +1,278 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+from .manifest import BundleManifest, ProfileEntry
+MAX_INPUT_TOKENS_PER_ITEM = 8192
+MAX_TOTAL_TOKENS_PER_REQUEST = 300000
+DEFAULT_INSTRUCTION = "Given a web search query, retrieve relevant passages that answer the query."
+def _import_runtime_deps() -> tuple[Any, Any, Any]:
+    try:
+        import numpy as np
+        import coremltools as ct
+        from transformers import AutoTokenizer
+    except Exception as exc:  # pragma: no cover - runtime dependency check
+        raise RuntimeError(
+            "Missing runtime dependencies. Install numpy, coremltools, transformers."
+        ) from exc
+    return np, ct, AutoTokenizer
+@dataclass(slots=True)
+class LoadedProfile:
+    entry: ProfileEntry
+    model_path: Path
+    model: Any | None = None
+class Qwen3AneRerankRuntime:
+    def __init__(self, bundle_dir: str | Path, compute_units: str = "cpu_and_ne") -> None:
+        np, ct, AutoTokenizer = _import_runtime_deps()
+        self.np = np
+        self.ct = ct
+        self.bundle_dir = Path(bundle_dir).resolve()
+        self.manifest = BundleManifest.load(self.bundle_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            str(self.bundle_dir / self.manifest.tokenizer_dir),
+            local_files_only=True,
+            trust_remote_code=False,
+            use_fast=True,
+        )
+        self.tokenizer.padding_side = "left"
+        if self.tokenizer.pad_token_id is None:
+            if self.tokenizer.eos_token is not None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            elif self.tokenizer.unk_token is not None:
+                self.tokenizer.pad_token = self.tokenizer.unk_token
+            else:
+                raise RuntimeError("Tokenizer has no pad/eos/unk token")
+        self.prefix_tokens = self.tokenizer.encode(
+            self.manifest.prefix_text,
+            add_special_tokens=False,
+        )
+        self.suffix_tokens = self.tokenizer.encode(
+            self.manifest.suffix_text,
+            add_special_tokens=False,
+        )
+        self.static_token_cost = len(self.prefix_tokens) + len(self.suffix_tokens)
+        self.compute_units = self._resolve_compute_units(compute_units)
+        self.profiles: list[LoadedProfile] = []
+        for entry in self.manifest.profiles:
+            package_path = self.bundle_dir / entry.package_path
+            model_path = package_path
+            if entry.compiled_path is not None:
+                compiled_path = self.bundle_dir / entry.compiled_path
+                if (compiled_path / "Manifest.json").exists():
+                    model_path = compiled_path
+            self.profiles.append(LoadedProfile(entry=entry, model_path=model_path))
+        if not self.profiles:
+            raise RuntimeError("No profiles found in manifest")
+        self.max_profile_batch = max(p.entry.batch_size for p in self.profiles)
+        self.max_profile_seq = max(p.entry.seq_len for p in self.profiles)
+        if self.static_token_cost >= self.max_profile_seq:
+            raise RuntimeError(
+                "Profile seq_len is too small for reranker prompt template. "
+                f"Need > {self.static_token_cost}, got {self.max_profile_seq}."
+            )
+    def _resolve_compute_units(self, raw: str) -> Any:
+        mode = raw.strip().lower()
+        cu = self.ct.ComputeUnit
+        if mode == "cpu_and_ne" and hasattr(cu, "CPU_AND_NE"):
+            return cu.CPU_AND_NE
+        if mode == "all":
+            return cu.ALL
+        if mode == "cpu_only":
+            return cu.CPU_ONLY
+        if mode == "cpu_and_gpu":
+            return cu.CPU_AND_GPU
+        if mode == "cpu_and_ne" and not hasattr(cu, "CPU_AND_NE"):
+            return cu.ALL
+        raise ValueError(f"Unsupported compute unit mode: {raw}")
+    def _get_model(self, profile: LoadedProfile) -> Any:
+        if profile.model is None:
+            profile.model = self.ct.models.MLModel(
+                str(profile.model_path),
+                compute_units=self.compute_units,
+            )
+        return profile.model
+    def _select_profile(self, batch_size: int, seq_len: int) -> LoadedProfile | None:
+        candidates = [
+            p
+            for p in self.profiles
+            if p.entry.batch_size >= batch_size and p.entry.seq_len >= seq_len
+        ]
+        if not candidates:
+            return None
+        candidates.sort(key=lambda p: (p.entry.batch_size * p.entry.seq_len, p.entry.seq_len, p.entry.batch_size))
+        return candidates[0]
+    def _plan_chunks(self, lengths: list[int]) -> list[tuple[int, int, LoadedProfile]]:
+        chunks: list[tuple[int, int, LoadedProfile]] = []
+        i = 0
+        n = len(lengths)
+        while i < n:
+            best: tuple[int, LoadedProfile] | None = None
+            max_batch = min(self.max_profile_batch, n - i)
+            for b in range(max_batch, 0, -1):
+                max_len = max(lengths[i : i + b])
+                profile = self._select_profile(batch_size=b, seq_len=max_len)
+                if profile is not None:
+                    best = (b, profile)
+                    break
+            if best is None:
+                raise ValueError(
+                    f"No profile can serve items starting at index {i}. Required seq_len={lengths[i]}"
+                )
+            b, profile = best
+            chunks.append((i, i + b, profile))
+            i += b
+        return chunks
+    def _predict_scores(self, profile: LoadedProfile, input_ids: Any, attention_mask: Any) -> Any:
+        model = self._get_model(profile)
+        out = model.predict(
+            {
+                profile.entry.input_names[0]: input_ids,
+                profile.entry.input_names[1]: attention_mask,
+            }
+        )
+        raw = out.get(profile.entry.output_name, next(iter(out.values())))
+        scores = self.np.asarray(raw, dtype=self.np.float32)
+        if scores.ndim == 0:
+            scores = scores.reshape(1)
+        elif scores.ndim == 2 and scores.shape[1] == 1:
+            scores = scores[:, 0]
+        elif scores.ndim > 1:
+            scores = scores.reshape(scores.shape[0], -1)[:, 0]
+        return scores
+    def _validate_token_limits(self, token_lengths: Iterable[int]) -> None:
+        lengths = list(token_lengths)
+        if any(length <= 0 for length in lengths):
+            raise ValueError("Input pair must not be empty")
+        if any(length > MAX_INPUT_TOKENS_PER_ITEM for length in lengths):
+            raise ValueError(
+                f"Each pair must be <= {MAX_INPUT_TOKENS_PER_ITEM} tokens before truncation"
+            )
+        if sum(lengths) > MAX_TOTAL_TOKENS_PER_REQUEST:
+            raise ValueError(
+                f"Total tokens across request must be <= {MAX_TOTAL_TOKENS_PER_REQUEST}"
+            )
+    def _format_pair_text(self, query: str, document: str, instruction: str) -> str:
+        if "{instruction}" not in self.manifest.pair_template:
+            raise RuntimeError("Invalid pair template: missing {instruction}")
+        if "{query}" not in self.manifest.pair_template:
+            raise RuntimeError("Invalid pair template: missing {query}")
+        if "{document}" not in self.manifest.pair_template:
+            raise RuntimeError("Invalid pair template: missing {document}")
+        return self.manifest.pair_template.format(
+            instruction=instruction,
+            query=query,
+            document=document,
+        )
+    def _pair_token_len(self, pair_text: str) -> int:
+        body_len = len(
+            self.tokenizer.encode(
+                pair_text,
+                add_special_tokens=False,
+                truncation=False,
+            )
+        )
+        return self.static_token_cost + body_len
+    def _build_pair_ids(self, pair_text: str, seq_len: int) -> list[int]:
+        body_budget = seq_len - self.static_token_cost
+        if body_budget <= 0:
+            raise RuntimeError(f"seq_len={seq_len} is too small for reranker template")
+        body_ids = self.tokenizer.encode(
+            pair_text,
+            add_special_tokens=False,
+            truncation=True,
+            max_length=body_budget,
+        )
+        return self.prefix_tokens + body_ids + self.suffix_tokens
+    def rerank(
+        self,
+        query: str,
+        documents: list[str],
+        *,
+        top_n: int | None = None,
+        instruction: str | None = None,
+    ) -> tuple[list[dict[str, Any]], int]:
+        if not query:
+            raise ValueError("query must not be empty")
+        if not documents:
+            raise ValueError("documents must not be empty")
+        if any(doc == "" for doc in documents):
+            raise ValueError("documents must not contain empty strings")
+        instruction_text = instruction or DEFAULT_INSTRUCTION
+        pair_texts = [self._format_pair_text(query, doc, instruction_text) for doc in documents]
+        raw_lengths = [self._pair_token_len(text) for text in pair_texts]
+        self._validate_token_limits(raw_lengths)
+        too_long = [idx for idx, length in enumerate(raw_lengths) if length > self.max_profile_seq]
+        if too_long:
+            first = too_long[0]
+            raise ValueError(
+                f"pair at index {first} has {raw_lengths[first]} tokens, "
+                f"but compiled profiles only support up to {self.max_profile_seq}. "
+                "Rebuild bundle with larger seq profiles."
+            )
+        effective_lengths = [min(length, self.max_profile_seq) for length in raw_lengths]
+        chunks = self._plan_chunks(effective_lengths)
+        pad_id = int(self.tokenizer.pad_token_id)
+        all_scores: list[Any] = []
+        prompt_tokens = 0
+        for start, end, profile in chunks:
+            chunk_texts = pair_texts[start:end]
+            profile_batch = profile.entry.batch_size
+            seq_len = profile.entry.seq_len
+            input_ids = self.np.full((profile_batch, seq_len), fill_value=pad_id, dtype=self.np.int32)
+            attention_mask = self.np.zeros((profile_batch, seq_len), dtype=self.np.int32)
+            for row, pair_text in enumerate(chunk_texts):
+                ids = self._build_pair_ids(pair_text, seq_len=seq_len)
+                tlen = len(ids)
+                offset = seq_len - tlen
+                input_ids[row, offset:] = self.np.asarray(ids, dtype=self.np.int32)
+                attention_mask[row, offset:] = 1
+                prompt_tokens += tlen
+            scores = self._predict_scores(profile, input_ids, attention_mask)
+            all_scores.append(scores[: len(chunk_texts)])
+        merged_scores = self.np.concatenate(all_scores, axis=0).astype(self.np.float32)
+        ranked = [
+            {"index": idx, "relevance_score": float(score)}
+            for idx, score in enumerate(merged_scores.tolist())
+        ]
+        ranked.sort(key=lambda item: item["relevance_score"], reverse=True)
+        n_results = len(ranked) if top_n is None else max(1, min(int(top_n), len(ranked)))
+        return ranked[:n_results], prompt_tokens

requirements-service.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+numpy==2.4.2
+coremltools==9.0
+transformers==5.2.0
+fastapi==0.135.1
+uvicorn==0.41.0

run_server.sh ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env bash
+set -euo pipefail
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUNDLE_DIR="${BUNDLE_DIR:-$ROOT_DIR/bundles/qwen3_reranker_ane_bundle_4b}"
+HOST="${HOST:-127.0.0.1}"
+PORT="${PORT:-8000}"
+COMPUTE_UNITS="${COMPUTE_UNITS:-cpu_and_ne}"
+MODEL_ID="${MODEL_ID:-qwen3-reranker-4b-ane}"
+if [[ -n "${PYTHON_BIN:-}" ]]; then
+  PY_BIN="$PYTHON_BIN"
+elif [[ -x "$ROOT_DIR/.venv/bin/python" ]]; then
+  PY_BIN="$ROOT_DIR/.venv/bin/python"
+else
+  PY_BIN="python3"
+fi
+if ! command -v "$PY_BIN" >/dev/null 2>&1; then
+  echo "[ERROR] Python 不可用: $PY_BIN"
+  echo "请先执行: python3.11 -m venv .venv && source .venv/bin/activate && python -m pip install -r requirements-service.txt"
+  exit 1
+fi
+PY_MM="$($PY_BIN -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
+case "$PY_MM" in
+  3.10|3.11|3.12) ;;
+  *)
+    echo "[ERROR] 当前 Python 版本为 ${PY_MM}，不受支持。"
+    echo "coremltools 在该版本下无法加载本地运行库（典型报错: libcoremlpython / libmilstoragepython）。"
+    echo "请改用 Python 3.11："
+    echo "  python3.11 -m venv .venv"
+    echo "  source .venv/bin/activate"
+    echo "  python -m pip install -r requirements-service.txt"
+    exit 1
+    ;;
+esac
+if [[ ! -f "$BUNDLE_DIR/manifest.json" ]]; then
+  echo "[ERROR] 未找到 bundle manifest: $BUNDLE_DIR/manifest.json"
+  exit 1
+fi
+$PY_BIN - <<'PY'
+import sys
+errors = []
+try:
+    import coremltools  # noqa: F401
+except Exception as e:
+    errors.append(f"import coremltools 失败: {e}")
+for mod in ("coremltools.libcoremlpython", "coremltools.libmilstoragepython"):
+    try:
+        __import__(mod)
+    except Exception as e:
+        errors.append(f"{mod} 加载失败: {e}")
+if errors:
+    print("[ERROR] Core ML Python 运行库不可用：")
+    for item in errors:
+        print("  -", item)
+    print("请确认你在 Python 3.11 的虚拟环境中安装依赖。")
+    sys.exit(1)
+PY
+cd "$ROOT_DIR"
+exec "$PY_BIN" -m qwen3_ane_rerank serve \
+  --bundle-dir "$BUNDLE_DIR" \
+  --no-auto-build \
+  --compute-units "$COMPUTE_UNITS" \
+  --model-id "$MODEL_ID" \
+  --host "$HOST" \
+  --port "$PORT"

setup_venv.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env bash
+set -euo pipefail
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$ROOT_DIR"
+if ! command -v python3.11 >/dev/null 2>&1; then
+  echo "[ERROR] 未找到 python3.11，请先安装 Python 3.11。"
+  exit 1
+fi
+if [[ -d .venv ]]; then
+  CUR_VER="$(.venv/bin/python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || true)"
+  if [[ "$CUR_VER" != "3.11" ]]; then
+    BACKUP_DIR=".venv.backup.$(date +%Y%m%d-%H%M%S)"
+    echo "[INFO] 现有 .venv 版本为 ${CUR_VER:-unknown}，移动到 $BACKUP_DIR"
+    mv .venv "$BACKUP_DIR"
+  fi
+fi
+python3.11 -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install -r requirements-service.txt
+python - <<'PY'
+import sys
+print("[OK] venv Python:", sys.version)
+import coremltools
+print("[OK] coremltools:", coremltools.__version__)
+PY
+echo "[DONE] 环境准备完成，运行 ./run_server.sh 启动服务。"