feat: enforce native preloading of neural model to prevent HTTP connection timeouts
Browse files- backend/main.py +12 -0
backend/main.py
CHANGED
|
@@ -58,6 +58,18 @@ async def lifespan(app: FastAPI):
|
|
| 58 |
|
| 59 |
app.state.qdrant = _qdrant_client
|
| 60 |
app.state.qdrant_ready = _qdrant_ready
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
yield
|
| 62 |
_qdrant_client.close()
|
| 63 |
|
|
|
|
| 58 |
|
| 59 |
app.state.qdrant = _qdrant_client
|
| 60 |
app.state.qdrant_ready = _qdrant_ready
|
| 61 |
+
|
| 62 |
+
# -----------------------------------------------------
|
| 63 |
+
# CRITICAL: Pre-load the 2.3 GB Neural Cross-Encoder
|
| 64 |
+
# to entirely prevent HF Gateway 60-second 500 timeouts
|
| 65 |
+
# during user requests.
|
| 66 |
+
# -----------------------------------------------------
|
| 67 |
+
import asyncio
|
| 68 |
+
from src.ml.reranker import _get_reranker
|
| 69 |
+
logger.info(f"Preloading Neural Reranker `{settings.reranker_model}`. This may take ~60 seconds to cache...")
|
| 70 |
+
await asyncio.to_thread(_get_reranker)
|
| 71 |
+
logger.info("Neural Reranker fully loaded into memory!")
|
| 72 |
+
|
| 73 |
yield
|
| 74 |
_qdrant_client.close()
|
| 75 |
|