Commit ·
4af4003
1
Parent(s): 0989643
Speed up CPU inference: halve token limits, pre-download models, fix OMP threads
Browse files- Reduce max_new_tokens across all pipeline calls (400→200, 300→150,
512→200) — on CPU each token is ~1s so this roughly halves wall time
- Pre-download MONET and sentence-transformers in Dockerfile to
eliminate first-request model download stall
- Set OMP_NUM_THREADS=4 and MKL_NUM_THREADS=4 to fix libgomp warning
and ensure proper CPU parallelism
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- Dockerfile +13 -0
- models/medgemma_agent.py +6 -6
Dockerfile
CHANGED
|
@@ -33,6 +33,15 @@ COPY --chown=user mcp_server/ mcp_server/
|
|
| 33 |
COPY --chown=user data/case_store.py data/case_store.py
|
| 34 |
COPY --chown=user guidelines/ guidelines/
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
# Runtime data directories — must be writable by user 1000
|
| 37 |
RUN mkdir -p data/uploads data/patient_chats data/lesions && \
|
| 38 |
echo '{"patients": []}' > data/patients.json && \
|
|
@@ -40,6 +49,10 @@ RUN mkdir -p data/uploads data/patient_chats data/lesions && \
|
|
| 40 |
|
| 41 |
USER user
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
EXPOSE 7860
|
| 44 |
|
| 45 |
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 33 |
COPY --chown=user data/case_store.py data/case_store.py
|
| 34 |
COPY --chown=user guidelines/ guidelines/
|
| 35 |
|
| 36 |
+
# Pre-download MONET and sentence-transformers models so first request is fast
|
| 37 |
+
RUN python -c "\
|
| 38 |
+
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification; \
|
| 39 |
+
AutoProcessor.from_pretrained('chanwkim/monet'); \
|
| 40 |
+
AutoModelForZeroShotImageClassification.from_pretrained('chanwkim/monet'); \
|
| 41 |
+
from sentence_transformers import SentenceTransformer; \
|
| 42 |
+
SentenceTransformer('all-MiniLM-L6-v2'); \
|
| 43 |
+
print('Models cached')"
|
| 44 |
+
|
| 45 |
# Runtime data directories — must be writable by user 1000
|
| 46 |
RUN mkdir -p data/uploads data/patient_chats data/lesions && \
|
| 47 |
echo '{"patients": []}' > data/patients.json && \
|
|
|
|
| 49 |
|
| 50 |
USER user
|
| 51 |
|
| 52 |
+
# Fix OMP_NUM_THREADS warning and ensure all CPU cores are used
|
| 53 |
+
ENV OMP_NUM_THREADS=4
|
| 54 |
+
ENV MKL_NUM_THREADS=4
|
| 55 |
+
|
| 56 |
EXPOSE 7860
|
| 57 |
|
| 58 |
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
models/medgemma_agent.py
CHANGED
|
@@ -341,7 +341,7 @@ class MedGemmaAgent:
|
|
| 341 |
|
| 342 |
try:
|
| 343 |
time.sleep(0.2)
|
| 344 |
-
output = self.pipe(messages, max_new_tokens=
|
| 345 |
result = output[0]["generated_text"][-1]["content"]
|
| 346 |
findings['synthesis'] = result
|
| 347 |
|
|
@@ -411,7 +411,7 @@ Be concise and specific."""
|
|
| 411 |
]
|
| 412 |
|
| 413 |
try:
|
| 414 |
-
output = self.pipe(messages, max_new_tokens=
|
| 415 |
reconciliation = output[0]["generated_text"][-1]["content"]
|
| 416 |
self.last_reconciliation = reconciliation
|
| 417 |
|
|
@@ -643,7 +643,7 @@ Be specific to THIS lesion. 3-5 sentences maximum."""
|
|
| 643 |
# Generate response
|
| 644 |
start = time.time()
|
| 645 |
try:
|
| 646 |
-
output = self.pipe(messages, max_new_tokens=
|
| 647 |
response = output[0]["generated_text"][-1]["content"]
|
| 648 |
|
| 649 |
yield f"[RESPONSE]\n"
|
|
@@ -820,7 +820,7 @@ Provide your assessment:
|
|
| 820 |
|
| 821 |
try:
|
| 822 |
yield f"[THINKING]Comparing current image to previous findings...[/THINKING]\n"
|
| 823 |
-
output = self.pipe(messages, max_new_tokens=
|
| 824 |
comparison_result = output[0]["generated_text"][-1]["content"]
|
| 825 |
|
| 826 |
yield f"[RESPONSE]\n"
|
|
@@ -863,7 +863,7 @@ Provide your assessment:
|
|
| 863 |
content.append({"type": "text", "text": message})
|
| 864 |
|
| 865 |
messages = [{"role": "user", "content": content}]
|
| 866 |
-
output = self.pipe(messages, max_new_tokens=
|
| 867 |
return output[0]["generated_text"][-1]["content"]
|
| 868 |
|
| 869 |
def chat_followup(self, message: str) -> Generator[str, None, None]:
|
|
@@ -917,7 +917,7 @@ Provide a concise, informative response. If the question is outside your experti
|
|
| 917 |
yield f"[THINKING]Considering your question in context of the previous analysis...[/THINKING]\n"
|
| 918 |
time.sleep(0.2)
|
| 919 |
|
| 920 |
-
output = self.pipe(messages, max_new_tokens=
|
| 921 |
response = output[0]["generated_text"][-1]["content"]
|
| 922 |
|
| 923 |
yield f"[RESPONSE]\n"
|
|
|
|
| 341 |
|
| 342 |
try:
|
| 343 |
time.sleep(0.2)
|
| 344 |
+
output = self.pipe(messages, max_new_tokens=200)
|
| 345 |
result = output[0]["generated_text"][-1]["content"]
|
| 346 |
findings['synthesis'] = result
|
| 347 |
|
|
|
|
| 411 |
]
|
| 412 |
|
| 413 |
try:
|
| 414 |
+
output = self.pipe(messages, max_new_tokens=150)
|
| 415 |
reconciliation = output[0]["generated_text"][-1]["content"]
|
| 416 |
self.last_reconciliation = reconciliation
|
| 417 |
|
|
|
|
| 643 |
# Generate response
|
| 644 |
start = time.time()
|
| 645 |
try:
|
| 646 |
+
output = self.pipe(messages, max_new_tokens=150)
|
| 647 |
response = output[0]["generated_text"][-1]["content"]
|
| 648 |
|
| 649 |
yield f"[RESPONSE]\n"
|
|
|
|
| 820 |
|
| 821 |
try:
|
| 822 |
yield f"[THINKING]Comparing current image to previous findings...[/THINKING]\n"
|
| 823 |
+
output = self.pipe(messages, max_new_tokens=200)
|
| 824 |
comparison_result = output[0]["generated_text"][-1]["content"]
|
| 825 |
|
| 826 |
yield f"[RESPONSE]\n"
|
|
|
|
| 863 |
content.append({"type": "text", "text": message})
|
| 864 |
|
| 865 |
messages = [{"role": "user", "content": content}]
|
| 866 |
+
output = self.pipe(messages, max_new_tokens=200)
|
| 867 |
return output[0]["generated_text"][-1]["content"]
|
| 868 |
|
| 869 |
def chat_followup(self, message: str) -> Generator[str, None, None]:
|
|
|
|
| 917 |
yield f"[THINKING]Considering your question in context of the previous analysis...[/THINKING]\n"
|
| 918 |
time.sleep(0.2)
|
| 919 |
|
| 920 |
+
output = self.pipe(messages, max_new_tokens=200)
|
| 921 |
response = output[0]["generated_text"][-1]["content"]
|
| 922 |
|
| 923 |
yield f"[RESPONSE]\n"
|