cgoodmaker Claude Opus 4.6 commited on
Commit
4af4003
·
1 Parent(s): 0989643

Speed up CPU inference: halve token limits, pre-download models, fix OMP threads

Browse files

- Reduce max_new_tokens across all pipeline calls (400→200, 300→150,
512→200) — on CPU each token is ~1s so this roughly halves wall time
- Pre-download MONET and sentence-transformers in Dockerfile to
eliminate first-request model download stall
- Set OMP_NUM_THREADS=4 and MKL_NUM_THREADS=4 to fix libgomp warning
and ensure proper CPU parallelism

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. Dockerfile +13 -0
  2. models/medgemma_agent.py +6 -6
Dockerfile CHANGED
@@ -33,6 +33,15 @@ COPY --chown=user mcp_server/ mcp_server/
33
  COPY --chown=user data/case_store.py data/case_store.py
34
  COPY --chown=user guidelines/ guidelines/
35
 
 
 
 
 
 
 
 
 
 
36
  # Runtime data directories — must be writable by user 1000
37
  RUN mkdir -p data/uploads data/patient_chats data/lesions && \
38
  echo '{"patients": []}' > data/patients.json && \
@@ -40,6 +49,10 @@ RUN mkdir -p data/uploads data/patient_chats data/lesions && \
40
 
41
  USER user
42
 
 
 
 
 
43
  EXPOSE 7860
44
 
45
  CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
33
  COPY --chown=user data/case_store.py data/case_store.py
34
  COPY --chown=user guidelines/ guidelines/
35
 
36
+ # Pre-download MONET and sentence-transformers models so first request is fast
37
+ RUN python -c "\
38
+ from transformers import AutoProcessor, AutoModelForZeroShotImageClassification; \
39
+ AutoProcessor.from_pretrained('chanwkim/monet'); \
40
+ AutoModelForZeroShotImageClassification.from_pretrained('chanwkim/monet'); \
41
+ from sentence_transformers import SentenceTransformer; \
42
+ SentenceTransformer('all-MiniLM-L6-v2'); \
43
+ print('Models cached')"
44
+
45
  # Runtime data directories — must be writable by user 1000
46
  RUN mkdir -p data/uploads data/patient_chats data/lesions && \
47
  echo '{"patients": []}' > data/patients.json && \
 
49
 
50
  USER user
51
 
52
+ # Fix OMP_NUM_THREADS warning and ensure all CPU cores are used
53
+ ENV OMP_NUM_THREADS=4
54
+ ENV MKL_NUM_THREADS=4
55
+
56
  EXPOSE 7860
57
 
58
  CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]
models/medgemma_agent.py CHANGED
@@ -341,7 +341,7 @@ class MedGemmaAgent:
341
 
342
  try:
343
  time.sleep(0.2)
344
- output = self.pipe(messages, max_new_tokens=400)
345
  result = output[0]["generated_text"][-1]["content"]
346
  findings['synthesis'] = result
347
 
@@ -411,7 +411,7 @@ Be concise and specific."""
411
  ]
412
 
413
  try:
414
- output = self.pipe(messages, max_new_tokens=300)
415
  reconciliation = output[0]["generated_text"][-1]["content"]
416
  self.last_reconciliation = reconciliation
417
 
@@ -643,7 +643,7 @@ Be specific to THIS lesion. 3-5 sentences maximum."""
643
  # Generate response
644
  start = time.time()
645
  try:
646
- output = self.pipe(messages, max_new_tokens=250)
647
  response = output[0]["generated_text"][-1]["content"]
648
 
649
  yield f"[RESPONSE]\n"
@@ -820,7 +820,7 @@ Provide your assessment:
820
 
821
  try:
822
  yield f"[THINKING]Comparing current image to previous findings...[/THINKING]\n"
823
- output = self.pipe(messages, max_new_tokens=400)
824
  comparison_result = output[0]["generated_text"][-1]["content"]
825
 
826
  yield f"[RESPONSE]\n"
@@ -863,7 +863,7 @@ Provide your assessment:
863
  content.append({"type": "text", "text": message})
864
 
865
  messages = [{"role": "user", "content": content}]
866
- output = self.pipe(messages, max_new_tokens=512)
867
  return output[0]["generated_text"][-1]["content"]
868
 
869
  def chat_followup(self, message: str) -> Generator[str, None, None]:
@@ -917,7 +917,7 @@ Provide a concise, informative response. If the question is outside your experti
917
  yield f"[THINKING]Considering your question in context of the previous analysis...[/THINKING]\n"
918
  time.sleep(0.2)
919
 
920
- output = self.pipe(messages, max_new_tokens=400)
921
  response = output[0]["generated_text"][-1]["content"]
922
 
923
  yield f"[RESPONSE]\n"
 
341
 
342
  try:
343
  time.sleep(0.2)
344
+ output = self.pipe(messages, max_new_tokens=200)
345
  result = output[0]["generated_text"][-1]["content"]
346
  findings['synthesis'] = result
347
 
 
411
  ]
412
 
413
  try:
414
+ output = self.pipe(messages, max_new_tokens=150)
415
  reconciliation = output[0]["generated_text"][-1]["content"]
416
  self.last_reconciliation = reconciliation
417
 
 
643
  # Generate response
644
  start = time.time()
645
  try:
646
+ output = self.pipe(messages, max_new_tokens=150)
647
  response = output[0]["generated_text"][-1]["content"]
648
 
649
  yield f"[RESPONSE]\n"
 
820
 
821
  try:
822
  yield f"[THINKING]Comparing current image to previous findings...[/THINKING]\n"
823
+ output = self.pipe(messages, max_new_tokens=200)
824
  comparison_result = output[0]["generated_text"][-1]["content"]
825
 
826
  yield f"[RESPONSE]\n"
 
863
  content.append({"type": "text", "text": message})
864
 
865
  messages = [{"role": "user", "content": content}]
866
+ output = self.pipe(messages, max_new_tokens=200)
867
  return output[0]["generated_text"][-1]["content"]
868
 
869
  def chat_followup(self, message: str) -> Generator[str, None, None]:
 
917
  yield f"[THINKING]Considering your question in context of the previous analysis...[/THINKING]\n"
918
  time.sleep(0.2)
919
 
920
+ output = self.pipe(messages, max_new_tokens=200)
921
  response = output[0]["generated_text"][-1]["content"]
922
 
923
  yield f"[RESPONSE]\n"