Spaces:

tech-doc
/

SkinProAI

Sleeping

cgoodmaker Claude Opus 4.6 commited on Mar 2

Commit

4af4003

1 Parent(s): 0989643

Speed up CPU inference: halve token limits, pre-download models, fix OMP threads

- Reduce max_new_tokens across all pipeline calls (400→200, 300→150,
512→200) — on CPU each token is ~1s so this roughly halves wall time
- Pre-download MONET and sentence-transformers in Dockerfile to
eliminate first-request model download stall
- Set OMP_NUM_THREADS=4 and MKL_NUM_THREADS=4 to fix libgomp warning
and ensure proper CPU parallelism

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

Dockerfile +13 -0
models/medgemma_agent.py +6 -6

Dockerfile CHANGED Viewed

@@ -33,6 +33,15 @@ COPY --chown=user mcp_server/ mcp_server/
 COPY --chown=user data/case_store.py data/case_store.py
 COPY --chown=user guidelines/ guidelines/
 # Runtime data directories — must be writable by user 1000
 RUN mkdir -p data/uploads data/patient_chats data/lesions && \
     echo '{"patients": []}' > data/patients.json && \
@@ -40,6 +49,10 @@ RUN mkdir -p data/uploads data/patient_chats data/lesions && \
 USER user
 EXPOSE 7860
 CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY --chown=user data/case_store.py data/case_store.py
 COPY --chown=user guidelines/ guidelines/
+# Pre-download MONET and sentence-transformers models so first request is fast
+RUN python -c "\
+from transformers import AutoProcessor, AutoModelForZeroShotImageClassification; \
+AutoProcessor.from_pretrained('chanwkim/monet'); \
+AutoModelForZeroShotImageClassification.from_pretrained('chanwkim/monet'); \
+from sentence_transformers import SentenceTransformer; \
+SentenceTransformer('all-MiniLM-L6-v2'); \
+print('Models cached')"
 # Runtime data directories — must be writable by user 1000
 RUN mkdir -p data/uploads data/patient_chats data/lesions && \
     echo '{"patients": []}' > data/patients.json && \
 USER user
+# Fix OMP_NUM_THREADS warning and ensure all CPU cores are used
+ENV OMP_NUM_THREADS=4
+ENV MKL_NUM_THREADS=4
 EXPOSE 7860
 CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7860"]

models/medgemma_agent.py CHANGED Viewed

@@ -341,7 +341,7 @@ class MedGemmaAgent:
         try:
             time.sleep(0.2)
-            output = self.pipe(messages, max_new_tokens=400)
             result = output[0]["generated_text"][-1]["content"]
             findings['synthesis'] = result
@@ -411,7 +411,7 @@ Be concise and specific."""
         ]
         try:
-            output = self.pipe(messages, max_new_tokens=300)
             reconciliation = output[0]["generated_text"][-1]["content"]
             self.last_reconciliation = reconciliation
@@ -643,7 +643,7 @@ Be specific to THIS lesion. 3-5 sentences maximum."""
         # Generate response
         start = time.time()
         try:
-            output = self.pipe(messages, max_new_tokens=250)
             response = output[0]["generated_text"][-1]["content"]
             yield f"[RESPONSE]\n"
@@ -820,7 +820,7 @@ Provide your assessment:
         try:
             yield f"[THINKING]Comparing current image to previous findings...[/THINKING]\n"
-            output = self.pipe(messages, max_new_tokens=400)
             comparison_result = output[0]["generated_text"][-1]["content"]
             yield f"[RESPONSE]\n"
@@ -863,7 +863,7 @@ Provide your assessment:
         content.append({"type": "text", "text": message})
         messages = [{"role": "user", "content": content}]
-        output = self.pipe(messages, max_new_tokens=512)
         return output[0]["generated_text"][-1]["content"]
     def chat_followup(self, message: str) -> Generator[str, None, None]:
@@ -917,7 +917,7 @@ Provide a concise, informative response. If the question is outside your experti
             yield f"[THINKING]Considering your question in context of the previous analysis...[/THINKING]\n"
             time.sleep(0.2)
-            output = self.pipe(messages, max_new_tokens=400)
             response = output[0]["generated_text"][-1]["content"]
             yield f"[RESPONSE]\n"

         try:
             time.sleep(0.2)
+            output = self.pipe(messages, max_new_tokens=200)
             result = output[0]["generated_text"][-1]["content"]
             findings['synthesis'] = result
         ]
         try:
+            output = self.pipe(messages, max_new_tokens=150)
             reconciliation = output[0]["generated_text"][-1]["content"]
             self.last_reconciliation = reconciliation
         # Generate response
         start = time.time()
         try:
+            output = self.pipe(messages, max_new_tokens=150)
             response = output[0]["generated_text"][-1]["content"]
             yield f"[RESPONSE]\n"
         try:
             yield f"[THINKING]Comparing current image to previous findings...[/THINKING]\n"
+            output = self.pipe(messages, max_new_tokens=200)
             comparison_result = output[0]["generated_text"][-1]["content"]
             yield f"[RESPONSE]\n"
         content.append({"type": "text", "text": message})
         messages = [{"role": "user", "content": content}]
+        output = self.pipe(messages, max_new_tokens=200)
         return output[0]["generated_text"][-1]["content"]
     def chat_followup(self, message: str) -> Generator[str, None, None]:
             yield f"[THINKING]Considering your question in context of the previous analysis...[/THINKING]\n"
             time.sleep(0.2)
+            output = self.pipe(messages, max_new_tokens=200)
             response = output[0]["generated_text"][-1]["content"]
             yield f"[RESPONSE]\n"