Spaces:

IslamAbdelslam
/

Chest-xRay

Sleeping

App Files Files Community

IslamAbdelslam commited on Apr 20

Commit

41c2fc8

unverified ·

1 Parent(s): 31dfcee

update v0.2

Browse files

Files changed (3) hide show

Dockerfile +13 -7
main.py +337 -18
requirements.txt +9 -4

Dockerfile CHANGED Viewed

@@ -13,12 +13,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libgomp1 \
     && rm -rf /var/lib/apt/lists/*
-# Runtime limits to reduce CPU/RAM pressure on free-tier containers
 ENV PYTHONUNBUFFERED=1 \
     OMP_NUM_THREADS=1 \
     MKL_NUM_THREADS=1 \
     OPENBLAS_NUM_THREADS=1 \
     NUMEXPR_NUM_THREADS=1 \
     MALLOC_ARENA_MAX=2 \
     ATEN_CPU_CAPABILITY=default \
     MKL_SERVICE_FORCE_INTEL=1
@@ -32,15 +35,18 @@ RUN pip install --no-cache-dir torch==2.5.1 torchvision==0.20.1 --index-url http
 # Install pinned fastai
 RUN pip install --no-cache-dir fastai==2.8.7
-# Install remaining requirements
 RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install gdown
-# Copy the application code and model
 COPY . .
-# Expose port
 EXPOSE 7860
-# Run the API with one worker for lower memory footprint
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "10"]

     libgomp1 \
     && rm -rf /var/lib/apt/lists/*
+# Runtime limits to reduce CPU/RAM pressure on small instances
 ENV PYTHONUNBUFFERED=1 \
+    PYTHONFAULTHANDLER=1 \
     OMP_NUM_THREADS=1 \
     MKL_NUM_THREADS=1 \
     OPENBLAS_NUM_THREADS=1 \
     NUMEXPR_NUM_THREADS=1 \
+    PREDICT_TIMEOUT_SECONDS=50 \
+    MAX_IMAGE_DIM=1024 \
     MALLOC_ARENA_MAX=2 \
     ATEN_CPU_CAPABILITY=default \
     MKL_SERVICE_FORCE_INTEL=1
 # Install pinned fastai
 RUN pip install --no-cache-dir fastai==2.8.7
+# Install remaining requirements and gdown
 RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir gdown
+# Copy the application code
 COPY . .
+# Download model from Google Drive into the location expected by main.py
+RUN gdown 1ppniUVWmgfNg_wnLFwx5YA-rk6mYQkMB -O /app/export.pkl
+# Expose default app port
 EXPOSE 7860
+# Railway uses PORT at runtime; fallback to 7860 locally
+CMD sh -c 'uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --workers 1 --timeout-keep-alive 10'

main.py CHANGED Viewed

@@ -9,6 +9,10 @@ import shutil
 import os
 import warnings
 import asyncio
 # Suppress warnings
 warnings.filterwarnings("ignore")
@@ -26,12 +30,21 @@ try:
     import torch
     from fastai.vision.all import load_learner, PILImage
     from PIL import Image, UnidentifiedImageError
 except ImportError:
     raise RuntimeError(
-        "FastAI is not installed. Please install fastai and torch.")
 MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "10"))
 MAX_UPLOAD_BYTES = MAX_UPLOAD_MB * 1024 * 1024
 app = FastAPI(title="Pneumonia Detection API")
@@ -60,12 +73,34 @@ for p in possible_paths:
         model_path = p
         break
-if model_path is None:
-    raise FileNotFoundError("Could not find export.pkl.")
-print(f"Loading model from: {model_path}")
-learn = load_learner(model_path)
-learn.model.eval()
 try:
     torch.set_num_threads(1)
@@ -88,16 +123,269 @@ async def root():
         "predict": ["/predict"],
         "accepted_file_fields": ["file"],
         "max_upload_mb": MAX_UPLOAD_MB,
     }
 @app.get("/health")
 async def health():
-    return {"status": "ok"}
 @app.post("/predict")
 async def predict(request: Request, file: UploadFile | None = File(default=None)):
     incoming_file: Any = file
     if incoming_file is None:
         form = await request.form()
@@ -136,24 +424,48 @@ async def predict(request: Request, file: UploadFile | None = File(default=None)
             with Image.open(tmp_path) as raw_img:
                 raw_img.load()
                 rgb_img = raw_img.convert("RGB")
-                if max(rgb_img.size) > 2048:
-                    rgb_img.thumbnail((2048, 2048), Image.Resampling.LANCZOS)
                 rgb_img.save(normalized_path, format="JPEG", quality=95)
         except UnidentifiedImageError as e:
             raise HTTPException(
                 status_code=400, detail=f"Invalid image file: {e}")
-        img = PILImage.create(normalized_path)
         # FastAI progress bars can break in some hosted environments; disable per-call.
         async with predict_lock:
-            with learn.no_bar():
-                with torch.inference_mode():
-                    pred_label, _, probabilities = learn.predict(img)
-        vocab = [str(label).strip() for label in learn.dls.vocab]
         class_probs = {
             class_name: float(prob)
-            for class_name, prob in zip(vocab, probabilities.tolist())
         }
         def get_prob(*aliases: str) -> float:
@@ -182,6 +494,8 @@ async def predict(request: Request, file: UploadFile | None = File(default=None)
                 "chest x-ray image",
                 "chest xray image",
                 "chest_xray",
             )
             if chest_xray_prob > 0.0:
                 other_prob = max(0.0, 1.0 - chest_xray_prob)
@@ -210,9 +524,14 @@ async def predict(request: Request, file: UploadFile | None = File(default=None)
     except HTTPException:
         raise
     except Exception as e:
-        print(f"[predict] error={e}", flush=True)
         raise HTTPException(
-            status_code=500, detail=f"Error predicting: {str(e)}")
     finally:
         if normalized_path.exists():
             normalized_path.unlink()

 import os
 import warnings
 import asyncio
+import logging
+import multiprocessing as mp
+import time
+import traceback
 # Suppress warnings
 warnings.filterwarnings("ignore")
     import torch
     from fastai.vision.all import load_learner, PILImage
     from PIL import Image, UnidentifiedImageError
+    import numpy as np
 except ImportError:
     raise RuntimeError(
+        "Required ML packages are missing. Please install fastai, torch, and numpy.")
 MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "10"))
 MAX_UPLOAD_BYTES = MAX_UPLOAD_MB * 1024 * 1024
+PREDICT_TIMEOUT_SECONDS = float(os.getenv("PREDICT_TIMEOUT_SECONDS", "50"))
+MAX_IMAGE_DIM = int(os.getenv("MAX_IMAGE_DIM", "1024"))
+MODEL_IMAGE_SIZE = int(os.getenv("MODEL_IMAGE_SIZE", "224"))
+CONFIGURED_INFERENCE_START_METHOD = os.getenv("INFERENCE_START_METHOD")
+SAFE_INFERENCE_START_METHOD = os.getenv("SAFE_INFERENCE_START_METHOD", "spawn")
+INFERENCE_CRASH_THRESHOLD = int(os.getenv("INFERENCE_CRASH_THRESHOLD", "2"))
+logger = logging.getLogger("uvicorn.error")
 app = FastAPI(title="Pneumonia Detection API")
         model_path = p
         break
+learn = None
+model_load_error = None
+active_inference_start_method = SAFE_INFERENCE_START_METHOD
+consecutive_inference_crashes = 0
+last_prediction_vocab: list[str] = []
+last_inference_stage: str | None = None
+last_inference_error: str | None = None
+_MODEL_MEAN = torch.tensor([0.485, 0.456, 0.406],
+                           dtype=torch.float32).view(1, 3, 1, 1)
+_MODEL_STD = torch.tensor([0.229, 0.224, 0.225],
+                          dtype=torch.float32).view(1, 3, 1, 1)
+def load_model() -> None:
+    global model_load_error
+    if model_path is None:
+        model_load_error = "Could not find export.pkl."
+        logger.error(model_load_error)
+        return
+    model_load_error = None
+@app.on_event("startup")
+async def startup_event() -> None:
+    load_model()
 try:
     torch.set_num_threads(1)
         "predict": ["/predict"],
         "accepted_file_fields": ["file"],
         "max_upload_mb": MAX_UPLOAD_MB,
+        "predict_timeout_seconds": PREDICT_TIMEOUT_SECONDS,
+        "max_image_dim": MAX_IMAGE_DIM,
     }
 @app.get("/health")
 async def health():
+    return {
+        "status": "ok",
+        "model_loaded": model_load_error is None,
+        "model_error": model_load_error,
+    }
+@app.get("/diag")
+async def diagnostics():
+    return {
+        "status": "ok",
+        "model_loaded": model_load_error is None,
+        "model_error": model_load_error,
+        "model_path": str(model_path) if model_path is not None else None,
+        "vocab": last_prediction_vocab,
+        "settings": {
+            "max_upload_mb": MAX_UPLOAD_MB,
+            "predict_timeout_seconds": PREDICT_TIMEOUT_SECONDS,
+            "max_image_dim": MAX_IMAGE_DIM,
+            "model_image_size": MODEL_IMAGE_SIZE,
+            "configured_inference_start_method": CONFIGURED_INFERENCE_START_METHOD,
+            "inference_start_method": active_inference_start_method,
+            "safe_inference_start_method": SAFE_INFERENCE_START_METHOD,
+            "inference_crash_threshold": INFERENCE_CRASH_THRESHOLD,
+            "consecutive_inference_crashes": consecutive_inference_crashes,
+        },
+        "runtime": {
+            "pythonunbuffered": os.getenv("PYTHONUNBUFFERED"),
+            "omp_num_threads": os.getenv("OMP_NUM_THREADS"),
+            "mkl_num_threads": os.getenv("MKL_NUM_THREADS"),
+            "openblas_num_threads": os.getenv("OPENBLAS_NUM_THREADS"),
+            "aten_cpu_capability": os.getenv("ATEN_CPU_CAPABILITY"),
+        },
+        "lock": {
+            "predict_lock_locked": predict_lock.locked(),
+        },
+        "last_inference": {
+            "stage": last_inference_stage,
+            "error": last_inference_error,
+        },
+        "versions": {
+            "torch": getattr(torch, "__version__", None),
+        },
+    }
+def _predict_from_path(image_path: Path, learner=None):
+    # Run all model work in one sync function so it can be moved to a worker thread.
+    active_learn = learner or learn
+    if active_learn is None:
+        raise RuntimeError(model_load_error or "Model is not loaded")
+    with Image.open(image_path) as raw_img:
+        rgb_img = raw_img.convert("RGB")
+        resized = rgb_img.resize(
+            (MODEL_IMAGE_SIZE, MODEL_IMAGE_SIZE), Image.Resampling.BILINEAR)
+        arr = np.asarray(resized, dtype=np.float32) / 255.0
+        inputs = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0)
+    device = next(active_learn.model.parameters()).device
+    inputs = inputs.to(device)
+    mean = _MODEL_MEAN.to(device)
+    std = _MODEL_STD.to(device)
+    inputs = (inputs - mean) / std
+    vocab = [str(label).strip() for label in active_learn.dls.vocab]
+    with active_learn.no_bar():
+        with torch.inference_mode():
+            outputs = active_learn.model(inputs)
+            if outputs.ndim == 1:
+                outputs = outputs.unsqueeze(0)
+            if outputs.shape[-1] == 1:
+                positive_prob = torch.sigmoid(outputs)[0].flatten()
+                if len(vocab) >= 2:
+                    probabilities = torch.zeros(
+                        len(vocab), device=positive_prob.device)
+                    probabilities[0] = 1 - positive_prob[0]
+                    probabilities[1] = positive_prob[0]
+                else:
+                    probabilities = torch.stack(
+                        [1 - positive_prob, positive_prob], dim=0).flatten()
+            else:
+                probabilities = torch.softmax(outputs, dim=-1)[0]
+            if len(vocab) > 0 and probabilities.numel() != len(vocab):
+                if probabilities.numel() < len(vocab):
+                    padded = torch.zeros(
+                        len(vocab), device=probabilities.device)
+                    padded[:probabilities.numel()] = probabilities
+                    probabilities = padded
+                else:
+                    probabilities = probabilities[:len(vocab)]
+            pred_index = int(torch.argmax(probabilities).item())
+            pred_label = vocab[pred_index] if pred_index < len(
+                vocab) else str(pred_index)
+            return pred_label, pred_index, probabilities, vocab
+class InferenceSubprocessCrash(RuntimeError):
+    def __init__(self, exit_code: int | None):
+        self.exit_code = exit_code
+        super().__init__(
+            f"Inference subprocess crashed (exit code {exit_code}).")
+def _predict_subprocess_worker(image_path: str, model_path_str: str | None, conn) -> None:
+    try:
+        conn.send({"status": "stage", "stage": "worker_started"})
+        local_learn = learn
+        if local_learn is None:
+            if not model_path_str:
+                raise RuntimeError(
+                    "Model path is missing in subprocess worker")
+            local_learn = load_learner(Path(model_path_str))
+            local_learn.model.eval()
+        conn.send({"status": "stage", "stage": "learner_loaded"})
+        try:
+            torch.set_num_threads(1)
+            torch.set_num_interop_threads(1)
+            torch.backends.mkldnn.enabled = False
+        except RuntimeError:
+            pass
+        conn.send({"status": "stage", "stage": "inference_preparing"})
+        pred_label, _, probabilities, vocab = _predict_from_path(
+            Path(image_path),
+            local_learn,
+        )
+        conn.send({"status": "stage", "stage": "inference_finished"})
+        payload = {
+            "ok": True,
+            "pred_label": str(pred_label),
+            "probabilities": probabilities.tolist(),
+            "vocab": vocab,
+        }
+        conn.send(payload)
+    except Exception as exc:
+        conn.send(
+            {
+                "ok": False,
+                "error_type": type(exc).__name__,
+                "error_message": str(exc),
+                "error_repr": repr(exc),
+                "traceback": traceback.format_exc(),
+            }
+        )
+    finally:
+        conn.close()
+def _predict_via_subprocess(image_path: Path, timeout_seconds: float, start_method: str):
+    global last_inference_stage, last_inference_error
+    ctx = mp.get_context(start_method)
+    parent_conn, child_conn = ctx.Pipe(duplex=False)
+    proc = ctx.Process(
+        target=_predict_subprocess_worker,
+        args=(str(image_path), str(model_path)
+              if model_path is not None else None, child_conn),
+        daemon=True,
+    )
+    try:
+        proc.start()
+        child_conn.close()
+        deadline = time.monotonic() + timeout_seconds
+        while True:
+            if parent_conn.poll(0.2):
+                try:
+                    payload = parent_conn.recv()
+                except EOFError:
+                    if not proc.is_alive():
+                        last_inference_error = (
+                            f"Subprocess exited before returning a payload (exit code {proc.exitcode})."
+                        )
+                        raise InferenceSubprocessCrash(proc.exitcode)
+                    raise RuntimeError(
+                        "Inference subprocess closed its pipe without returning a result."
+                    )
+                proc.join(timeout=1)
+                if not isinstance(payload, dict):
+                    raise RuntimeError(
+                        f"Unexpected inference payload type: {type(payload).__name__}"
+                    )
+                if payload.get("status") == "stage":
+                    last_inference_stage = str(payload.get("stage"))
+                    continue
+                if not payload.get("ok"):
+                    error_type = payload.get(
+                        "error_type") or "InferenceWorkerError"
+                    error_message = (
+                        payload.get("error_message")
+                        or payload.get("error_repr")
+                        or "Unknown inference error"
+                    )
+                    traceback_text = payload.get("traceback")
+                    if traceback_text:
+                        logger.error(
+                            "Inference worker traceback:\n%s", traceback_text)
+                    last_inference_error = f"{error_type}: {error_message}"
+                    raise RuntimeError(f"{error_type}: {error_message}")
+                return payload
+            if not proc.is_alive():
+                last_inference_error = (
+                    f"Subprocess exited before returning a payload (exit code {proc.exitcode})."
+                )
+                raise InferenceSubprocessCrash(proc.exitcode)
+            if time.monotonic() >= deadline:
+                raise TimeoutError("Inference subprocess timed out")
+    finally:
+        if proc.is_alive():
+            proc.terminate()
+            proc.join(timeout=2)
+        parent_conn.close()
+def _record_inference_success() -> None:
+    global consecutive_inference_crashes
+    consecutive_inference_crashes = 0
+def _record_inference_crash() -> bool:
+    global consecutive_inference_crashes, active_inference_start_method
+    consecutive_inference_crashes += 1
+    should_switch = (
+        active_inference_start_method != SAFE_INFERENCE_START_METHOD
+        and consecutive_inference_crashes >= INFERENCE_CRASH_THRESHOLD
+    )
+    if should_switch:
+        logger.warning(
+            "Switching inference subprocess mode from %s to %s after %d consecutive crashes",
+            active_inference_start_method,
+            SAFE_INFERENCE_START_METHOD,
+            consecutive_inference_crashes,
+        )
+        active_inference_start_method = SAFE_INFERENCE_START_METHOD
+        consecutive_inference_crashes = 0
+        return True
+    return False
 @app.post("/predict")
 async def predict(request: Request, file: UploadFile | None = File(default=None)):
+    load_model()
+    if model_load_error is not None:
+        raise HTTPException(
+            status_code=503,
+            detail=model_load_error or "Model is not available.",
+        )
     incoming_file: Any = file
     if incoming_file is None:
         form = await request.form()
             with Image.open(tmp_path) as raw_img:
                 raw_img.load()
                 rgb_img = raw_img.convert("RGB")
+                if max(rgb_img.size) > MAX_IMAGE_DIM:
+                    rgb_img.thumbnail(
+                        (MAX_IMAGE_DIM, MAX_IMAGE_DIM), Image.Resampling.LANCZOS)
                 rgb_img.save(normalized_path, format="JPEG", quality=95)
         except UnidentifiedImageError as e:
             raise HTTPException(
                 status_code=400, detail=f"Invalid image file: {e}")
         # FastAI progress bars can break in some hosted environments; disable per-call.
         async with predict_lock:
+            try:
+                prediction = await asyncio.to_thread(
+                    _predict_via_subprocess,
+                    normalized_path,
+                    PREDICT_TIMEOUT_SECONDS,
+                    active_inference_start_method,
+                )
+            except TimeoutError:
+                raise HTTPException(
+                    status_code=504,
+                    detail=(
+                        "Prediction timed out before platform edge timeout. "
+                        "Try a smaller image or increase resources."
+                    ),
+                )
+            except InferenceSubprocessCrash as exc:
+                switched = _record_inference_crash()
+                detail = f"Inference worker crashed (exit code {exc.exit_code})."
+                if switched:
+                    detail += " Automatically switched to safer inference mode; retry the request."
+                raise HTTPException(status_code=503, detail=detail)
+        _record_inference_success()
+        pred_label = prediction["pred_label"]
+        probabilities = prediction["probabilities"]
+        vocab = prediction["vocab"]
+        global last_prediction_vocab
+        last_prediction_vocab = vocab
         class_probs = {
             class_name: float(prob)
+            for class_name, prob in zip(vocab, probabilities)
         }
         def get_prob(*aliases: str) -> float:
                 "chest x-ray image",
                 "chest xray image",
                 "chest_xray",
+                "other",
+                "Other",
             )
             if chest_xray_prob > 0.0:
                 other_prob = max(0.0, 1.0 - chest_xray_prob)
     except HTTPException:
         raise
     except Exception as e:
+        print(f"[predict] error={type(e).__name__}: {e!r}", flush=True)
+        error_message = str(e).strip() or repr(e)
+        global last_inference_error
+        last_inference_error = f"{type(e).__name__}: {error_message}"
         raise HTTPException(
+            status_code=500,
+            detail=f"Error predicting: {type(e).__name__}: {error_message}",
+        )
     finally:
         if normalized_path.exists():
             normalized_path.unlink()

requirements.txt CHANGED Viewed

@@ -1,4 +1,9 @@
-fastapi
-uvicorn
-python-multipart
-ipython

+fastapi==0.115.6
+uvicorn==0.34.0
+python-multipart==0.0.20
+ipython==8.31.0
+pillow==11.1.0
+torch==2.5.1
+torchvision==0.20.1
+fastai==2.8.7
+scikit-learn==1.3.2