File size: 3,552 Bytes
89c6379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# =============================================================================
# RareDx — Hugging Face Spaces Dockerfile
# Single container: FastAPI (8080, internal) + Streamlit (8501, public)
# =============================================================================

FROM python:3.11-slim

# --------------------------------------------------------------------------
# System dependencies
# --------------------------------------------------------------------------
RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc \
        g++ \
        libxml2-dev \
        libxslt-dev \
        curl \
        supervisor \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# --------------------------------------------------------------------------
# Python dependencies
# Install before copying source so this layer is cached on code-only changes
# --------------------------------------------------------------------------
COPY backend/requirements.txt ./requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# --------------------------------------------------------------------------
# Pre-download BioLORD-2023 model into the image
# This avoids a ~500MB download on every Space restart
# --------------------------------------------------------------------------
ENV HF_HOME=/app/.cache/huggingface
RUN python -c "\
from sentence_transformers import SentenceTransformer; \
print('Downloading BioLORD-2023...'); \
SentenceTransformer('FremyCompany/BioLORD-2023'); \
print('Model cached.')"

# --------------------------------------------------------------------------
# Application source
# --------------------------------------------------------------------------
COPY backend/ ./backend/

# --------------------------------------------------------------------------
# Pre-built knowledge data (bundled — no runtime download needed)
#   data/graph_store.json  — 33MB  Orphanet+HPO knowledge graph (NetworkX JSON)
#   data/chromadb/         — 149MB BioLORD disease embeddings (ChromaDB)
#   data/hpo_index/        — 26MB  BioLORD HPO term embeddings (numpy + JSON)
# --------------------------------------------------------------------------
COPY data/graph_store.json   ./data/graph_store.json
COPY data/chromadb/          ./data/chromadb/
COPY data/hpo_index/         ./data/hpo_index/

# --------------------------------------------------------------------------
# supervisord config
# --------------------------------------------------------------------------
COPY supervisord.conf /etc/supervisor/conf.d/raredx.conf

# --------------------------------------------------------------------------
# Runtime environment
# Tell pipeline to use embedded ChromaDB and local graph store
# (no Neo4j or external ChromaDB server needed)
# --------------------------------------------------------------------------
ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    CHROMA_HOST=localhost \
    CHROMA_PORT=9999 \
    CHROMA_COLLECTION=rare_diseases \
    EMBED_MODEL=FremyCompany/BioLORD-2023 \
    ORPHANET_DATA_DIR=/app/data/orphanet

# Port Streamlit listens on (declared for HF Spaces)
EXPOSE 8501

# --------------------------------------------------------------------------
# Start both services via supervisord
# FastAPI: 127.0.0.1:8080 (internal — Streamlit calls it)
# Streamlit: 0.0.0.0:8501 (public — HF Spaces exposes this)
# --------------------------------------------------------------------------
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/raredx.conf"]