| FROM python:3.12-slim |
|
|
| WORKDIR /app |
|
|
| # Install system dependencies |
| RUN apt-get update && apt-get install -y --no-install-recommends \ |
| build-essential \ |
| && rm -rf /var/lib/apt/lists/* |
|
|
| # Copy requirements and install Python dependencies |
| # Install CPU-only PyTorch first (saves ~1.5GB vs full torch with CUDA) |
| COPY requirements.txt . |
| RUN pip install --no-cache-dir torch --index-url https: |
| pip install --no-cache-dir -r requirements.txt |
|
|
| # Pre-download models during build (network is available here) |
| # At runtime, the container has NO outbound DNS, so models must be cached |
|
|
| # 1. Summarization model (MBart, float16) |
| RUN python -c "\ |
| from transformers import MBartForConditionalGeneration, AutoTokenizer, AutoConfig; \ |
| import torch; \ |
| repo = 'bayan10/summarization-model'; \ |
| print('Downloading summarization tokenizer...'); \ |
| AutoTokenizer.from_pretrained(repo); \ |
| print('Downloading summarization config...'); \ |
| AutoConfig.from_pretrained(repo); \ |
| print('Downloading summarization model (float16)...'); \ |
| MBartForConditionalGeneration.from_pretrained(repo, torch_dtype=torch.float16); \ |
| print('Summarization model cached!'); \ |
| " |
|
|
| # 2. Spelling model (AraSpell — AraBERT encoder-decoder + checkpoint) |
| RUN python -c "\ |
| from huggingface_hub import hf_hub_download; \ |
| from transformers import AutoTokenizer, EncoderDecoderModel, AutoModelForMaskedLM; \ |
| print('Downloading AraSpell checkpoint...'); \ |
| hf_hub_download(repo_id='bayan10/AraSpell-Model', filename='last_model.pt'); \ |
| print('Downloading AraBERT tokenizer...'); \ |
| AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02'); \ |
| print('Downloading AraBERT encoder-decoder...'); \ |
| EncoderDecoderModel.from_encoder_decoder_pretrained('aubmindlab/bert-base-arabertv02', 'aubmindlab/bert-base-arabertv02'); \ |
| print('Downloading AraBERT MLM (for ContextualCorrector)...'); \ |
| AutoModelForMaskedLM.from_pretrained('aubmindlab/bert-base-arabertv02'); \ |
| print('Spelling model + MLM cached!'); \ |
| " |
|
|
| # 3. Grammar — camel-tools MLE disambiguator data |
| RUN camel_data -i light |
|
|
| # 4. Punctuation model (PuncAra-v1 — EncoderDecoderModel) |
| RUN python -c "\ |
| from transformers import EncoderDecoderModel, AutoTokenizer; \ |
| repo = 'bayan10/PuncAra-v1'; \ |
| print('Downloading PuncAra-v1 tokenizer...'); \ |
| AutoTokenizer.from_pretrained(repo); \ |
| print('Downloading PuncAra-v1 model...'); \ |
| EncoderDecoderModel.from_pretrained(repo); \ |
| print('PuncAra-v1 cached!'); \ |
| " |
|
|
| # 5. Dialect-to-MSA model (mT5, float16) |
| RUN python -c "\ |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; \ |
| import torch; \ |
| repo = 'bayan10/dialect-to-msa-model'; \ |
| print('Downloading dialect tokenizer...'); \ |
| AutoTokenizer.from_pretrained(repo); \ |
| print('Downloading dialect model (float16)...'); \ |
| AutoModelForSeq2SeqLM.from_pretrained(repo, torch_dtype=torch.float16); \ |
| print('Dialect model cached!'); \ |
| " |
|
|
| # Copy application code |
| COPY src/ ./src/ |
| COPY quran.py ./ |
| COPY quran_master.db ./ |
| COPY .env* ./ |
|
|
| # Set environment variables |
| ENV PORT=7860 |
| ENV DEBUG=False |
| ENV PYTHONUNBUFFERED=1 |
|
|
| # Expose port |
| EXPOSE 7860 |
|
|
| # Start the app with gunicorn (single worker to minimize RAM) |
| # Timeout 300s: full pipeline (spelling ~50s + grammar ~8s + punctuation ~30s + cold start) |
| CMD ["gunicorn", "--chdir", "src", "app:app", "--bind", "0.0.0.0:7860", "--timeout", "300", "--workers", "1"] |
|
|