version: '3.8' services: # Ollama service for local LLM inference ollama: image: ollama/ollama:latest container_name: rag-ollama volumes: - ollama_data:/root/.ollama ports: - "11434:11434" environment: - OLLAMA_HOST=0.0.0.0 healthcheck: test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] interval: 30s timeout: 10s retries: 5 restart: unless-stopped # Uncomment for GPU support # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: 1 # capabilities: [gpu] # RAG Application rag-app: build: context: . dockerfile: Dockerfile container_name: rag-terminal ports: - "7860:7860" volumes: # Persist embeddings and vector store - ./rag_data:/app/rag_data - ./.embedding_cache:/app/.embedding_cache # Mount documents directory (read-only) - ~/Documents/Books:/app/documents:ro environment: # Ollama configuration - OLLAMA_BASE_URL=http://ollama:11434 - OLLAMA_MODEL=${OLLAMA_MODEL:-smollm2:360m} - OLLAMA_MODEL_CLOUD=${OLLAMA_MODEL_CLOUD:-gpt-oss:20b-cloud} - OLLAMA_API_KEY=${OLLAMA_API_KEY} # Embedding configuration - EMBEDDING_METHOD=${EMBEDDING_METHOD:-huggingface} - HF_TOKEN=${HF_TOKEN} - OPENAI_API_KEY=${OPENAI_API_KEY} - VOYAGE_API_KEY=${VOYAGE_API_KEY} - HF_EMBEDDING_MODEL=${HF_EMBEDDING_MODEL:-BAAI/bge-base-en-v1.5} # Document configuration - SAMPLE_DOCUMENT_PATH=${SAMPLE_DOCUMENT_PATH:-/app/documents/Atomic_Habits_James_Clear.pdf} - SAVE_DIR=/app/rag_data - CHUNK_SIZE=${CHUNK_SIZE:-1000} - CHUNK_OVERLAP=${CHUNK_OVERLAP:-200} - TOP_K=${TOP_K:-5} # Token limits - TOKEN_LIMIT_BASE=${TOKEN_LIMIT_BASE:-512} - TOKEN_LIMIT_PER_SOURCE=${TOKEN_LIMIT_PER_SOURCE:-200} - TOKEN_LIMIT_MAX=${TOKEN_LIMIT_MAX:-2048} # Gradio configuration - GRADIO_SERVER_NAME=0.0.0.0 - GRADIO_SERVER_PORT=7860 - GRADIO_SHARE=${GRADIO_SHARE:-false} depends_on: ollama: condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:7860/"] interval: 30s timeout: 10s retries: 3 start_period: 60s restart: unless-stopped volumes: ollama_data: driver: local networks: default: name: rag-network