SmartCertify-ML / Dockerfile
Harsh Yadav
fix: remove CNN from build (OOM), inline tabular training, ELA heuristic fallback for image
165fd8b
FROM python:3.11-slim
WORKDIR /app
# System deps
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
git \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
# Install CPU-only PyTorch first (separate index), then everything else
RUN pip install --no-cache-dir --timeout 300 \
torch==2.2.2 torchvision==0.17.2 \
--index-url https://download.pytorch.org/whl/cpu
RUN pip install --no-cache-dir --timeout 300 --retries 5 -r requirements.txt
COPY . .
# Create dirs and __init__ files
RUN mkdir -p saved_models plots logs data && \
touch app/__init__.py \
app/api/__init__.py \
app/api/routes/__init__.py \
app/models/__init__.py \
app/data/__init__.py \
app/utils/__init__.py \
app/config/__init__.py
# ─────────────────────────────────────────────────────────────────────────────
# BUILD STEP 1: Generate 25,000 synthetic CSV rows (no network needed)
# ─────────────────────────────────────────────────────────────────────────────
RUN python -m app.data.generate_synthetic
# ─────────────────────────────────────────────────────────────────────────────
# BUILD STEP 2: Pre-download NLP models from HuggingFace
# ─────────────────────────────────────────────────────────────────────────────
RUN python -c "\
from sentence_transformers import SentenceTransformer; \
from transformers import pipeline; \
print('Downloading all-MiniLM-L6-v2...'); \
SentenceTransformer('all-MiniLM-L6-v2'); \
print('Downloading DistilBERT zero-shot...'); \
pipeline('zero-shot-classification', model='typeform/distilbert-base-uncased-mnli'); \
print('NLP models downloaded.') \
"
# ─────────────────────────────────────────────────────────────────────────────
# BUILD STEP 3: Train tabular models ONLY (fraud + trust + anomaly + similarity)
# CNN skipped at build time β€” image analysis uses ELA heuristics at runtime.
# This avoids OOM from holding thousands of PIL images in memory.
# ─────────────────────────────────────────────────────────────────────────────
RUN python -c "\
import os; \
os.environ.setdefault('LOKY_MAX_CPU_COUNT', '2'); \
import joblib, pandas as pd; \
from pathlib import Path; \
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, IsolationForest; \
from sklearn.model_selection import train_test_split; \
from sklearn.preprocessing import LabelEncoder, StandardScaler; \
import xgboost as xgb; \
import lightgbm as lgb; \
\
SAVE_DIR = Path('saved_models'); \
df = pd.read_csv('data/synthetic_certificates.csv'); \
print(f'Training on {len(df)} rows...'); \
\
FRAUD_FEATS = ['issuer_reputation_score','template_match_score','metadata_completeness_score', \
'domain_verification_status','previous_verification_count','cert_age_days', \
'issuer_cert_count','has_expiry','name_length','course_name_length', \
'total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \
'domain_age_days','verification_success_rate']; \
TRUST_FEATS = ['total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \
'domain_age_days','verification_success_rate']; \
\
le = LabelEncoder(); \
y = le.fit_transform(df['label']); \
label_map = {l:i for i,l in enumerate(le.classes_)}; \
X = df[FRAUD_FEATS].fillna(0); \
Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y); \
\
print(' Training RandomForest...'); \
rf = RandomForestClassifier(n_estimators=200,max_depth=12,n_jobs=-1,random_state=42); \
rf.fit(Xtr,ytr); \
print(' Training XGBoost...'); \
xm = xgb.XGBClassifier(n_estimators=200,max_depth=6,learning_rate=0.1, \
eval_metric='mlogloss',random_state=42,verbosity=0); \
xm.fit(Xtr,ytr); \
print(' Training LightGBM...'); \
lm = lgb.LGBMClassifier(n_estimators=200,max_depth=8,learning_rate=0.1, \
random_state=42,verbose=-1); \
lm.fit(Xtr,ytr); \
joblib.dump(rf, SAVE_DIR/'fraud_rf.pkl'); \
joblib.dump(xm, SAVE_DIR/'fraud_xgb.pkl'); \
joblib.dump(lm, SAVE_DIR/'fraud_lgb.pkl'); \
joblib.dump(FRAUD_FEATS, SAVE_DIR/'fraud_features.pkl'); \
joblib.dump(label_map, SAVE_DIR/'fraud_label_map.pkl'); \
print(' Fraud models saved.'); \
\
Xt = df[TRUST_FEATS].fillna(0); yt = df['trust_score'].fillna(0.5); \
Xtr2,Xte2,ytr2,yte2 = train_test_split(Xt,yt,test_size=0.2,random_state=42); \
print(' Training trust model...'); \
tm = GradientBoostingRegressor(n_estimators=200,max_depth=5,learning_rate=0.05,random_state=42); \
tm.fit(Xtr2,ytr2); \
joblib.dump(tm, SAVE_DIR/'trust_model.pkl'); \
joblib.dump(TRUST_FEATS, SAVE_DIR/'trust_features.pkl'); \
print(' Trust model saved.'); \
\
sc = StandardScaler(); Xs = sc.fit_transform(X); \
print(' Training anomaly model...'); \
am = IsolationForest(contamination=0.1,n_estimators=200,random_state=42,n_jobs=-1); \
am.fit(Xs); \
joblib.dump(am, SAVE_DIR/'anomaly_model.pkl'); \
joblib.dump(sc, SAVE_DIR/'anomaly_scaler.pkl'); \
joblib.dump(FRAUD_FEATS, SAVE_DIR/'anomaly_features.pkl'); \
print(' Anomaly model saved.'); \
\
from sentence_transformers import SentenceTransformer; \
print(' Setting up similarity model...'); \
sim = SentenceTransformer('all-MiniLM-L6-v2'); \
(SAVE_DIR/'similarity_model_name.txt').write_text('all-MiniLM-L6-v2'); \
joblib.dump({'model_name':'all-MiniLM-L6-v2','embedding_dim':384}, SAVE_DIR/'similarity_meta.pkl'); \
print(' Similarity model saved.'); \
\
from transformers import pipeline as hf_pipeline; \
print(' Setting up chat model...'); \
clf = hf_pipeline('zero-shot-classification',model='typeform/distilbert-base-uncased-mnli',device=-1); \
(SAVE_DIR/'chat_model_name.txt').write_text('typeform/distilbert-base-uncased-mnli'); \
print('All models trained and saved!') \
"
# ─────────────────────────────────────────────────────────────────────────────
# BUILD STEP 4: Verify core model files exist β€” image model is optional
# ─────────────────────────────────────────────────────────────────────────────
RUN python -c "\
import os; from pathlib import Path; \
required = ['saved_models/fraud_rf.pkl','saved_models/fraud_xgb.pkl', \
'saved_models/fraud_lgb.pkl','saved_models/fraud_features.pkl', \
'saved_models/trust_model.pkl','saved_models/trust_features.pkl', \
'saved_models/anomaly_model.pkl','saved_models/anomaly_scaler.pkl', \
'saved_models/anomaly_features.pkl']; \
missing = [f for f in required if not os.path.exists(f)]; \
assert not missing, f'Build failed - missing: {missing}'; \
files = list(Path('saved_models').iterdir()); \
print(f'Build OK β€” {len(files)} model files:'); \
[print(f' {f.name}: {f.stat().st_size/1024:.1f} KB') for f in sorted(files)] \
"
# Set offline mode for runtime β€” models are already cached
ENV TRANSFORMERS_OFFLINE=1
ENV HF_DATASETS_OFFLINE=1
EXPOSE 7860
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]