FROM python:3.11-slim WORKDIR /app # System deps RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ git \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt . # Install CPU-only PyTorch first (separate index), then everything else RUN pip install --no-cache-dir --timeout 300 \ torch==2.2.2 torchvision==0.17.2 \ --index-url https://download.pytorch.org/whl/cpu RUN pip install --no-cache-dir --timeout 300 --retries 5 -r requirements.txt COPY . . # Create dirs and __init__ files RUN mkdir -p saved_models plots logs data && \ touch app/__init__.py \ app/api/__init__.py \ app/api/routes/__init__.py \ app/models/__init__.py \ app/data/__init__.py \ app/utils/__init__.py \ app/config/__init__.py # ───────────────────────────────────────────────────────────────────────────── # BUILD STEP 1: Generate 25,000 synthetic CSV rows (no network needed) # ───────────────────────────────────────────────────────────────────────────── RUN python -m app.data.generate_synthetic # ───────────────────────────────────────────────────────────────────────────── # BUILD STEP 2: Pre-download NLP models from HuggingFace # ───────────────────────────────────────────────────────────────────────────── RUN python -c "\ from sentence_transformers import SentenceTransformer; \ from transformers import pipeline; \ print('Downloading all-MiniLM-L6-v2...'); \ SentenceTransformer('all-MiniLM-L6-v2'); \ print('Downloading DistilBERT zero-shot...'); \ pipeline('zero-shot-classification', model='typeform/distilbert-base-uncased-mnli'); \ print('NLP models downloaded.') \ " # ───────────────────────────────────────────────────────────────────────────── # BUILD STEP 3: Train tabular models ONLY (fraud + trust + anomaly + similarity) # CNN skipped at build time — image analysis uses ELA heuristics at runtime. # This avoids OOM from holding thousands of PIL images in memory. # ───────────────────────────────────────────────────────────────────────────── RUN python -c "\ import os; \ os.environ.setdefault('LOKY_MAX_CPU_COUNT', '2'); \ import joblib, pandas as pd; \ from pathlib import Path; \ from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, IsolationForest; \ from sklearn.model_selection import train_test_split; \ from sklearn.preprocessing import LabelEncoder, StandardScaler; \ import xgboost as xgb; \ import lightgbm as lgb; \ \ SAVE_DIR = Path('saved_models'); \ df = pd.read_csv('data/synthetic_certificates.csv'); \ print(f'Training on {len(df)} rows...'); \ \ FRAUD_FEATS = ['issuer_reputation_score','template_match_score','metadata_completeness_score', \ 'domain_verification_status','previous_verification_count','cert_age_days', \ 'issuer_cert_count','has_expiry','name_length','course_name_length', \ 'total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \ 'domain_age_days','verification_success_rate']; \ TRUST_FEATS = ['total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \ 'domain_age_days','verification_success_rate']; \ \ le = LabelEncoder(); \ y = le.fit_transform(df['label']); \ label_map = {l:i for i,l in enumerate(le.classes_)}; \ X = df[FRAUD_FEATS].fillna(0); \ Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y); \ \ print(' Training RandomForest...'); \ rf = RandomForestClassifier(n_estimators=200,max_depth=12,n_jobs=-1,random_state=42); \ rf.fit(Xtr,ytr); \ print(' Training XGBoost...'); \ xm = xgb.XGBClassifier(n_estimators=200,max_depth=6,learning_rate=0.1, \ eval_metric='mlogloss',random_state=42,verbosity=0); \ xm.fit(Xtr,ytr); \ print(' Training LightGBM...'); \ lm = lgb.LGBMClassifier(n_estimators=200,max_depth=8,learning_rate=0.1, \ random_state=42,verbose=-1); \ lm.fit(Xtr,ytr); \ joblib.dump(rf, SAVE_DIR/'fraud_rf.pkl'); \ joblib.dump(xm, SAVE_DIR/'fraud_xgb.pkl'); \ joblib.dump(lm, SAVE_DIR/'fraud_lgb.pkl'); \ joblib.dump(FRAUD_FEATS, SAVE_DIR/'fraud_features.pkl'); \ joblib.dump(label_map, SAVE_DIR/'fraud_label_map.pkl'); \ print(' Fraud models saved.'); \ \ Xt = df[TRUST_FEATS].fillna(0); yt = df['trust_score'].fillna(0.5); \ Xtr2,Xte2,ytr2,yte2 = train_test_split(Xt,yt,test_size=0.2,random_state=42); \ print(' Training trust model...'); \ tm = GradientBoostingRegressor(n_estimators=200,max_depth=5,learning_rate=0.05,random_state=42); \ tm.fit(Xtr2,ytr2); \ joblib.dump(tm, SAVE_DIR/'trust_model.pkl'); \ joblib.dump(TRUST_FEATS, SAVE_DIR/'trust_features.pkl'); \ print(' Trust model saved.'); \ \ sc = StandardScaler(); Xs = sc.fit_transform(X); \ print(' Training anomaly model...'); \ am = IsolationForest(contamination=0.1,n_estimators=200,random_state=42,n_jobs=-1); \ am.fit(Xs); \ joblib.dump(am, SAVE_DIR/'anomaly_model.pkl'); \ joblib.dump(sc, SAVE_DIR/'anomaly_scaler.pkl'); \ joblib.dump(FRAUD_FEATS, SAVE_DIR/'anomaly_features.pkl'); \ print(' Anomaly model saved.'); \ \ from sentence_transformers import SentenceTransformer; \ print(' Setting up similarity model...'); \ sim = SentenceTransformer('all-MiniLM-L6-v2'); \ (SAVE_DIR/'similarity_model_name.txt').write_text('all-MiniLM-L6-v2'); \ joblib.dump({'model_name':'all-MiniLM-L6-v2','embedding_dim':384}, SAVE_DIR/'similarity_meta.pkl'); \ print(' Similarity model saved.'); \ \ from transformers import pipeline as hf_pipeline; \ print(' Setting up chat model...'); \ clf = hf_pipeline('zero-shot-classification',model='typeform/distilbert-base-uncased-mnli',device=-1); \ (SAVE_DIR/'chat_model_name.txt').write_text('typeform/distilbert-base-uncased-mnli'); \ print('All models trained and saved!') \ " # ───────────────────────────────────────────────────────────────────────────── # BUILD STEP 4: Verify core model files exist — image model is optional # ───────────────────────────────────────────────────────────────────────────── RUN python -c "\ import os; from pathlib import Path; \ required = ['saved_models/fraud_rf.pkl','saved_models/fraud_xgb.pkl', \ 'saved_models/fraud_lgb.pkl','saved_models/fraud_features.pkl', \ 'saved_models/trust_model.pkl','saved_models/trust_features.pkl', \ 'saved_models/anomaly_model.pkl','saved_models/anomaly_scaler.pkl', \ 'saved_models/anomaly_features.pkl']; \ missing = [f for f in required if not os.path.exists(f)]; \ assert not missing, f'Build failed - missing: {missing}'; \ files = list(Path('saved_models').iterdir()); \ print(f'Build OK — {len(files)} model files:'); \ [print(f' {f.name}: {f.stat().st_size/1024:.1f} KB') for f in sorted(files)] \ " # Set offline mode for runtime — models are already cached ENV TRANSFORMERS_OFFLINE=1 ENV HF_DATASETS_OFFLINE=1 EXPOSE 7860 HEALTHCHECK --interval=30s --timeout=10s --retries=3 \ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]