Spaces:

iharshyadav
/

SmartCertify-ML

Sleeping

Harsh Yadav

fix: remove CNN from build (OOM), inline tabular training, ELA heuristic fallback for image

165fd8b about 1 month ago

8.24 kB

	FROM python:3.11-slim

	WORKDIR /app

	# System deps
	RUN apt-get update && apt-get install -y --no-install-recommends \
	build-essential \
	git \
	&& rm -rf /var/lib/apt/lists/*

	COPY requirements.txt .

	# Install CPU-only PyTorch first (separate index), then everything else
	RUN pip install --no-cache-dir --timeout 300 \
	torch==2.2.2 torchvision==0.17.2 \
	--index-url https://download.pytorch.org/whl/cpu

	RUN pip install --no-cache-dir --timeout 300 --retries 5 -r requirements.txt

	COPY . .

	# Create dirs and __init__ files
	RUN mkdir -p saved_models plots logs data && \
	touch app/__init__.py \
	app/api/__init__.py \
	app/api/routes/__init__.py \
	app/models/__init__.py \
	app/data/__init__.py \
	app/utils/__init__.py \
	app/config/__init__.py

	# ─────────────────────────────────────────────────────────────────────────────
	# BUILD STEP 1: Generate 25,000 synthetic CSV rows (no network needed)
	# ─────────────────────────────────────────────────────────────────────────────
	RUN python -m app.data.generate_synthetic

	# ─────────────────────────────────────────────────────────────────────────────
	# BUILD STEP 2: Pre-download NLP models from HuggingFace
	# ─────────────────────────────────────────────────────────────────────────────
	RUN python -c "\
	from sentence_transformers import SentenceTransformer; \
	from transformers import pipeline; \
	print('Downloading all-MiniLM-L6-v2...'); \
	SentenceTransformer('all-MiniLM-L6-v2'); \
	print('Downloading DistilBERT zero-shot...'); \
	pipeline('zero-shot-classification', model='typeform/distilbert-base-uncased-mnli'); \
	print('NLP models downloaded.') \
	"

	# ─────────────────────────────────────────────────────────────────────────────
	# BUILD STEP 3: Train tabular models ONLY (fraud + trust + anomaly + similarity)
	# CNN skipped at build time — image analysis uses ELA heuristics at runtime.
	# This avoids OOM from holding thousands of PIL images in memory.
	# ─────────────────────────────────────────────────────────────────────────────
	RUN python -c "\
	import os; \
	os.environ.setdefault('LOKY_MAX_CPU_COUNT', '2'); \
	import joblib, pandas as pd; \
	from pathlib import Path; \
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, IsolationForest; \
	from sklearn.model_selection import train_test_split; \
	from sklearn.preprocessing import LabelEncoder, StandardScaler; \
	import xgboost as xgb; \
	import lightgbm as lgb; \
	\
	SAVE_DIR = Path('saved_models'); \
	df = pd.read_csv('data/synthetic_certificates.csv'); \
	print(f'Training on {len(df)} rows...'); \
	\
	FRAUD_FEATS = ['issuer_reputation_score','template_match_score','metadata_completeness_score', \
	'domain_verification_status','previous_verification_count','cert_age_days', \
	'issuer_cert_count','has_expiry','name_length','course_name_length', \
	'total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \
	'domain_age_days','verification_success_rate']; \
	TRUST_FEATS = ['total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \
	'domain_age_days','verification_success_rate']; \
	\
	le = LabelEncoder(); \
	y = le.fit_transform(df['label']); \
	label_map = {l:i for i,l in enumerate(le.classes_)}; \
	X = df[FRAUD_FEATS].fillna(0); \
	Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y); \
	\
	print(' Training RandomForest...'); \
	rf = RandomForestClassifier(n_estimators=200,max_depth=12,n_jobs=-1,random_state=42); \
	rf.fit(Xtr,ytr); \
	print(' Training XGBoost...'); \
	xm = xgb.XGBClassifier(n_estimators=200,max_depth=6,learning_rate=0.1, \
	eval_metric='mlogloss',random_state=42,verbosity=0); \
	xm.fit(Xtr,ytr); \
	print(' Training LightGBM...'); \
	lm = lgb.LGBMClassifier(n_estimators=200,max_depth=8,learning_rate=0.1, \
	random_state=42,verbose=-1); \
	lm.fit(Xtr,ytr); \
	joblib.dump(rf, SAVE_DIR/'fraud_rf.pkl'); \
	joblib.dump(xm, SAVE_DIR/'fraud_xgb.pkl'); \
	joblib.dump(lm, SAVE_DIR/'fraud_lgb.pkl'); \
	joblib.dump(FRAUD_FEATS, SAVE_DIR/'fraud_features.pkl'); \
	joblib.dump(label_map, SAVE_DIR/'fraud_label_map.pkl'); \
	print(' Fraud models saved.'); \
	\
	Xt = df[TRUST_FEATS].fillna(0); yt = df['trust_score'].fillna(0.5); \
	Xtr2,Xte2,ytr2,yte2 = train_test_split(Xt,yt,test_size=0.2,random_state=42); \
	print(' Training trust model...'); \
	tm = GradientBoostingRegressor(n_estimators=200,max_depth=5,learning_rate=0.05,random_state=42); \
	tm.fit(Xtr2,ytr2); \
	joblib.dump(tm, SAVE_DIR/'trust_model.pkl'); \
	joblib.dump(TRUST_FEATS, SAVE_DIR/'trust_features.pkl'); \
	print(' Trust model saved.'); \
	\
	sc = StandardScaler(); Xs = sc.fit_transform(X); \
	print(' Training anomaly model...'); \
	am = IsolationForest(contamination=0.1,n_estimators=200,random_state=42,n_jobs=-1); \
	am.fit(Xs); \
	joblib.dump(am, SAVE_DIR/'anomaly_model.pkl'); \
	joblib.dump(sc, SAVE_DIR/'anomaly_scaler.pkl'); \
	joblib.dump(FRAUD_FEATS, SAVE_DIR/'anomaly_features.pkl'); \
	print(' Anomaly model saved.'); \
	\
	from sentence_transformers import SentenceTransformer; \
	print(' Setting up similarity model...'); \
	sim = SentenceTransformer('all-MiniLM-L6-v2'); \
	(SAVE_DIR/'similarity_model_name.txt').write_text('all-MiniLM-L6-v2'); \
	joblib.dump({'model_name':'all-MiniLM-L6-v2','embedding_dim':384}, SAVE_DIR/'similarity_meta.pkl'); \
	print(' Similarity model saved.'); \
	\
	from transformers import pipeline as hf_pipeline; \
	print(' Setting up chat model...'); \
	clf = hf_pipeline('zero-shot-classification',model='typeform/distilbert-base-uncased-mnli',device=-1); \
	(SAVE_DIR/'chat_model_name.txt').write_text('typeform/distilbert-base-uncased-mnli'); \
	print('All models trained and saved!') \
	"

	# ─────────────────────────────────────────────────────────────────────────────
	# BUILD STEP 4: Verify core model files exist — image model is optional
	# ─────────────────────────────────────────────────────────────────────────────
	RUN python -c "\
	import os; from pathlib import Path; \
	required = ['saved_models/fraud_rf.pkl','saved_models/fraud_xgb.pkl', \
	'saved_models/fraud_lgb.pkl','saved_models/fraud_features.pkl', \
	'saved_models/trust_model.pkl','saved_models/trust_features.pkl', \
	'saved_models/anomaly_model.pkl','saved_models/anomaly_scaler.pkl', \
	'saved_models/anomaly_features.pkl']; \
	missing = [f for f in required if not os.path.exists(f)]; \
	assert not missing, f'Build failed - missing: {missing}'; \
	files = list(Path('saved_models').iterdir()); \
	print(f'Build OK — {len(files)} model files:'); \
	[print(f' {f.name}: {f.stat().st_size/1024:.1f} KB') for f in sorted(files)] \
	"

	# Set offline mode for runtime — models are already cached
	ENV TRANSFORMERS_OFFLINE=1
	ENV HF_DATASETS_OFFLINE=1

	EXPOSE 7860

	HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
	CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" \|\| exit 1

	CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]