Spaces:
Sleeping
Sleeping
Harsh Yadav
fix: remove CNN from build (OOM), inline tabular training, ELA heuristic fallback for image
165fd8b | FROM python:3.11-slim | |
| WORKDIR /app | |
| # System deps | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| build-essential \ | |
| git \ | |
| && rm -rf /var/lib/apt/lists/* | |
| COPY requirements.txt . | |
| # Install CPU-only PyTorch first (separate index), then everything else | |
| RUN pip install --no-cache-dir --timeout 300 \ | |
| torch==2.2.2 torchvision==0.17.2 \ | |
| --index-url https://download.pytorch.org/whl/cpu | |
| RUN pip install --no-cache-dir --timeout 300 --retries 5 -r requirements.txt | |
| COPY . . | |
| # Create dirs and __init__ files | |
| RUN mkdir -p saved_models plots logs data && \ | |
| touch app/__init__.py \ | |
| app/api/__init__.py \ | |
| app/api/routes/__init__.py \ | |
| app/models/__init__.py \ | |
| app/data/__init__.py \ | |
| app/utils/__init__.py \ | |
| app/config/__init__.py | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BUILD STEP 1: Generate 25,000 synthetic CSV rows (no network needed) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RUN python -m app.data.generate_synthetic | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BUILD STEP 2: Pre-download NLP models from HuggingFace | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RUN python -c "\ | |
| from sentence_transformers import SentenceTransformer; \ | |
| from transformers import pipeline; \ | |
| print('Downloading all-MiniLM-L6-v2...'); \ | |
| SentenceTransformer('all-MiniLM-L6-v2'); \ | |
| print('Downloading DistilBERT zero-shot...'); \ | |
| pipeline('zero-shot-classification', model='typeform/distilbert-base-uncased-mnli'); \ | |
| print('NLP models downloaded.') \ | |
| " | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BUILD STEP 3: Train tabular models ONLY (fraud + trust + anomaly + similarity) | |
| # CNN skipped at build time β image analysis uses ELA heuristics at runtime. | |
| # This avoids OOM from holding thousands of PIL images in memory. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RUN python -c "\ | |
| import os; \ | |
| os.environ.setdefault('LOKY_MAX_CPU_COUNT', '2'); \ | |
| import joblib, pandas as pd; \ | |
| from pathlib import Path; \ | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, IsolationForest; \ | |
| from sklearn.model_selection import train_test_split; \ | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler; \ | |
| import xgboost as xgb; \ | |
| import lightgbm as lgb; \ | |
| \ | |
| SAVE_DIR = Path('saved_models'); \ | |
| df = pd.read_csv('data/synthetic_certificates.csv'); \ | |
| print(f'Training on {len(df)} rows...'); \ | |
| \ | |
| FRAUD_FEATS = ['issuer_reputation_score','template_match_score','metadata_completeness_score', \ | |
| 'domain_verification_status','previous_verification_count','cert_age_days', \ | |
| 'issuer_cert_count','has_expiry','name_length','course_name_length', \ | |
| 'total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \ | |
| 'domain_age_days','verification_success_rate']; \ | |
| TRUST_FEATS = ['total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \ | |
| 'domain_age_days','verification_success_rate']; \ | |
| \ | |
| le = LabelEncoder(); \ | |
| y = le.fit_transform(df['label']); \ | |
| label_map = {l:i for i,l in enumerate(le.classes_)}; \ | |
| X = df[FRAUD_FEATS].fillna(0); \ | |
| Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y); \ | |
| \ | |
| print(' Training RandomForest...'); \ | |
| rf = RandomForestClassifier(n_estimators=200,max_depth=12,n_jobs=-1,random_state=42); \ | |
| rf.fit(Xtr,ytr); \ | |
| print(' Training XGBoost...'); \ | |
| xm = xgb.XGBClassifier(n_estimators=200,max_depth=6,learning_rate=0.1, \ | |
| eval_metric='mlogloss',random_state=42,verbosity=0); \ | |
| xm.fit(Xtr,ytr); \ | |
| print(' Training LightGBM...'); \ | |
| lm = lgb.LGBMClassifier(n_estimators=200,max_depth=8,learning_rate=0.1, \ | |
| random_state=42,verbose=-1); \ | |
| lm.fit(Xtr,ytr); \ | |
| joblib.dump(rf, SAVE_DIR/'fraud_rf.pkl'); \ | |
| joblib.dump(xm, SAVE_DIR/'fraud_xgb.pkl'); \ | |
| joblib.dump(lm, SAVE_DIR/'fraud_lgb.pkl'); \ | |
| joblib.dump(FRAUD_FEATS, SAVE_DIR/'fraud_features.pkl'); \ | |
| joblib.dump(label_map, SAVE_DIR/'fraud_label_map.pkl'); \ | |
| print(' Fraud models saved.'); \ | |
| \ | |
| Xt = df[TRUST_FEATS].fillna(0); yt = df['trust_score'].fillna(0.5); \ | |
| Xtr2,Xte2,ytr2,yte2 = train_test_split(Xt,yt,test_size=0.2,random_state=42); \ | |
| print(' Training trust model...'); \ | |
| tm = GradientBoostingRegressor(n_estimators=200,max_depth=5,learning_rate=0.05,random_state=42); \ | |
| tm.fit(Xtr2,ytr2); \ | |
| joblib.dump(tm, SAVE_DIR/'trust_model.pkl'); \ | |
| joblib.dump(TRUST_FEATS, SAVE_DIR/'trust_features.pkl'); \ | |
| print(' Trust model saved.'); \ | |
| \ | |
| sc = StandardScaler(); Xs = sc.fit_transform(X); \ | |
| print(' Training anomaly model...'); \ | |
| am = IsolationForest(contamination=0.1,n_estimators=200,random_state=42,n_jobs=-1); \ | |
| am.fit(Xs); \ | |
| joblib.dump(am, SAVE_DIR/'anomaly_model.pkl'); \ | |
| joblib.dump(sc, SAVE_DIR/'anomaly_scaler.pkl'); \ | |
| joblib.dump(FRAUD_FEATS, SAVE_DIR/'anomaly_features.pkl'); \ | |
| print(' Anomaly model saved.'); \ | |
| \ | |
| from sentence_transformers import SentenceTransformer; \ | |
| print(' Setting up similarity model...'); \ | |
| sim = SentenceTransformer('all-MiniLM-L6-v2'); \ | |
| (SAVE_DIR/'similarity_model_name.txt').write_text('all-MiniLM-L6-v2'); \ | |
| joblib.dump({'model_name':'all-MiniLM-L6-v2','embedding_dim':384}, SAVE_DIR/'similarity_meta.pkl'); \ | |
| print(' Similarity model saved.'); \ | |
| \ | |
| from transformers import pipeline as hf_pipeline; \ | |
| print(' Setting up chat model...'); \ | |
| clf = hf_pipeline('zero-shot-classification',model='typeform/distilbert-base-uncased-mnli',device=-1); \ | |
| (SAVE_DIR/'chat_model_name.txt').write_text('typeform/distilbert-base-uncased-mnli'); \ | |
| print('All models trained and saved!') \ | |
| " | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BUILD STEP 4: Verify core model files exist β image model is optional | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RUN python -c "\ | |
| import os; from pathlib import Path; \ | |
| required = ['saved_models/fraud_rf.pkl','saved_models/fraud_xgb.pkl', \ | |
| 'saved_models/fraud_lgb.pkl','saved_models/fraud_features.pkl', \ | |
| 'saved_models/trust_model.pkl','saved_models/trust_features.pkl', \ | |
| 'saved_models/anomaly_model.pkl','saved_models/anomaly_scaler.pkl', \ | |
| 'saved_models/anomaly_features.pkl']; \ | |
| missing = [f for f in required if not os.path.exists(f)]; \ | |
| assert not missing, f'Build failed - missing: {missing}'; \ | |
| files = list(Path('saved_models').iterdir()); \ | |
| print(f'Build OK β {len(files)} model files:'); \ | |
| [print(f' {f.name}: {f.stat().st_size/1024:.1f} KB') for f in sorted(files)] \ | |
| " | |
| # Set offline mode for runtime β models are already cached | |
| ENV TRANSFORMERS_OFFLINE=1 | |
| ENV HF_DATASETS_OFFLINE=1 | |
| EXPOSE 7860 | |
| HEALTHCHECK --interval=30s --timeout=10s --retries=3 \ | |
| CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1 | |
| CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"] | |