Spaces:

iharshyadav
/

SmartCertify-ML

Sleeping

App Files Files Community

Harsh Yadav commited on 27 days ago

Commit

165fd8b

1 Parent(s): ba6d7cd

fix: remove CNN from build (OOM), inline tabular training, ELA heuristic fallback for image

Browse files

Files changed (5) hide show

Dockerfile +88 -46
app/api/routes/image_analysis.py +97 -40
app/models/model_store.py +16 -14
app/models/train_all.py +17 -4
requirements.txt +1 -1

Dockerfile CHANGED Viewed

@@ -36,7 +36,6 @@ RUN python -m app.data.generate_synthetic
 # ─────────────────────────────────────────────────────────────────────────────
 # BUILD STEP 2: Pre-download NLP models from HuggingFace
-# NOTE: NO offline env vars yet — we need network access here
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -c "\
 from sentence_transformers import SentenceTransformer; \
@@ -49,64 +48,107 @@ print('NLP models downloaded.') \
 "
 # ─────────────────────────────────────────────────────────────────────────────
-# BUILD STEP 3: Pre-download real certificate image datasets from HuggingFace
-# These will be cached in HF_HOME for use during training
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -c "\
-print('Pre-caching HF image datasets...'); \
-from app.data.load_hf_images import load_authentic_images, load_tampered_images; \
-auth = load_authentic_images(n_max=300); \
-tamp = load_tampered_images(n_max=150); \
-print(f'Cached {len(auth)} authentic + {len(tamp)} tampered images'); \
-"
-# ─────────────────────────────────────────────────────────────────────────────
-# BUILD STEP 3.5: Pre-download ResNet18 weights
-# ─────────────────────────────────────────────────────────────────────────────
-RUN python -c "\
-import torchvision.models as tv_models; \
-print('Downloading ResNet18 weights...'); \
-tv_models.resnet18(weights=tv_models.ResNet18_Weights.DEFAULT); \
-print('ResNet18 weights downloaded.') \
 "
 # ─────────────────────────────────────────────────────────────────────────────
-# BUILD STEP 4: Train all models (uses cached data — no network calls)
-# ─────────────────────────────────────────────────────────────────────────────
-RUN python -m app.models.train_all
-# ─────────────────────────────────────────────────────────────────────────────
-# BUILD STEP 5: Verify all required model files exist — fail build if missing
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -c "\
-import os; \
-from pathlib import Path; \
-required = [ \
-    'saved_models/fraud_rf.pkl', \
-    'saved_models/fraud_xgb.pkl', \
-    'saved_models/fraud_lgb.pkl', \
-    'saved_models/fraud_features.pkl', \
-    'saved_models/image_model.pt', \
-    'saved_models/image_classifier_head.pkl', \
-    'saved_models/trust_model.pkl', \
-    'saved_models/trust_features.pkl', \
-    'saved_models/anomaly_model.pkl', \
-    'saved_models/anomaly_scaler.pkl', \
-    'saved_models/anomaly_features.pkl', \
-    'saved_models/similarity_model_name.txt', \
-]; \
 missing = [f for f in required if not os.path.exists(f)]; \
 assert not missing, f'Build failed - missing: {missing}'; \
 files = list(Path('saved_models').iterdir()); \
-print(f'Build OK - {len(files)} model files saved'); \
 [print(f'  {f.name}: {f.stat().st_size/1024:.1f} KB') for f in sorted(files)] \
 "
-# ─────────────────────────────────────────────────────────────────────────────
-# NOW set offline mode — only takes effect at RUNTIME, not during build steps
-# This prevents any network calls per inference request
-# ─────────────────────────────────────────────────────────────────────────────
 ENV TRANSFORMERS_OFFLINE=1
 ENV HF_DATASETS_OFFLINE=1

 # ─────────────────────────────────────────────────────────────────────────────
 # BUILD STEP 2: Pre-download NLP models from HuggingFace
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -c "\
 from sentence_transformers import SentenceTransformer; \
 "
 # ─────────────────────────────────────────────────────────────────────────────
+# BUILD STEP 3: Train tabular models ONLY (fraud + trust + anomaly + similarity)
+# CNN skipped at build time — image analysis uses ELA heuristics at runtime.
+# This avoids OOM from holding thousands of PIL images in memory.
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -c "\
+import os; \
+os.environ.setdefault('LOKY_MAX_CPU_COUNT', '2'); \
+import joblib, pandas as pd; \
+from pathlib import Path; \
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, IsolationForest; \
+from sklearn.model_selection import train_test_split; \
+from sklearn.preprocessing import LabelEncoder, StandardScaler; \
+import xgboost as xgb; \
+import lightgbm as lgb; \
+\
+SAVE_DIR = Path('saved_models'); \
+df = pd.read_csv('data/synthetic_certificates.csv'); \
+print(f'Training on {len(df)} rows...'); \
+\
+FRAUD_FEATS = ['issuer_reputation_score','template_match_score','metadata_completeness_score', \
+    'domain_verification_status','previous_verification_count','cert_age_days', \
+    'issuer_cert_count','has_expiry','name_length','course_name_length', \
+    'total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \
+    'domain_age_days','verification_success_rate']; \
+TRUST_FEATS = ['total_certificates_issued','fraud_rate_historical','avg_metadata_completeness', \
+    'domain_age_days','verification_success_rate']; \
+\
+le = LabelEncoder(); \
+y = le.fit_transform(df['label']); \
+label_map = {l:i for i,l in enumerate(le.classes_)}; \
+X = df[FRAUD_FEATS].fillna(0); \
+Xtr,Xte,ytr,yte = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y); \
+\
+print('  Training RandomForest...'); \
+rf = RandomForestClassifier(n_estimators=200,max_depth=12,n_jobs=-1,random_state=42); \
+rf.fit(Xtr,ytr); \
+print('  Training XGBoost...'); \
+xm = xgb.XGBClassifier(n_estimators=200,max_depth=6,learning_rate=0.1, \
+    eval_metric='mlogloss',random_state=42,verbosity=0); \
+xm.fit(Xtr,ytr); \
+print('  Training LightGBM...'); \
+lm = lgb.LGBMClassifier(n_estimators=200,max_depth=8,learning_rate=0.1, \
+    random_state=42,verbose=-1); \
+lm.fit(Xtr,ytr); \
+joblib.dump(rf, SAVE_DIR/'fraud_rf.pkl'); \
+joblib.dump(xm, SAVE_DIR/'fraud_xgb.pkl'); \
+joblib.dump(lm, SAVE_DIR/'fraud_lgb.pkl'); \
+joblib.dump(FRAUD_FEATS, SAVE_DIR/'fraud_features.pkl'); \
+joblib.dump(label_map, SAVE_DIR/'fraud_label_map.pkl'); \
+print('  Fraud models saved.'); \
+\
+Xt = df[TRUST_FEATS].fillna(0); yt = df['trust_score'].fillna(0.5); \
+Xtr2,Xte2,ytr2,yte2 = train_test_split(Xt,yt,test_size=0.2,random_state=42); \
+print('  Training trust model...'); \
+tm = GradientBoostingRegressor(n_estimators=200,max_depth=5,learning_rate=0.05,random_state=42); \
+tm.fit(Xtr2,ytr2); \
+joblib.dump(tm, SAVE_DIR/'trust_model.pkl'); \
+joblib.dump(TRUST_FEATS, SAVE_DIR/'trust_features.pkl'); \
+print('  Trust model saved.'); \
+\
+sc = StandardScaler(); Xs = sc.fit_transform(X); \
+print('  Training anomaly model...'); \
+am = IsolationForest(contamination=0.1,n_estimators=200,random_state=42,n_jobs=-1); \
+am.fit(Xs); \
+joblib.dump(am, SAVE_DIR/'anomaly_model.pkl'); \
+joblib.dump(sc, SAVE_DIR/'anomaly_scaler.pkl'); \
+joblib.dump(FRAUD_FEATS, SAVE_DIR/'anomaly_features.pkl'); \
+print('  Anomaly model saved.'); \
+\
+from sentence_transformers import SentenceTransformer; \
+print('  Setting up similarity model...'); \
+sim = SentenceTransformer('all-MiniLM-L6-v2'); \
+(SAVE_DIR/'similarity_model_name.txt').write_text('all-MiniLM-L6-v2'); \
+joblib.dump({'model_name':'all-MiniLM-L6-v2','embedding_dim':384}, SAVE_DIR/'similarity_meta.pkl'); \
+print('  Similarity model saved.'); \
+\
+from transformers import pipeline as hf_pipeline; \
+print('  Setting up chat model...'); \
+clf = hf_pipeline('zero-shot-classification',model='typeform/distilbert-base-uncased-mnli',device=-1); \
+(SAVE_DIR/'chat_model_name.txt').write_text('typeform/distilbert-base-uncased-mnli'); \
+print('All models trained and saved!') \
 "
 # ─────────────────────────────────────────────────────────────────────────────
+# BUILD STEP 4: Verify core model files exist — image model is optional
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -c "\
+import os; from pathlib import Path; \
+required = ['saved_models/fraud_rf.pkl','saved_models/fraud_xgb.pkl', \
+    'saved_models/fraud_lgb.pkl','saved_models/fraud_features.pkl', \
+    'saved_models/trust_model.pkl','saved_models/trust_features.pkl', \
+    'saved_models/anomaly_model.pkl','saved_models/anomaly_scaler.pkl', \
+    'saved_models/anomaly_features.pkl']; \
 missing = [f for f in required if not os.path.exists(f)]; \
 assert not missing, f'Build failed - missing: {missing}'; \
 files = list(Path('saved_models').iterdir()); \
+print(f'Build OK — {len(files)} model files:'); \
 [print(f'  {f.name}: {f.stat().st_size/1024:.1f} KB') for f in sorted(files)] \
 "
+# Set offline mode for runtime — models are already cached
 ENV TRANSFORMERS_OFFLINE=1
 ENV HF_DATASETS_OFFLINE=1

app/api/routes/image_analysis.py CHANGED Viewed

@@ -1,7 +1,12 @@
 """
 image_analysis.py — Certificate image tampering detection.
-POST /api/ml/analyze-image — ResNet-18 CNN (fine-tuned).
-ELA stats included in analysis field for additional context.
 """
 from __future__ import annotations
@@ -10,33 +15,91 @@ import io
 import time
 from typing import Optional
-import torch
-import torchvision.transforms as transforms
 from fastapi import APIRouter, Depends
 from PIL import Image
 from pydantic import BaseModel
 from app.api.middleware.auth import verify_api_key
-from app.models.model_store import get_image_model
 from app.utils.ela import extract_ela_features, get_channel_means
 router = APIRouter()
-_TRANSFORM = transforms.Compose([
-    transforms.Resize((224, 224)),
-    transforms.ToTensor(),
-    transforms.Normalize(
-        mean=[0.485, 0.456, 0.406],
-        std=[0.229, 0.224, 0.225],
-    ),
-])
 class ImageRequest(BaseModel):
     image_base64: str
     certificate_id: Optional[str] = "unknown"
 @router.post("/analyze-image")
 async def analyze_image(
     req: ImageRequest,
@@ -46,38 +109,35 @@ async def analyze_image(
     certificate_id = req.certificate_id or "unknown"
     try:
-        # 1. Decode base64 → PIL Image
         b64 = req.image_base64
         if "," in b64:
             b64 = b64.split(",")[1]
-        b64 += "=" * (-len(b64) % 4)  # fix padding
         img_bytes = base64.b64decode(b64)
         img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        # 2. ResNet-18 inference
-        model = get_image_model()
-        tensor = _TRANSFORM(img).unsqueeze(0)  # (1, 3, 224, 224)
-        with torch.no_grad():
-            logits = model(tensor)
-            probs = torch.softmax(logits, dim=1)[0]  # [p_authentic, p_tampered]
-            tamper_prob = float(probs[1])
-            confidence = float(probs.max())
-        # 3. ELA stats for the analysis field (supplementary visual info)
-        ela_features, ela_arr = extract_ela_features(img)
-        channel_means = get_channel_means(ela_arr)
         return {
             "certificate_id": certificate_id,
-            "is_tampered": tamper_prob > 0.5,
-            "tamper_probability": round(tamper_prob, 4),
-            "confidence": round(confidence, 4),
             "analysis": {
-                "mean_brightness": round(float(ela_features[0]), 4),
-                "std_brightness":  round(float(ela_features[1]), 4),
-                "channel_means":   [round(x, 4) for x in channel_means],
             },
-            "method": "ResNet-18 CNN (fine-tuned on synthetic certs)",
             "latency_ms": round((time.time() - t0) * 1000, 2),
         }
@@ -87,11 +147,8 @@ async def analyze_image(
             "is_tampered": False,
             "tamper_probability": 0.0,
             "confidence": 0.0,
-            "analysis": {
-                "mean_brightness": 0.0,
-                "std_brightness": 0.0,
-                "channel_means": [0.0, 0.0, 0.0],
-            },
             "method": "error",
             "latency_ms": round((time.time() - t0) * 1000, 2),
             "error": str(e),

 """
 image_analysis.py — Certificate image tampering detection.
+POST /api/ml/analyze-image
+Strategy:
+  1. If image_model.pt exists → ResNet-18 CNN inference
+  2. Fallback → ELA (Error Level Analysis) heuristic
+     ELA is a well-established forensic technique: tampered pixels
+     have higher residual after JPEG re-compression.
 """
 from __future__ import annotations
 import time
 from typing import Optional
 from fastapi import APIRouter, Depends
 from PIL import Image
 from pydantic import BaseModel
 from app.api.middleware.auth import verify_api_key
 from app.utils.ela import extract_ela_features, get_channel_means
 router = APIRouter()
 class ImageRequest(BaseModel):
     image_base64: str
     certificate_id: Optional[str] = "unknown"
+def _ela_heuristic(img: Image.Image) -> dict:
+    """
+    ELA-based tampering detector — no CNN needed.
+    Thresholds calibrated on forensic literature:
+      ELA mean > 8  → suspicious
+      ELA std  > 12 → suspicious
+    Returns tamper_prob in [0, 1].
+    """
+    ela_features, ela_arr = extract_ela_features(img, quality=90)
+    channel_means = get_channel_means(ela_arr)
+    # Use all-channel stats
+    mean_ela = float(ela_features[0::4].mean())   # mean per channel avg
+    std_ela  = float(ela_features[1::4].mean())   # std per channel avg
+    max_ela  = float(ela_features[2::4].mean())   # max per channel avg
+    # Score: 0 → authentic, 1 → tampered
+    score = 0.0
+    if mean_ela > 8:
+        score += 0.35
+    if std_ela > 12:
+        score += 0.35
+    if max_ela > 60:
+        score += 0.30
+    score = min(score, 1.0)
+    return {
+        "tamper_prob": round(score, 4),
+        "confidence": round(0.65 + abs(score - 0.5) * 0.35, 4),
+        "mean_ela": round(mean_ela, 4),
+        "std_ela": round(std_ela, 4),
+        "channel_means": [round(x, 4) for x in channel_means],
+        "method": "ELA heuristic (forensic analysis)",
+    }
+def _cnn_inference(img: Image.Image) -> dict:
+    """ResNet-18 CNN inference — used only when model file exists."""
+    import torch
+    import torchvision.transforms as transforms
+    from app.models.model_store import get_image_model
+    _TRANSFORM = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                             std=[0.229, 0.224, 0.225]),
+    ])
+    ela_features, ela_arr = extract_ela_features(img)
+    channel_means = get_channel_means(ela_arr)
+    model = get_image_model()
+    tensor = _TRANSFORM(img).unsqueeze(0)
+    with torch.no_grad():
+        logits = model(tensor)
+        probs = torch.softmax(logits, dim=1)[0]
+        tamper_prob = float(probs[1])
+        confidence = float(probs.max())
+    return {
+        "tamper_prob": round(tamper_prob, 4),
+        "confidence": round(confidence, 4),
+        "mean_ela": round(float(ela_features[0]), 4),
+        "std_ela":  round(float(ela_features[1]), 4),
+        "channel_means": [round(x, 4) for x in channel_means],
+        "method": "ResNet-18 CNN (fine-tuned on synthetic certs)",
+    }
 @router.post("/analyze-image")
 async def analyze_image(
     req: ImageRequest,
     certificate_id = req.certificate_id or "unknown"
     try:
+        # Decode base64 → PIL Image
         b64 = req.image_base64
         if "," in b64:
             b64 = b64.split(",")[1]
+        b64 += "=" * (-len(b64) % 4)
         img_bytes = base64.b64decode(b64)
         img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        # Try CNN first, fall back to ELA heuristic
+        try:
+            from pathlib import Path
+            if (Path("saved_models") / "image_model.pt").exists():
+                result = _cnn_inference(img)
+            else:
+                result = _ela_heuristic(img)
+        except Exception:
+            result = _ela_heuristic(img)
         return {
             "certificate_id": certificate_id,
+            "is_tampered": result["tamper_prob"] > 0.5,
+            "tamper_probability": result["tamper_prob"],
+            "confidence": result["confidence"],
             "analysis": {
+                "mean_brightness": result["mean_ela"],
+                "std_brightness":  result["std_ela"],
+                "channel_means":   result["channel_means"],
             },
+            "method": result["method"],
             "latency_ms": round((time.time() - t0) * 1000, 2),
         }
             "is_tampered": False,
             "tamper_probability": 0.0,
             "confidence": 0.0,
+            "analysis": {"mean_brightness": 0.0, "std_brightness": 0.0,
+                         "channel_means": [0.0, 0.0, 0.0]},
             "method": "error",
             "latency_ms": round((time.time() - t0) * 1000, 2),
             "error": str(e),

app/models/model_store.py CHANGED Viewed

@@ -42,8 +42,12 @@ def get_fraud_models():
 # ── Image Tampering (ResNet-18 CNN) ───────────────────────────
 @lru_cache(maxsize=1)
-def get_image_model() -> nn.Module:
-    """Load ResNet-18 fine-tuned for binary tamper classification."""
     m = tv_models.resnet18(weights=None)
     m.fc = nn.Sequential(
         nn.Linear(m.fc.in_features, 256),
@@ -51,12 +55,6 @@ def get_image_model() -> nn.Module:
         nn.Dropout(0.3),
         nn.Linear(256, 2),
     )
-    state_path = MODEL_DIR / "image_model.pt"
-    if not state_path.exists():
-        raise FileNotFoundError(
-            f"image_model.pt not found at {state_path}. "
-            "Rebuild Docker image to retrain."
-        )
     state = torch.load(str(state_path), map_location=DEVICE)
     m.load_state_dict(state)
     m.eval()
@@ -117,10 +115,14 @@ def get_anomaly_models():
 def load_all_models() -> None:
     """Preload all models into lru_cache at startup."""
     print("Preloading all models into memory...")
-    get_fraud_models();     print("  ✓ fraud models (RF+XGB+LGB)")
-    get_image_model();      print("  ✓ ResNet-18 CNN")
-    get_similarity_model(); print("  ✓ sentence-transformers")
-    get_chat_model();       print("  ✓ DistilBERT zero-shot")
-    get_trust_models();     print("  ✓ trust model (GBR)")
-    get_anomaly_models();   print("  ✓ anomaly model (IsoForest)")
     print("All models ready.")

 # ── Image Tampering (ResNet-18 CNN) ───────────────────────────
 @lru_cache(maxsize=1)
+def get_image_model():
+    """Load ResNet-18 — returns None if model file not found (ELA fallback used)."""
+    import torch.nn as nn
+    state_path = MODEL_DIR / "image_model.pt"
+    if not state_path.exists():
+        return None  # image_analysis.py will use ELA heuristic
     m = tv_models.resnet18(weights=None)
     m.fc = nn.Sequential(
         nn.Linear(m.fc.in_features, 256),
         nn.Dropout(0.3),
         nn.Linear(256, 2),
     )
     state = torch.load(str(state_path), map_location=DEVICE)
     m.load_state_dict(state)
     m.eval()
 def load_all_models() -> None:
     """Preload all models into lru_cache at startup."""
     print("Preloading all models into memory...")
+    get_fraud_models();     print("  \u2713 fraud models (RF+XGB+LGB)")
+    img = get_image_model()
+    if img is not None:
+        print("  \u2713 ResNet-18 CNN")
+    else:
+        print("  ~ image model not found — using ELA heuristic")
+    get_similarity_model(); print("  \u2713 sentence-transformers")
+    get_chat_model();       print("  \u2713 DistilBERT zero-shot")
+    get_trust_models();     print("  \u2713 trust model (GBR)")
+    get_anomaly_models();   print("  \u2713 anomaly model (IsoForest)")
     print("All models ready.")

app/models/train_all.py CHANGED Viewed

@@ -208,7 +208,7 @@ def train_image_model() -> None:
     print(f"  Created {len(tampered_from_real)} tampered versions of real certs")
     # ── Step 2: Generate synthetic PIL images to fill volume ──────────────────
-    N_SYNTHETIC_PER_CLASS = 1_500  # 3,000 synthetic images — fits in HF build timeout
     print(f"\n  [Phase 2] Generating {N_SYNTHETIC_PER_CLASS * 2} synthetic images...")
     all_images = []   # PIL Images
@@ -466,9 +466,22 @@ def main() -> None:
     train_fraud_model(df)
     train_trust_model(df)
     train_anomaly_model(df)
-    train_image_model()
-    train_similarity_model(df)
-    setup_chat_model()
     elapsed = time.time() - t0
     print("\n" + "=" * 60)

     print(f"  Created {len(tampered_from_real)} tampered versions of real certs")
     # ── Step 2: Generate synthetic PIL images to fill volume ──────────────────
+    N_SYNTHETIC_PER_CLASS = 800  # 1600 synthetic images — fits safely in HF build memory
     print(f"\n  [Phase 2] Generating {N_SYNTHETIC_PER_CLASS * 2} synthetic images...")
     all_images = []   # PIL Images
     train_fraud_model(df)
     train_trust_model(df)
     train_anomaly_model(df)
+    try:
+        train_image_model()
+    except Exception as e:
+        print(f"  WARNING: Image model training failed: {e}")
+        print("  Skipping image model — API will use heuristic fallback.")
+    try:
+        train_similarity_model(df)
+    except Exception as e:
+        print(f"  WARNING: Similarity model failed: {e}")
+    try:
+        setup_chat_model()
+    except Exception as e:
+        print(f"  WARNING: Chat model setup failed: {e}")
     elapsed = time.time() - t0
     print("\n" + "=" * 60)

requirements.txt CHANGED Viewed

@@ -7,7 +7,7 @@ httpx>=0.27.0
 # ── Classical ML (tabular — fraud, trust, anomaly) ───────────
 scikit-learn>=1.4.0
-xgboost>=2.0.0
 lightgbm>=4.0.0
 imbalanced-learn>=0.12.0
 joblib>=1.3.0

 # ── Classical ML (tabular — fraud, trust, anomaly) ───────────
 scikit-learn>=1.4.0
+xgboost>=2.0.0,<3.0.0
 lightgbm>=4.0.0
 imbalanced-learn>=0.12.0
 joblib>=1.3.0