| --- |
| language: en |
| tags: |
| - image-classification |
| - document-classification |
| - tensorflow |
| - efficientnet |
| - computer-vision |
| license: mit |
| pipeline_tag: image-classification |
| library_name: tf-keras |
| --- |
| |
| # Document Classifier |
|
|
| A Keras EfficientNet model for classifying real-world document images into structured categories. Includes a full validation pipeline covering image quality checks and AI/fake image detection. |
|
|
| --- |
|
|
| ## How to use this model |
|
|
| ```python |
| # Step 1 — Install dependencies |
| # pip install huggingface_hub tensorflow opencv-python pillow |
| |
| # Step 2 — Copy and run this complete code |
| |
| from huggingface_hub import snapshot_download |
| import tensorflow as tf |
| import numpy as np |
| import cv2 |
| import json |
| from tensorflow.keras.applications.efficientnet import preprocess_input |
| |
| # Download model from Hugging Face (cached after first run) |
| local_path = snapshot_download(repo_id="shailgsits/document-classifier") |
| |
| # Load model + class labels |
| model = tf.saved_model.load(local_path) |
| infer = model.signatures["serving_default"] |
| |
| with open(f"{local_path}/class_index.json") as f: |
| class_indices = json.load(f) |
| LABELS = {int(v): k for k, v in class_indices.items()} |
| |
| DOCUMENT_TYPE_LABELS = { |
| "1_visiting_card": "Visiting Card", |
| "2_prescription": "Prescription", |
| "3_shop_banner": "Shop Banner", |
| "4_invalid_image": "Invalid", |
| } |
| |
| def predict(image_path: str) -> dict: |
| img = cv2.imread(image_path) |
| if img is None: |
| return {"status": "ERROR", "message": "Could not read image"} |
| |
| img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) |
| resized = cv2.resize(img_rgb, (224, 224)) |
| input_arr = np.expand_dims(resized.astype(np.float32), axis=0) |
| input_arr = preprocess_input(input_arr) |
| |
| outputs = infer(tf.constant(input_arr)) |
| preds = list(outputs.values())[0].numpy()[0] |
| class_id = int(np.argmax(preds)) |
| confidence = float(np.max(preds)) |
| label = LABELS.get(class_id, "unknown") |
| friendly = DOCUMENT_TYPE_LABELS.get(label, label) |
| |
| return { |
| "status": "VALID" if confidence >= 0.75 else "LOW_CONFIDENCE", |
| "document_type": label, |
| "document_type_label": friendly, |
| "confidence": round(confidence * 100, 2), |
| "all_scores": { |
| DOCUMENT_TYPE_LABELS.get(LABELS[i], LABELS[i]): round(float(p) * 100, 2) |
| for i, p in enumerate(preds) |
| } |
| } |
| |
| # --- Run prediction --- |
| result = predict("your_image.jpg") |
| print(result) |
| |
| # Example output: |
| # { |
| # 'status': 'VALID', |
| # 'document_type': '1_visiting_card', |
| # 'document_type_label': 'Visiting Card', |
| # 'confidence': 97.43, |
| # 'all_scores': {'Visiting Card': 97.43, 'Prescription': 1.2, 'Shop Banner': 0.9, 'Invalid': 0.47} |
| # } |
| ``` |
|
|
| --- |
|
|
| ## Supported Document Types |
|
|
| | Label | Description | |
| |---|---| |
| | `visiting_card` | Business / name cards | |
| | `prescription` | Medical prescriptions | |
| | `shop_banner` | Storefront signage, banners | |
| | `invalid_image` | Rejected / unrecognized documents | |
|
|
| --- |
|
|
| ## Files in this repo |
|
|
| | File | Description | |
| |---|---| |
| | `document_classifier_final.keras` | Trained Keras model (EfficientNet) | |
| | `class_index.json` | Class name → index mapping | |
|
|
| --- |
|
|
| ## Quick Test in Google Colab |
|
|
| ```python |
| !pip install huggingface_hub tensorflow pillow opencv-python requests -q |
| |
| import tensorflow as tf, numpy as np, cv2, requests, json |
| from PIL import Image |
| from io import BytesIO |
| from huggingface_hub import hf_hub_download |
| from tensorflow.keras.applications.efficientnet import preprocess_input |
| |
| # Load model + class mapping |
| model = tf.keras.models.load_model( |
| hf_hub_download("shailgsits/document-classifier", "document_classifier_final.keras") |
| ) |
| with open(hf_hub_download("shailgsits/document-classifier", "class_index.json")) as f: |
| index_to_label = {v: k.split("_", 1)[1] for k, v in json.load(f).items()} |
| |
| # Predict from any image URL |
| def predict_from_url(url: str): |
| img = np.array(Image.open(BytesIO(requests.get(url).content)).convert("RGB"))[:, :, ::-1] |
| h, w = img.shape[:2] |
| scale = min(224 / w, 224 / h) |
| nw, nh = int(w * scale), int(h * scale) |
| res = cv2.resize(img, (nw, nh)) |
| canvas = np.ones((224, 224, 3), np.uint8) * 255 |
| canvas[(224 - nh) // 2:(224 - nh) // 2 + nh, (224 - nw) // 2:(224 - nw) // 2 + nw] = res |
| input_arr = preprocess_input(np.expand_dims(canvas.astype(np.float32), 0)) |
| pred = model.predict(input_arr)[0] |
| idx = int(np.argmax(pred)) |
| return {"label": index_to_label[idx], "confidence": round(float(pred[idx]) * 100, 2)} |
| |
| # Test with a Google Drive image |
| url = "https://drive.google.com/uc?export=download&id=YOUR_FILE_ID" |
| print(predict_from_url(url)) |
| # {'label': 'visiting_card', 'confidence': 97.43} |
| ``` |
|
|
| --- |
|
|
| ## Predict from local file (Colab upload) |
|
|
| ```python |
| from google.colab import files |
| uploaded = files.upload() |
| image_path = list(uploaded.keys())[0] |
| |
| img = cv2.imread(image_path) |
| h, w = img.shape[:2] |
| scale = min(224 / w, 224 / h) |
| nw, nh = int(w * scale), int(h * scale) |
| res = cv2.resize(img, (nw, nh)) |
| canvas = np.ones((224, 224, 3), np.uint8) * 255 |
| canvas[(224 - nh) // 2:(224 - nh) // 2 + nh, (224 - nw) // 2:(224 - nw) // 2 + nw] = res |
| input_arr = preprocess_input(np.expand_dims(canvas.astype(np.float32), 0)) |
| pred = model.predict(input_arr)[0] |
| idx = int(np.argmax(pred)) |
| print({"label": index_to_label[idx], "confidence": round(float(pred[idx]) * 100, 2)}) |
| ``` |
|
|
| --- |
|
|
| ## Preprocessing Details |
|
|
| Images are resized with **letterboxing** (aspect-ratio preserved, white padding) to 224×224, then passed through `EfficientNet`'s `preprocess_input`. |
|
|
| --- |
|
|
| ## Validation Pipeline |
|
|
| Before inference, every image passes through: |
|
|
| | Check | Condition | |
| |---|---| |
| | Blank image | Grayscale std < 12 | |
| | Blurry image | Laplacian variance < 10 | |
| | Ruled paper | ≥5 evenly-spaced horizontal lines | |
| | No text detected | Fewer than 6 connected text components | |
| | AI metadata | EXIF/XMP contains AI tool keywords | |
| | Screenshot/UI | >55% near-white pixels | |
| | AI watermark | OCR detects generator text in bottom strip | |
| | Gemini sparkle | Sparkle artifact in bottom-right corner | |
| | AI staged background | Card/background sharpness ratio > 5.0 | |
| | Perspective tilt | >35% lines in 15°–45° diagonal range | |
| | DCT frequency | High-freq energy ratio > 0.12 | |
| | Texture uniformity | Patch variance CV < 0.4 and mean var < 50 | |
|
|
| --- |
|
|
| ## License |
|
|
| MIT |
|
|
| --- |
|
|
| ## Author |
|
|
| Developed and trained by **[Shailendra Singh Tiwari](https://www.linkedin.com/in/shailendra-singh-tiwari/)** |