Spaces:

arpit-gour02
/

document-classification-demo

Sleeping

App Files Files Community

arpit-gour02 commited on Dec 15, 2025

Commit

14fe11e

unverified ·

1 Parent(s): 6239f77

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -30

app.py CHANGED Viewed

@@ -1,72 +1,164 @@
 import gradio as gr
 import torch
-from torchvision import models, transforms
-# --- 1. CONFIGURATION ---
-MODEL_FILENAME = "resnet50_epoch_4.pth" # Loading locally now
 class_names = [
     'letter', 'form', 'email', 'handwritten', 'advertisement', 'scientific report',
     'scientific publication', 'specification', 'file folder', 'news article',
     'budget', 'invoice', 'presentation', 'questionnaire', 'resume', 'memo'
 ]
-# --- 2. LOAD MODEL LOCALLY ---
-def load_model_locally():
-    print(f"Loading {MODEL_FILENAME} from local disk...")
-    # Initialize Standard Architecture
-    model = models.resnet50(num_classes=16)
-    # Load the checkpoint locally
     checkpoint = torch.load(MODEL_FILENAME, map_location=torch.device('cpu'))
-    # Handle if it's nested in 'state_dict'
     if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
     else:
-        state_dict = checkpoint
-    # --- THE FIX: RENAME KEYS ---
-    # We must still rename 'shortcut' -> 'downsample' because your file
-    # has custom names, but we are using the standard torchvision model here.
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        new_key = key.replace("shortcut", "downsample")
-        new_state_dict[new_key] = value
-    # ----------------------------
-    model.load_state_dict(new_state_dict)
     model.eval()
     return model
-model = load_model_locally()
-# --- 3. PREPROCESSING ---
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
-# --- 4. PREDICTION FUNCTION ---
 def predict(image):
-    if image is None:
-        return None
     image_tensor = transform(image).unsqueeze(0)
     with torch.no_grad():
         outputs = model(image_tensor)
         probabilities = torch.nn.functional.softmax(outputs[0], dim=0)
     return {class_names[i]: float(probabilities[i]) for i in range(len(class_names))}
-# --- 5. LAUNCH INTERFACE ---
 interface = gr.Interface(
     fn=predict,
     inputs=gr.Image(type="pil"),
     outputs=gr.Label(num_top_classes=3),
     title="Document Classifier (ResNet50)",
-    description="Classifies documents into 16 categories.",
     examples=[
         ["1.png"],
         ["5022.png"],

 import gradio as gr
 import torch
+import torch.nn as nn
+from torchvision import transforms
+from PIL import Image
+# ==========================================
+# 1. YOUR CUSTOM MODEL ARCHITECTURE
+# ==========================================
+class BottleneckBlock(nn.Module):
+    expansion = 4
+    def __init__(self, in_channels, mid_channels, stride=1):
+        super(BottleneckBlock, self).__init__()
+        out_channels = mid_channels * self.expansion
+        self.conv1 = nn.Conv2d(in_channels, mid_channels, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(mid_channels)
+        self.conv2 = nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(mid_channels)
+        self.conv3 = nn.Conv2d(mid_channels, out_channels, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        identity = self.shortcut(identity)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet50(nn.Module):
+    def __init__(self, num_classes=16, channels_img=3):
+        super(ResNet50, self).__init__()
+        self.in_channels = 64
+        self.conv1 = nn.Conv2d(channels_img, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(mid_channels=64, num_blocks=3, stride=1)
+        self.layer2 = self._make_layer(mid_channels=128, num_blocks=4, stride=2)
+        self.layer3 = self._make_layer(mid_channels=256, num_blocks=6, stride=2)
+        self.layer4 = self._make_layer(mid_channels=512, num_blocks=3, stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * 4, num_classes)
+    def _make_layer(self, mid_channels, num_blocks, stride):
+        layers = []
+        layers.append(BottleneckBlock(self.in_channels, mid_channels, stride))
+        self.in_channels = mid_channels * 4
+        for _ in range(num_blocks - 1):
+            layers.append(BottleneckBlock(self.in_channels, mid_channels, stride=1))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+# ==========================================
+# 2. CONFIG & LOADING
+# ==========================================
+MODEL_FILENAME = "resnet50_epoch_5.pth"
 class_names = [
     'letter', 'form', 'email', 'handwritten', 'advertisement', 'scientific report',
     'scientific publication', 'specification', 'file folder', 'news article',
     'budget', 'invoice', 'presentation', 'questionnaire', 'resume', 'memo'
 ]
+def load_model():
+    print(f"Loading {MODEL_FILENAME}...")
+    # Initialize YOUR Custom ResNet50
+    model = ResNet50(num_classes=16)
+    # Load weights (CPU is sufficient for inference)
     checkpoint = torch.load(MODEL_FILENAME, map_location=torch.device('cpu'))
+    # Handle dictionary nesting if present
     if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        model.load_state_dict(checkpoint['state_dict'])
     else:
+        model.load_state_dict(checkpoint)
     model.eval()
     return model
+# Load the model once at startup
+model = load_model()
+# ==========================================
+# 3. PREPROCESSING & INTERFACE
+# ==========================================
+# Standard ImageNet transforms
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
 def predict(image):
+    if image is None: return None
     image_tensor = transform(image).unsqueeze(0)
     with torch.no_grad():
         outputs = model(image_tensor)
         probabilities = torch.nn.functional.softmax(outputs[0], dim=0)
     return {class_names[i]: float(probabilities[i]) for i in range(len(class_names))}
+# Gradio UI
 interface = gr.Interface(
     fn=predict,
     inputs=gr.Image(type="pil"),
     outputs=gr.Label(num_top_classes=3),
     title="Document Classifier (ResNet50)",
+    description="Custom ResNet50 trained on RVL-CDIP to classify 16 document types.",
     examples=[
         ["1.png"],
         ["5022.png"],