Spaces:

Gizachew
/

amst

Build error

App Files Files Community

Gizachew commited on Dec 28, 2024

Commit

84982b3

verified ·

1 Parent(s): 9d93df6

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -176

app.py CHANGED Viewed

@@ -1,201 +1,70 @@
 # app.py
-import os
 import torch
-import torch.nn as nn
-from torchvision import transforms
 from PIL import Image
-import numpy as np
-import gradio as gr
-import timm
-import matplotlib.pyplot as plt
-import matplotlib.patches as patches
-# Optional: If integrating OCR
-# import pytesseract
-# Define the Detection Model Architecture
-class ViTDetectionModel(nn.Module):
-    def __init__(self, num_queries=100, hidden_dim=768):
-        """
-        Initializes the ViTDetectionModel.
-        Args:
-            num_queries (int, optional): Number of detection queries. Defaults to 100.
-            hidden_dim (int, optional): Hidden dimension size. Defaults to 768.
-        """
-        super(ViTDetectionModel, self).__init__()
-        # Configure the ViT model to output features only
-        self.vit = timm.create_model(
-            'vit_base_patch16_224',
-            pretrained=False,  # Set to False since we are loading a trained model
-            num_classes=0,               # Disable classification head
-            features_only=True,          # Return feature maps
-            out_indices=(11,)            # Get the last feature map
-        )
-        self.query_embed = nn.Embedding(num_queries, hidden_dim)
-        self.fc_bbox = nn.Linear(hidden_dim, 8)  # 4 points (x, y) for quadrilateral
-        self.fc_class = nn.Linear(hidden_dim, 1)  # Binary classification
-    def forward(self, x):
-        """
-        Forward pass of the detection model.
-        Args:
-            x (Tensor): Input images [batch, 3, H, W].
-        Returns:
-            Tuple[Tensor, Tensor]: Predicted bounding boxes and class scores.
-        """
-        # Retrieve the feature map
-        features = self.vit(x)[0]  # [batch, hidden_dim, H*W]
-        if features.dim() == 3:
-            batch_size, hidden_dim, num_patches = features.shape
-            grid_size = int(np.sqrt(num_patches))
-            if grid_size * grid_size != num_patches:
-                raise ValueError(f"Number of patches {num_patches} is not a perfect square.")
-            H, W = grid_size, grid_size
-            features = features.view(batch_size, hidden_dim, H, W)
-        elif features.dim() == 4:
-            batch_size, hidden_dim, H, W = features.shape
-        else:
-            raise ValueError(f"Unexpected feature dimensions: {features.dim()}, expected 3 or 4.")
-        # Flatten the spatial dimensions
-        features = features.flatten(2).transpose(1, 2)  # [batch, H*W, hidden_dim]
-        # Prepare query embeddings
-        queries = self.query_embed.weight.unsqueeze(0).repeat(batch_size, 1, 1)  # [batch, num_queries, hidden_dim]
-        # Compute attention weights
-        attn = torch.matmul(features, queries.transpose(-1, -2))  # [batch, H*W, num_queries]
-        attn = torch.softmax(attn, dim=1)  # Softmax over patches
-        # Aggregate features based on attention
-        output = torch.matmul(attn.transpose(-1, -2), features)  # [batch, num_queries, hidden_dim]
-        # Predict bounding boxes and classes
-        bboxes = self.fc_bbox(output)  # [batch, num_queries, 8]
-        classes = self.fc_class(output)  # [batch, num_queries, 1]
-        return bboxes, classes
-# Function to Load the Trained Model
-def load_model(model_path, device):
-    """
-    Loads the trained detection model.
-    Args:
-        model_path (str): Path to the saved model state dictionary.
-        device (torch.device): Device to load the model on.
-    Returns:
-        nn.Module: Loaded detection model.
-    """
-    model = ViTDetectionModel(num_queries=100, hidden_dim=768).to(device)
-    model.load_state_dict(torch.load(model_path, map_location=device))
-    model.eval()
-    return model
-# Function to Perform Text Detection on an Image
-def detect_text(image, model, device, max_boxes=100, confidence_threshold=0.5):
     """
-    Detects text in the input image using the detection model.
-    Args:
-        image (PIL Image): Input image.
-        model (nn.Module): Trained detection model.
-        device (torch.device): Device to run the model on.
-        max_boxes (int, optional): Maximum number of bounding boxes to return. Defaults to 100.
-        confidence_threshold (float, optional): Threshold to filter detections. Defaults to 0.5.
-    Returns:
-        PIL Image: Image with detected bounding boxes drawn.
     """
-    # Define transformation
-    transform = transforms.Compose([
-        transforms.Resize((224, 224)),
-        transforms.ToTensor(),
-    ])
     # Preprocess the image
-    input_tensor = transform(image).unsqueeze(0).to(device)  # [1, 3, 224, 224]
-    # Perform detection
     with torch.no_grad():
-        pred_bboxes, pred_classes = model(input_tensor)  # [1, num_queries, 8], [1, num_queries, 1]
-    # Process predictions
-    pred_bboxes = pred_bboxes.squeeze(0)  # [num_queries, 8]
-    pred_classes = pred_classes.squeeze(0)  # [num_queries, 1]
-    pred_classes_sigmoid = torch.sigmoid(pred_classes)
-    high_conf_indices = (pred_classes_sigmoid > confidence_threshold).squeeze(1).nonzero(as_tuple=False).squeeze(1)
-    selected_indices = high_conf_indices[:max_boxes]
-    selected_bboxes = pred_bboxes[selected_indices]  # [selected, 8]
-    # Denormalize bounding boxes to original image size
-    width, height = image.size
-    scale_x = width / 224
-    scale_y = height / 224
-    boxes = selected_bboxes.cpu().numpy() * np.array([scale_x, scale_y] * 4)  # [selected, 8]
-    # Draw bounding boxes on the image
-    fig, ax = plt.subplots(1, figsize=(12, 12))
-    ax.imshow(image)
-    for box in boxes:
-        polygon = patches.Polygon(box.reshape(-1, 2), linewidth=2, edgecolor='r', facecolor='none')
-        ax.add_patch(polygon)
-    plt.axis('off')
-    # Convert Matplotlib figure to PIL Image
-    fig.canvas.draw()
-    img_with_boxes = Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    plt.close(fig)
-    return img_with_boxes
-# Optional: If integrating OCR with pytesseract
-# def detect_and_recognize_text(image, model, device, max_boxes=100, confidence_threshold=0.5):
-#     # Similar to detect_text but includes OCR steps
-#     pass
-# Initialize the model
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model_path = "finetuned_recog_model.pth"  # Ensure this path matches where the model is stored
-model = load_model(model_path, device)
-print("Model loaded successfully.")
-# Define the Gradio Interface Function
-def gradio_detect(image):
     """
-    Gradio interface function for text detection.
-    Args:
-        image (PIL Image): Uploaded image.
-    Returns:
-        PIL Image: Image with detected bounding boxes.
     """
-    result_image = detect_text(image, model, device)
-    return result_image
-# Create Gradio Interface
 iface = gr.Interface(
-    fn=gradio_detect,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Image(type="pil"),
-    title="Text Detection with ViT",
-    description="Upload an image, and the model will detect and highlight text regions.",
-    examples=[
-        # You can add URLs or paths to example images here
-        # "https://example.com/image1.jpg",
-        # "https://example.com/image2.jpg",
-    ],
-    allow_flagging="never"
 )
-# Launch the Gradio App (Optional for local testing)
-# if __name__ == "__main__":
-#     iface.launch()

 # app.py
+import gradio as gr
 import torch
 from PIL import Image
+from model import load_model
+from utils import preprocess_image, decode_predictions
+import os
+# Load the model (ensure the path is correct)
+MODEL_PATH = "saved_models/finetuned/finetuned_recog_model.pth"
+FONT_PATH = "fonts/NotoSansEthiopic-Regular.ttf"  # Update the path to your font
+# Check if model file exists
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Model file not found at {MODEL_PATH}. Please provide the correct path.")
+# Check if font file exists
+if not os.path.exists(FONT_PATH):
+    raise FileNotFoundError(f"Font file not found at {FONT_PATH}. Please provide the correct path.")
+# Load the model
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = load_model(MODEL_PATH, device=device)
+# Load the font for rendering Amharic text
+from matplotlib import font_manager as fm
+import matplotlib.pyplot as plt
+ethiopic_font = fm.FontProperties(fname=FONT_PATH, size=15)
+def recognize_text(image: Image.Image):
     """
+    Function to recognize text from an image.
     """
     # Preprocess the image
+    input_tensor = preprocess_image(image).unsqueeze(0).to(device)  # [1, 3, 224, 224]
+    # Perform inference
     with torch.no_grad():
+        log_probs = model(input_tensor)  # [H*W, 1, vocab_size]
+    # Decode predictions
+    recognized_texts = decode_predictions(log_probs)
+    return recognized_texts[0]
+def display_image_with_text(image: Image.Image, recognized_text: str):
     """
+    Function to display the image with recognized text.
     """
+    plt.figure(figsize=(6,6))
+    plt.imshow(image)
+    plt.axis('off')
+    plt.title(f"Recognized Text: {recognized_text}", fontproperties=ethiopic_font)
+    plt.show()
+    return plt
+# Define Gradio Interface
 iface = gr.Interface(
+    fn=recognize_text,
+    inputs=gr.inputs.Image(type="pil"),
+    outputs=gr.outputs.Textbox(),
+    title="Amharic Text Recognition",
+    description="Upload an image containing Amharic text, and the model will recognize and display the text."
 )
+# Launch the Gradio app
+if __name__ == "__main__":
+    iface.launch()