Spaces:

daneigh
/

MSX-Backend

Sleeping

App Files Files Community

daneigh commited on Dec 14, 2025

Commit

bddb462

verified ·

1 Parent(s): f2f67b5

Upload 6 files

Browse files

Files changed (6) hide show

app.py +43 -0
best_multimodal_v3.pth +3 -0
flask_app.py +36 -0
requirements.txt +10 -0
test_mode.py +434 -0
util.py +65 -0

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import gradio as gr
+import sys
+import os
+from flask import Flask, request, jsonify
+import threading
+# Import your inference function directly
+from test_mode import run_inference
+def classify_meme(image):
+    try:
+        # Convert PIL to bytes if needed
+        import io
+        if hasattr(image, 'save'):
+            img_bytes = io.BytesIO()
+            image.save(img_bytes, format='PNG')
+            img_bytes = img_bytes.getvalue()
+        else:
+            img_bytes = image
+        result = run_inference(img_bytes)
+        if "error" in result:
+            return f"Error: {result['error']}"
+        prediction = result['prediction']
+        confidence = max(result['probabilities'][0]) * 100
+        return f"Classification: {prediction}\nConfidence: {confidence:.1f}%"
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Simple Gradio interface with API enabled
+iface = gr.Interface(
+    fn=classify_meme,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="MemeSenseX Backend",
+    description="Meme content classifier"
+)
+# Launch with API enabled
+iface.launch(share=False)

best_multimodal_v3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40016216a9127daf17477fcd088970536ccc2ba905135d7563f07f8f94eb8f5d
+size 516666381

flask_app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from flask import Flask, request, jsonify
+from test_mode import run_inference
+from flask_cors import CORS
+app = Flask(__name__)
+CORS(app)
+# Endpoint to handle image upload and return processed text and image tensor shape
+@app.route('/process_predict', methods=['POST'])
+def process_predict():
+    # Check if the post request has the file part
+    if 'image' not in request.files:
+        return jsonify({"error": "No image file provided"}), 400
+    # convert to bytes
+    image = request.files['image']
+    image_bytes = image.read()
+    # result
+    result = run_inference(image_bytes)
+    print(f"Processed result: {result}")
+    if "error" in result:
+        return jsonify(result), 500
+    return jsonify({
+        "status": "success",
+        "message": "Image processed successfully",
+        "data": result
+    }), 201
+if __name__ == '__main__':
+    app.run(debug=True, port=5001)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+torchvision
+transformers
+easyocr
+Pillow
+opencv-python-headless
+flask
+flask-cors
+gradio
+matplotlib

test_mode.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import torch
+import torch.nn as nn
+from torchvision import models, transforms
+import torch.nn.functional as F
+import math
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+import matplotlib.pyplot as plt
+import easyocr
+import numpy as np
+import re
+import os
+import io
+import cv2
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL_PATH = os.path.join(BASE_DIR, "best_multimodal_v3.pth")
+# =========================
+# 1. Text Preprocessing
+# =========================
+def preprocess_text(text):
+    emoji_pattern = re.compile(
+        "["
+        "\U0001F600-\U0001F64F"  # emoticons
+        "\U0001F300-\U0001F5FF"  # symbols & pictographs
+        "\U0001F680-\U0001F6FF"  # transport & map symbols
+        "\U0001F1E0-\U0001F1FF"  # flags
+        "\U00002700-\U000027BF"  # dingbats
+        "\U0001F900-\U0001F9FF"  # supplemental symbols
+        "\U00002600-\U000026FF"  # misc symbols
+        "\U00002B00-\U00002BFF"  # arrows, etc.
+        "\U0001FA70-\U0001FAFF"  # extended symbols
+        "]+",
+        flags=re.UNICODE
+    )
+    # Remove emojis
+    text = emoji_pattern.sub(r'', text)
+    # Lowercase and strip
+    text = text.lower().strip()
+    # Keep letters (including accented), and spaces
+    text = re.sub(r'[^a-zñáéíóúü\s]', '', text)
+    # Normalize whitespace
+    text = re.sub(r'\s+', ' ', text)
+    return text
+# =========================
+# 2. OCR Extraction
+# =========================
+def ocr_extract_text(image_path, confidence_threshold=0.6):
+    reader = easyocr.Reader(['en', 'tl'], gpu=torch.cuda.is_available())
+    # # preprocess image for ocr
+    # image = cv2.cvtColor(image_path, cv2.COLOR_RGB2GRAY)
+    # # image = cv2.GaussianBlur(image,(5,5),0)
+    # result = reader.readtext(image, detail=1, paragraph=False, width_ths=0.7, height_ths=0.7)
+    # # Extract text and confidence scores
+    # texts = []
+    # confidences = []
+    # for detection in result:
+    #     bbox, text, confidence = detection
+    #     texts.append(text)
+    #     confidences.append(confidence)
+    # final_text = " ".join(texts)
+    # preprocess_txt = preprocess_text(final_text)
+    # avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+    # return final_text, preprocess_txt, avg_confidence
+    # Convert to grayscale
+    gray = cv2.cvtColor(image_path, cv2.COLOR_RGB2GRAY)
+    # First pass: OCR on raw grayscale
+    result = reader.readtext(gray, detail=1, paragraph=False, width_ths=0.7, height_ths=0.7)
+    texts, confidences = [], []
+    for detection in result:
+        if len(detection) == 3:
+            _, text, conf = detection
+        else:
+            text, conf = detection
+        if isinstance(text, list):
+            text = " ".join([str(t) for t in text if isinstance(t, str)])
+        texts.append(text)
+        try:
+            confidences.append(float(conf))
+        except (ValueError, TypeError):
+            confidences.append(0.0)
+    final_text = " ".join(texts)
+    avg_conf = sum(confidences)/len(confidences) if confidences else 0.0
+    # If confidence is low, retry with Gaussian blur
+    if avg_conf < confidence_threshold:
+        texts, confidences = [], []
+        gauss_img = cv2.GaussianBlur(gray, (5,5), 0)
+        result = reader.readtext(gauss_img, detail=1, paragraph=False, width_ths=0.7, height_ths=0.7)
+        for detection in result:
+            if len(detection) == 3:
+                _, text, conf = detection
+            else:
+                text, conf = detection
+            if isinstance(text, list):
+                text = " ".join([str(t) for t in text if isinstance(t, str)])
+            texts.append(text)
+            try:
+                confidences.append(float(conf))
+            except (ValueError, TypeError):
+                confidences.append(0.0)
+        final_text_gauss = " ".join(texts)
+        avg_conf_gauss = sum(confidences)/len(confidences) if confidences else 0.0
+        # Keep the version with higher confidence
+        if avg_conf_gauss > avg_conf:
+            final_text, avg_conf = final_text_gauss, avg_conf_gauss
+    if not final_text:
+        return "", "", 0.0
+    preprocess_txt = preprocess_text(final_text)
+    return final_text, preprocess_txt, avg_conf
+# =========================
+# 3. Image Preprocessing
+# =========================
+def resize_normalize_image(image_path, target_size=(224, 224)):
+    preprocess_image = transforms.Compose([
+        transforms.Resize(target_size, interpolation=transforms.InterpolationMode.BILINEAR),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        )
+    ])
+    image_tensor = preprocess_image(image_path).unsqueeze(0)  # Add batch dimension
+    return image_tensor
+# =========================
+# 4. Model Definitions
+# =========================
+class CrossAttentionModule(nn.Module):
+    def __init__(self, query_dim, key_value_dim, hidden_dim=256, num_heads=8, dropout=0.1):
+        super(CrossAttentionModule, self).__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.head_dim = hidden_dim // num_heads
+        self.scale = math.sqrt(self.head_dim)  # √dk
+        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"
+        # Query projection for H (image features)
+        self.query_proj = nn.Linear(query_dim, hidden_dim)
+        # Key and Value projections for G (text features)
+        self.key_proj = nn.Linear(key_value_dim, hidden_dim)
+        self.value_proj = nn.Linear(key_value_dim, hidden_dim)
+        # Output projection WO
+        self.out_proj = nn.Linear(hidden_dim, query_dim)
+        # Layer normalization
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+        # MLP for final transformation
+        self.mlp = nn.Sequential(
+            nn.Linear(query_dim, query_dim * 4),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(query_dim * 4, query_dim),
+            nn.Dropout(dropout)
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, H, G):
+        """
+        Args:
+            H: Query features [batch_size, seq_len_h, query_dim] (e.g., image patches)
+            G: Key/Value features [batch_size, seq_len_g, key_value_dim] (e.g., text tokens)
+        """
+        batch_size, seq_len_h, _ = H.shape
+        seq_len_g = G.shape[1]
+        # Step 1: Project to Q, K, V
+        Q = self.query_proj(H)  # WiQ H
+        K = self.key_proj(G)    # WiK G
+        V = self.value_proj(G)  # WiV G
+        # Step 2: Reshape for multi-head attention
+        Q = Q.view(batch_size, seq_len_h, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(batch_size, seq_len_g, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(batch_size, seq_len_g, self.num_heads, self.head_dim).transpose(1, 2)
+        # Step 3: Compute attention ATTi(H,G) = softmax((WiQ H)T(WiK G)/√dk)(WiV G)T
+        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        attention_weights = F.softmax(attention_scores, dim=-1)
+        attention_weights = self.dropout(attention_weights)
+        attention_output = torch.matmul(attention_weights, V)
+        # Step 4: Concatenate heads and apply output projection
+        attention_output = attention_output.transpose(1, 2).contiguous().view(
+            batch_size, seq_len_h, self.hidden_dim
+        )
+        # MATT(H,G) = [ATT1...ATTh]WO
+        matt_output = self.out_proj(attention_output)
+        # Step 5: Z = LN(H + MATT(H,G))
+        Z = self.norm1(H + matt_output)
+        # Step 6: TIM(H,G) = LN(Z + MLP(Z))
+        mlp_output = self.mlp(Z)
+        tim_output = self.norm2(Z + mlp_output)
+        return tim_output
+class MultimodalClassifier(nn.Module):
+    def __init__(self, num_classes=2, model_name='jcblaise/roberta-tagalog-base'):
+        super(MultimodalClassifier, self).__init__()
+        # Image encoder (ResNet-18, keep spatial features)
+        resnet = models.resnet18(pretrained=True)
+        modules = list(resnet.children())[:-2]  # keep until last conv (before avgpool)
+        self.image_encoder = nn.Sequential(*modules)  # output: (B, 512, 7, 7)
+        # Text encoder
+        self.text_encoder = AutoModel.from_pretrained(model_name)
+        # Cross-attention using paper formula
+        # Image attends to text
+        self.img_to_text_attention = CrossAttentionModule(
+            query_dim=512,
+            key_value_dim=self.text_encoder.config.hidden_size,
+            hidden_dim=256,
+            num_heads=8
+        )
+        # Text attends to image
+        self.text_to_img_attention = CrossAttentionModule(
+            query_dim=self.text_encoder.config.hidden_size,
+            key_value_dim=512,
+            hidden_dim=256,
+            num_heads=8
+        )
+        # Fusion & classifier
+        self.fusion = nn.Sequential(
+            nn.Linear(512 + self.text_encoder.config.hidden_size, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(128, num_classes)
+        )
+    def forward(self, images, input_ids, attention_mask):
+        # Extract image features
+        batch_size = images.size(0)
+        img_feats = self.image_encoder(images)           # (B, 512, 7, 7)
+        img_feats = img_feats.flatten(2).permute(0, 2, 1)  # (B, 49, 512)
+        # Extract text features
+        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
+        txt_feats = text_outputs.last_hidden_state  # (B, seq_len, H)
+        # Cross-attention following paper formula
+        # TIM(img_feats, txt_feats) and TIM(txt_feats, img_feats)
+        attended_img = self.img_to_text_attention(img_feats, txt_feats)
+        attended_txt = self.text_to_img_attention(txt_feats, img_feats)
+        # Pool attended outputs
+        img_repr = attended_img.mean(dim=1)  # (B, 512)
+        txt_repr = attended_txt[:, 0, :]     # CLS token (B, hidden_size)
+        # Fusion
+        fused = torch.cat([img_repr, txt_repr], dim=1)
+        return self.fusion(fused)
+# =========================
+# 5. Load Model & Tokenizer
+# =========================
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = MultimodalClassifier(num_classes=2)
+model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
+model.to(device)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-base")
+# =========================
+# 6. Inference Function
+# =========================
+def run_inference(image_path):
+    # Convert bytes → PIL image
+    if isinstance(image_path, (bytes, bytearray)):
+        pil_img = Image.open(io.BytesIO(image_path)).convert("RGB")
+    elif isinstance(image_path, str):
+        pil_img = Image.open(image_path).convert("RGB")
+    elif isinstance(image_path, Image.Image):
+        pil_img = image_path.convert("RGB")
+    else:
+        raise TypeError(f"Unsupported input type: {type(image_path)}")
+    # OCR
+    np_image= np.array(pil_img)
+    raw_text, clean_text, confidence= ocr_extract_text(np_image)
+    if clean_text == "":
+        return {
+            "error": "This is not a meme. Upload a valid meme image with text.",
+        }
+    elif len(clean_text.split()) < 3:
+        return {
+            "error": "Insufficient text detected in the meme. Please upload a meme with more text. Minimum is 3 words.",
+            "clean_text": clean_text,
+            "raw_text": raw_text,
+            "confidence": confidence
+        }
+    # Image
+    img_tensor = resize_normalize_image(pil_img).to(device)
+    # Tokenize text
+    encoding = tokenizer(
+        clean_text, return_tensors='pt',
+        padding=True, truncation=True, max_length=128
+    )
+    input_ids = encoding['input_ids'].to(device)
+    attention_mask = encoding['attention_mask'].to(device)
+    # Forward pass
+    with torch.no_grad():
+        logits = model(img_tensor, input_ids, attention_mask)
+        probs = torch.softmax(logits, dim=1)
+        pred_class = torch.argmax(probs, dim=1).item()
+        pred_class = 'sexual' if pred_class == 1 else 'non-sexual'
+    return {
+        'original_size': pil_img.size,
+        'prediction': pred_class,
+        'probabilities': probs.cpu().numpy().tolist(),
+        'raw_text': raw_text,
+        'clean_text': clean_text,
+        'confidence': confidence
+    }
+# =========================
+# 7. Run as main
+# =========================
+# if __name__ == "__main__":
+#     # Example: load image from path
+#     IMAGE_PATH = "backend/OIP (1).jfif"
+#     # test_dimension_sensitivity(IMAGE_PATH)
+#     result = run_inference(IMAGE_PATH)
+#     # Print results
+#     print("Original Image Size:", result['original_size'])
+#     print("Prediction:", result['prediction'])
+#     print("Probabilities:", result['probabilities'])
+#     print("Raw OCR Text:", result['raw_text'])
+#     print("Clean OCR Text:", result['clean_text'])
+#     print("OCR Confidence:", result['confidence'])
+#     # Preprocess image
+#     pil_img = Image.open(IMAGE_PATH).convert("RGB")
+#     img_tensor = resize_normalize_image(pil_img).to(device)
+#     # -----------------------------
+#     # Generate ResNet heatmap
+#     # -----------------------------
+#     features = {}
+#     def hook_fn(module, input, output):
+#         features['value'] = output.detach()
+#     last_conv = model.image_encoder[-1]
+#     hook_handle = last_conv.register_forward_hook(hook_fn)
+#     with torch.no_grad():
+#         _ = model(img_tensor,
+#                 input_ids=torch.zeros(1,1, dtype=torch.long, device=device),
+#                 attention_mask=torch.ones(1,1, dtype=torch.long, device=device))
+#     hook_handle.remove()
+#     feat_tensor = features['value']
+#     heatmap_grid = feat_tensor[0].mean(dim=0).cpu().numpy()
+#     heatmap_grid = (heatmap_grid - heatmap_grid.min()) / (heatmap_grid.max() - heatmap_grid.min())
+#     heatmap_resized = np.uint8(255 * heatmap_grid)
+#     heatmap_resized = Image.fromarray(heatmap_resized).resize(pil_img.size, Image.NEAREST)
+#     heatmap_resized = np.array(heatmap_resized)
+#     probs = result['probabilities'][0]
+#     prob_text = f"non-sexual: {probs[0]:.2f}, sexual: {probs[1]:.2f}"
+#     # -----------------------------
+#     # Plot everything in one figure
+#     # -----------------------------
+#     fig, ax = plt.subplots(figsize=(6,6))
+#     ax.imshow(pil_img)  # original image
+#     ax.imshow(heatmap_resized, cmap='jet', alpha=0.4, interpolation='nearest')  # overlay heatmap
+#     ax.axis('off')
+#     ax.set_title(f"{result['prediction']} ({prob_text})", fontsize=14, color='blue')
+#     # Add colorbar
+#     sm = plt.cm.ScalarMappable(cmap='jet', norm=plt.Normalize(vmin=0, vmax=1))
+#     sm.set_array([])
+#     cbar = fig.colorbar(sm, ax=ax, fraction=0.046, pad=0.04)
+#     cbar.set_label('Feature Intensity')
+#     plt.show()

util.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import easyocr
+from PIL import Image
+import re
+from torchvision import transforms
+import matplotlib.pyplot as plt
+# data={"image_path":"", "text":"", "preprocess_image":""}
+# to preprocess the text extracted from the meme
+def preprocess_text(text):
+    emoji_pattern = re.compile(
+        "["
+        "\U0001F600-\U0001F64F"  # emoticons
+        "\U0001F300-\U0001F5FF"  # symbols & pictographs
+        "\U0001F680-\U0001F6FF"  # transport & map symbols
+        "\U0001F1E0-\U0001F1FF"  # flags
+        "\U00002700-\U000027BF"  # dingbats
+        "\U0001F900-\U0001F9FF"  # supplemental symbols
+        "\U00002600-\U000026FF"  # miscellaneous symbols
+        "\U00002B00-\U00002BFF"  # arrows, etc.
+        "\U0001FA70-\U0001FAFF"  # extended symbols
+        "]+",
+        flags=re.UNICODE
+    )
+    text = emoji_pattern.sub(r'', text)
+    text = text.lower().strip()
+    text = re.sub(r'[^a-z0-9\s]', '', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\b\w\b', '', text)
+    text = re.sub(r'[^\w\s]', '', text)
+    return text
+# to extract and preprocess text from image using OCR
+def ocr_extract_text(image_path):
+    reader =  easyocr.Reader(['en', 'tl'], gpu=True)
+    result = reader.readtext(image_path, detail=0)
+    final_text = " ".join(result)
+    preprocess_txt= preprocess_text(final_text)
+    return final_text, preprocess_txt
+# to resize and normalize image for model input
+def resize_normalize_image(image, target_size= (224, 224)):
+    preprocess_image= transforms.Compose([
+        transforms.Resize(target_size),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225]
+        )
+    ])
+    image = Image.open(image).convert('RGB')
+    image = preprocess_image(image)
+    image = image.unsqueeze(0)
+    return image
+# if __name__ == "__main__":
+#     input_image= "backend/test_image.jpg"
+#     data["image_path"]= input_image
+#     data["text"]= ocr_extract_text(input_image)
+#     data["preprocess_image"]= resize_normalize_image(input_image)
+#     print(data)