Spaces:

vsaez
/

object-detection-app

Running

App Files Files Community

Víctor Sáez commited on Jul 20

Commit

2e9147d

1 Parent(s): 6ecfb14

Add multilenguage support

Browse files

Files changed (2) hide show

app.py +321 -35
requirements.txt +0 -0

app.py CHANGED Viewed

@@ -3,72 +3,358 @@ import torch
 from PIL import Image, ImageDraw, ImageFont
 from transformers import DetrImageProcessor, DetrForObjectDetection
 from pathlib import Path
-# Load DETR model and processor from Hugging Face
-model_name = "facebook/detr-resnet-50"
-processor = DetrImageProcessor.from_pretrained(model_name)
-model = DetrForObjectDetection.from_pretrained(model_name)
 # Load font
 font_path = Path("assets/fonts/arial.ttf")
 if not font_path.exists():
-    # If the font file does not exist, use the default PIL font
     print(f"Font file {font_path} not found. Using default font.")
     font = ImageFont.load_default()
 else:
-    font = ImageFont.truetype(str(font_path), size=100)
-print(f"CUDA is available: {torch.cuda.is_available()}")
-# Main function: takes an image and returns it with boxes and labels
-def detect_objects(image):
     inputs = processor(images=image, return_tensors="pt")
     outputs = model(**inputs)
-    # Convert model output to usable detection results
     target_sizes = torch.tensor([image.size[::-1]])
     results = processor.post_process_object_detection(
-        outputs, threshold=0.9, target_sizes=target_sizes
     )[0]
-    # Draw bounding boxes and labels on a copy of the image
     image_with_boxes = image.copy()
     draw = ImageDraw.Draw(image_with_boxes)
     for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
         box = [round(x, 2) for x in box.tolist()]
-        draw.rectangle(box, outline="red", width=3)
         # Prepare label text
-        label_text = f"{model.config.id2label[label.item()]}: {round(score.item(), 2)}"
-        # Measure text size
-        text_bbox = draw.textbbox((0, 0), label_text, font=font)
-        text_width = text_bbox[2] - text_bbox[0]
-        text_height = text_bbox[3] - text_bbox[1]
-        # Set background rectangle for text
-        text_background = [
-            box[0], box[1] - text_height,
-                    box[0] + text_width, box[1]
         ]
-        draw.rectangle(text_background, fill="black")  # Background
-        draw.text((box[0], box[1] - text_height), label_text, fill="white", font=font)
-    return image_with_boxes
-with gr.Blocks() as app:
-    with gr.Row():
-        gr.Markdown("## Object Detection App\nUpload an image to detect objects using Facebook's DETR model.")
-    with gr.Row():
-        input_image = gr.Image(type="pil", label="Input Image")
-        output_image = gr.Image(label="Detected Objects")
-    with gr.Row():
-        button = gr.Button("Detect Objects")
-    button.click(fn=detect_objects, inputs=input_image, outputs=output_image)
 if __name__ == "__main__":
-    app.launch()

 from PIL import Image, ImageDraw, ImageFont
 from transformers import DetrImageProcessor, DetrForObjectDetection
 from pathlib import Path
+import transformers
+# Global variables to cache models
+current_model = None
+current_processor = None
+current_model_name = None
+# Available models with better selection
+available_models = {
+    # DETR Models
+    "DETR ResNet-50": "facebook/detr-resnet-50",
+    "DETR ResNet-101": "facebook/detr-resnet-101",
+    "DETR DC5": "facebook/detr-resnet-50-dc5",
+    "DETR ResNet-50 Face Only": "esraakh/detr_fine_tune_face_detection_final"
+}
+def load_model(model_key):
+    """Load model and processor based on selected model key"""
+    global current_model, current_processor, current_model_name
+    model_name = available_models[model_key]
+    # Only load if it's a different model
+    if current_model_name != model_name:
+        print(f"Loading model: {model_name}")
+        current_processor = DetrImageProcessor.from_pretrained(model_name)
+        current_model = DetrForObjectDetection.from_pretrained(model_name)
+        current_model_name = model_name
+        print(f"Model loaded: {model_name}")
+        print(f"Available labels: {list(current_model.config.id2label.values())}")
+    return current_model, current_processor
 # Load font
 font_path = Path("assets/fonts/arial.ttf")
 if not font_path.exists():
     print(f"Font file {font_path} not found. Using default font.")
     font = ImageFont.load_default()
 else:
+    font = ImageFont.truetype(str(font_path), size=100)  # Reduced font size
+# Set up translations for the app
+translations = {
+    "English": {
+        "title": "## Enhanced Object Detection App\nUpload an image to detect objects using various DETR models.",
+        "input_label": "Input Image",
+        "output_label": "Detected Objects",
+        "dropdown_label": "Label Language",
+        "dropdown_detection_model_label": "Detection Model",
+        "threshold_label": "Detection Threshold",
+        "button": "Detect Objects",
+        "info_label": "Detection Info",
+        "model_fast": "General Objects (fast)",
+        "model_precision": "General Objects (high precision)",
+        "model_small": "Small Objects/Details (slow)",
+        "model_faces": "Face Detection (people only)"
+    },
+    "Spanish": {
+        "title": "## Aplicación Mejorada de Detección de Objetos\nSube una imagen para detectar objetos usando varios modelos DETR.",
+        "input_label": "Imagen de entrada",
+        "output_label": "Objetos detectados",
+        "dropdown_label": "Idioma de las etiquetas",
+        "dropdown_detection_model_label": "Modelo de detección",
+        "threshold_label": "Umbral de detección",
+        "button": "Detectar objetos",
+        "info_label": "Información de detección",
+        "model_fast": "Objetos generales (rápido)",
+        "model_precision": "Objetos generales (precisión alta)",
+        "model_small": "Objetos pequeños/detalles (lento)",
+        "model_faces": "Detección de caras (solo personas)"
+    },
+    "French": {
+        "title": "## Application Améliorée de Détection d'Objets\nTéléchargez une image pour détecter des objets avec divers modèles DETR.",
+        "input_label": "Image d'entrée",
+        "output_label": "Objets détectés",
+        "dropdown_label": "Langue des étiquettes",
+        "dropdown_detection_model_label": "Modèle de détection",
+        "threshold_label": "Seuil de détection",
+        "button": "Détecter les objets",
+        "info_label": "Information de détection",
+        "model_fast": "Objets généraux (rapide)",
+        "model_precision": "Objets généraux (haute précision)",
+        "model_small": "Petits objets/détails (lent)",
+        "model_faces": "Détection de visages (personnes uniquement)"
+    }
+}
+def t(language, key):
+    return translations.get(language, translations["English"]).get(key, key)
+def get_translated_model_choices(language):
+    """Get model choices translated to the selected language"""
+    model_mapping = {
+        "DETR ResNet-50": "model_fast",
+        "DETR ResNet-101": "model_precision",
+        "DETR DC5": "model_small",
+        "DETR ResNet-50 Face Only": "model_faces"
+    }
+    translated_choices = []
+    for model_key in available_models.keys():
+        if model_key in model_mapping:
+            translation_key = model_mapping[model_key]
+            translated_name = t(language, translation_key)
+        else:
+            translated_name = model_key  # Fallback to original name
+        translated_choices.append(translated_name)
+    return translated_choices
+def get_model_key_from_translation(translated_name, language):
+    """Get the original model key from translated name"""
+    model_mapping = {
+        "DETR ResNet-50": "model_fast",
+        "DETR ResNet-101": "model_precision",
+        "DETR DC5": "model_small",
+        "DETR ResNet-50 Face Only": "model_faces"
+    }
+    # Reverse lookup
+    for model_key, translation_key in model_mapping.items():
+        if t(language, translation_key) == translated_name:
+            return model_key
+    # If not found, try direct match
+    if translated_name in available_models:
+        return translated_name
+    # Default fallback
+    return "DETR ResNet-50"
+def get_helsinki_model(language_label):
+    """Returns the Helsinki-NLP model name for translating from English to the selected language."""
+    lang_map = {
+        "Spanish": "es",
+        "French": "fr",
+        "English": "en"
+    }
+    target = lang_map.get(language_label)
+    if not target or target == "en":
+        return None
+    return f"Helsinki-NLP/opus-mt-en-{target}"
+# add cache for translations
+translation_cache = {}
+def translate_label(language_label, label):
+    """Translates the given label to the target language."""
+    # Check cache first
+    cache_key = f"{language_label}_{label}"
+    if cache_key in translation_cache:
+        return translation_cache[cache_key]
+    model_name = get_helsinki_model(language_label)
+    if not model_name:
+        return label
+    try:
+        translator = transformers.pipeline("translation", model=model_name)
+        result = translator(label, max_length=40)
+        translated = result[0]['translation_text']
+        # Cache the result
+        translation_cache[cache_key] = translated
+        return translated
+    except Exception as e:
+        print(f"Translation error (429 or other): {e}")
+        return label  # Return original if translation fails
+def detect_objects(image, language_selector, translated_model_selector, threshold):
+    """Enhanced object detection with adjustable threshold and better info"""
+    # Get the actual model key from the translated name
+    model_selector = get_model_key_from_translation(translated_model_selector, language_selector)
+    print(f"Processing image. Language: {language_selector}, Model: {model_selector}, Threshold: {threshold}")
+    # Load the selected model
+    model, processor = load_model(model_selector)
+    # Process the image
     inputs = processor(images=image, return_tensors="pt")
     outputs = model(**inputs)
+    # Convert model output to usable detection results with custom threshold
     target_sizes = torch.tensor([image.size[::-1]])
     results = processor.post_process_object_detection(
+        outputs, threshold=threshold, target_sizes=target_sizes
     )[0]
+    # Create a copy of the image for drawing
     image_with_boxes = image.copy()
     draw = ImageDraw.Draw(image_with_boxes)
+    # Detection info
+    detection_info = f"Detected {len(results['scores'])} objects with threshold {threshold}\n"
+    detection_info += f"Model: {translated_model_selector} ({model_selector})\n\n"
+    # Colors for different confidence levels
+    colors = {
+        'high': 'red',  # > 0.8
+        'medium': 'orange',  # 0.5-0.8
+        'low': 'yellow'  # < 0.5
+    }
+    detected_objects = []
     for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        confidence = score.item()
         box = [round(x, 2) for x in box.tolist()]
+        # Choose color based on confidence
+        if confidence > 0.8:
+            color = colors['high']
+        elif confidence > 0.5:
+            color = colors['medium']
+        else:
+            color = colors['low']
+        # Draw bounding box
+        draw.rectangle(box, outline=color, width=3)
         # Prepare label text
+        label_text = model.config.id2label[label.item()]
+        translated_label = translate_label(language_selector, label_text)
+        display_text = f"{translated_label}: {round(confidence, 3)}"
+        # Store detection info
+        detected_objects.append({
+            'label': label_text,
+            'translated': translated_label,
+            'confidence': confidence,
+            'box': box
+        })
+        # Calculate text position and size
+        try:
+            text_bbox = draw.textbbox((0, 0), display_text, font=font)
+            text_width = text_bbox[2] - text_bbox[0]
+            text_height = text_bbox[3] - text_bbox[1]
+        except:
+            # Fallback for older PIL versions
+            text_width, text_height = draw.textsize(display_text, font=font)
+        # Draw text background
+        text_bg = [
+            box[0], box[1] - text_height - 4,
+                    box[0] + text_width + 4, box[1]
         ]
+        draw.rectangle(text_bg, fill="black")
+        draw.text((box[0] + 2, box[1] - text_height - 2), display_text, fill="white", font=font)
+    # Create detailed detection info
+    if detected_objects:
+        detection_info += "Objects found:\n"
+        for obj in sorted(detected_objects, key=lambda x: x['confidence'], reverse=True):
+            detection_info += f"- {obj['translated']} ({obj['label']}): {obj['confidence']:.3f}\n"
+    else:
+        detection_info += "No objects detected. Try lowering the threshold."
+    return image_with_boxes, detection_info
+def build_app():
+    with gr.Blocks(theme=gr.themes.Soft()) as app:
+        with gr.Row():
+            title = gr.Markdown(t("English", "title"))
+        with gr.Row():
+            with gr.Column(scale=1):
+                language_selector = gr.Dropdown(
+                    choices=["English", "Spanish", "French"],
+                    value="English",
+                    label=t("English", "dropdown_label")
+                )
+            with gr.Column(scale=1):
+                model_selector = gr.Dropdown(
+                    choices=get_translated_model_choices("English"),
+                    value=t("English", "model_fast"),  # Default to translated "fast" option
+                    label=t("English", "dropdown_detection_model_label")
+                )
+            with gr.Column(scale=1):
+                threshold_slider = gr.Slider(
+                    minimum=0.1,
+                    maximum=0.95,
+                    value=0.5,  # Lowered default threshold
+                    step=0.05,
+                    label=t("English", "threshold_label")
+                )
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(type="pil", label=t("English", "input_label"))
+                button = gr.Button(t("English", "button"), variant="primary")
+            with gr.Column(scale=1):
+                output_image = gr.Image(label=t("English", "output_label"))
+                detection_info = gr.Textbox(
+                    label=t("English", "info_label"),
+                    lines=10,
+                    max_lines=15
+                )
+        # Function to update interface when language changes
+        def update_interface(selected_language):
+            translated_choices = get_translated_model_choices(selected_language)
+            default_model = t(selected_language, "model_fast")
+            return [
+                gr.update(value=t(selected_language, "title")),
+                gr.update(label=t(selected_language, "dropdown_label")),
+                gr.update(
+                    choices=translated_choices,
+                    value=default_model,
+                    label=t(selected_language, "dropdown_detection_model_label")
+                ),
+                gr.update(label=t(selected_language, "threshold_label")),
+                gr.update(label=t(selected_language, "input_label")),
+                gr.update(value=t(selected_language, "button")),
+                gr.update(label=t(selected_language, "output_label")),
+                gr.update(label=t(selected_language, "info_label"))
+            ]
+        # Connect language change event
+        language_selector.change(
+            fn=update_interface,
+            inputs=language_selector,
+            outputs=[title, language_selector, model_selector, threshold_slider,
+                     input_image, button, output_image, detection_info],
+            queue=False
+        )
+        # Connect detection button click event
+        button.click(
+            fn=detect_objects,
+            inputs=[input_image, language_selector, model_selector, threshold_slider],
+            outputs=[output_image, detection_info]
+        )
+    return app
+# Initialize with default model
+load_model("DETR ResNet-50")
+# Launch the application
 if __name__ == "__main__":
+    app = build_app()
+    app.launch()

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ