Spaces:

fmeres
/

florence-2-document-analyzer

Sleeping

Felipe Meres commited on Sep 24, 2025

Commit

b14c740

1 Parent(s): 9a8f848

Major compatibility fix: Downgrade to Gradio 3.50.2

- Downgrade Gradio: 4.28.0 -> 3.50.2 (stable, HF-compatible version)
- Rewrite app.py for Gradio 3.x syntax (gr.Interface instead of gr.Blocks)
- Simplify interface: single image upload, simplified processing
- Remove complex gallery/multi-image support for stability
- Update README SDK version to match
- Focus on core Florence-2 functionality with stable Gradio

Files changed (3) hide show

app.py +58 -231
app_backup.py +383 -0
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -2,14 +2,10 @@ import gradio as gr
 import torch
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
-import io
-import base64
 from pathlib import Path
-import tempfile
 import os
-from typing import List, Tuple, Dict, Any, Optional
-import json
 import time
 # Import configuration
 from config import *
@@ -108,32 +104,6 @@ class Florence2Analyzer:
         except Exception as e:
             return {"error": f"Analysis failed: {str(e)}", "success": False}
-def convert_pdf_to_images(pdf_file) -> List[Image.Image]:
-    """Convert PDF pages to PIL Images"""
-    if not PDF_AVAILABLE:
-        raise ValueError("PDF processing not available. Please install pdf2image.")
-    try:
-        # Handle different input types
-        if hasattr(pdf_file, 'read'):
-            # File-like object
-            pdf_bytes = pdf_file.read()
-            images = convert_from_bytes(pdf_bytes, dpi=PDF_DPI, fmt='RGB')
-        elif isinstance(pdf_file, str) and os.path.exists(pdf_file):
-            # File path
-            images = convert_from_path(pdf_file, dpi=PDF_DPI, fmt='RGB')
-        else:
-            raise ValueError("Invalid PDF input format")
-        # Limit number of pages
-        if len(images) > MAX_PDF_PAGES:
-            print(f"Warning: PDF has {len(images)} pages, processing only first {MAX_PDF_PAGES}")
-            images = images[:MAX_PDF_PAGES]
-        return images
-    except Exception as e:
-        raise ValueError(f"Failed to convert PDF: {str(e)}")
 def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
     """Draw bounding boxes and labels on image"""
     if not results.get("success", False):
@@ -146,12 +116,9 @@ def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Im
     try:
         # Load a font
         try:
-            font = ImageFont.truetype("arial.ttf", FONT_SIZE)
         except:
-            try:
-                font = ImageFont.truetype("DejaVuSans.ttf", FONT_SIZE)
-            except:
-                font = ImageFont.load_default()
         parsed_results = results.get("parsed_results", {})
@@ -167,217 +134,77 @@ def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Im
                 # Draw bounding box
                 draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
-                # Prepare label text (truncate if too long)
-                display_label = label if len(label) <= 30 else f"{label[:27]}..."
-                # Draw label background
-                text_bbox = draw.textbbox((x1, y1), display_label, font=font)
-                text_width = text_bbox[2] - text_bbox[0]
-                text_height = text_bbox[3] - text_bbox[1]
-                # Ensure label fits within image bounds
-                label_x = min(x1, image.width - text_width - 5)
-                label_y = max(y1 - text_height - 5, 5)
-                # Draw background rectangle
-                draw.rectangle([label_x - 2, label_y - 2, label_x + text_width + 2, label_y + text_height + 2],
-                             fill=color)
                 # Draw label text
-                draw.text((label_x, label_y), display_label, fill="white", font=font)
-        # Handle OCR results
-        elif "quad_boxes" in parsed_results and "labels" in parsed_results:
-            quad_boxes = parsed_results["quad_boxes"]
-            labels = parsed_results["labels"]
-            for i, (quad, label) in enumerate(zip(quad_boxes, labels)):
-                color = BBOX_COLORS[i % len(BBOX_COLORS)]
-                # Draw quadrilateral for OCR results
-                if len(quad) >= 8:  # quad should have 8 coordinates (4 points)
-                    points = [(quad[j], quad[j+1]) for j in range(0, 8, 2)]
-                    draw.polygon(points, outline=color, width=BBOX_WIDTH)
-                    # Draw label near first point
-                    x, y = points[0]
-                    display_label = label if len(label) <= 20 else f"{label[:17]}..."
-                    text_bbox = draw.textbbox((x, y), display_label, font=font)
-                    draw.rectangle([text_bbox[0]-2, text_bbox[1]-2, text_bbox[2]+2, text_bbox[3]+2],
-                                 fill=color)
-                    draw.text((x, y), display_label, fill="white", font=font)
     except Exception as e:
         print(f"Error drawing annotations: {e}")
     return annotated_image
-def process_uploaded_file(file, task_type: str) -> Tuple[List[Image.Image], List[Image.Image], str]:
-    """Process uploaded file (image or PDF) and return original and annotated versions"""
     if file is None:
-        return [], [], "No file uploaded."
-    analyzer = Florence2Analyzer()
-    original_images = []
-    annotated_images = []
-    status_message = ""
     try:
-        # Determine file type
-        file_extension = Path(file.name).suffix.lower()
-        if file_extension == '.pdf':
-            if not PDF_AVAILABLE:
-                return [], [], "PDF processing not available. Please install pdf2image."
-            # Convert PDF to images
-            status_message += f"Converting PDF to images...\n"
-            pdf_images = convert_pdf_to_images(file)
-            status_message += f"Successfully converted {len(pdf_images)} pages.\n"
-            for i, img in enumerate(pdf_images):
-                status_message += f"Processing page {i+1}...\n"
-                # Analyze with Florence-2
-                results = analyzer.analyze_image(img, task_type)
-                if results.get("success", False):
-                    annotated_img = draw_bounding_boxes(img, results)
-                    original_images.append(img)
-                    annotated_images.append(annotated_img)
-                    status_message += f"Page {i+1} analyzed successfully.\n"
-                else:
-                    status_message += f"Page {i+1} analysis failed: {results.get('error', 'Unknown error')}\n"
-                    original_images.append(img)
-                    annotated_images.append(img)  # Fallback to original
-        elif file_extension in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
-            # Process single image
-            status_message += "Processing image...\n"
             img = Image.open(file).convert('RGB')
-            results = analyzer.analyze_image(img, task_type)
-            if results.get("success", False):
-                annotated_img = draw_bounding_boxes(img, results)
-                original_images.append(img)
-                annotated_images.append(annotated_img)
-                status_message += "Image analyzed successfully.\n"
-                # Add detailed results to status
-                if "parsed_results" in results:
-                    parsed = results["parsed_results"]
-                    if task_type == "detailed_caption" and isinstance(parsed, dict):
-                        caption = parsed.get("detailed_caption", "No caption generated")
-                        status_message += f"Caption: {caption}\n"
-                    elif "labels" in parsed:
-                        labels = parsed["labels"]
-                        status_message += f"Detected objects: {', '.join(labels[:5])}{'...' if len(labels) > 5 else ''}\n"
-            else:
-                status_message += f"Analysis failed: {results.get('error', 'Unknown error')}\n"
-                original_images.append(img)
-                annotated_images.append(img)
         else:
-            return [], [], f"Unsupported file type: {file_extension}. Please upload PNG, JPG, JPEG, or PDF files."
     except Exception as e:
-        return [], [], f"Error processing file: {str(e)}"
-    return original_images, annotated_images, status_message
-def create_gallery_content(original_images: List[Image.Image], annotated_images: List[Image.Image]) -> List[Tuple[Image.Image, str]]:
-    """Create content for Gradio gallery showing both original and annotated versions"""
-    gallery_content = []
-    for i, (orig, anno) in enumerate(zip(original_images, annotated_images)):
-        # Add original image
-        gallery_content.append((orig, f"Page/Image {i+1} - Original"))
-        # Add annotated image
-        gallery_content.append((anno, f"Page/Image {i+1} - Analyzed"))
-    return gallery_content
 # Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="Florence-2 Document & Image Analyzer", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 📄 Florence-2 Document & Image Analyzer
-        Upload images (PNG, JPG, JPEG) or PDF documents to analyze them with Microsoft's Florence-2 vision model.
-        The model can detect objects, generate captions, perform OCR, and more!
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                file_upload = gr.File(
-                    label="Upload Image or PDF",
-                    file_types=[".png", ".jpg", ".jpeg", ".pdf"],
-                    type="filepath"
-                )
-                task_type = gr.Dropdown(
-                    choices=[(config["description"], task_name) for task_name, config in FLORENCE_TASKS.items()],
-                    value="object_detection",
-                    label="Analysis Type",
-                    info="Choose what type of analysis to perform"
-                )
-                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
-                status_text = gr.Textbox(
-                    label="Status",
-                    lines=8,
-                    interactive=False,
-                    placeholder="Upload a file and click Analyze to see results..."
-                )
-            with gr.Column(scale=2):
-                gallery = gr.Gallery(
-                    label="Results (Original vs Analyzed)",
-                    show_label=True,
-                    elem_id="gallery",
-                    columns=2,
-                    rows=2,
-                    object_fit="contain",
-                    height="auto"
-                )
-        # Event handler
-        def process_and_display(file, task):
-            if file is None:
-                return [], "Please upload a file first."
-            original_imgs, annotated_imgs, status = process_uploaded_file(file, task)
-            gallery_content = create_gallery_content(original_imgs, annotated_imgs)
-            return gallery_content, status
-        analyze_btn.click(
-            fn=process_and_display,
-            inputs=[file_upload, task_type],
-            outputs=[gallery, status_text]
-        )
-        # Example section
-        gr.Markdown("""
-        ## 💡 Tips for Best Results
-        - **Images**: Upload clear, high-resolution images for better analysis
-        - **PDFs**: Multi-page PDFs will be processed page by page
-        - **Object Detection**: Great for identifying and locating objects in images
-        - **Detailed Caption**: Provides comprehensive descriptions of image content
-        - **OCR**: Perfect for extracting text from documents and images
-        - **Dense Captioning**: Provides detailed captions for different regions
-        ## 🎯 Supported Formats
-        - **Images**: PNG, JPG, JPEG, BMP, TIFF
-        - **Documents**: PDF (converted to images automatically)
-        """)
-    return demo
-# Launch the application
 if __name__ == "__main__":
-    demo = create_interface()
     demo.launch()

 import torch
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
 from pathlib import Path
 import os
 import time
+from typing import List, Dict, Any
 # Import configuration
 from config import *
         except Exception as e:
             return {"error": f"Analysis failed: {str(e)}", "success": False}
 def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
     """Draw bounding boxes and labels on image"""
     if not results.get("success", False):
     try:
         # Load a font
         try:
+            font = ImageFont.load_default()
         except:
+            font = None
         parsed_results = results.get("parsed_results", {})
                 # Draw bounding box
                 draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
                 # Draw label text
+                if font:
+                    draw.text((x1, max(y1-20, 0)), label[:30], fill=color, font=font)
     except Exception as e:
         print(f"Error drawing annotations: {e}")
     return annotated_image
+def process_image(file, task_type):
+    """Process uploaded file and return result"""
     if file is None:
+        return None, "Please upload a file first."
     try:
+        # Load image
+        if isinstance(file, str):
             img = Image.open(file).convert('RGB')
         else:
+            img = Image.open(file.name).convert('RGB')
+        # Analyze with Florence-2
+        analyzer = Florence2Analyzer()
+        results = analyzer.analyze_image(img, task_type)
+        if results.get("success", False):
+            annotated_img = draw_bounding_boxes(img, results)
+            status = "Image analyzed successfully!"
+            # Add results info
+            if "parsed_results" in results:
+                parsed = results["parsed_results"]
+                if task_type == "detailed_caption" and isinstance(parsed, dict):
+                    caption = parsed.get("detailed_caption", "No caption generated")
+                    status += f"\n\nCaption: {caption}"
+                elif "labels" in parsed:
+                    labels = parsed["labels"]
+                    status += f"\n\nDetected objects: {', '.join(labels[:5])}"
+            return annotated_img, status
+        else:
+            return img, f"Analysis failed: {results.get('error', 'Unknown error')}"
     except Exception as e:
+        return None, f"Error processing file: {str(e)}"
+# Task choices
+task_choices = [
+    "object_detection",
+    "detailed_caption",
+    "dense_captioning",
+    "ocr",
+    "region_proposal"
+]
 # Create Gradio interface
+demo = gr.Interface(
+    fn=process_image,
+    inputs=[
+        gr.File(label="Upload Image", file_types=["image"]),
+        gr.Dropdown(choices=task_choices, value="object_detection", label="Analysis Type")
+    ],
+    outputs=[
+        gr.Image(label="Analyzed Image"),
+        gr.Textbox(label="Status", lines=5)
+    ],
+    title="📄 Florence-2 Document & Image Analyzer",
+    description="Upload images to analyze them with Microsoft's Florence-2 vision model. The model can detect objects, generate captions, perform OCR, and more!",
+    theme="soft",
+    allow_flagging="never"
+)
 if __name__ == "__main__":
     demo.launch()

app_backup.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import gradio as gr
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import io
+import base64
+from pathlib import Path
+import tempfile
+import os
+from typing import List, Tuple, Dict, Any, Optional
+import json
+import time
+# Import configuration
+from config import *
+# PDF processing
+try:
+    from pdf2image import convert_from_path, convert_from_bytes
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+    print("Warning: pdf2image not available. PDF processing will be disabled.")
+# Florence-2 model imports
+try:
+    from transformers import AutoProcessor, AutoModelForCausalLM
+    FLORENCE_AVAILABLE = True
+except ImportError:
+    FLORENCE_AVAILABLE = False
+    print("Warning: transformers not available. Florence-2 processing will be disabled.")
+class Florence2Analyzer:
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.device = "cpu" if FORCE_CPU else ("cuda" if torch.cuda.is_available() else "cpu")
+        self._load_model()
+    def _load_model(self):
+        """Load Florence-2 model and processor"""
+        if not FLORENCE_AVAILABLE:
+            print("Florence-2 not available - transformers library not found")
+            return
+        try:
+            print(f"Loading Florence-2 model: {FLORENCE_MODEL_ID}")
+            start_time = time.time()
+            self.model = AutoModelForCausalLM.from_pretrained(
+                FLORENCE_MODEL_ID,
+                torch_dtype=torch.float16 if (torch.cuda.is_available() and not FORCE_CPU) else torch.float32,
+                trust_remote_code=True
+            ).to(self.device)
+            self.processor = AutoProcessor.from_pretrained(FLORENCE_MODEL_ID, trust_remote_code=True)
+            load_time = time.time() - start_time
+            print(f"Florence-2 model loaded successfully on {self.device} in {load_time:.2f} seconds")
+        except Exception as e:
+            print(f"Error loading Florence-2 model: {e}")
+            self.model = None
+            self.processor = None
+    def analyze_image(self, image: Image.Image, task_type: str = "detailed_caption") -> Dict[str, Any]:
+        """Analyze image with Florence-2 model"""
+        if not self.model or not self.processor:
+            return {"error": ERROR_MESSAGES["model_not_loaded"], "success": False}
+        try:
+            # Get task configuration
+            task_config = FLORENCE_TASKS.get(task_type, FLORENCE_TASKS["detailed_caption"])
+            task_prompt = task_config["prompt"]
+            # Resize image if too large
+            if image.size[0] > MAX_IMAGE_SIZE[0] or image.size[1] > MAX_IMAGE_SIZE[1]:
+                image.thumbnail(MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)
+                print(f"Resized image to {image.size}")
+            # Process image
+            inputs = self.processor(text=task_prompt, images=image, return_tensors="pt").to(self.device)
+            # Generate
+            generated_ids = self.model.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=task_config["max_tokens"],
+                num_beams=3,
+                do_sample=False
+            )
+            # Decode response
+            generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+            parsed_answer = self.processor.post_process_generation(
+                generated_text,
+                task=task_prompt,
+                image_size=(image.width, image.height)
+            )
+            return {
+                "task_type": task_type,
+                "raw_text": generated_text,
+                "parsed_results": parsed_answer,
+                "success": True,
+                "processing_time": time.time()
+            }
+        except Exception as e:
+            return {"error": f"Analysis failed: {str(e)}", "success": False}
+def convert_pdf_to_images(pdf_file) -> List[Image.Image]:
+    """Convert PDF pages to PIL Images"""
+    if not PDF_AVAILABLE:
+        raise ValueError("PDF processing not available. Please install pdf2image.")
+    try:
+        # Handle different input types
+        if hasattr(pdf_file, 'read'):
+            # File-like object
+            pdf_bytes = pdf_file.read()
+            images = convert_from_bytes(pdf_bytes, dpi=PDF_DPI, fmt='RGB')
+        elif isinstance(pdf_file, str) and os.path.exists(pdf_file):
+            # File path
+            images = convert_from_path(pdf_file, dpi=PDF_DPI, fmt='RGB')
+        else:
+            raise ValueError("Invalid PDF input format")
+        # Limit number of pages
+        if len(images) > MAX_PDF_PAGES:
+            print(f"Warning: PDF has {len(images)} pages, processing only first {MAX_PDF_PAGES}")
+            images = images[:MAX_PDF_PAGES]
+        return images
+    except Exception as e:
+        raise ValueError(f"Failed to convert PDF: {str(e)}")
+def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
+    """Draw bounding boxes and labels on image"""
+    if not results.get("success", False):
+        return image
+    # Create a copy to draw on
+    annotated_image = image.copy()
+    draw = ImageDraw.Draw(annotated_image)
+    try:
+        # Load a font
+        try:
+            font = ImageFont.truetype("arial.ttf", FONT_SIZE)
+        except:
+            try:
+                font = ImageFont.truetype("DejaVuSans.ttf", FONT_SIZE)
+            except:
+                font = ImageFont.load_default()
+        parsed_results = results.get("parsed_results", {})
+        # Handle object detection and dense captioning results
+        if "bboxes" in parsed_results and "labels" in parsed_results:
+            bboxes = parsed_results["bboxes"]
+            labels = parsed_results["labels"]
+            for i, (bbox, label) in enumerate(zip(bboxes, labels)):
+                color = BBOX_COLORS[i % len(BBOX_COLORS)]
+                x1, y1, x2, y2 = bbox
+                # Draw bounding box
+                draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
+                # Prepare label text (truncate if too long)
+                display_label = label if len(label) <= 30 else f"{label[:27]}..."
+                # Draw label background
+                text_bbox = draw.textbbox((x1, y1), display_label, font=font)
+                text_width = text_bbox[2] - text_bbox[0]
+                text_height = text_bbox[3] - text_bbox[1]
+                # Ensure label fits within image bounds
+                label_x = min(x1, image.width - text_width - 5)
+                label_y = max(y1 - text_height - 5, 5)
+                # Draw background rectangle
+                draw.rectangle([label_x - 2, label_y - 2, label_x + text_width + 2, label_y + text_height + 2],
+                             fill=color)
+                # Draw label text
+                draw.text((label_x, label_y), display_label, fill="white", font=font)
+        # Handle OCR results
+        elif "quad_boxes" in parsed_results and "labels" in parsed_results:
+            quad_boxes = parsed_results["quad_boxes"]
+            labels = parsed_results["labels"]
+            for i, (quad, label) in enumerate(zip(quad_boxes, labels)):
+                color = BBOX_COLORS[i % len(BBOX_COLORS)]
+                # Draw quadrilateral for OCR results
+                if len(quad) >= 8:  # quad should have 8 coordinates (4 points)
+                    points = [(quad[j], quad[j+1]) for j in range(0, 8, 2)]
+                    draw.polygon(points, outline=color, width=BBOX_WIDTH)
+                    # Draw label near first point
+                    x, y = points[0]
+                    display_label = label if len(label) <= 20 else f"{label[:17]}..."
+                    text_bbox = draw.textbbox((x, y), display_label, font=font)
+                    draw.rectangle([text_bbox[0]-2, text_bbox[1]-2, text_bbox[2]+2, text_bbox[3]+2],
+                                 fill=color)
+                    draw.text((x, y), display_label, fill="white", font=font)
+    except Exception as e:
+        print(f"Error drawing annotations: {e}")
+    return annotated_image
+def process_uploaded_file(file, task_type: str) -> Tuple[List[Image.Image], List[Image.Image], str]:
+    """Process uploaded file (image or PDF) and return original and annotated versions"""
+    if file is None:
+        return [], [], "No file uploaded."
+    analyzer = Florence2Analyzer()
+    original_images = []
+    annotated_images = []
+    status_message = ""
+    try:
+        # Determine file type
+        file_extension = Path(file.name).suffix.lower()
+        if file_extension == '.pdf':
+            if not PDF_AVAILABLE:
+                return [], [], "PDF processing not available. Please install pdf2image."
+            # Convert PDF to images
+            status_message += f"Converting PDF to images...\n"
+            pdf_images = convert_pdf_to_images(file)
+            status_message += f"Successfully converted {len(pdf_images)} pages.\n"
+            for i, img in enumerate(pdf_images):
+                status_message += f"Processing page {i+1}...\n"
+                # Analyze with Florence-2
+                results = analyzer.analyze_image(img, task_type)
+                if results.get("success", False):
+                    annotated_img = draw_bounding_boxes(img, results)
+                    original_images.append(img)
+                    annotated_images.append(annotated_img)
+                    status_message += f"Page {i+1} analyzed successfully.\n"
+                else:
+                    status_message += f"Page {i+1} analysis failed: {results.get('error', 'Unknown error')}\n"
+                    original_images.append(img)
+                    annotated_images.append(img)  # Fallback to original
+        elif file_extension in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
+            # Process single image
+            status_message += "Processing image...\n"
+            img = Image.open(file).convert('RGB')
+            results = analyzer.analyze_image(img, task_type)
+            if results.get("success", False):
+                annotated_img = draw_bounding_boxes(img, results)
+                original_images.append(img)
+                annotated_images.append(annotated_img)
+                status_message += "Image analyzed successfully.\n"
+                # Add detailed results to status
+                if "parsed_results" in results:
+                    parsed = results["parsed_results"]
+                    if task_type == "detailed_caption" and isinstance(parsed, dict):
+                        caption = parsed.get("detailed_caption", "No caption generated")
+                        status_message += f"Caption: {caption}\n"
+                    elif "labels" in parsed:
+                        labels = parsed["labels"]
+                        status_message += f"Detected objects: {', '.join(labels[:5])}{'...' if len(labels) > 5 else ''}\n"
+            else:
+                status_message += f"Analysis failed: {results.get('error', 'Unknown error')}\n"
+                original_images.append(img)
+                annotated_images.append(img)
+        else:
+            return [], [], f"Unsupported file type: {file_extension}. Please upload PNG, JPG, JPEG, or PDF files."
+    except Exception as e:
+        return [], [], f"Error processing file: {str(e)}"
+    return original_images, annotated_images, status_message
+def create_gallery_content(original_images: List[Image.Image], annotated_images: List[Image.Image]) -> List[Tuple[Image.Image, str]]:
+    """Create content for Gradio gallery showing both original and annotated versions"""
+    gallery_content = []
+    for i, (orig, anno) in enumerate(zip(original_images, annotated_images)):
+        # Add original image
+        gallery_content.append((orig, f"Page/Image {i+1} - Original"))
+        # Add annotated image
+        gallery_content.append((anno, f"Page/Image {i+1} - Analyzed"))
+    return gallery_content
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Florence-2 Document & Image Analyzer", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 📄 Florence-2 Document & Image Analyzer
+        Upload images (PNG, JPG, JPEG) or PDF documents to analyze them with Microsoft's Florence-2 vision model.
+        The model can detect objects, generate captions, perform OCR, and more!
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_upload = gr.File(
+                    label="Upload Image or PDF",
+                    file_types=[".png", ".jpg", ".jpeg", ".pdf"],
+                    type="filepath"
+                )
+                task_type = gr.Dropdown(
+                    choices=[(config["description"], task_name) for task_name, config in FLORENCE_TASKS.items()],
+                    value="object_detection",
+                    label="Analysis Type",
+                    info="Choose what type of analysis to perform"
+                )
+                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
+                status_text = gr.Textbox(
+                    label="Status",
+                    lines=8,
+                    interactive=False,
+                    placeholder="Upload a file and click Analyze to see results..."
+                )
+            with gr.Column(scale=2):
+                gallery = gr.Gallery(
+                    label="Results (Original vs Analyzed)",
+                    show_label=True,
+                    elem_id="gallery",
+                    columns=2,
+                    rows=2,
+                    object_fit="contain",
+                    height="auto"
+                )
+        # Event handler
+        def process_and_display(file, task):
+            if file is None:
+                return [], "Please upload a file first."
+            original_imgs, annotated_imgs, status = process_uploaded_file(file, task)
+            gallery_content = create_gallery_content(original_imgs, annotated_imgs)
+            return gallery_content, status
+        analyze_btn.click(
+            fn=process_and_display,
+            inputs=[file_upload, task_type],
+            outputs=[gallery, status_text]
+        )
+        # Example section
+        gr.Markdown("""
+        ## 💡 Tips for Best Results
+        - **Images**: Upload clear, high-resolution images for better analysis
+        - **PDFs**: Multi-page PDFs will be processed page by page
+        - **Object Detection**: Great for identifying and locating objects in images
+        - **Detailed Caption**: Provides comprehensive descriptions of image content
+        - **OCR**: Perfect for extracting text from documents and images
+        - **Dense Captioning**: Provides detailed captions for different regions
+        ## 🎯 Supported Formats
+        - **Images**: PNG, JPG, JPEG, BMP, TIFF
+        - **Documents**: PDF (converted to images automatically)
+        """)
+    return demo
+# Launch the application
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 # Core dependencies
-gradio==4.28.0
 torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.35.0

 # Core dependencies
+gradio==3.50.2
 torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.35.0