File size: 13,923 Bytes
1ddb064
20e8d5d
 
 
 
 
 
1ddb064
 
 
 
 
 
 
 
 
 
20e8d5d
 
 
 
1ddb064
 
 
 
 
 
 
1f8f715
63cf06b
1f8f715
1ddb064
 
1f8f715
63cf06b
 
1f8f715
63cf06b
1f8f715
1ddb064
 
8ec6e89
1ddb064
 
 
8ec6e89
 
1ddb064
1f8f715
8ec6e89
 
 
 
1ddb064
1f8f715
1ddb064
 
 
 
1f8f715
1ddb064
63cf06b
1f8f715
63cf06b
1ddb064
63cf06b
 
1ddb064
63cf06b
1ddb064
 
 
63cf06b
 
 
 
 
 
 
1ddb064
63cf06b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ddb064
20e8d5d
 
 
 
 
 
 
 
63cf06b
20e8d5d
 
 
 
 
 
 
 
 
 
63cf06b
20e8d5d
 
1ddb064
20e8d5d
 
 
1ddb064
 
 
 
63cf06b
1ddb064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63cf06b
1ddb064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63cf06b
1ddb064
 
63cf06b
1ddb064
 
 
63cf06b
1ddb064
 
 
 
63cf06b
1ddb064
 
 
1f8f715
1ddb064
 
 
 
63cf06b
1ddb064
 
1f8f715
1ddb064
1f8f715
1ddb064
 
1f8f715
1ddb064
 
 
 
63cf06b
1ddb064
 
 
1f8f715
1ddb064
 
63cf06b
1ddb064
 
1f8f715
1ddb064
 
1f8f715
1ddb064
 
 
1f8f715
1ddb064
 
 
1f8f715
1ddb064
 
1f8f715
1ddb064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f8f715
1ddb064
 
 
1f8f715
 
1ddb064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f8f715
b14c740
1ddb064
 
 
 
 
 
 
 
 
 
 
 
 
 
20e8d5d
1ddb064
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import gradio as gr
import torch
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from pathlib import Path
import os
import time
from typing import Dict, Any, Tuple, Optional, List
import tempfile
import io

# PDF processing
try:
    from pdf2image import convert_from_bytes, convert_from_path
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False

# Import configuration
from config import *

# Global variables to store model (similar to Streamlit's session state)
model_cache = {
    'model': None,
    'processor': None,
    'device': None,
    'loaded': False
}

def load_florence_model():
    """Load Florence-2 model and processor on-demand"""
    if model_cache['loaded']:
        return model_cache['model'], model_cache['processor'], model_cache['device']

    try:
        from transformers import AutoProcessor, AutoModelForCausalLM

        device = "cpu" if FORCE_CPU else ("cuda" if torch.cuda.is_available() else "cpu")

        print(f"Loading Florence-2 model on {device}...")

        # Load model with compatibility fixes
        model = AutoModelForCausalLM.from_pretrained(
            FLORENCE_MODEL_ID,
            torch_dtype=torch.float16 if (torch.cuda.is_available() and not FORCE_CPU) else torch.float32,
            trust_remote_code=True,
            attn_implementation="eager"  # Use eager attention for compatibility
        ).to(device)

        # Fix for transformers compatibility issue
        if hasattr(model, 'config'):
            model.config.use_cache = False

        processor = AutoProcessor.from_pretrained(FLORENCE_MODEL_ID, trust_remote_code=True)

        model_cache['model'] = model
        model_cache['processor'] = processor
        model_cache['device'] = device
        model_cache['loaded'] = True

        print(f"βœ… Model loaded successfully on {device}")
        return model, processor, device

    except Exception as e:
        print(f"Failed to load Florence-2 model: {e}")
        return None, None, None

def analyze_image(image: Image.Image, task_type: str) -> Dict[str, Any]:
    """Analyze image with Florence-2 model"""
    # Load model if not already loaded
    model, processor, device = load_florence_model()

    if not model or not processor:
        return {"error": "Model not loaded", "success": False}

    try:
        task_config = FLORENCE_TASKS.get(task_type, FLORENCE_TASKS["detailed_caption"])
        task_prompt = task_config["prompt"]

        # Resize image if too large
        if image.size[0] > MAX_IMAGE_SIZE[0] or image.size[1] > MAX_IMAGE_SIZE[1]:
            image.thumbnail(MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)

        inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(device)

        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=task_config["max_tokens"],
            num_beams=3,
            do_sample=False
        )

        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        parsed_answer = processor.post_process_generation(
            generated_text,
            task=task_prompt,
            image_size=(image.width, image.height)
        )

        return {
            "parsed_results": parsed_answer,
            "success": True
        }

    except Exception as e:
        return {"error": f"Analysis failed: {str(e)}", "success": False}

def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
    """Draw bounding boxes and labels on image"""
    if not results.get("success", False):
        return image

    annotated_image = image.copy()
    draw = ImageDraw.Draw(annotated_image)

    try:
        font = ImageFont.load_default()
        parsed_results = results.get("parsed_results", {})

        if "bboxes" in parsed_results and "labels" in parsed_results:
            bboxes = parsed_results["bboxes"]
            labels = parsed_results["labels"]

            for i, (bbox, label) in enumerate(zip(bboxes, labels)):
                color = BBOX_COLORS[i % len(BBOX_COLORS)]
                x1, y1, x2, y2 = bbox
                draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
                draw.text((x1, max(y1-20, 0)), label[:30], fill=color, font=font)

    except Exception as e:
        print(f"Error drawing annotations: {e}")

    return annotated_image

def process_pdf(pdf_file) -> List[Image.Image]:
    """Convert PDF to images"""
    if not PDF_AVAILABLE:
        raise ValueError("PDF processing not available. Please install pdf2image.")

    try:
        # Convert PDF to images
        if hasattr(pdf_file, 'read'):
            # File object
            pdf_bytes = pdf_file.read()
            images = convert_from_bytes(pdf_bytes, dpi=PDF_DPI)
        else:
            # File path
            images = convert_from_path(pdf_file, dpi=PDF_DPI)

        # Limit number of pages
        if len(images) > MAX_PDF_PAGES:
            images = images[:MAX_PDF_PAGES]

        return images
    except Exception as e:
        raise ValueError(f"Failed to process PDF: {str(e)}")

def format_results_text(results: Dict[str, Any], task_type: str) -> str:
    """Format analysis results as text"""
    if not results.get("success", False):
        return f"❌ Analysis failed: {results.get('error', 'Unknown error')}"

    parsed = results.get("parsed_results", {})

    if task_type == "detailed_caption":
        if isinstance(parsed, dict) and "detailed_caption" in parsed:
            return f"πŸ“ **Caption:** {parsed['detailed_caption']}"
        elif isinstance(parsed, str):
            return f"πŸ“ **Caption:** {parsed}"

    elif task_type == "object_detection":
        if "labels" in parsed and parsed["labels"]:
            labels = parsed["labels"]
            bbox_count = len(labels)
            labels_text = ', '.join(labels[:10])
            if len(labels) > 10:
                labels_text += f" ...and {len(labels) - 10} more"
            return f"🎯 **Detected Objects ({bbox_count}):** {labels_text}"

    elif task_type == "ocr":
        if "text" in parsed:
            ocr_text = parsed.get("text", "")
            if ocr_text:
                return f"πŸ”€ **Extracted Text:**\n{ocr_text}"
            else:
                return "πŸ”€ **OCR Result:** No text detected in the image"

    elif task_type == "dense_captioning":
        if "labels" in parsed and parsed["labels"]:
            captions = parsed["labels"]
            return f"πŸ“‹ **Region Captions:**\n" + '\n'.join([f"β€’ {cap}" for cap in captions[:5]])

    return "βœ… Analysis completed successfully!"

def process_uploaded_file(file_path: str) -> Tuple[Image.Image, str]:
    """Process uploaded file (image or PDF) and return first image"""
    if file_path is None:
        return None, "Please upload a file first."

    try:
        file_extension = Path(file_path).suffix.lower()

        if file_extension == '.pdf':
            if not PDF_AVAILABLE:
                return None, "PDF processing not available. Please upload an image instead."

            # Convert PDF to images
            images = process_pdf(file_path)
            if not images:
                return None, "No images found in PDF."

            # Use the first page for now
            image = images[0]
            status = f"βœ… PDF processed successfully. Showing page 1 of {len(images)}."

        elif file_extension in ['.png', '.jpg', '.jpeg']:
            # Load image
            image = Image.open(file_path).convert("RGB")
            status = "βœ… Image loaded successfully."

        else:
            return None, "Unsupported file format. Please upload PNG, JPG, JPEG, or PDF files."

        return image, status

    except Exception as e:
        return None, f"❌ Error processing file: {str(e)}"

def process_image(image: Image.Image, task_type: str) -> Tuple[Image.Image, str, str]:
    """Process uploaded image and return results"""
    if image is None:
        return None, "Please upload an image first.", ""

    # Convert to RGB if needed
    if image.mode != "RGB":
        image = image.convert("RGB")

    # Analyze the image
    results = analyze_image(image, task_type)

    # Create annotated image
    annotated_image = draw_bounding_boxes(image, results)

    # Format results text
    results_text = format_results_text(results, task_type)

    # Create status message
    if results.get("success", False):
        status = f"βœ… Analysis completed successfully using Florence-2 on {model_cache.get('device', 'unknown device')}"
    else:
        status = f"❌ Analysis failed: {results.get('error', 'Unknown error')}"

    return annotated_image, results_text, status

def create_interface():
    """Create the Gradio interface"""

    # Custom CSS for better styling
    custom_css = """
    .gradio-container {
        font-family: 'Arial', sans-serif;
    }
    .analysis-results {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 1rem 0;
    }
    """

    with gr.Blocks(title="Florence-2 Document & Image Analyzer", css=custom_css, theme=gr.themes.Soft()) as demo:

        gr.Markdown("""
        # πŸ“„ Florence-2 Document & Image Analyzer

        Upload images to analyze them with Microsoft's Florence-2 vision model.

        **Note:** The model will be loaded automatically on first use (~5GB download, takes 2-3 minutes).
        """)

        with gr.Row():
            with gr.Column():
                file_input = gr.File(
                    label="Upload Image or PDF",
                    file_types=[".png", ".jpg", ".jpeg", ".pdf"],
                    type="filepath"
                )

                image_input = gr.Image(
                    type="pil",
                    label="Current Image",
                    height=400,
                    interactive=False
                )

                task_dropdown = gr.Dropdown(
                    choices=[
                        ("Object Detection", "object_detection"),
                        ("Detailed Caption", "detailed_caption"),
                        ("OCR (Text Extraction)", "ocr"),
                        ("Dense Captioning", "dense_captioning")
                    ],
                    value="object_detection",
                    label="Analysis Type",
                    info="Choose the type of analysis to perform"
                )

                analyze_btn = gr.Button("πŸ” Analyze Image", variant="primary", size="lg")

            with gr.Column():
                annotated_output = gr.Image(
                    label="Analysis Results",
                    height=400
                )

                results_text = gr.Markdown(
                    label="Analysis Details",
                    value="Upload an image and click 'Analyze Image' to get started!"
                )

                status_text = gr.Markdown(
                    value="ℹ️ Ready to analyze images"
                )

        # Event handlers
        def handle_file_upload(file_path):
            if file_path is None:
                return None, "Please upload a file first."
            image, status = process_uploaded_file(file_path)
            return image, status

        def handle_analyze(image, task_type):
            return process_image(image, task_type)

        file_input.change(
            fn=handle_file_upload,
            inputs=[file_input],
            outputs=[image_input, status_text],
            show_progress=True
        )

        analyze_btn.click(
            fn=handle_analyze,
            inputs=[image_input, task_dropdown],
            outputs=[annotated_output, results_text, status_text],
            show_progress=True
        )

        # Information sections
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                ## ℹ️ About Florence-2

                **Florence-2** is Microsoft's foundation vision model capable of:

                - **🎯 Object Detection**: Identifies and locates objects with bounding boxes
                - **πŸ“ Detailed Caption**: Generates comprehensive descriptions of image content
                - **πŸ”€ OCR**: Extracts and locates text in images
                - **πŸ“‹ Dense Captioning**: Provides detailed captions for different regions

                The model downloads automatically on first use (~5GB) and is cached for subsequent uses.
                """)

            with gr.Column():
                gr.Markdown("""
                ## ⚑ Performance Notes

                - **First run**: Model download may take 2-3 minutes
                - **GPU**: Faster inference when available
                - **CPU**: Works but slower processing
                - **Model size**: ~5GB (cached after first download)
                - **Supported formats**: PNG, JPG, JPEG, PDF
                """)

        # Usage instructions
        gr.Markdown("""
        ## πŸ“‹ How to Use

        1. **Upload a file**: Click "Upload Image or PDF" and choose your file
        2. **Select analysis type**: Choose from the dropdown menu
        3. **Click Analyze**: The image will appear and you can analyze it
        4. **View results**: See the annotated image and detailed analysis

        **Good examples to try:**
        - Photos with objects (cars, people, animals)
        - Screenshots with text for OCR
        - Documents or diagrams for analysis
        - Multi-object scenes for detection
        """)

    return demo

def main():
    """Main function to launch the Gradio app"""
    demo = create_interface()

    # Launch the app
    demo.launch(
        share=SHARE_LINK,
        server_port=SERVER_PORT,
        show_error=True,
        quiet=False
    )

if __name__ == "__main__":
    main()