Felipe Meres commited on
Commit
b14c740
Β·
1 Parent(s): 9a8f848

Major compatibility fix: Downgrade to Gradio 3.50.2

Browse files

- Downgrade Gradio: 4.28.0 -> 3.50.2 (stable, HF-compatible version)
- Rewrite app.py for Gradio 3.x syntax (gr.Interface instead of gr.Blocks)
- Simplify interface: single image upload, simplified processing
- Remove complex gallery/multi-image support for stability
- Update README SDK version to match
- Focus on core Florence-2 functionality with stable Gradio

Files changed (3) hide show
  1. app.py +58 -231
  2. app_backup.py +383 -0
  3. requirements.txt +1 -1
app.py CHANGED
@@ -2,14 +2,10 @@ import gradio as gr
2
  import torch
3
  from PIL import Image, ImageDraw, ImageFont
4
  import numpy as np
5
- import io
6
- import base64
7
  from pathlib import Path
8
- import tempfile
9
  import os
10
- from typing import List, Tuple, Dict, Any, Optional
11
- import json
12
  import time
 
13
 
14
  # Import configuration
15
  from config import *
@@ -108,32 +104,6 @@ class Florence2Analyzer:
108
  except Exception as e:
109
  return {"error": f"Analysis failed: {str(e)}", "success": False}
110
 
111
- def convert_pdf_to_images(pdf_file) -> List[Image.Image]:
112
- """Convert PDF pages to PIL Images"""
113
- if not PDF_AVAILABLE:
114
- raise ValueError("PDF processing not available. Please install pdf2image.")
115
-
116
- try:
117
- # Handle different input types
118
- if hasattr(pdf_file, 'read'):
119
- # File-like object
120
- pdf_bytes = pdf_file.read()
121
- images = convert_from_bytes(pdf_bytes, dpi=PDF_DPI, fmt='RGB')
122
- elif isinstance(pdf_file, str) and os.path.exists(pdf_file):
123
- # File path
124
- images = convert_from_path(pdf_file, dpi=PDF_DPI, fmt='RGB')
125
- else:
126
- raise ValueError("Invalid PDF input format")
127
-
128
- # Limit number of pages
129
- if len(images) > MAX_PDF_PAGES:
130
- print(f"Warning: PDF has {len(images)} pages, processing only first {MAX_PDF_PAGES}")
131
- images = images[:MAX_PDF_PAGES]
132
-
133
- return images
134
- except Exception as e:
135
- raise ValueError(f"Failed to convert PDF: {str(e)}")
136
-
137
  def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
138
  """Draw bounding boxes and labels on image"""
139
  if not results.get("success", False):
@@ -146,12 +116,9 @@ def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Im
146
  try:
147
  # Load a font
148
  try:
149
- font = ImageFont.truetype("arial.ttf", FONT_SIZE)
150
  except:
151
- try:
152
- font = ImageFont.truetype("DejaVuSans.ttf", FONT_SIZE)
153
- except:
154
- font = ImageFont.load_default()
155
 
156
  parsed_results = results.get("parsed_results", {})
157
 
@@ -167,217 +134,77 @@ def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Im
167
  # Draw bounding box
168
  draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
169
 
170
- # Prepare label text (truncate if too long)
171
- display_label = label if len(label) <= 30 else f"{label[:27]}..."
172
-
173
- # Draw label background
174
- text_bbox = draw.textbbox((x1, y1), display_label, font=font)
175
- text_width = text_bbox[2] - text_bbox[0]
176
- text_height = text_bbox[3] - text_bbox[1]
177
-
178
- # Ensure label fits within image bounds
179
- label_x = min(x1, image.width - text_width - 5)
180
- label_y = max(y1 - text_height - 5, 5)
181
-
182
- # Draw background rectangle
183
- draw.rectangle([label_x - 2, label_y - 2, label_x + text_width + 2, label_y + text_height + 2],
184
- fill=color)
185
-
186
  # Draw label text
187
- draw.text((label_x, label_y), display_label, fill="white", font=font)
188
-
189
- # Handle OCR results
190
- elif "quad_boxes" in parsed_results and "labels" in parsed_results:
191
- quad_boxes = parsed_results["quad_boxes"]
192
- labels = parsed_results["labels"]
193
-
194
- for i, (quad, label) in enumerate(zip(quad_boxes, labels)):
195
- color = BBOX_COLORS[i % len(BBOX_COLORS)]
196
-
197
- # Draw quadrilateral for OCR results
198
- if len(quad) >= 8: # quad should have 8 coordinates (4 points)
199
- points = [(quad[j], quad[j+1]) for j in range(0, 8, 2)]
200
- draw.polygon(points, outline=color, width=BBOX_WIDTH)
201
-
202
- # Draw label near first point
203
- x, y = points[0]
204
- display_label = label if len(label) <= 20 else f"{label[:17]}..."
205
-
206
- text_bbox = draw.textbbox((x, y), display_label, font=font)
207
- draw.rectangle([text_bbox[0]-2, text_bbox[1]-2, text_bbox[2]+2, text_bbox[3]+2],
208
- fill=color)
209
- draw.text((x, y), display_label, fill="white", font=font)
210
 
211
  except Exception as e:
212
  print(f"Error drawing annotations: {e}")
213
 
214
  return annotated_image
215
 
216
- def process_uploaded_file(file, task_type: str) -> Tuple[List[Image.Image], List[Image.Image], str]:
217
- """Process uploaded file (image or PDF) and return original and annotated versions"""
218
  if file is None:
219
- return [], [], "No file uploaded."
220
-
221
- analyzer = Florence2Analyzer()
222
- original_images = []
223
- annotated_images = []
224
- status_message = ""
225
 
226
  try:
227
- # Determine file type
228
- file_extension = Path(file.name).suffix.lower()
229
-
230
- if file_extension == '.pdf':
231
- if not PDF_AVAILABLE:
232
- return [], [], "PDF processing not available. Please install pdf2image."
233
-
234
- # Convert PDF to images
235
- status_message += f"Converting PDF to images...\n"
236
- pdf_images = convert_pdf_to_images(file)
237
- status_message += f"Successfully converted {len(pdf_images)} pages.\n"
238
-
239
- for i, img in enumerate(pdf_images):
240
- status_message += f"Processing page {i+1}...\n"
241
-
242
- # Analyze with Florence-2
243
- results = analyzer.analyze_image(img, task_type)
244
-
245
- if results.get("success", False):
246
- annotated_img = draw_bounding_boxes(img, results)
247
- original_images.append(img)
248
- annotated_images.append(annotated_img)
249
- status_message += f"Page {i+1} analyzed successfully.\n"
250
- else:
251
- status_message += f"Page {i+1} analysis failed: {results.get('error', 'Unknown error')}\n"
252
- original_images.append(img)
253
- annotated_images.append(img) # Fallback to original
254
-
255
- elif file_extension in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
256
- # Process single image
257
- status_message += "Processing image...\n"
258
-
259
  img = Image.open(file).convert('RGB')
260
- results = analyzer.analyze_image(img, task_type)
261
-
262
- if results.get("success", False):
263
- annotated_img = draw_bounding_boxes(img, results)
264
- original_images.append(img)
265
- annotated_images.append(annotated_img)
266
- status_message += "Image analyzed successfully.\n"
267
-
268
- # Add detailed results to status
269
- if "parsed_results" in results:
270
- parsed = results["parsed_results"]
271
- if task_type == "detailed_caption" and isinstance(parsed, dict):
272
- caption = parsed.get("detailed_caption", "No caption generated")
273
- status_message += f"Caption: {caption}\n"
274
- elif "labels" in parsed:
275
- labels = parsed["labels"]
276
- status_message += f"Detected objects: {', '.join(labels[:5])}{'...' if len(labels) > 5 else ''}\n"
277
- else:
278
- status_message += f"Analysis failed: {results.get('error', 'Unknown error')}\n"
279
- original_images.append(img)
280
- annotated_images.append(img)
281
  else:
282
- return [], [], f"Unsupported file type: {file_extension}. Please upload PNG, JPG, JPEG, or PDF files."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  except Exception as e:
285
- return [], [], f"Error processing file: {str(e)}"
286
-
287
- return original_images, annotated_images, status_message
288
 
289
- def create_gallery_content(original_images: List[Image.Image], annotated_images: List[Image.Image]) -> List[Tuple[Image.Image, str]]:
290
- """Create content for Gradio gallery showing both original and annotated versions"""
291
- gallery_content = []
292
-
293
- for i, (orig, anno) in enumerate(zip(original_images, annotated_images)):
294
- # Add original image
295
- gallery_content.append((orig, f"Page/Image {i+1} - Original"))
296
- # Add annotated image
297
- gallery_content.append((anno, f"Page/Image {i+1} - Analyzed"))
298
-
299
- return gallery_content
300
 
301
  # Create Gradio interface
302
- def create_interface():
303
- with gr.Blocks(title="Florence-2 Document & Image Analyzer", theme=gr.themes.Soft()) as demo:
304
- gr.Markdown("""
305
- # πŸ“„ Florence-2 Document & Image Analyzer
306
-
307
- Upload images (PNG, JPG, JPEG) or PDF documents to analyze them with Microsoft's Florence-2 vision model.
308
- The model can detect objects, generate captions, perform OCR, and more!
309
- """)
310
-
311
- with gr.Row():
312
- with gr.Column(scale=1):
313
- file_upload = gr.File(
314
- label="Upload Image or PDF",
315
- file_types=[".png", ".jpg", ".jpeg", ".pdf"],
316
- type="filepath"
317
- )
318
-
319
- task_type = gr.Dropdown(
320
- choices=[(config["description"], task_name) for task_name, config in FLORENCE_TASKS.items()],
321
- value="object_detection",
322
- label="Analysis Type",
323
- info="Choose what type of analysis to perform"
324
- )
325
-
326
- analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
327
-
328
- status_text = gr.Textbox(
329
- label="Status",
330
- lines=8,
331
- interactive=False,
332
- placeholder="Upload a file and click Analyze to see results..."
333
- )
334
-
335
- with gr.Column(scale=2):
336
- gallery = gr.Gallery(
337
- label="Results (Original vs Analyzed)",
338
- show_label=True,
339
- elem_id="gallery",
340
- columns=2,
341
- rows=2,
342
- object_fit="contain",
343
- height="auto"
344
- )
345
-
346
- # Event handler
347
- def process_and_display(file, task):
348
- if file is None:
349
- return [], "Please upload a file first."
350
-
351
- original_imgs, annotated_imgs, status = process_uploaded_file(file, task)
352
- gallery_content = create_gallery_content(original_imgs, annotated_imgs)
353
-
354
- return gallery_content, status
355
-
356
- analyze_btn.click(
357
- fn=process_and_display,
358
- inputs=[file_upload, task_type],
359
- outputs=[gallery, status_text]
360
- )
361
-
362
- # Example section
363
- gr.Markdown("""
364
- ## πŸ’‘ Tips for Best Results
365
-
366
- - **Images**: Upload clear, high-resolution images for better analysis
367
- - **PDFs**: Multi-page PDFs will be processed page by page
368
- - **Object Detection**: Great for identifying and locating objects in images
369
- - **Detailed Caption**: Provides comprehensive descriptions of image content
370
- - **OCR**: Perfect for extracting text from documents and images
371
- - **Dense Captioning**: Provides detailed captions for different regions
372
-
373
- ## 🎯 Supported Formats
374
- - **Images**: PNG, JPG, JPEG, BMP, TIFF
375
- - **Documents**: PDF (converted to images automatically)
376
- """)
377
-
378
- return demo
379
-
380
- # Launch the application
381
  if __name__ == "__main__":
382
- demo = create_interface()
383
  demo.launch()
 
2
  import torch
3
  from PIL import Image, ImageDraw, ImageFont
4
  import numpy as np
 
 
5
  from pathlib import Path
 
6
  import os
 
 
7
  import time
8
+ from typing import List, Dict, Any
9
 
10
  # Import configuration
11
  from config import *
 
104
  except Exception as e:
105
  return {"error": f"Analysis failed: {str(e)}", "success": False}
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
108
  """Draw bounding boxes and labels on image"""
109
  if not results.get("success", False):
 
116
  try:
117
  # Load a font
118
  try:
119
+ font = ImageFont.load_default()
120
  except:
121
+ font = None
 
 
 
122
 
123
  parsed_results = results.get("parsed_results", {})
124
 
 
134
  # Draw bounding box
135
  draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  # Draw label text
138
+ if font:
139
+ draw.text((x1, max(y1-20, 0)), label[:30], fill=color, font=font)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  except Exception as e:
142
  print(f"Error drawing annotations: {e}")
143
 
144
  return annotated_image
145
 
146
+ def process_image(file, task_type):
147
+ """Process uploaded file and return result"""
148
  if file is None:
149
+ return None, "Please upload a file first."
 
 
 
 
 
150
 
151
  try:
152
+ # Load image
153
+ if isinstance(file, str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  img = Image.open(file).convert('RGB')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  else:
156
+ img = Image.open(file.name).convert('RGB')
157
+
158
+ # Analyze with Florence-2
159
+ analyzer = Florence2Analyzer()
160
+ results = analyzer.analyze_image(img, task_type)
161
+
162
+ if results.get("success", False):
163
+ annotated_img = draw_bounding_boxes(img, results)
164
+ status = "Image analyzed successfully!"
165
+
166
+ # Add results info
167
+ if "parsed_results" in results:
168
+ parsed = results["parsed_results"]
169
+ if task_type == "detailed_caption" and isinstance(parsed, dict):
170
+ caption = parsed.get("detailed_caption", "No caption generated")
171
+ status += f"\n\nCaption: {caption}"
172
+ elif "labels" in parsed:
173
+ labels = parsed["labels"]
174
+ status += f"\n\nDetected objects: {', '.join(labels[:5])}"
175
+
176
+ return annotated_img, status
177
+ else:
178
+ return img, f"Analysis failed: {results.get('error', 'Unknown error')}"
179
 
180
  except Exception as e:
181
+ return None, f"Error processing file: {str(e)}"
 
 
182
 
183
+ # Task choices
184
+ task_choices = [
185
+ "object_detection",
186
+ "detailed_caption",
187
+ "dense_captioning",
188
+ "ocr",
189
+ "region_proposal"
190
+ ]
 
 
 
191
 
192
  # Create Gradio interface
193
+ demo = gr.Interface(
194
+ fn=process_image,
195
+ inputs=[
196
+ gr.File(label="Upload Image", file_types=["image"]),
197
+ gr.Dropdown(choices=task_choices, value="object_detection", label="Analysis Type")
198
+ ],
199
+ outputs=[
200
+ gr.Image(label="Analyzed Image"),
201
+ gr.Textbox(label="Status", lines=5)
202
+ ],
203
+ title="πŸ“„ Florence-2 Document & Image Analyzer",
204
+ description="Upload images to analyze them with Microsoft's Florence-2 vision model. The model can detect objects, generate captions, perform OCR, and more!",
205
+ theme="soft",
206
+ allow_flagging="never"
207
+ )
208
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  if __name__ == "__main__":
 
210
  demo.launch()
app_backup.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image, ImageDraw, ImageFont
4
+ import numpy as np
5
+ import io
6
+ import base64
7
+ from pathlib import Path
8
+ import tempfile
9
+ import os
10
+ from typing import List, Tuple, Dict, Any, Optional
11
+ import json
12
+ import time
13
+
14
+ # Import configuration
15
+ from config import *
16
+
17
+ # PDF processing
18
+ try:
19
+ from pdf2image import convert_from_path, convert_from_bytes
20
+ PDF_AVAILABLE = True
21
+ except ImportError:
22
+ PDF_AVAILABLE = False
23
+ print("Warning: pdf2image not available. PDF processing will be disabled.")
24
+
25
+ # Florence-2 model imports
26
+ try:
27
+ from transformers import AutoProcessor, AutoModelForCausalLM
28
+ FLORENCE_AVAILABLE = True
29
+ except ImportError:
30
+ FLORENCE_AVAILABLE = False
31
+ print("Warning: transformers not available. Florence-2 processing will be disabled.")
32
+
33
+ class Florence2Analyzer:
34
+ def __init__(self):
35
+ self.model = None
36
+ self.processor = None
37
+ self.device = "cpu" if FORCE_CPU else ("cuda" if torch.cuda.is_available() else "cpu")
38
+ self._load_model()
39
+
40
+ def _load_model(self):
41
+ """Load Florence-2 model and processor"""
42
+ if not FLORENCE_AVAILABLE:
43
+ print("Florence-2 not available - transformers library not found")
44
+ return
45
+
46
+ try:
47
+ print(f"Loading Florence-2 model: {FLORENCE_MODEL_ID}")
48
+ start_time = time.time()
49
+
50
+ self.model = AutoModelForCausalLM.from_pretrained(
51
+ FLORENCE_MODEL_ID,
52
+ torch_dtype=torch.float16 if (torch.cuda.is_available() and not FORCE_CPU) else torch.float32,
53
+ trust_remote_code=True
54
+ ).to(self.device)
55
+
56
+ self.processor = AutoProcessor.from_pretrained(FLORENCE_MODEL_ID, trust_remote_code=True)
57
+
58
+ load_time = time.time() - start_time
59
+ print(f"Florence-2 model loaded successfully on {self.device} in {load_time:.2f} seconds")
60
+ except Exception as e:
61
+ print(f"Error loading Florence-2 model: {e}")
62
+ self.model = None
63
+ self.processor = None
64
+
65
+ def analyze_image(self, image: Image.Image, task_type: str = "detailed_caption") -> Dict[str, Any]:
66
+ """Analyze image with Florence-2 model"""
67
+ if not self.model or not self.processor:
68
+ return {"error": ERROR_MESSAGES["model_not_loaded"], "success": False}
69
+
70
+ try:
71
+ # Get task configuration
72
+ task_config = FLORENCE_TASKS.get(task_type, FLORENCE_TASKS["detailed_caption"])
73
+ task_prompt = task_config["prompt"]
74
+
75
+ # Resize image if too large
76
+ if image.size[0] > MAX_IMAGE_SIZE[0] or image.size[1] > MAX_IMAGE_SIZE[1]:
77
+ image.thumbnail(MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)
78
+ print(f"Resized image to {image.size}")
79
+
80
+ # Process image
81
+ inputs = self.processor(text=task_prompt, images=image, return_tensors="pt").to(self.device)
82
+
83
+ # Generate
84
+ generated_ids = self.model.generate(
85
+ input_ids=inputs["input_ids"],
86
+ pixel_values=inputs["pixel_values"],
87
+ max_new_tokens=task_config["max_tokens"],
88
+ num_beams=3,
89
+ do_sample=False
90
+ )
91
+
92
+ # Decode response
93
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
94
+ parsed_answer = self.processor.post_process_generation(
95
+ generated_text,
96
+ task=task_prompt,
97
+ image_size=(image.width, image.height)
98
+ )
99
+
100
+ return {
101
+ "task_type": task_type,
102
+ "raw_text": generated_text,
103
+ "parsed_results": parsed_answer,
104
+ "success": True,
105
+ "processing_time": time.time()
106
+ }
107
+
108
+ except Exception as e:
109
+ return {"error": f"Analysis failed: {str(e)}", "success": False}
110
+
111
+ def convert_pdf_to_images(pdf_file) -> List[Image.Image]:
112
+ """Convert PDF pages to PIL Images"""
113
+ if not PDF_AVAILABLE:
114
+ raise ValueError("PDF processing not available. Please install pdf2image.")
115
+
116
+ try:
117
+ # Handle different input types
118
+ if hasattr(pdf_file, 'read'):
119
+ # File-like object
120
+ pdf_bytes = pdf_file.read()
121
+ images = convert_from_bytes(pdf_bytes, dpi=PDF_DPI, fmt='RGB')
122
+ elif isinstance(pdf_file, str) and os.path.exists(pdf_file):
123
+ # File path
124
+ images = convert_from_path(pdf_file, dpi=PDF_DPI, fmt='RGB')
125
+ else:
126
+ raise ValueError("Invalid PDF input format")
127
+
128
+ # Limit number of pages
129
+ if len(images) > MAX_PDF_PAGES:
130
+ print(f"Warning: PDF has {len(images)} pages, processing only first {MAX_PDF_PAGES}")
131
+ images = images[:MAX_PDF_PAGES]
132
+
133
+ return images
134
+ except Exception as e:
135
+ raise ValueError(f"Failed to convert PDF: {str(e)}")
136
+
137
+ def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
138
+ """Draw bounding boxes and labels on image"""
139
+ if not results.get("success", False):
140
+ return image
141
+
142
+ # Create a copy to draw on
143
+ annotated_image = image.copy()
144
+ draw = ImageDraw.Draw(annotated_image)
145
+
146
+ try:
147
+ # Load a font
148
+ try:
149
+ font = ImageFont.truetype("arial.ttf", FONT_SIZE)
150
+ except:
151
+ try:
152
+ font = ImageFont.truetype("DejaVuSans.ttf", FONT_SIZE)
153
+ except:
154
+ font = ImageFont.load_default()
155
+
156
+ parsed_results = results.get("parsed_results", {})
157
+
158
+ # Handle object detection and dense captioning results
159
+ if "bboxes" in parsed_results and "labels" in parsed_results:
160
+ bboxes = parsed_results["bboxes"]
161
+ labels = parsed_results["labels"]
162
+
163
+ for i, (bbox, label) in enumerate(zip(bboxes, labels)):
164
+ color = BBOX_COLORS[i % len(BBOX_COLORS)]
165
+ x1, y1, x2, y2 = bbox
166
+
167
+ # Draw bounding box
168
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=BBOX_WIDTH)
169
+
170
+ # Prepare label text (truncate if too long)
171
+ display_label = label if len(label) <= 30 else f"{label[:27]}..."
172
+
173
+ # Draw label background
174
+ text_bbox = draw.textbbox((x1, y1), display_label, font=font)
175
+ text_width = text_bbox[2] - text_bbox[0]
176
+ text_height = text_bbox[3] - text_bbox[1]
177
+
178
+ # Ensure label fits within image bounds
179
+ label_x = min(x1, image.width - text_width - 5)
180
+ label_y = max(y1 - text_height - 5, 5)
181
+
182
+ # Draw background rectangle
183
+ draw.rectangle([label_x - 2, label_y - 2, label_x + text_width + 2, label_y + text_height + 2],
184
+ fill=color)
185
+
186
+ # Draw label text
187
+ draw.text((label_x, label_y), display_label, fill="white", font=font)
188
+
189
+ # Handle OCR results
190
+ elif "quad_boxes" in parsed_results and "labels" in parsed_results:
191
+ quad_boxes = parsed_results["quad_boxes"]
192
+ labels = parsed_results["labels"]
193
+
194
+ for i, (quad, label) in enumerate(zip(quad_boxes, labels)):
195
+ color = BBOX_COLORS[i % len(BBOX_COLORS)]
196
+
197
+ # Draw quadrilateral for OCR results
198
+ if len(quad) >= 8: # quad should have 8 coordinates (4 points)
199
+ points = [(quad[j], quad[j+1]) for j in range(0, 8, 2)]
200
+ draw.polygon(points, outline=color, width=BBOX_WIDTH)
201
+
202
+ # Draw label near first point
203
+ x, y = points[0]
204
+ display_label = label if len(label) <= 20 else f"{label[:17]}..."
205
+
206
+ text_bbox = draw.textbbox((x, y), display_label, font=font)
207
+ draw.rectangle([text_bbox[0]-2, text_bbox[1]-2, text_bbox[2]+2, text_bbox[3]+2],
208
+ fill=color)
209
+ draw.text((x, y), display_label, fill="white", font=font)
210
+
211
+ except Exception as e:
212
+ print(f"Error drawing annotations: {e}")
213
+
214
+ return annotated_image
215
+
216
+ def process_uploaded_file(file, task_type: str) -> Tuple[List[Image.Image], List[Image.Image], str]:
217
+ """Process uploaded file (image or PDF) and return original and annotated versions"""
218
+ if file is None:
219
+ return [], [], "No file uploaded."
220
+
221
+ analyzer = Florence2Analyzer()
222
+ original_images = []
223
+ annotated_images = []
224
+ status_message = ""
225
+
226
+ try:
227
+ # Determine file type
228
+ file_extension = Path(file.name).suffix.lower()
229
+
230
+ if file_extension == '.pdf':
231
+ if not PDF_AVAILABLE:
232
+ return [], [], "PDF processing not available. Please install pdf2image."
233
+
234
+ # Convert PDF to images
235
+ status_message += f"Converting PDF to images...\n"
236
+ pdf_images = convert_pdf_to_images(file)
237
+ status_message += f"Successfully converted {len(pdf_images)} pages.\n"
238
+
239
+ for i, img in enumerate(pdf_images):
240
+ status_message += f"Processing page {i+1}...\n"
241
+
242
+ # Analyze with Florence-2
243
+ results = analyzer.analyze_image(img, task_type)
244
+
245
+ if results.get("success", False):
246
+ annotated_img = draw_bounding_boxes(img, results)
247
+ original_images.append(img)
248
+ annotated_images.append(annotated_img)
249
+ status_message += f"Page {i+1} analyzed successfully.\n"
250
+ else:
251
+ status_message += f"Page {i+1} analysis failed: {results.get('error', 'Unknown error')}\n"
252
+ original_images.append(img)
253
+ annotated_images.append(img) # Fallback to original
254
+
255
+ elif file_extension in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
256
+ # Process single image
257
+ status_message += "Processing image...\n"
258
+
259
+ img = Image.open(file).convert('RGB')
260
+ results = analyzer.analyze_image(img, task_type)
261
+
262
+ if results.get("success", False):
263
+ annotated_img = draw_bounding_boxes(img, results)
264
+ original_images.append(img)
265
+ annotated_images.append(annotated_img)
266
+ status_message += "Image analyzed successfully.\n"
267
+
268
+ # Add detailed results to status
269
+ if "parsed_results" in results:
270
+ parsed = results["parsed_results"]
271
+ if task_type == "detailed_caption" and isinstance(parsed, dict):
272
+ caption = parsed.get("detailed_caption", "No caption generated")
273
+ status_message += f"Caption: {caption}\n"
274
+ elif "labels" in parsed:
275
+ labels = parsed["labels"]
276
+ status_message += f"Detected objects: {', '.join(labels[:5])}{'...' if len(labels) > 5 else ''}\n"
277
+ else:
278
+ status_message += f"Analysis failed: {results.get('error', 'Unknown error')}\n"
279
+ original_images.append(img)
280
+ annotated_images.append(img)
281
+ else:
282
+ return [], [], f"Unsupported file type: {file_extension}. Please upload PNG, JPG, JPEG, or PDF files."
283
+
284
+ except Exception as e:
285
+ return [], [], f"Error processing file: {str(e)}"
286
+
287
+ return original_images, annotated_images, status_message
288
+
289
+ def create_gallery_content(original_images: List[Image.Image], annotated_images: List[Image.Image]) -> List[Tuple[Image.Image, str]]:
290
+ """Create content for Gradio gallery showing both original and annotated versions"""
291
+ gallery_content = []
292
+
293
+ for i, (orig, anno) in enumerate(zip(original_images, annotated_images)):
294
+ # Add original image
295
+ gallery_content.append((orig, f"Page/Image {i+1} - Original"))
296
+ # Add annotated image
297
+ gallery_content.append((anno, f"Page/Image {i+1} - Analyzed"))
298
+
299
+ return gallery_content
300
+
301
+ # Create Gradio interface
302
+ def create_interface():
303
+ with gr.Blocks(title="Florence-2 Document & Image Analyzer", theme=gr.themes.Soft()) as demo:
304
+ gr.Markdown("""
305
+ # πŸ“„ Florence-2 Document & Image Analyzer
306
+
307
+ Upload images (PNG, JPG, JPEG) or PDF documents to analyze them with Microsoft's Florence-2 vision model.
308
+ The model can detect objects, generate captions, perform OCR, and more!
309
+ """)
310
+
311
+ with gr.Row():
312
+ with gr.Column(scale=1):
313
+ file_upload = gr.File(
314
+ label="Upload Image or PDF",
315
+ file_types=[".png", ".jpg", ".jpeg", ".pdf"],
316
+ type="filepath"
317
+ )
318
+
319
+ task_type = gr.Dropdown(
320
+ choices=[(config["description"], task_name) for task_name, config in FLORENCE_TASKS.items()],
321
+ value="object_detection",
322
+ label="Analysis Type",
323
+ info="Choose what type of analysis to perform"
324
+ )
325
+
326
+ analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
327
+
328
+ status_text = gr.Textbox(
329
+ label="Status",
330
+ lines=8,
331
+ interactive=False,
332
+ placeholder="Upload a file and click Analyze to see results..."
333
+ )
334
+
335
+ with gr.Column(scale=2):
336
+ gallery = gr.Gallery(
337
+ label="Results (Original vs Analyzed)",
338
+ show_label=True,
339
+ elem_id="gallery",
340
+ columns=2,
341
+ rows=2,
342
+ object_fit="contain",
343
+ height="auto"
344
+ )
345
+
346
+ # Event handler
347
+ def process_and_display(file, task):
348
+ if file is None:
349
+ return [], "Please upload a file first."
350
+
351
+ original_imgs, annotated_imgs, status = process_uploaded_file(file, task)
352
+ gallery_content = create_gallery_content(original_imgs, annotated_imgs)
353
+
354
+ return gallery_content, status
355
+
356
+ analyze_btn.click(
357
+ fn=process_and_display,
358
+ inputs=[file_upload, task_type],
359
+ outputs=[gallery, status_text]
360
+ )
361
+
362
+ # Example section
363
+ gr.Markdown("""
364
+ ## πŸ’‘ Tips for Best Results
365
+
366
+ - **Images**: Upload clear, high-resolution images for better analysis
367
+ - **PDFs**: Multi-page PDFs will be processed page by page
368
+ - **Object Detection**: Great for identifying and locating objects in images
369
+ - **Detailed Caption**: Provides comprehensive descriptions of image content
370
+ - **OCR**: Perfect for extracting text from documents and images
371
+ - **Dense Captioning**: Provides detailed captions for different regions
372
+
373
+ ## 🎯 Supported Formats
374
+ - **Images**: PNG, JPG, JPEG, BMP, TIFF
375
+ - **Documents**: PDF (converted to images automatically)
376
+ """)
377
+
378
+ return demo
379
+
380
+ # Launch the application
381
+ if __name__ == "__main__":
382
+ demo = create_interface()
383
+ demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  # Core dependencies
2
- gradio==4.28.0
3
  torch>=2.0.0
4
  torchvision>=0.15.0
5
  transformers>=4.35.0
 
1
  # Core dependencies
2
+ gradio==3.50.2
3
  torch>=2.0.0
4
  torchvision>=0.15.0
5
  transformers>=4.35.0