Felipe Meres commited on
Commit
1ddb064
Β·
1 Parent(s): 1f8f715

Convert Florence-2 space from Streamlit to Gradio

Browse files

Major improvements:
- βœ… Updated to Gradio 4.44.0+ for better HF Spaces compatibility
- βœ… Enhanced PDF processing with multi-page support
- βœ… Improved file upload handling for images and PDFs
- βœ… Better responsive UI with two-column layout
- βœ… Progressive loading and status indicators
- βœ… Custom styling with Gradio Soft theme
- βœ… Enhanced error handling and user feedback
- βœ… Mobile-friendly responsive design

Technical changes:
- Replaced Streamlit session state with global model cache
- Added comprehensive PDF processing with pdf2image
- Implemented Gradio's modern component patterns
- Updated dependencies for optimal HF Spaces performance
- Maintained all Florence-2 model functionality

Ready for production deployment on Hugging Face Spaces.

Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +289 -99
  3. requirements.txt +1 -1
README.md CHANGED
@@ -3,7 +3,7 @@ title: Florence-2 Document & Image Analyzer
3
  emoji: πŸ“„
4
  colorFrom: blue
5
  colorTo: purple
6
- sdk: streamlit
7
 
8
  app_file: app.py
9
  pinned: false
@@ -63,13 +63,13 @@ Upload any document or image to see Florence-2 in action:
63
  - **Technical diagrams**: Component identification and labeling
64
  # Florence-2 Document & Image Analyzer
65
 
66
- This Space uses Streamlit to provide an interactive interface for Microsoft's Florence-2 vision model.
67
 
68
  ## Features
69
  - Object Detection with bounding boxes
70
- - Detailed image captioning
71
  - OCR text extraction
72
- - Interactive Streamlit interface
73
  - Model caching for performance
74
 
75
  Upload an image and select an analysis type to get started!
 
3
  emoji: πŸ“„
4
  colorFrom: blue
5
  colorTo: purple
6
+ sdk: gradio
7
 
8
  app_file: app.py
9
  pinned: false
 
63
  - **Technical diagrams**: Component identification and labeling
64
  # Florence-2 Document & Image Analyzer
65
 
66
+ This Space uses Gradio to provide an interactive interface for Microsoft's Florence-2 vision model.
67
 
68
  ## Features
69
  - Object Detection with bounding boxes
70
+ - Detailed image captioning
71
  - OCR text extraction
72
+ - Interactive Gradio interface
73
  - Model caching for performance
74
 
75
  Upload an image and select an analysis type to get started!
app.py CHANGED
@@ -1,54 +1,69 @@
1
- import streamlit as st
2
  import torch
3
  from PIL import Image, ImageDraw, ImageFont
4
  import numpy as np
5
  from pathlib import Path
6
  import os
7
  import time
8
- from typing import Dict, Any
 
 
 
 
 
 
 
 
 
9
 
10
  # Import configuration
11
  from config import *
12
 
13
- # Initialize session state for model
14
- if 'model_loaded' not in st.session_state:
15
- st.session_state.model_loaded = False
16
- st.session_state.model = None
17
- st.session_state.processor = None
18
- st.session_state.device = None
 
19
 
20
  def load_florence_model():
21
  """Load Florence-2 model and processor on-demand"""
22
- if st.session_state.model_loaded:
23
- return st.session_state.model, st.session_state.processor, st.session_state.device
24
 
25
  try:
26
  from transformers import AutoProcessor, AutoModelForCausalLM
27
 
28
  device = "cpu" if FORCE_CPU else ("cuda" if torch.cuda.is_available() else "cpu")
29
 
30
- with st.spinner(f"Loading Florence-2 model on {device}... This may take a few minutes."):
31
- model = AutoModelForCausalLM.from_pretrained(
32
- FLORENCE_MODEL_ID,
33
- torch_dtype=torch.float16 if (torch.cuda.is_available() and not FORCE_CPU) else torch.float32,
34
- trust_remote_code=True
35
- ).to(device)
 
36
 
37
- processor = AutoProcessor.from_pretrained(FLORENCE_MODEL_ID, trust_remote_code=True)
38
 
39
- st.session_state.model = model
40
- st.session_state.processor = processor
41
- st.session_state.device = device
42
- st.session_state.model_loaded = True
43
 
 
44
  return model, processor, device
45
 
46
  except Exception as e:
47
- st.error(f"Failed to load Florence-2 model: {e}")
48
  return None, None, None
49
 
50
- def analyze_image(image, task_type, model, processor, device):
51
  """Analyze image with Florence-2 model"""
 
 
 
52
  if not model or not processor:
53
  return {"error": "Model not loaded", "success": False}
54
 
@@ -56,6 +71,7 @@ def analyze_image(image, task_type, model, processor, device):
56
  task_config = FLORENCE_TASKS.get(task_type, FLORENCE_TASKS["detailed_caption"])
57
  task_prompt = task_config["prompt"]
58
 
 
59
  if image.size[0] > MAX_IMAGE_SIZE[0] or image.size[1] > MAX_IMAGE_SIZE[1]:
60
  image.thumbnail(MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)
61
 
@@ -84,7 +100,7 @@ def analyze_image(image, task_type, model, processor, device):
84
  except Exception as e:
85
  return {"error": f"Analysis failed: {str(e)}", "success": False}
86
 
87
- def draw_bounding_boxes(image, results):
88
  """Draw bounding boxes and labels on image"""
89
  if not results.get("success", False):
90
  return image
@@ -107,106 +123,280 @@ def draw_bounding_boxes(image, results):
107
  draw.text((x1, max(y1-20, 0)), label[:30], fill=color, font=font)
108
 
109
  except Exception as e:
110
- st.error(f"Error drawing annotations: {e}")
111
 
112
  return annotated_image
113
 
114
- def main():
115
- st.set_page_config(
116
- page_title="Florence-2 Document & Image Analyzer",
117
- page_icon="πŸ“„",
118
- layout="wide"
119
- )
120
 
121
- st.title("πŸ“„ Florence-2 Document & Image Analyzer")
122
- st.markdown("Upload images to analyze them with Microsoft's Florence-2 vision model.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- # Show model status
125
- if st.session_state.model_loaded:
126
- st.success(f"βœ… Florence-2 model loaded on {st.session_state.device}")
127
- else:
128
- st.info("ℹ️ Model will be loaded when you upload an image (first time may take 2-3 minutes)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- uploaded_file = st.file_uploader("Choose an image", type=['png', 'jpg', 'jpeg'])
 
131
 
132
- task_choices = {
133
- "Object Detection": "object_detection",
134
- "Detailed Caption": "detailed_caption",
135
- "OCR": "ocr"
136
- }
137
 
138
- selected_task = st.selectbox("Analysis Type", options=list(task_choices.keys()))
 
 
 
139
 
140
- if uploaded_file is not None:
141
- col1, col2 = st.columns(2)
 
142
 
143
- with col1:
144
- st.subheader("Original Image")
145
- image = Image.open(uploaded_file).convert("RGB")
146
- st.image(image, use_column_width=True)
147
 
148
- if st.button("πŸ” Analyze Image", type="primary"):
149
- # Load model on-demand
150
- model, processor, device = load_florence_model()
151
 
152
- if model is None:
153
- st.error("❌ Failed to load model. Please try refreshing the page.")
154
- return
155
 
156
- st.success(f"βœ… Model loaded successfully on {device}")
 
157
 
158
- with st.spinner("Analyzing image..."):
159
- task_type = task_choices[selected_task]
160
- results = analyze_image(image, task_type, model, processor, device)
 
161
 
162
- if results.get("success", False):
163
- annotated_image = draw_bounding_boxes(image, results)
 
164
 
165
- with col2:
166
- st.subheader("Analysis Results")
167
- st.image(annotated_image, use_column_width=True)
168
 
169
- # Show results
170
- with st.expander("πŸ“‹ Analysis Details", expanded=True):
171
- parsed = results.get("parsed_results", {})
172
- if task_type == "detailed_caption" and isinstance(parsed, dict):
173
- caption = parsed.get("detailed_caption", "")
174
- st.write(f"**Caption:** {caption}")
175
- elif "labels" in parsed and parsed["labels"]:
176
- labels = parsed["labels"]
177
- st.write(f"**Detected Objects ({len(labels)}):** {', '.join(labels[:10])}")
178
- if len(labels) > 10:
179
- st.write(f"*...and {len(labels) - 10} more objects*")
180
- else:
181
- st.write("βœ… Analysis completed successfully!")
182
 
183
- st.balloons()
184
- else:
185
- st.error(f"❌ Analysis failed: {results.get('error', 'Unknown error')}")
186
 
 
 
 
187
  else:
188
- st.info("πŸ‘† Please upload an image to get started!")
 
 
189
 
190
- # Add helpful information
191
- with st.expander("ℹ️ About Florence-2"):
192
- st.markdown("""
193
- **Florence-2** is Microsoft's foundation vision model capable of:
194
 
195
- - **🎯 Object Detection**: Identifies and locates objects with bounding boxes
196
- - **πŸ“ Detailed Caption**: Generates comprehensive descriptions of image content
197
- - **πŸ”€ OCR**: Extracts and locates text in images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- The model downloads automatically on first use (~5GB) and is cached for subsequent uses.
 
 
200
  """)
201
 
202
- # Performance info
203
- with st.expander("⚑ Performance Notes"):
204
- st.markdown("""
205
- - **First run**: Model download may take 2-3 minutes
206
- - **GPU**: Faster inference when available
207
- - **CPU**: Works but slower processing
208
- - **Model size**: ~5GB (cached after first download)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  """)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  if __name__ == "__main__":
212
- main()
 
1
+ import gradio as gr
2
  import torch
3
  from PIL import Image, ImageDraw, ImageFont
4
  import numpy as np
5
  from pathlib import Path
6
  import os
7
  import time
8
+ from typing import Dict, Any, Tuple, Optional, List
9
+ import tempfile
10
+ import io
11
+
12
+ # PDF processing
13
+ try:
14
+ from pdf2image import convert_from_bytes, convert_from_path
15
+ PDF_AVAILABLE = True
16
+ except ImportError:
17
+ PDF_AVAILABLE = False
18
 
19
  # Import configuration
20
  from config import *
21
 
22
+ # Global variables to store model (similar to Streamlit's session state)
23
+ model_cache = {
24
+ 'model': None,
25
+ 'processor': None,
26
+ 'device': None,
27
+ 'loaded': False
28
+ }
29
 
30
  def load_florence_model():
31
  """Load Florence-2 model and processor on-demand"""
32
+ if model_cache['loaded']:
33
+ return model_cache['model'], model_cache['processor'], model_cache['device']
34
 
35
  try:
36
  from transformers import AutoProcessor, AutoModelForCausalLM
37
 
38
  device = "cpu" if FORCE_CPU else ("cuda" if torch.cuda.is_available() else "cpu")
39
 
40
+ print(f"Loading Florence-2 model on {device}...")
41
+
42
+ model = AutoModelForCausalLM.from_pretrained(
43
+ FLORENCE_MODEL_ID,
44
+ torch_dtype=torch.float16 if (torch.cuda.is_available() and not FORCE_CPU) else torch.float32,
45
+ trust_remote_code=True
46
+ ).to(device)
47
 
48
+ processor = AutoProcessor.from_pretrained(FLORENCE_MODEL_ID, trust_remote_code=True)
49
 
50
+ model_cache['model'] = model
51
+ model_cache['processor'] = processor
52
+ model_cache['device'] = device
53
+ model_cache['loaded'] = True
54
 
55
+ print(f"βœ… Model loaded successfully on {device}")
56
  return model, processor, device
57
 
58
  except Exception as e:
59
+ print(f"Failed to load Florence-2 model: {e}")
60
  return None, None, None
61
 
62
+ def analyze_image(image: Image.Image, task_type: str) -> Dict[str, Any]:
63
  """Analyze image with Florence-2 model"""
64
+ # Load model if not already loaded
65
+ model, processor, device = load_florence_model()
66
+
67
  if not model or not processor:
68
  return {"error": "Model not loaded", "success": False}
69
 
 
71
  task_config = FLORENCE_TASKS.get(task_type, FLORENCE_TASKS["detailed_caption"])
72
  task_prompt = task_config["prompt"]
73
 
74
+ # Resize image if too large
75
  if image.size[0] > MAX_IMAGE_SIZE[0] or image.size[1] > MAX_IMAGE_SIZE[1]:
76
  image.thumbnail(MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)
77
 
 
100
  except Exception as e:
101
  return {"error": f"Analysis failed: {str(e)}", "success": False}
102
 
103
+ def draw_bounding_boxes(image: Image.Image, results: Dict[str, Any]) -> Image.Image:
104
  """Draw bounding boxes and labels on image"""
105
  if not results.get("success", False):
106
  return image
 
123
  draw.text((x1, max(y1-20, 0)), label[:30], fill=color, font=font)
124
 
125
  except Exception as e:
126
+ print(f"Error drawing annotations: {e}")
127
 
128
  return annotated_image
129
 
130
+ def process_pdf(pdf_file) -> List[Image.Image]:
131
+ """Convert PDF to images"""
132
+ if not PDF_AVAILABLE:
133
+ raise ValueError("PDF processing not available. Please install pdf2image.")
 
 
134
 
135
+ try:
136
+ # Convert PDF to images
137
+ if hasattr(pdf_file, 'read'):
138
+ # File object
139
+ pdf_bytes = pdf_file.read()
140
+ images = convert_from_bytes(pdf_bytes, dpi=PDF_DPI)
141
+ else:
142
+ # File path
143
+ images = convert_from_path(pdf_file, dpi=PDF_DPI)
144
+
145
+ # Limit number of pages
146
+ if len(images) > MAX_PDF_PAGES:
147
+ images = images[:MAX_PDF_PAGES]
148
+
149
+ return images
150
+ except Exception as e:
151
+ raise ValueError(f"Failed to process PDF: {str(e)}")
152
 
153
+ def format_results_text(results: Dict[str, Any], task_type: str) -> str:
154
+ """Format analysis results as text"""
155
+ if not results.get("success", False):
156
+ return f"❌ Analysis failed: {results.get('error', 'Unknown error')}"
157
+
158
+ parsed = results.get("parsed_results", {})
159
+
160
+ if task_type == "detailed_caption":
161
+ if isinstance(parsed, dict) and "detailed_caption" in parsed:
162
+ return f"πŸ“ **Caption:** {parsed['detailed_caption']}"
163
+ elif isinstance(parsed, str):
164
+ return f"πŸ“ **Caption:** {parsed}"
165
+
166
+ elif task_type == "object_detection":
167
+ if "labels" in parsed and parsed["labels"]:
168
+ labels = parsed["labels"]
169
+ bbox_count = len(labels)
170
+ labels_text = ', '.join(labels[:10])
171
+ if len(labels) > 10:
172
+ labels_text += f" ...and {len(labels) - 10} more"
173
+ return f"🎯 **Detected Objects ({bbox_count}):** {labels_text}"
174
+
175
+ elif task_type == "ocr":
176
+ if "text" in parsed:
177
+ ocr_text = parsed.get("text", "")
178
+ if ocr_text:
179
+ return f"πŸ”€ **Extracted Text:**\n{ocr_text}"
180
+ else:
181
+ return "πŸ”€ **OCR Result:** No text detected in the image"
182
+
183
+ elif task_type == "dense_captioning":
184
+ if "labels" in parsed and parsed["labels"]:
185
+ captions = parsed["labels"]
186
+ return f"πŸ“‹ **Region Captions:**\n" + '\n'.join([f"β€’ {cap}" for cap in captions[:5]])
187
+
188
+ return "βœ… Analysis completed successfully!"
189
+
190
+ def process_uploaded_file(file_path: str) -> Tuple[Image.Image, str]:
191
+ """Process uploaded file (image or PDF) and return first image"""
192
+ if file_path is None:
193
+ return None, "Please upload a file first."
194
 
195
+ try:
196
+ file_extension = Path(file_path).suffix.lower()
197
 
198
+ if file_extension == '.pdf':
199
+ if not PDF_AVAILABLE:
200
+ return None, "PDF processing not available. Please upload an image instead."
 
 
201
 
202
+ # Convert PDF to images
203
+ images = process_pdf(file_path)
204
+ if not images:
205
+ return None, "No images found in PDF."
206
 
207
+ # Use the first page for now
208
+ image = images[0]
209
+ status = f"βœ… PDF processed successfully. Showing page 1 of {len(images)}."
210
 
211
+ elif file_extension in ['.png', '.jpg', '.jpeg']:
212
+ # Load image
213
+ image = Image.open(file_path).convert("RGB")
214
+ status = "βœ… Image loaded successfully."
215
 
216
+ else:
217
+ return None, "Unsupported file format. Please upload PNG, JPG, JPEG, or PDF files."
 
218
 
219
+ return image, status
 
 
220
 
221
+ except Exception as e:
222
+ return None, f"❌ Error processing file: {str(e)}"
223
 
224
+ def process_image(image: Image.Image, task_type: str) -> Tuple[Image.Image, str, str]:
225
+ """Process uploaded image and return results"""
226
+ if image is None:
227
+ return None, "Please upload an image first.", ""
228
 
229
+ # Convert to RGB if needed
230
+ if image.mode != "RGB":
231
+ image = image.convert("RGB")
232
 
233
+ # Analyze the image
234
+ results = analyze_image(image, task_type)
 
235
 
236
+ # Create annotated image
237
+ annotated_image = draw_bounding_boxes(image, results)
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ # Format results text
240
+ results_text = format_results_text(results, task_type)
 
241
 
242
+ # Create status message
243
+ if results.get("success", False):
244
+ status = f"βœ… Analysis completed successfully using Florence-2 on {model_cache.get('device', 'unknown device')}"
245
  else:
246
+ status = f"❌ Analysis failed: {results.get('error', 'Unknown error')}"
247
+
248
+ return annotated_image, results_text, status
249
 
250
+ def create_interface():
251
+ """Create the Gradio interface"""
 
 
252
 
253
+ # Custom CSS for better styling
254
+ custom_css = """
255
+ .gradio-container {
256
+ font-family: 'Arial', sans-serif;
257
+ }
258
+ .analysis-results {
259
+ background-color: #f0f2f6;
260
+ padding: 1rem;
261
+ border-radius: 0.5rem;
262
+ margin: 1rem 0;
263
+ }
264
+ """
265
+
266
+ with gr.Blocks(title="Florence-2 Document & Image Analyzer", css=custom_css, theme=gr.themes.Soft()) as demo:
267
+
268
+ gr.Markdown("""
269
+ # πŸ“„ Florence-2 Document & Image Analyzer
270
 
271
+ Upload images to analyze them with Microsoft's Florence-2 vision model.
272
+
273
+ **Note:** The model will be loaded automatically on first use (~5GB download, takes 2-3 minutes).
274
  """)
275
 
276
+ with gr.Row():
277
+ with gr.Column():
278
+ file_input = gr.File(
279
+ label="Upload Image or PDF",
280
+ file_types=[".png", ".jpg", ".jpeg", ".pdf"],
281
+ type="filepath"
282
+ )
283
+
284
+ image_input = gr.Image(
285
+ type="pil",
286
+ label="Current Image",
287
+ height=400,
288
+ interactive=False
289
+ )
290
+
291
+ task_dropdown = gr.Dropdown(
292
+ choices=[
293
+ ("Object Detection", "object_detection"),
294
+ ("Detailed Caption", "detailed_caption"),
295
+ ("OCR (Text Extraction)", "ocr"),
296
+ ("Dense Captioning", "dense_captioning")
297
+ ],
298
+ value="object_detection",
299
+ label="Analysis Type",
300
+ info="Choose the type of analysis to perform"
301
+ )
302
+
303
+ analyze_btn = gr.Button("πŸ” Analyze Image", variant="primary", size="lg")
304
+
305
+ with gr.Column():
306
+ annotated_output = gr.Image(
307
+ label="Analysis Results",
308
+ height=400
309
+ )
310
+
311
+ results_text = gr.Markdown(
312
+ label="Analysis Details",
313
+ value="Upload an image and click 'Analyze Image' to get started!"
314
+ )
315
+
316
+ status_text = gr.Markdown(
317
+ value="ℹ️ Ready to analyze images"
318
+ )
319
+
320
+ # Event handlers
321
+ def handle_file_upload(file_path):
322
+ if file_path is None:
323
+ return None, "Please upload a file first."
324
+ image, status = process_uploaded_file(file_path)
325
+ return image, status
326
+
327
+ def handle_analyze(image, task_type):
328
+ return process_image(image, task_type)
329
+
330
+ file_input.change(
331
+ fn=handle_file_upload,
332
+ inputs=[file_input],
333
+ outputs=[image_input, status_text],
334
+ show_progress=True
335
+ )
336
+
337
+ analyze_btn.click(
338
+ fn=handle_analyze,
339
+ inputs=[image_input, task_dropdown],
340
+ outputs=[annotated_output, results_text, status_text],
341
+ show_progress=True
342
+ )
343
+
344
+ # Information sections
345
+ with gr.Row():
346
+ with gr.Column():
347
+ gr.Markdown("""
348
+ ## ℹ️ About Florence-2
349
+
350
+ **Florence-2** is Microsoft's foundation vision model capable of:
351
+
352
+ - **🎯 Object Detection**: Identifies and locates objects with bounding boxes
353
+ - **πŸ“ Detailed Caption**: Generates comprehensive descriptions of image content
354
+ - **πŸ”€ OCR**: Extracts and locates text in images
355
+ - **πŸ“‹ Dense Captioning**: Provides detailed captions for different regions
356
+
357
+ The model downloads automatically on first use (~5GB) and is cached for subsequent uses.
358
+ """)
359
+
360
+ with gr.Column():
361
+ gr.Markdown("""
362
+ ## ⚑ Performance Notes
363
+
364
+ - **First run**: Model download may take 2-3 minutes
365
+ - **GPU**: Faster inference when available
366
+ - **CPU**: Works but slower processing
367
+ - **Model size**: ~5GB (cached after first download)
368
+ - **Supported formats**: PNG, JPG, JPEG, PDF
369
+ """)
370
+
371
+ # Usage instructions
372
+ gr.Markdown("""
373
+ ## πŸ“‹ How to Use
374
+
375
+ 1. **Upload a file**: Click "Upload Image or PDF" and choose your file
376
+ 2. **Select analysis type**: Choose from the dropdown menu
377
+ 3. **Click Analyze**: The image will appear and you can analyze it
378
+ 4. **View results**: See the annotated image and detailed analysis
379
+
380
+ **Good examples to try:**
381
+ - Photos with objects (cars, people, animals)
382
+ - Screenshots with text for OCR
383
+ - Documents or diagrams for analysis
384
+ - Multi-object scenes for detection
385
  """)
386
 
387
+ return demo
388
+
389
+ def main():
390
+ """Main function to launch the Gradio app"""
391
+ demo = create_interface()
392
+
393
+ # Launch the app
394
+ demo.launch(
395
+ share=SHARE_LINK,
396
+ server_port=SERVER_PORT,
397
+ show_error=True,
398
+ quiet=False
399
+ )
400
+
401
  if __name__ == "__main__":
402
+ main()
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  # Core dependencies - minimal versions that work
2
- streamlit==1.28.1
3
  torch>=2.0.0
4
  torchvision>=0.15.0
5
  transformers>=4.35.0
 
1
  # Core dependencies - minimal versions that work
2
+ gradio>=4.44.0,<5.0.0
3
  torch>=2.0.0
4
  torchvision>=0.15.0
5
  transformers>=4.35.0