Spaces:

um41r
/

Florence-2

Build error

App Files Files Community

um41r commited on Feb 21

Commit

700b060

verified ·

1 Parent(s): fcaff5d

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -133

app.py CHANGED Viewed

@@ -3,41 +3,35 @@ import sys
 import gradio as gr
 import numpy as np
 from PIL import Image
-import onnxruntime as ort
-from huggingface_hub import hf_hub_download
-import torch
-from typing import Dict, List, Optional
-import json
 import warnings
 import logging
-# Suppress warnings
 warnings.filterwarnings("ignore")
 logging.getLogger("transformers").setLevel(logging.ERROR)
 # Configuration
 MODEL_REPO = "onnx-community/Florence-2-base"
-# Use non-merged decoder to avoid the subgraph output issue
-# decoder_model_merged has a bug with outer scope values
 ONNX_FILES = {
     "vision_encoder": "vision_encoder_fp16.onnx",
     "embed_tokens": "embed_tokens_fp16.onnx",
     "encoder_model": "encoder_model_fp16.onnx",
-    "decoder_model": "decoder_model_fp16.onnx",  # Changed from merged to standard
-    "decoder_with_past_model": "decoder_with_past_model_fp16.onnx"  # For efficient generation
 }
-# Global variables for models
 sessions = {}
 processor = None
 tokenizer = None
 def download_models():
     """Download ONNX models from HuggingFace Hub"""
-    print("📥 Downloading ONNX models (FP16)...")
-    model_paths = {}
     os.makedirs("./models/onnx", exist_ok=True)
     for name, filename in ONNX_FILES.items():
@@ -48,34 +42,35 @@ def download_models():
             local_dir="./models",
             local_dir_use_symlinks=False
         )
-        model_paths[name] = path
         size_mb = os.path.getsize(path) / (1024 * 1024)
         print(f"  ✓ {name}: {size_mb:.1f}MB")
-    return model_paths
 def init_models():
     """Initialize ONNX Runtime sessions"""
-    global sessions, processor, tokenizer
-    # Download models if not present
     if not all(os.path.exists(f"./models/onnx/{f}") for f in ONNX_FILES.values()):
         download_models()
     else:
         print("✅ Models already cached")
-    # FORCE CPU-ONLY execution
     providers = ['CPUExecutionProvider']
-    print(f"Using providers: {providers} (CPU-only mode)")
-    # Import transformers
-    try:
-        from transformers import AutoProcessor, AutoTokenizer, BartTokenizerFast
-        import transformers
-        print(f"Transformers version: {transformers.__version__}")
-    except ImportError as e:
-        print(f"Error importing transformers: {e}")
-        raise
     # Load processor and tokenizer
     print("📥 Loading processor and tokenizer...")
@@ -87,7 +82,7 @@ def init_models():
             use_fast=False
         )
     except Exception as e:
-        print(f"Error loading processor: {e}")
         processor = AutoProcessor.from_pretrained(
             "microsoft/Florence-2-base",
             trust_remote_code=True
@@ -110,8 +105,6 @@ def init_models():
         sess_options = ort.SessionOptions()
         sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-        sess_options.enable_cpu_mem_arena = True
-        sess_options.enable_mem_pattern = True
         try:
             sessions[name] = ort.InferenceSession(
@@ -126,106 +119,94 @@ def init_models():
     print("✅ All models loaded successfully!")
-def generate_caption(image: Image.Image, task: str = "<MORE_DETAILED_CAPTION>", max_new_tokens: int = 256):
-    """Generate caption using Florence-2 ONNX models with separate decoder"""
-    # Prepare inputs using processor
     inputs = processor(text=task, images=image, return_tensors="np")
-    # Get shapes
     batch_size = 1
-    # 1. Run Vision Encoder
     pixel_values = inputs["pixel_values"].astype(np.float16)
     vision_outputs = sessions["vision_encoder"].run(None, {"pixel_values": pixel_values})
     image_features = vision_outputs[0]
-    # 2. Run Embed Tokens
     input_ids = inputs["input_ids"].astype(np.int64)
     embed_outputs = sessions["embed_tokens"].run(None, {"input_ids": input_ids})
     text_embeds = embed_outputs[0]
-    # 3. Concatenate image and text embeddings for encoder
     combined_embeds = np.concatenate([image_features, text_embeds], axis=1)
-    # Create attention mask for combined sequence
     vision_seq_len = image_features.shape[1]
     text_seq_len = text_embeds.shape[1]
     combined_seq_len = vision_seq_len + text_seq_len
     encoder_attention_mask = np.ones((batch_size, combined_seq_len), dtype=np.int64)
-    # 4. Run Encoder
     encoder_outputs = sessions["encoder_model"].run(None, {
         "inputs_embeds": combined_embeds.astype(np.float16),
         "attention_mask": encoder_attention_mask
     })
     encoder_hidden_states = encoder_outputs[0]
-    # 5. Generation with separate decoder models
-    # Use decoder_model for first step, decoder_with_past_model for subsequent steps
     generated_ids = input_ids.copy()
-    past_key_values = None
     for i in range(max_new_tokens):
-        if past_key_values is None:
-            # First iteration - use decoder_model (no past)
             decoder_inputs = {
-                "input_ids": generated_ids,  # Full sequence for first step
                 "encoder_hidden_states": encoder_hidden_states.astype(np.float16),
                 "encoder_attention_mask": encoder_attention_mask.astype(np.int64)
             }
             decoder_outputs = sessions["decoder_model"].run(None, decoder_inputs)
             logits = decoder_outputs[0]
-            # Extract past key values for next iteration (if provided by model)
-            if len(decoder_outputs) > 1:
-                past_key_values = decoder_outputs[1:]
         else:
-            # Subsequent iterations - use decoder_with_past_model
-            # Only feed the last token
             decoder_inputs = {
-                "input_ids": generated_ids[:, -1:],  # Last token only
                 "encoder_hidden_states": encoder_hidden_states.astype(np.float16),
-                "encoder_attention_mask": encoder_attention_mask.astype(np.int64),
-                "past_key_values": past_key_values
             }
             decoder_outputs = sessions["decoder_with_past_model"].run(None, decoder_inputs)
             logits = decoder_outputs[0]
-            # Update past key values
-            if len(decoder_outputs) > 1:
-                past_key_values = decoder_outputs[1:]
-        # Get next token (greedy decoding) - use last position
         next_token_logits = logits[:, -1, :]
         next_token_id = np.argmax(next_token_logits, axis=-1, keepdims=True)
-        # Append to generated sequence
         generated_ids = np.concatenate([generated_ids, next_token_id], axis=1)
-        # Check for EOS token (2 is typically EOS for Florence-2)
         if next_token_id[0, 0] == 2:
             break
-    # Decode output
     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
-    # Post-process based on task
     try:
         result = processor.post_process_generation(generated_text, task, image.size)
-    except Exception as e:
         result = {task: generated_text}
     return result
 def analyze_image(image, task_type):
-    """Main analysis function for Gradio"""
     if image is None:
         return "Please upload an image first."
     try:
         if isinstance(image, np.ndarray):
             image = Image.fromarray(image)
@@ -248,99 +229,85 @@ def analyze_image(image, task_type):
     except Exception as e:
         import traceback
-        error_msg = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-        return error_msg
 def create_prompt_from_analysis(image, analysis_result, prompt_type):
-    """Convert analysis to AI generation prompts"""
     try:
-        if isinstance(analysis_result, str):
-            try:
-                analysis = json.loads(analysis_result)
-            except:
-                analysis = {"description": analysis_result}
-        else:
-            analysis = analysis_result
         if isinstance(analysis, dict):
-            if "<MORE_DETAILED_CAPTION>" in analysis:
-                description = analysis["<MORE_DETAILED_CAPTION>"]
-            elif "<CAPTION>" in analysis:
-                description = analysis["<CAPTION>"]
-            elif "<OCR>" in analysis:
-                description = analysis["<OCR>"]
-            else:
-                description = str(analysis)
         else:
             description = str(analysis)
         if prompt_type == "Midjourney":
-            prompt = f"""Midjourney Prompt:
 A highly detailed photograph of {description}, cinematic lighting, 8k resolution, sharp focus, professional photography, trending on ArtStation --ar 16:9 --v 6.0"""
         elif prompt_type == "Stable Diffusion":
-            prompt = f"""Positive Prompt:
 {description}, masterpiece, best quality, highly detailed, 8k, sharp focus, professional photography, cinematic lighting, vibrant colors
 Negative Prompt:
 low quality, blurry, distorted, deformed, ugly, duplicate, watermark, signature, text, cropped, worst quality, jpeg artifacts"""
         elif prompt_type == "DALL-E":
-            prompt = f"""DALL-E Prompt:
 A professional, high-quality image showing {description}. The image should be photorealistic with excellent composition, lighting, and detail. Suitable for commercial use."""
         elif prompt_type == "Master Creator Prompt":
-            prompt = f"""MASTER PROMPT ANALYSIS
 ========================
-SOURCE ANALYSIS:
-{description}
 TECHNICAL BREAKDOWN:
-- Subject Matter: [Identify main subjects]
-- Composition: [Rule of thirds, symmetry, framing]
-- Lighting: [Natural/artificial, direction, quality]
-- Color Palette: [Dominant colors, contrast]
-- Style/Genre: [Photographic style]
-- Mood/Atmosphere: [Emotional tone]
-- Technical Aspects: [Camera angle, lens choice]
 RECREATION GUIDE:
-To recreate: {description}
-KEYWORDS:
-{', '.join(str(description).split()[:20])}, masterpiece, detailed, professional
 VARIATIONS:
 1. Golden hour lighting
-2. Black and white
 3. Dramatic shadows
-4. Bird's eye view
-5. Cinematic teal/orange grading"""
-        return prompt
     except Exception as e:
         return f"Error creating prompt: {str(e)}"
-# Initialize
-print("🚀 Initializing Florence-2 ONNX Space...")
-try:
-    import transformers
-    print(f"Transformers version: {transformers.__version__}")
-    print(f"ONNX Runtime version: {ort.__version__}")
-except ImportError as e:
-    print(f"Import error: {e}")
-init_models()
 # Gradio Interface
 with gr.Blocks(title="Florence-2 Vision Analyzer") as demo:
     gr.Markdown("""
     # 🎨 Florence-2 Vision Analyzer & Prompt Generator
-    ### Powered by ONNX Runtime (FP16) - CPU Mode
     Upload an image to analyze it and generate AI-ready prompts!
     """)
     with gr.Row():
@@ -357,7 +324,7 @@ with gr.Blocks(title="Florence-2 Vision Analyzer") as demo:
             analysis_output = gr.Textbox(
                 label="Analysis Result",
                 lines=10,
-                placeholder="Analysis will appear here..."
             )
     gr.Markdown("---")
@@ -390,14 +357,6 @@ with gr.Blocks(title="Florence-2 Vision Analyzer") as demo:
         outputs=prompt_output,
         api_visibility="public"
     )
-    gr.Examples(
-        examples=[
-            [{"path": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"}],
-        ],
-        inputs=input_image,
-        label="Try Example Image"
-    )
 if __name__ == "__main__":
     demo.launch(

 import gradio as gr
 import numpy as np
 from PIL import Image
 import warnings
 import logging
+import json
+# Suppress warnings immediately
 warnings.filterwarnings("ignore")
 logging.getLogger("transformers").setLevel(logging.ERROR)
 # Configuration
 MODEL_REPO = "onnx-community/Florence-2-base"
 ONNX_FILES = {
     "vision_encoder": "vision_encoder_fp16.onnx",
     "embed_tokens": "embed_tokens_fp16.onnx",
     "encoder_model": "encoder_model_fp16.onnx",
+    "decoder_model": "decoder_model_fp16.onnx",
+    "decoder_with_past_model": "decoder_with_past_model_fp16.onnx"
 }
+# Global variables - will be initialized lazily
 sessions = {}
 processor = None
 tokenizer = None
+ort = None  # Will import lazily
 def download_models():
     """Download ONNX models from HuggingFace Hub"""
+    from huggingface_hub import hf_hub_download
+    print("📥 Downloading ONNX models (FP16)...")
     os.makedirs("./models/onnx", exist_ok=True)
     for name, filename in ONNX_FILES.items():
             local_dir="./models",
             local_dir_use_symlinks=False
         )
         size_mb = os.path.getsize(path) / (1024 * 1024)
         print(f"  ✓ {name}: {size_mb:.1f}MB")
+    print("✅ All models downloaded!")
 def init_models():
     """Initialize ONNX Runtime sessions"""
+    global sessions, processor, tokenizer, ort
+    # Lazy import onnxruntime to avoid build issues
+    import onnxruntime as ort_module
+    ort = ort_module
+    # Lazy import transformers
+    from transformers import AutoProcessor, AutoTokenizer, BartTokenizerFast
+    import transformers
+    print(f"Transformers version: {transformers.__version__}")
+    print(f"ONNX Runtime version: {ort.__version__}")
+    # Check if models exist
     if not all(os.path.exists(f"./models/onnx/{f}") for f in ONNX_FILES.values()):
         download_models()
     else:
         print("✅ Models already cached")
+    # CPU-only providers
     providers = ['CPUExecutionProvider']
+    print(f"Using providers: {providers}")
     # Load processor and tokenizer
     print("📥 Loading processor and tokenizer...")
             use_fast=False
         )
     except Exception as e:
+        print(f"Error with use_fast=False: {e}")
         processor = AutoProcessor.from_pretrained(
             "microsoft/Florence-2-base",
             trust_remote_code=True
         sess_options = ort.SessionOptions()
         sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
         try:
             sessions[name] = ort.InferenceSession(
     print("✅ All models loaded successfully!")
+def generate_caption(image, task="<MORE_DETAILED_CAPTION>", max_new_tokens=256):
+    """Generate caption using Florence-2 ONNX models"""
+    # Prepare inputs
     inputs = processor(text=task, images=image, return_tensors="np")
     batch_size = 1
+    # 1. Vision Encoder
     pixel_values = inputs["pixel_values"].astype(np.float16)
     vision_outputs = sessions["vision_encoder"].run(None, {"pixel_values": pixel_values})
     image_features = vision_outputs[0]
+    # 2. Embed Tokens
     input_ids = inputs["input_ids"].astype(np.int64)
     embed_outputs = sessions["embed_tokens"].run(None, {"input_ids": input_ids})
     text_embeds = embed_outputs[0]
+    # 3. Concatenate for encoder
     combined_embeds = np.concatenate([image_features, text_embeds], axis=1)
     vision_seq_len = image_features.shape[1]
     text_seq_len = text_embeds.shape[1]
     combined_seq_len = vision_seq_len + text_seq_len
     encoder_attention_mask = np.ones((batch_size, combined_seq_len), dtype=np.int64)
+    # 4. Encoder
     encoder_outputs = sessions["encoder_model"].run(None, {
         "inputs_embeds": combined_embeds.astype(np.float16),
         "attention_mask": encoder_attention_mask
     })
     encoder_hidden_states = encoder_outputs[0]
+    # 5. Generation
     generated_ids = input_ids.copy()
+    use_past = False
     for i in range(max_new_tokens):
+        if not use_past:
+            # First step - use decoder_model
             decoder_inputs = {
+                "input_ids": generated_ids,
                 "encoder_hidden_states": encoder_hidden_states.astype(np.float16),
                 "encoder_attention_mask": encoder_attention_mask.astype(np.int64)
             }
             decoder_outputs = sessions["decoder_model"].run(None, decoder_inputs)
             logits = decoder_outputs[0]
+            use_past = True  # Switch to past model for next iteration
         else:
+            # Subsequent steps - use decoder_with_past_model
             decoder_inputs = {
+                "input_ids": generated_ids[:, -1:],
                 "encoder_hidden_states": encoder_hidden_states.astype(np.float16),
+                "encoder_attention_mask": encoder_attention_mask.astype(np.int64)
             }
             decoder_outputs = sessions["decoder_with_past_model"].run(None, decoder_inputs)
             logits = decoder_outputs[0]
+        # Greedy decoding
         next_token_logits = logits[:, -1, :]
         next_token_id = np.argmax(next_token_logits, axis=-1, keepdims=True)
         generated_ids = np.concatenate([generated_ids, next_token_id], axis=1)
+        # Check for EOS (token 2)
         if next_token_id[0, 0] == 2:
             break
+    # Decode
     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
+    # Post-process
     try:
         result = processor.post_process_generation(generated_text, task, image.size)
+    except:
         result = {task: generated_text}
     return result
 def analyze_image(image, task_type):
+    """Main analysis function"""
     if image is None:
         return "Please upload an image first."
+    # Initialize models on first use if not already done
+    if not sessions:
+        try:
+            init_models()
+        except Exception as e:
+            return f"Initialization error: {str(e)}"
     try:
         if isinstance(image, np.ndarray):
             image = Image.fromarray(image)
     except Exception as e:
         import traceback
+        return f"Error: {str(e)}\n\n{traceback.format_exc()}"
 def create_prompt_from_analysis(image, analysis_result, prompt_type):
+    """Convert analysis to AI prompts"""
+    if not analysis_result or analysis_result == "Please upload an image first.":
+        return "Please analyze an image first."
     try:
+        # Parse analysis
+        try:
+            analysis = json.loads(analysis_result)
+        except:
+            analysis = {"description": analysis_result}
+        # Extract description
         if isinstance(analysis, dict):
+            description = (analysis.get("<MORE_DETAILED_CAPTION>") or
+                          analysis.get("<CAPTION>") or
+                          analysis.get("<OCR>") or
+                          str(analysis))
         else:
             description = str(analysis)
+        # Generate prompts
         if prompt_type == "Midjourney":
+            return f"""Midjourney Prompt:
 A highly detailed photograph of {description}, cinematic lighting, 8k resolution, sharp focus, professional photography, trending on ArtStation --ar 16:9 --v 6.0"""
         elif prompt_type == "Stable Diffusion":
+            return f"""Positive Prompt:
 {description}, masterpiece, best quality, highly detailed, 8k, sharp focus, professional photography, cinematic lighting, vibrant colors
 Negative Prompt:
 low quality, blurry, distorted, deformed, ugly, duplicate, watermark, signature, text, cropped, worst quality, jpeg artifacts"""
         elif prompt_type == "DALL-E":
+            return f"""DALL-E Prompt:
 A professional, high-quality image showing {description}. The image should be photorealistic with excellent composition, lighting, and detail. Suitable for commercial use."""
         elif prompt_type == "Master Creator Prompt":
+            keywords = ', '.join(str(description).split()[:20])
+            return f"""MASTER PROMPT ANALYSIS
 ========================
+SOURCE: {description}
 TECHNICAL BREAKDOWN:
+- Subject Matter: Main subjects identified
+- Composition: Rule of thirds, symmetry, framing
+- Lighting: Natural/artificial, direction, quality
+- Color Palette: Dominant colors, contrast
+- Style: Photographic style and genre
+- Mood: Emotional tone and atmosphere
 RECREATION GUIDE:
+Focus on: {description}
+KEYWORDS: {keywords}, masterpiece, detailed, professional
 VARIATIONS:
 1. Golden hour lighting
+2. Black and white conversion
 3. Dramatic shadows
+4. Bird's eye view perspective
+5. Cinematic color grading"""
     except Exception as e:
         return f"Error creating prompt: {str(e)}"
 # Gradio Interface
+print("🚀 Starting Gradio app...")
 with gr.Blocks(title="Florence-2 Vision Analyzer") as demo:
     gr.Markdown("""
     # 🎨 Florence-2 Vision Analyzer & Prompt Generator
+    ### Powered by ONNX Runtime (FP16)
     Upload an image to analyze it and generate AI-ready prompts!
+    *Models will download on first use (~550MB)*
     """)
     with gr.Row():
             analysis_output = gr.Textbox(
                 label="Analysis Result",
                 lines=10,
+                placeholder="Click 'Analyze Image' to process..."
             )
     gr.Markdown("---")
         outputs=prompt_output,
         api_visibility="public"
     )
 if __name__ == "__main__":
     demo.launch(