Spaces:

markbaggett
/

descriptive-metadata-generation

Sleeping

App Files Files Community

markpbaggett commited on Sep 12, 2025

Commit

f5d8b8a

1 Parent(s): 1216cbc

Fix.

Browse files

Files changed (2) hide show

app.py +168 -74
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,39 +1,115 @@
 import gradio as gr
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-import json
 from PIL import Image
 # Global variables to store model and processor
 model = None
 processor = None
 tokenizer = None
 def load_model():
-    """Load the Qwen2.5-VL model and processor"""
     global model, processor, tokenizer
     if model is None:
-        print("Loading Qwen2.5-VL-7B-Instruct model...")
-        # Load model - using smaller 7B version for Spaces compatibility
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2.5-VL-7B-Instruct",
-            torch_dtype=torch.bfloat16,
-            device_map="auto"
-        )
-        # Load processor and tokenizer
-        processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-        print("Model loaded successfully!")
     return model, processor, tokenizer
 def generate_metadata(image, metadata_type):
-    """Generate metadata for the uploaded image"""
     if image is None:
         return "Please upload an image first."
@@ -54,7 +130,7 @@ def generate_metadata(image, metadata_type):
         prompt = prompts.get(metadata_type, prompts["Basic Description"])
-        # Prepare the conversation format expected by Qwen2.5-VL
         messages = [
             {
                 "role": "user",
@@ -68,39 +144,70 @@ def generate_metadata(image, metadata_type):
             }
         ]
-        # Process the input
-        text = processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(model.device)
-        # Generate response
-        with torch.no_grad():
-            generated_ids = model.generate(
-                **inputs,
-                max_new_tokens=512,
-                temperature=0.7,
-                do_sample=True,
-                top_p=0.9,
-                pad_token_id=tokenizer.pad_token_id
             )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        return output_text.strip()
     except Exception as e:
         return f"Error generating metadata: {str(e)}"
@@ -108,7 +215,6 @@ def generate_metadata(image, metadata_type):
 def create_interface():
     """Create the Gradio interface"""
-    # Custom CSS for better styling
     css = """
     .metadata-container {
         background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
@@ -169,25 +275,6 @@ def create_interface():
                     elem_classes=["output-text"]
                 )
-        # Example images section
-        gr.HTML("<h3 style='text-align: center; margin-top: 30px;'>Try these example images:</h3>")
-        with gr.Row():
-            example_images = [
-                ["examples/landscape.jpg", "Scene & Context"],
-                ["examples/portrait.jpg", "Objects & People"],
-                ["examples/food.jpg", "Basic Description"],
-                ["examples/architecture.jpg", "Technical Analysis"]
-            ]
-            gr.Examples(
-                examples=example_images,
-                inputs=[image_input, metadata_type],
-                outputs=output_text,
-                fn=generate_metadata,
-                cache_examples=False
-            )
         # Event handlers
         generate_btn.click(
             fn=generate_metadata,
@@ -196,7 +283,7 @@ def create_interface():
             show_progress=True
         )
-        # Auto-generate on image upload with basic description
         image_input.change(
             fn=lambda img: generate_metadata(img, "Basic Description") if img else "",
             inputs=[image_input],
@@ -207,21 +294,28 @@ def create_interface():
         gr.HTML("""
         <div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 1px solid #eee;">
             <p style="color: #666;">
-                This Space uses Qwen2.5-VL-7B-Instruct for intelligent image analysis and metadata generation.
                 <br>Perfect for content management, SEO optimization, and accessibility improvements.
             </p>
         </div>
         """)
     return interface
-# Initialize the model when the app starts (optional - can be lazy loaded)
 def initialize_app():
     """Initialize the application"""
     print("Starting Image Metadata Generator...")
     print("Model will be loaded on first use to save resources.")
-    # Create and launch interface
     interface = create_interface()
     return interface

 import gradio as gr
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from PIL import Image
+import json
+# Try to import qwen_vl_utils, fallback if not available
+try:
+    from qwen_vl_utils import process_vision_info
+    QWEN_UTILS_AVAILABLE = True
+except ImportError:
+    print("Warning: qwen_vl_utils not available, using fallback processing")
+    QWEN_UTILS_AVAILABLE = False
 # Global variables to store model and processor
 model = None
 processor = None
 tokenizer = None
+def process_vision_info_fallback(messages):
+    """Fallback function if qwen_vl_utils is not available"""
+    image_inputs = []
+    video_inputs = []
+    for message in messages:
+        if message.get("role") == "user":
+            for content in message.get("content", []):
+                if content.get("type") == "image":
+                    image_inputs.append(content["image"])
+                elif content.get("type") == "video":
+                    video_inputs.append(content["video"])
+    return image_inputs, video_inputs
 def load_model():
+    """Load the Qwen2.5-VL model and processor with better error handling"""
     global model, processor, tokenizer
     if model is None:
+        try:
+            print("Loading Qwen2.5-VL-7B-Instruct model...")
+            # Try different model loading strategies
+            model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
+            # Load processor first (often more stable)
+            print("Loading processor...")
+            processor = AutoProcessor.from_pretrained(
+                model_id,
+                trust_remote_code=True
+            )
+            # Load tokenizer
+            print("Loading tokenizer...")
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id,
+                trust_remote_code=True
+            )
+            # Load model with more conservative settings
+            print("Loading model... This may take a few minutes...")
+            model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_id,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+                trust_remote_code=True,
+                # Add these parameters for better compatibility
+                attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager",
+                low_cpu_mem_usage=True,
+            )
+            print("Model loaded successfully!")
+        except Exception as e:
+            print(f"Error loading main model: {e}")
+            print("Trying alternative loading method...")
+            try:
+                # Fallback: try loading with different parameters
+                model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model_id,
+                    torch_dtype=torch.float16,  # Try float16 instead
+                    device_map="cpu",  # Force CPU loading
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                )
+                print("Model loaded with fallback method!")
+            except Exception as e2:
+                print(f"Fallback loading also failed: {e2}")
+                print("Trying smaller Qwen2-VL model...")
+                try:
+                    # Try the older Qwen2-VL model as final fallback
+                    model_id = "Qwen/Qwen2-VL-7B-Instruct"
+                    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+                    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+                    model = Qwen2VLForConditionalGeneration.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                        trust_remote_code=True,
+                    )
+                    print("Loaded Qwen2-VL (older version) successfully!")
+                except Exception as e3:
+                    raise Exception(f"All model loading attempts failed. Last error: {e3}")
     return model, processor, tokenizer
 def generate_metadata(image, metadata_type):
+    """Generate metadata for the uploaded image with improved error handling"""
     if image is None:
         return "Please upload an image first."
         prompt = prompts.get(metadata_type, prompts["Basic Description"])
+        # Prepare the conversation format
         messages = [
             {
                 "role": "user",
             }
         ]
+        # Process the input with error handling
+        try:
+            text = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
             )
+            # Use appropriate vision processing
+            if QWEN_UTILS_AVAILABLE:
+                image_inputs, video_inputs = process_vision_info(messages)
+            else:
+                image_inputs, video_inputs = process_vision_info_fallback(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            # Move to device
+            inputs = inputs.to(model.device)
+        except Exception as e:
+            print(f"Error in input processing: {e}")
+            # Fallback to simpler processing
+            try:
+                inputs = processor(
+                    text=prompt,
+                    images=image,
+                    return_tensors="pt",
+                    padding=True
+                )
+                inputs = inputs.to(model.device)
+            except Exception as e2:
+                return f"Error processing input: {str(e2)}"
+        # Generate response with conservative parameters
+        try:
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=384,  # Reduced from 512
+                    temperature=0.7,
+                    do_sample=True,
+                    top_p=0.9,
+                    pad_token_id=tokenizer.pad_token_id,
+                    eos_token_id=tokenizer.eos_token_id,
+                )
+            # Extract and decode the response
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            output_text = processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )[0]
+            return output_text.strip()
+        except Exception as e:
+            return f"Error during generation: {str(e)}"
     except Exception as e:
         return f"Error generating metadata: {str(e)}"
 def create_interface():
     """Create the Gradio interface"""
     css = """
     .metadata-container {
         background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                     elem_classes=["output-text"]
                 )
         # Event handlers
         generate_btn.click(
             fn=generate_metadata,
             show_progress=True
         )
+        # Auto-generate on image upload
         image_input.change(
             fn=lambda img: generate_metadata(img, "Basic Description") if img else "",
             inputs=[image_input],
         gr.HTML("""
         <div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 1px solid #eee;">
             <p style="color: #666;">
+                This Space uses Qwen2.5-VL for intelligent image analysis and metadata generation.
                 <br>Perfect for content management, SEO optimization, and accessibility improvements.
             </p>
+            <p style="color: #888; font-size: 0.9em; margin-top: 10px;">
+                <strong>Note:</strong> First generation may take 1-2 minutes while the model loads. Subsequent generations will be much faster.
+            </p>
         </div>
         """)
     return interface
 def initialize_app():
     """Initialize the application"""
     print("Starting Image Metadata Generator...")
     print("Model will be loaded on first use to save resources.")
+    # Print system info for debugging
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
     interface = create_interface()
     return interface

requirements.txt CHANGED Viewed

@@ -7,3 +7,5 @@ qwen-vl-utils
 torchvision
 numpy
 requests

 torchvision
 numpy
 requests
+flash-attn>=2.0.0
+einops