File size: 12,538 Bytes
1216cbc
 
 
 
f5d8b8a
 
 
 
 
 
 
 
 
1216cbc
 
 
 
 
 
f5d8b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216cbc
f5d8b8a
1216cbc
 
 
f5d8b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c79cf18
 
f5d8b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216cbc
 
 
 
f5d8b8a
1216cbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5d8b8a
1216cbc
 
 
 
 
 
 
 
 
 
 
 
 
f5d8b8a
 
 
 
1216cbc
f5d8b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216cbc
f5d8b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216cbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5d8b8a
1216cbc
 
 
 
 
 
 
 
 
 
f5d8b8a
1216cbc
 
f5d8b8a
 
 
1216cbc
 
 
 
 
 
 
 
 
 
f5d8b8a
 
 
 
 
 
1216cbc
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import gradio as gr
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from PIL import Image
import json

# Try to import qwen_vl_utils, fallback if not available
try:
    from qwen_vl_utils import process_vision_info
    QWEN_UTILS_AVAILABLE = True
except ImportError:
    print("Warning: qwen_vl_utils not available, using fallback processing")
    QWEN_UTILS_AVAILABLE = False

# Global variables to store model and processor
model = None
processor = None
tokenizer = None

def process_vision_info_fallback(messages):
    """Fallback function if qwen_vl_utils is not available"""
    image_inputs = []
    video_inputs = []
    
    for message in messages:
        if message.get("role") == "user":
            for content in message.get("content", []):
                if content.get("type") == "image":
                    image_inputs.append(content["image"])
                elif content.get("type") == "video":
                    video_inputs.append(content["video"])
    
    return image_inputs, video_inputs

def load_model():
    """Load the Qwen2.5-VL model and processor with better error handling"""
    global model, processor, tokenizer
    
    if model is None:
        try:
            print("Loading Qwen2.5-VL-7B-Instruct model...")
            
            # Try different model loading strategies
            model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
            
            # Load processor first (often more stable)
            print("Loading processor...")
            processor = AutoProcessor.from_pretrained(
                model_id,
                trust_remote_code=True
            )
            
            # Load tokenizer
            print("Loading tokenizer...")
            tokenizer = AutoTokenizer.from_pretrained(
                model_id,
                trust_remote_code=True
            )
            
            # Load model with more conservative settings
            print("Loading model... This may take a few minutes...")
            model = Qwen2VLForConditionalGeneration.from_pretrained(
                model_id,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                trust_remote_code=True,
                # Use eager attention (more compatible)
                attn_implementation="eager",
                low_cpu_mem_usage=True,
            )
            
            print("Model loaded successfully!")
            
        except Exception as e:
            print(f"Error loading main model: {e}")
            print("Trying alternative loading method...")
            
            try:
                # Fallback: try loading with different parameters
                model = Qwen2VLForConditionalGeneration.from_pretrained(
                    model_id,
                    torch_dtype=torch.float16,  # Try float16 instead
                    device_map="cpu",  # Force CPU loading
                    trust_remote_code=True,
                    low_cpu_mem_usage=True,
                )
                print("Model loaded with fallback method!")
                
            except Exception as e2:
                print(f"Fallback loading also failed: {e2}")
                print("Trying smaller Qwen2-VL model...")
                
                try:
                    # Try the older Qwen2-VL model as final fallback
                    model_id = "Qwen/Qwen2-VL-7B-Instruct"
                    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
                    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
                    model = Qwen2VLForConditionalGeneration.from_pretrained(
                        model_id,
                        torch_dtype=torch.float16,
                        device_map="auto",
                        trust_remote_code=True,
                    )
                    print("Loaded Qwen2-VL (older version) successfully!")
                    
                except Exception as e3:
                    raise Exception(f"All model loading attempts failed. Last error: {e3}")
    
    return model, processor, tokenizer

def generate_metadata(image, metadata_type):
    """Generate metadata for the uploaded image with improved error handling"""
    if image is None:
        return "Please upload an image first."
    
    try:
        # Load model if not already loaded
        model, processor, tokenizer = load_model()
        
        # Define prompts for different metadata types
        prompts = {
            "Basic Description": "Describe this image in detail, including what you see, the setting, colors, and overall composition.",
            "Technical Analysis": "Analyze this image from a technical perspective. Describe the lighting, composition, camera angle, depth of field, and any photographic techniques used.",
            "Objects & People": "List all the objects, people, animals, and items you can identify in this image. Be comprehensive and specific.",
            "Scene & Context": "Describe the scene, setting, location, time of day, weather conditions, and any contextual information you can infer from this image.",
            "Artistic Analysis": "Analyze this image from an artistic perspective, discussing the style, mood, aesthetic qualities, visual elements, and artistic techniques used.",
            "SEO Keywords": "Generate relevant SEO keywords and tags that would help categorize and find this image in a database or search system.",
            "JSON Metadata": "Create a comprehensive JSON metadata object for this image including description, objects, colors, setting, mood, and technical details."
        }
        
        prompt = prompts.get(metadata_type, prompts["Basic Description"])
        
        # Prepare the conversation format
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        
        # Process the input with error handling
        try:
            text = processor.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            
            # Use appropriate vision processing
            if QWEN_UTILS_AVAILABLE:
                image_inputs, video_inputs = process_vision_info(messages)
            else:
                image_inputs, video_inputs = process_vision_info_fallback(messages)
            
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            
            # Move to device
            inputs = inputs.to(model.device)
            
        except Exception as e:
            print(f"Error in input processing: {e}")
            # Fallback to simpler processing
            try:
                inputs = processor(
                    text=prompt,
                    images=image,
                    return_tensors="pt",
                    padding=True
                )
                inputs = inputs.to(model.device)
            except Exception as e2:
                return f"Error processing input: {str(e2)}"
        
        # Generate response with conservative parameters
        try:
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=384,  # Reduced from 512
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
            
            # Extract and decode the response
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed, 
                skip_special_tokens=True, 
                clean_up_tokenization_spaces=False
            )[0]
            
            return output_text.strip()
            
        except Exception as e:
            return f"Error during generation: {str(e)}"
        
    except Exception as e:
        return f"Error generating metadata: {str(e)}"

def create_interface():
    """Create the Gradio interface"""
    
    css = """
    .metadata-container {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        border-radius: 15px;
        padding: 20px;
        margin: 10px 0;
    }
    .output-text {
        background-color: #f8f9fa;
        border-radius: 10px;
        padding: 15px;
        border-left: 4px solid #667eea;
    }
    """
    
    with gr.Blocks(css=css, title="Image Metadata Generator with Qwen2.5-VL") as interface:
        gr.HTML("""
        <div style="text-align: center; padding: 20px;">
            <h1 style="color: #333; margin-bottom: 10px;">🖼️ Image Metadata Generator</h1>
            <h3 style="color: #666; font-weight: normal;">Powered by Qwen2.5-VL</h3>
            <p style="color: #888;">Upload an image and generate comprehensive metadata using AI vision</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                image_input = gr.Image(
                    type="pil",
                    label="Upload Image",
                    height=400
                )
                
                metadata_type = gr.Dropdown(
                    choices=[
                        "Basic Description",
                        "Technical Analysis", 
                        "Objects & People",
                        "Scene & Context",
                        "Artistic Analysis",
                        "SEO Keywords",
                        "JSON Metadata"
                    ],
                    value="Basic Description",
                    label="Metadata Type"
                )
                
                generate_btn = gr.Button(
                    "Generate Metadata", 
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                output_text = gr.Textbox(
                    label="Generated Metadata",
                    lines=20,
                    max_lines=25,
                    elem_classes=["output-text"]
                )
        
        # Event handlers
        generate_btn.click(
            fn=generate_metadata,
            inputs=[image_input, metadata_type],
            outputs=output_text,
            show_progress=True
        )
        
        # Auto-generate on image upload
        image_input.change(
            fn=lambda img: generate_metadata(img, "Basic Description") if img else "",
            inputs=[image_input],
            outputs=output_text,
            show_progress=True
        )
        
        gr.HTML("""
        <div style="text-align: center; padding: 20px; margin-top: 30px; border-top: 1px solid #eee;">
            <p style="color: #666;">
                This Space uses Qwen2.5-VL for intelligent image analysis and metadata generation.
                <br>Perfect for content management, SEO optimization, and accessibility improvements.
            </p>
            <p style="color: #888; font-size: 0.9em; margin-top: 10px;">
                <strong>Note:</strong> First generation may take 1-2 minutes while the model loads. Subsequent generations will be much faster.
            </p>
        </div>
        """)
    
    return interface

def initialize_app():
    """Initialize the application"""
    print("Starting Image Metadata Generator...")
    print("Model will be loaded on first use to save resources.")
    
    # Print system info for debugging
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    
    interface = create_interface()
    return interface

if __name__ == "__main__":
    app = initialize_app()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )