Malaji71 commited on
Commit
482876d
·
verified ·
1 Parent(s): 40b615f

Delete optimizer.py

Browse files
Files changed (1) hide show
  1. optimizer.py +0 -443
optimizer.py DELETED
@@ -1,443 +0,0 @@
1
- """
2
- Ultra Supreme Optimizer - Main optimization engine for image analysis
3
- VERSIÓN FLORENCE-2 - Usa Florence-2 en lugar de CLIP Interrogator
4
- """
5
-
6
- # IMPORTANT: spaces must be imported BEFORE torch or any CUDA-using library
7
- import spaces
8
- import gc
9
- import logging
10
- import re
11
- from datetime import datetime
12
- from typing import Tuple, Dict, Any, Optional
13
-
14
- import torch
15
- import numpy as np
16
- from PIL import Image
17
- from transformers import AutoProcessor, AutoModelForCausalLM
18
-
19
- from analyzer import UltraSupremeAnalyzer
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- class UltraSupremeOptimizer:
25
- """Main optimizer class for ultra supreme image analysis"""
26
-
27
- def __init__(self):
28
- self.processor = None
29
- self.model = None
30
- self.analyzer = UltraSupremeAnalyzer()
31
- self.usage_count = 0
32
- self.device = self._get_device()
33
- self.is_initialized = False
34
-
35
- @staticmethod
36
- def _get_device() -> str:
37
- """Determine the best available device for computation"""
38
- if torch.cuda.is_available():
39
- return "cuda"
40
- elif torch.backends.mps.is_available():
41
- return "mps"
42
- else:
43
- return "cpu"
44
-
45
- def initialize_model(self) -> bool:
46
- """Initialize Florence-2 model"""
47
- if self.is_initialized:
48
- return True
49
-
50
- try:
51
- logger.info("Loading Florence-2 model...")
52
-
53
- # Load Florence-2 base model (you can also use 'microsoft/Florence-2-large' for better quality)
54
- model_id = "microsoft/Florence-2-base"
55
-
56
- self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
57
- self.model = AutoModelForCausalLM.from_pretrained(
58
- model_id,
59
- trust_remote_code=True,
60
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
61
- )
62
-
63
- # Keep model on CPU initially
64
- self.model = self.model.to("cpu")
65
- self.model.eval()
66
-
67
- self.is_initialized = True
68
-
69
- # Clean up memory after initialization
70
- gc.collect()
71
-
72
- logger.info("Florence-2 model initialized successfully")
73
- return True
74
-
75
- except Exception as e:
76
- logger.error(f"Model initialization error: {e}")
77
- return False
78
-
79
- def optimize_image(self, image: Any) -> Optional[Image.Image]:
80
- """Optimize image for processing"""
81
- if image is None:
82
- return None
83
-
84
- try:
85
- # Convert to PIL Image if necessary
86
- if isinstance(image, np.ndarray):
87
- image = Image.fromarray(image)
88
- elif not isinstance(image, Image.Image):
89
- image = Image.open(image)
90
-
91
- # Convert to RGB if necessary
92
- if image.mode != 'RGB':
93
- image = image.convert('RGB')
94
-
95
- # Florence-2 handles various sizes well, but let's be reasonable
96
- max_size = 1024
97
- if image.size[0] > max_size or image.size[1] > max_size:
98
- image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
99
-
100
- return image
101
-
102
- except Exception as e:
103
- logger.error(f"Image optimization error: {e}")
104
- return None
105
-
106
- def apply_flux_rules(self, base_prompt: str) -> str:
107
- """Aplica las reglas de Flux a un prompt base"""
108
-
109
- # Limpiar el prompt de elementos no deseados
110
- cleanup_patterns = [
111
- r',\s*trending on artstation',
112
- r',\s*trending on [^,]+',
113
- r',\s*\d+k\s*',
114
- r',\s*\d+k resolution',
115
- r',\s*artstation',
116
- r',\s*concept art',
117
- r',\s*digital art',
118
- r',\s*by greg rutkowski',
119
- ]
120
-
121
- cleaned_prompt = base_prompt
122
- for pattern in cleanup_patterns:
123
- cleaned_prompt = re.sub(pattern, '', cleaned_prompt, flags=re.IGNORECASE)
124
-
125
- # Detectar el tipo de imagen para añadir configuración de cámara apropiada
126
- camera_config = ""
127
- if any(word in base_prompt.lower() for word in ['portrait', 'person', 'man', 'woman', 'face']):
128
- camera_config = ", Shot on Hasselblad X2D 100C, 90mm f/2.5 lens at f/2.8, professional portrait photography"
129
- elif any(word in base_prompt.lower() for word in ['landscape', 'mountain', 'nature', 'outdoor']):
130
- camera_config = ", Shot on Phase One XT, 40mm f/4 lens at f/8, epic landscape photography"
131
- elif any(word in base_prompt.lower() for word in ['street', 'urban', 'city']):
132
- camera_config = ", Shot on Leica M11, 35mm f/1.4 lens at f/2.8, documentary street photography"
133
- else:
134
- camera_config = ", Shot on Phase One XF IQ4, 80mm f/2.8 lens at f/4, professional photography"
135
-
136
- # Añadir mejoras de iluminación si no están presentes
137
- if 'lighting' not in cleaned_prompt.lower():
138
- if 'dramatic' in cleaned_prompt.lower():
139
- cleaned_prompt += ", dramatic cinematic lighting"
140
- elif 'portrait' in cleaned_prompt.lower():
141
- cleaned_prompt += ", professional studio lighting with subtle rim light"
142
- else:
143
- cleaned_prompt += ", masterful natural lighting"
144
-
145
- # Construir el prompt final
146
- final_prompt = cleaned_prompt + camera_config
147
-
148
- # Asegurar que empiece con mayúscula
149
- final_prompt = final_prompt[0].upper() + final_prompt[1:] if final_prompt else final_prompt
150
-
151
- # Limpiar espacios y comas duplicadas
152
- final_prompt = re.sub(r'\s+', ' ', final_prompt)
153
- final_prompt = re.sub(r',\s*,+', ',', final_prompt)
154
-
155
- return final_prompt
156
-
157
- @spaces.GPU(duration=60)
158
- def run_florence_inference(self, image: Image.Image) -> Tuple[str, str, str]:
159
- """Run Florence-2 inference on GPU"""
160
- try:
161
- # Move model to GPU
162
- self.model = self.model.to("cuda")
163
- logger.info("Florence-2 model moved to GPU")
164
-
165
- # Task prompts for different types of analysis
166
- tasks = {
167
- "detailed_caption": "<DETAILED_CAPTION>",
168
- "more_detailed_caption": "<MORE_DETAILED_CAPTION>",
169
- "caption": "<CAPTION>",
170
- "dense_region_caption": "<DENSE_REGION_CAPTION>"
171
- }
172
-
173
- results = {}
174
-
175
- # Run different captioning tasks
176
- for task_name, task_prompt in tasks.items():
177
- try:
178
- inputs = self.processor(text=task_prompt, images=image, return_tensors="pt")
179
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
180
-
181
- with torch.cuda.amp.autocast(dtype=torch.float16):
182
- generated_ids = self.model.generate(
183
- input_ids=inputs["input_ids"],
184
- pixel_values=inputs["pixel_values"],
185
- max_new_tokens=1024,
186
- num_beams=3,
187
- do_sample=False
188
- )
189
-
190
- generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
191
- parsed = self.processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
192
-
193
- # Extract the caption from the parsed result
194
- if task_prompt in parsed:
195
- results[task_name] = parsed[task_prompt]
196
- else:
197
- # Sometimes the result is directly in the parsed output
198
- results[task_name] = str(parsed) if parsed else ""
199
-
200
- except Exception as e:
201
- logger.warning(f"Error in {task_name}: {e}")
202
- results[task_name] = ""
203
-
204
- # Extract results
205
- detailed_caption = results.get("detailed_caption", "")
206
- more_detailed = results.get("more_detailed_caption", "")
207
- caption = results.get("caption", "")
208
-
209
- # Combine for a comprehensive description
210
- if more_detailed:
211
- full_prompt = more_detailed
212
- elif detailed_caption:
213
- full_prompt = detailed_caption
214
- else:
215
- full_prompt = caption
216
-
217
- # Use different levels as our three outputs
218
- clip_fast = caption if caption else "A photograph"
219
- clip_classic = detailed_caption if detailed_caption else full_prompt
220
- clip_best = more_detailed if more_detailed else full_prompt
221
-
222
- logger.info(f"Florence-2 captions generated successfully")
223
-
224
- return full_prompt, clip_fast, clip_classic
225
-
226
- except Exception as e:
227
- logger.error(f"Florence-2 inference error: {e}")
228
- # Move model back to CPU to free GPU memory
229
- self.model = self.model.to("cpu")
230
- raise e
231
- finally:
232
- # Always move model back to CPU after inference
233
- self.model = self.model.to("cpu")
234
- torch.cuda.empty_cache()
235
-
236
- def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]:
237
- """
238
- Generate ultra supreme prompt from image usando Florence-2
239
-
240
- Returns:
241
- Tuple of (prompt, analysis_info, score, breakdown)
242
- """
243
- try:
244
- # Inicializar modelo si no está inicializado
245
- if not self.is_initialized:
246
- if not self.initialize_model():
247
- return "❌ Model initialization failed.", "Please refresh and try again.", 0, {}
248
-
249
- # Validate input
250
- if image is None:
251
- return "❌ Please upload an image.", "No image provided.", 0, {}
252
-
253
- self.usage_count += 1
254
-
255
- # Optimize image
256
- image = self.optimize_image(image)
257
- if image is None:
258
- return "❌ Image processing failed.", "Invalid image format.", 0, {}
259
-
260
- start_time = datetime.now()
261
-
262
- logger.info("ULTRA SUPREME ANALYSIS - Starting with Florence-2")
263
-
264
- # Ejecutar inferencia Florence-2
265
- try:
266
- full_prompt, caption_fast, caption_detailed = self.run_florence_inference(image)
267
- except Exception as e:
268
- logger.error(f"Florence-2 failed: {e}")
269
- # Fallback básico
270
- full_prompt = "A photograph"
271
- caption_fast = "image"
272
- caption_detailed = "detailed image"
273
-
274
- logger.info(f"Florence-2 caption: {full_prompt[:100]}...")
275
-
276
- # Ejecutar análisis ultra supremo con múltiples modelos
277
- logger.info("Running multi-model ultra supreme analysis...")
278
- ultra_analysis = self.analyzer.ultra_supreme_analysis(
279
- image, caption_fast, caption_detailed, full_prompt
280
- )
281
-
282
- # Construir prompt mejorado basado en análisis completo
283
- enhanced_prompt_parts = []
284
-
285
- # Base prompt de Florence
286
- enhanced_prompt_parts.append(full_prompt)
287
-
288
- # Agregar información demográfica si está disponible
289
- if ultra_analysis["demographic"]["gender"] and ultra_analysis["demographic"]["gender_confidence"] > 0.7:
290
- gender = ultra_analysis["demographic"]["gender"]
291
- age_cat = ultra_analysis["demographic"]["age_category"]
292
- if age_cat:
293
- enhanced_prompt_parts.append(f"{age_cat} {gender}")
294
-
295
- # Agregar estado emocional principal
296
- if ultra_analysis["emotional_state"]["primary_emotion"] and ultra_analysis["emotional_state"]["emotion_confidence"] > 0.6:
297
- emotion = ultra_analysis["emotional_state"]["primary_emotion"]
298
- enhanced_prompt_parts.append(f"{emotion} expression")
299
-
300
- # Agregar información de pose si está disponible
301
- if ultra_analysis["pose_composition"]["posture"]:
302
- enhanced_prompt_parts.append(ultra_analysis["pose_composition"]["posture"][0])
303
-
304
- # Combinar y aplicar reglas de Flux
305
- combined_prompt = ", ".join(enhanced_prompt_parts)
306
- optimized_prompt = self.apply_flux_rules(combined_prompt)
307
-
308
- # Si el analyzer enriqueció el prompt, úsalo
309
- analyzer_prompt = self.analyzer.build_ultra_supreme_prompt(ultra_analysis, [full_prompt])
310
- if len(analyzer_prompt) > len(optimized_prompt):
311
- optimized_prompt = self.apply_flux_rules(analyzer_prompt)
312
-
313
- # Calcular score usando el analyzer
314
- score, breakdown = self.analyzer.calculate_ultra_supreme_score(optimized_prompt, ultra_analysis)
315
-
316
- end_time = datetime.now()
317
- duration = (end_time - start_time).total_seconds()
318
-
319
- # Memory cleanup
320
- gc.collect()
321
- if torch.cuda.is_available():
322
- torch.cuda.empty_cache()
323
-
324
- # Generate enhanced analysis report con datos de múltiples modelos
325
- analysis_info = self._generate_ultra_analysis_report(
326
- ultra_analysis, score, breakdown, duration, "Florence-2"
327
- )
328
-
329
- return optimized_prompt, analysis_info, score, breakdown
330
-
331
- except Exception as e:
332
- logger.error(f"Ultra supreme generation error: {e}", exc_info=True)
333
- return f"❌ Error: {str(e)}", "Please try with a different image.", 0, {}
334
-
335
- def _generate_ultra_analysis_report(self, analysis: Dict[str, Any],
336
- score: int, breakdown: Dict[str, int],
337
- duration: float, caption_model: str = "Florence-2") -> str:
338
- """Generate ultra detailed analysis report with multi-model results"""
339
-
340
- device_used = "cuda" if torch.cuda.is_available() else "cpu"
341
- gpu_status = "⚡ ZeroGPU" if device_used == "cuda" else "💻 CPU"
342
-
343
- # Demographic info
344
- demo_info = ""
345
- if analysis["demographic"]["age_category"]:
346
- age = analysis["demographic"]["age_category"].replace("_", " ").title()
347
- gender = analysis["demographic"]["gender"] or "person"
348
- confidence = analysis["demographic"]["age_confidence"]
349
- demo_info = f"**Detected:** {age} {gender} (confidence: {confidence:.0%})"
350
-
351
- # Emotion info
352
- emotion_info = ""
353
- if analysis["emotional_state"]["primary_emotion"]:
354
- emotion = analysis["emotional_state"]["primary_emotion"]
355
- confidence = analysis["emotional_state"]["emotion_confidence"]
356
- emotion_info = f"**Primary Emotion:** {emotion} ({confidence:.0%})"
357
-
358
- # Add emotion distribution if available
359
- if analysis["emotional_state"]["emotion_distribution"]:
360
- top_emotions = sorted(
361
- analysis["emotional_state"]["emotion_distribution"].items(),
362
- key=lambda x: x[1], reverse=True
363
- )[:3]
364
- emotion_details = ", ".join([f"{e[0]}: {e[1]:.0%}" for e in top_emotions])
365
- emotion_info += f"\n**Emotion Distribution:** {emotion_details}"
366
-
367
- # Face analysis info
368
- face_info = f"**Faces Detected:** {analysis['facial_ultra']['face_count']}"
369
- if analysis['facial_ultra']['face_count'] > 0:
370
- features = []
371
- for feature_type in ['eyes', 'mouth', 'facial_hair', 'skin']:
372
- if analysis['facial_ultra'].get(feature_type):
373
- features.extend(analysis['facial_ultra'][feature_type])
374
- if features:
375
- face_info += f"\n**Facial Features:** {', '.join(features[:5])}"
376
-
377
- # Pose info
378
- pose_info = ""
379
- if analysis["pose_composition"].get("pose_confidence", 0) > 0:
380
- confidence = analysis["pose_composition"]["pose_confidence"]
381
- pose_info = f"**Pose Analysis:** Body detected ({confidence:.0%} confidence)"
382
- if analysis["pose_composition"]["posture"]:
383
- pose_info += f"\n**Posture:** {', '.join(analysis['pose_composition']['posture'])}"
384
-
385
- # Environment info
386
- env_info = ""
387
- if analysis["environmental"]["setting_type"]:
388
- env_info = f"**Setting:** {analysis['environmental']['setting_type'].replace('_', ' ').title()}"
389
- if analysis["environmental"]["lighting_analysis"]:
390
- env_info += f"\n**Lighting:** {', '.join(analysis['environmental']['lighting_analysis'])}"
391
-
392
- # Intelligence metrics
393
- metrics = analysis["intelligence_metrics"]
394
-
395
- # Caption info
396
- caption_info = analysis.get("clip_best", "")[:150] + "..." if len(analysis.get("clip_best", "")) > 150 else analysis.get("clip_best", "")
397
-
398
- analysis_info = f"""**🚀 ULTRA SUPREME MULTI-MODEL ANALYSIS COMPLETE**
399
- **Processing:** {gpu_status} • {duration:.1f}s • {caption_model} + Multi-Model Pipeline
400
- **Ultra Score:** {score}/100 • Models: {caption_model} + DeepFace + MediaPipe + Transformers
401
-
402
- **📊 BREAKDOWN:**
403
- • Prompt Quality: {breakdown.get('prompt_quality', 0)}/25
404
- • Analysis Depth: {breakdown.get('analysis_depth', 0)}/25
405
- • Model Confidence: {breakdown.get('model_confidence', 0)}/25
406
- • Feature Richness: {breakdown.get('feature_richness', 0)}/25
407
-
408
- **📝 VISION-LANGUAGE ANALYSIS:**
409
- **{caption_model} Caption:** {caption_info}
410
-
411
- **🧠 DEEP ANALYSIS RESULTS:**
412
-
413
- **👤 DEMOGRAPHICS & IDENTITY:**
414
- {demo_info or "No face detected for demographic analysis"}
415
-
416
- **😊 EMOTIONAL ANALYSIS:**
417
- {emotion_info or "No emotional data available"}
418
-
419
- **👁️ FACIAL ANALYSIS:**
420
- {face_info}
421
-
422
- **🚶 POSE & BODY LANGUAGE:**
423
- {pose_info or "No pose data available"}
424
-
425
- **🏞️ ENVIRONMENT & SCENE:**
426
- {env_info or "No environmental data detected"}
427
-
428
- **📊 INTELLIGENCE METRICS:**
429
- • **Total Features Detected:** {metrics['total_features_detected']}
430
- • **Analysis Depth Score:** {metrics['analysis_depth_score']}/100
431
- • **Model Confidence Average:** {metrics['model_confidence_average']:.0%}
432
- • **Technical Optimization:** {metrics['technical_optimization_score']}/100
433
-
434
- **✨ MULTI-MODEL ADVANTAGES:**
435
- ✅ {caption_model}: State-of-the-art vision-language understanding
436
- ✅ DeepFace: Accurate age, gender, emotion detection
437
- ✅ MediaPipe: Body pose and gesture analysis
438
- ✅ Transformers: Advanced emotion classification
439
- ✅ OpenCV: Robust face detection
440
-
441
- **🔬 Powered by Pariente AI Research • Ultra Supreme Intelligence Engine**"""
442
-
443
- return analysis_info