Spaces:
Running
Running
File size: 11,242 Bytes
1cd8cba 8de791b 1cd8cba 8de791b 439fb23 fd05be2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | import gradio as gr
import torch
import clip
import numpy as np
import random
import os
from PIL import Image
from ultralytics import YOLO # Needed for both person and fashion detection
from gtts import gTTS
import uuid
import time
import tempfile
def analyze_outfit(input_img, yolo_person_model, yolo_fashion_model, clip_model, clip_preprocess,
all_prompts, style_prompts_end_index, FASHION_CLASSES, CATEGORY_LABEL_MAP,
response_templates, YOLO_PERSON_CONF_THRESHOLD, YOLO_FASHION_CONF_THRESHOLD,
YOLO_FASHION_HIGH_CONF_THRESHOLD, DEVICE):
# Handle both file paths and PIL Images
if isinstance(input_img, str):
try:
input_img = Image.open(input_img)
except Exception as e:
return (f"<p style='color: #FF5555;'>Error loading image: {str(e)}</p>",
None, "Image loading error")
# Existing code continues...
if input_img is None:
return ("<p style='color: #FF5555; text-align: center;'>Please upload an image.</p>",
None, "Error: No image provided.")
img = input_img.convert("RGB").copy()
#def analyze_outfit(image):
#if image is None:
#return ("<p style='color: #FF5555; text-align: center;'>Please upload an image.</p>", None, "Error: No image provided.")
#image = image.convert("RGB").copy()
#print(f"[DEBUG] image_path type: {type(image_path)} | value: {image_path}")
# 1) YOLO Person Detection
person_results = yolo_person_model(img, verbose=False, conf=YOLO_PERSON_CONF_THRESHOLD)
boxes = person_results[0].boxes.xyxy.cpu().numpy()
classes = person_results[0].boxes.cls.cpu().numpy()
confidences = person_results[0].boxes.conf.cpu().numpy()
# Filter for persons (class 0 in standard YOLOv8)
person_indices = np.where(classes == 0)[0]
cropped_img = img # Default to full image if no person found
person_detected = False
if len(person_indices) > 0:
# Find the person detection with the highest confidence
max_conf_person_idx = person_indices[np.argmax(confidences[person_indices])]
x1, y1, x2, y2 = map(int, boxes[max_conf_person_idx])
# Ensure coordinates are valid and within image bounds
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(img.width, x2), min(img.height, y2)
if x1 < x2 and y1 < y2: # Check if the box has valid dimensions
cropped_img = img.crop((x1, y1, x2, y2))
print(f"Person detected and cropped: Box {x1, y1, x2, y2}")
person_detected = True
else:
print("Warning: Invalid person bounding box after clipping. Using full image.")
cropped_img = img
else:
print("No person detected by yolo_person_model. Analyzing full image.")
# 2) YOLO Fashion Model Detection (run on the cropped image if person was found)
detected_fashion_item_name = None
detected_fashion_item_conf = 0.0
if person_detected or True: # Or always run on the (potentially full) image? Let's always run for now.
try:
fashion_results = yolo_fashion_model(cropped_img, verbose=False, conf=YOLO_FASHION_CONF_THRESHOLD)
fashion_boxes = fashion_results[0].boxes.xyxy.cpu().numpy()
fashion_classes = fashion_results[0].boxes.cls.cpu().numpy().astype(int)
fashion_confidences = fashion_results[0].boxes.conf.cpu().numpy()
if len(fashion_classes) > 0:
# Find the detection with the highest confidence
best_fashion_idx = np.argmax(fashion_confidences)
detected_class_id = fashion_classes[best_fashion_idx]
detected_fashion_item_conf = fashion_confidences[best_fashion_idx]
if detected_class_id in FASHION_CLASSES:
detected_fashion_item_name = FASHION_CLASSES[detected_class_id]
print(f"Fashion model detected: '{detected_fashion_item_name}' "
f"with confidence {detected_fashion_item_conf:.2f}")
else:
print(f"Warning: Detected fashion class ID {detected_class_id} not in FASHION_CLASSES map.")
else:
print("No fashion items detected above threshold by yolo_fashion_model.")
except Exception as e:
print(f"Error during YOLO fashion model analysis: {e}")
# Continue without fashion model input
# 3) CLIP Analysis (always run on the cropped/full image)
clip_detected_item = "look" # Default fallback item name
clip_detected_item_prob = 0.0
category_key = 'mid' # Default category
final_score_str = "N/A"
try:
image_tensor = clip_preprocess(cropped_img).unsqueeze(0).to(DEVICE)
text_tokens = clip.tokenize(all_prompts).to(DEVICE)
with torch.no_grad():
logits, _ = clip_model(image_tensor, text_tokens)
all_probs = logits.softmax(dim=-1).cpu().numpy()[0]
# Calculate style scores
drip_len = len(style_prompts['drippy'])
mid_len = len(style_prompts['mid'])
drip_score = np.mean(all_probs[0 : drip_len])
mid_score = np.mean(all_probs[drip_len : drip_len + mid_len])
not_score = np.mean(all_probs[drip_len + mid_len : style_prompts_end_index])
# Determine overall style category AND DEFINE score_label
score_label = "Style Score" # Initialize with a default/fallback
if drip_score > 0.41 and drip_score > mid_score and drip_score > not_score:
category_key = 'drippy'
final_score = drip_score
score_label = "Drip Score" # <<< DEFINE score_label
elif mid_score > not_score: # Check mid_score > not_score explicitly
category_key = 'mid'
final_score = mid_score
score_label = "Mid Score" # <<< DEFINE score_label
else:
category_key = 'not_drippy'
final_score = not_score
score_label = "Trash Score" # <<< DEFINE score_label # Or maybe "Rating Score"
category_label = CATEGORY_LABEL_MAP[category_key]
# final_score_str = f"{final_score:.2f}" # You might not need this raw score string anymore
percentage_score = max(0, final_score * 100)
percentage_score_str = f"{percentage_score:.0f}%" # Formats as integer (e.g., "3%", "15%", "0%")
# Now score_label is defined before being used here
print(f"Style analysis: Category={category_label}, Score = {score_label}={percentage_score_str} (Raw Score: {final_score:.4f})")
# Get top clothing item from CLIP
top_3_clip_items = get_top_clip_clothing(all_probs, n=3) # <<< Ask for top 3 items
if top_3_clip_items:
# Print the top 3 detected items
detected_items_str = ", ".join([f"{item[0]} ({item[1]*100:.1f}%)" for item in top_3_clip_items]) # Show item and probability
print(f"I think I detected: {detected_items_str}")
# Still use the single *most* probable item for response generation logic later
clip_detected_item, clip_detected_item_prob = top_3_clip_items[0]
# Optional: You can keep or remove the print for the single top item below if the top-3 print is sufficient
# print(f"Top clothing item identified by CLIP (for response): '{clip_detected_item}' "
# f"with probability {clip_detected_item_prob:.2f}")
else:
print("I couldn't confidently identify specific clothing items via CLIP.")
clip_detected_item = "piece" # Use a different fallback if CLIP fails
clip_detected_item_prob = 0.0 # Ensure prob is defined
except Exception as e:
print(f"Error during CLIP analysis: {e}")
# Use defaults, maybe return error message?
return ("<p style='color: #FF5555;'>Error during CLIP analysis.</p>",
None, f"Analysis Error: {e}")
# 4) Determine the Final Item to Mention in Response
final_clothing_item = "style" # Ultimate fallback generic term
generic_response_needed = False
if detected_fashion_item_name and detected_fashion_item_conf >= YOLO_FASHION_HIGH_CONF_THRESHOLD:
# Priority 1: High-confidence fashion model detection
final_clothing_item = detected_fashion_item_name
print(f"Using highly confident fashion model item: '{final_clothing_item}'")
elif detected_fashion_item_name and detected_fashion_item_conf >= YOLO_FASHION_CONF_THRESHOLD:
# Priority 2: Medium-confidence fashion model detection (still prefer over CLIP)
final_clothing_item = detected_fashion_item_name
print(f"Using medium confidence fashion model item: '{final_clothing_item}'")
elif clip_detected_item and clip_detected_item_prob > 0.05: # Check if CLIP prob is somewhat reasonable
# Priority 3: CLIP detection (if fashion model didn't provide a strong candidate)
final_clothing_item = clip_detected_item
print(f"Using CLIP detected item: '{final_clothing_item}'")
else:
# Priority 4: Generic response needed (no confident detection from either model)
final_clothing_item = random.choice(["fit", "look", "style", "vibe"]) # Randomize generic term
generic_response_needed = True
print(f"Using generic fallback item: '{final_clothing_item}'")
# 5) Generate Response and TTS
try:
response_pool = response_templates[category_key]
# Choose a random template from the entire response pool
chosen_template = random.choice(response_pool)
# Format the response, substituting the item name if needed
response_text = chosen_template.format(item=final_clothing_item) if '{item}' in chosen_template else chosen_template
tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3")
tts = gTTS(text=response_text, lang='en', tld='com', slow=False)
tts.save(tts_path)
print(f"Generated TTS response: '{response_text}' saved to {tts_path}")
# --- Updated HTML Output ---
category_html = f"""
<div class='results-container'>
<h2 class='result-category'>RATING: {category_label.upper()}</h2>
<p class='result-score'>{score_label}: {percentage_score_str}</p>
</div>
"""
return category_html, tts_path, response_text
except Exception as e:
print(f"Error during response/TTS generation: {e}")
percentage_score = max(0, final_score * 100)
percentage_score_str = f"{percentage_score:.0f}%"
category_html = f"""
<div class='results-container'>
<h2 class='result-category'>Result: {category_label.upper()}</h2>
<p class='result-score'>{score_label}: {percentage_score_str}</p>
<p class='result-error' style='color: #FFAAAA; font-size: 0.9em;'>Error generating audio/full response.</p>
</div>
"""
# Still provide category info, but indicate TTS/response error
return category_html, None, f"Analysis complete ({category_label}), but error generating audio/response." |