import gradio as gr import torch import clip import numpy as np import random import os from PIL import Image from ultralytics import YOLO # Needed for both person and fashion detection from gtts import gTTS import uuid import time import tempfile def analyze_outfit(input_img, yolo_person_model, yolo_fashion_model, clip_model, clip_preprocess, all_prompts, style_prompts_end_index, FASHION_CLASSES, CATEGORY_LABEL_MAP, response_templates, YOLO_PERSON_CONF_THRESHOLD, YOLO_FASHION_CONF_THRESHOLD, YOLO_FASHION_HIGH_CONF_THRESHOLD, DEVICE): # Handle both file paths and PIL Images if isinstance(input_img, str): try: input_img = Image.open(input_img) except Exception as e: return (f"

Error loading image: {str(e)}

", None, "Image loading error") # Existing code continues... if input_img is None: return ("

Please upload an image.

", None, "Error: No image provided.") img = input_img.convert("RGB").copy() #def analyze_outfit(image): #if image is None: #return ("

Please upload an image.

", None, "Error: No image provided.") #image = image.convert("RGB").copy() #print(f"[DEBUG] image_path type: {type(image_path)} | value: {image_path}") # 1) YOLO Person Detection person_results = yolo_person_model(img, verbose=False, conf=YOLO_PERSON_CONF_THRESHOLD) boxes = person_results[0].boxes.xyxy.cpu().numpy() classes = person_results[0].boxes.cls.cpu().numpy() confidences = person_results[0].boxes.conf.cpu().numpy() # Filter for persons (class 0 in standard YOLOv8) person_indices = np.where(classes == 0)[0] cropped_img = img # Default to full image if no person found person_detected = False if len(person_indices) > 0: # Find the person detection with the highest confidence max_conf_person_idx = person_indices[np.argmax(confidences[person_indices])] x1, y1, x2, y2 = map(int, boxes[max_conf_person_idx]) # Ensure coordinates are valid and within image bounds x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(img.width, x2), min(img.height, y2) if x1 < x2 and y1 < y2: # Check if the box has valid dimensions cropped_img = img.crop((x1, y1, x2, y2)) print(f"Person detected and cropped: Box {x1, y1, x2, y2}") person_detected = True else: print("Warning: Invalid person bounding box after clipping. Using full image.") cropped_img = img else: print("No person detected by yolo_person_model. Analyzing full image.") # 2) YOLO Fashion Model Detection (run on the cropped image if person was found) detected_fashion_item_name = None detected_fashion_item_conf = 0.0 if person_detected or True: # Or always run on the (potentially full) image? Let's always run for now. try: fashion_results = yolo_fashion_model(cropped_img, verbose=False, conf=YOLO_FASHION_CONF_THRESHOLD) fashion_boxes = fashion_results[0].boxes.xyxy.cpu().numpy() fashion_classes = fashion_results[0].boxes.cls.cpu().numpy().astype(int) fashion_confidences = fashion_results[0].boxes.conf.cpu().numpy() if len(fashion_classes) > 0: # Find the detection with the highest confidence best_fashion_idx = np.argmax(fashion_confidences) detected_class_id = fashion_classes[best_fashion_idx] detected_fashion_item_conf = fashion_confidences[best_fashion_idx] if detected_class_id in FASHION_CLASSES: detected_fashion_item_name = FASHION_CLASSES[detected_class_id] print(f"Fashion model detected: '{detected_fashion_item_name}' " f"with confidence {detected_fashion_item_conf:.2f}") else: print(f"Warning: Detected fashion class ID {detected_class_id} not in FASHION_CLASSES map.") else: print("No fashion items detected above threshold by yolo_fashion_model.") except Exception as e: print(f"Error during YOLO fashion model analysis: {e}") # Continue without fashion model input # 3) CLIP Analysis (always run on the cropped/full image) clip_detected_item = "look" # Default fallback item name clip_detected_item_prob = 0.0 category_key = 'mid' # Default category final_score_str = "N/A" try: image_tensor = clip_preprocess(cropped_img).unsqueeze(0).to(DEVICE) text_tokens = clip.tokenize(all_prompts).to(DEVICE) with torch.no_grad(): logits, _ = clip_model(image_tensor, text_tokens) all_probs = logits.softmax(dim=-1).cpu().numpy()[0] # Calculate style scores drip_len = len(style_prompts['drippy']) mid_len = len(style_prompts['mid']) drip_score = np.mean(all_probs[0 : drip_len]) mid_score = np.mean(all_probs[drip_len : drip_len + mid_len]) not_score = np.mean(all_probs[drip_len + mid_len : style_prompts_end_index]) # Determine overall style category AND DEFINE score_label score_label = "Style Score" # Initialize with a default/fallback if drip_score > 0.41 and drip_score > mid_score and drip_score > not_score: category_key = 'drippy' final_score = drip_score score_label = "Drip Score" # <<< DEFINE score_label elif mid_score > not_score: # Check mid_score > not_score explicitly category_key = 'mid' final_score = mid_score score_label = "Mid Score" # <<< DEFINE score_label else: category_key = 'not_drippy' final_score = not_score score_label = "Trash Score" # <<< DEFINE score_label # Or maybe "Rating Score" category_label = CATEGORY_LABEL_MAP[category_key] # final_score_str = f"{final_score:.2f}" # You might not need this raw score string anymore percentage_score = max(0, final_score * 100) percentage_score_str = f"{percentage_score:.0f}%" # Formats as integer (e.g., "3%", "15%", "0%") # Now score_label is defined before being used here print(f"Style analysis: Category={category_label}, Score = {score_label}={percentage_score_str} (Raw Score: {final_score:.4f})") # Get top clothing item from CLIP top_3_clip_items = get_top_clip_clothing(all_probs, n=3) # <<< Ask for top 3 items if top_3_clip_items: # Print the top 3 detected items detected_items_str = ", ".join([f"{item[0]} ({item[1]*100:.1f}%)" for item in top_3_clip_items]) # Show item and probability print(f"I think I detected: {detected_items_str}") # Still use the single *most* probable item for response generation logic later clip_detected_item, clip_detected_item_prob = top_3_clip_items[0] # Optional: You can keep or remove the print for the single top item below if the top-3 print is sufficient # print(f"Top clothing item identified by CLIP (for response): '{clip_detected_item}' " # f"with probability {clip_detected_item_prob:.2f}") else: print("I couldn't confidently identify specific clothing items via CLIP.") clip_detected_item = "piece" # Use a different fallback if CLIP fails clip_detected_item_prob = 0.0 # Ensure prob is defined except Exception as e: print(f"Error during CLIP analysis: {e}") # Use defaults, maybe return error message? return ("

Error during CLIP analysis.

", None, f"Analysis Error: {e}") # 4) Determine the Final Item to Mention in Response final_clothing_item = "style" # Ultimate fallback generic term generic_response_needed = False if detected_fashion_item_name and detected_fashion_item_conf >= YOLO_FASHION_HIGH_CONF_THRESHOLD: # Priority 1: High-confidence fashion model detection final_clothing_item = detected_fashion_item_name print(f"Using highly confident fashion model item: '{final_clothing_item}'") elif detected_fashion_item_name and detected_fashion_item_conf >= YOLO_FASHION_CONF_THRESHOLD: # Priority 2: Medium-confidence fashion model detection (still prefer over CLIP) final_clothing_item = detected_fashion_item_name print(f"Using medium confidence fashion model item: '{final_clothing_item}'") elif clip_detected_item and clip_detected_item_prob > 0.05: # Check if CLIP prob is somewhat reasonable # Priority 3: CLIP detection (if fashion model didn't provide a strong candidate) final_clothing_item = clip_detected_item print(f"Using CLIP detected item: '{final_clothing_item}'") else: # Priority 4: Generic response needed (no confident detection from either model) final_clothing_item = random.choice(["fit", "look", "style", "vibe"]) # Randomize generic term generic_response_needed = True print(f"Using generic fallback item: '{final_clothing_item}'") # 5) Generate Response and TTS try: response_pool = response_templates[category_key] # Choose a random template from the entire response pool chosen_template = random.choice(response_pool) # Format the response, substituting the item name if needed response_text = chosen_template.format(item=final_clothing_item) if '{item}' in chosen_template else chosen_template tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3") tts = gTTS(text=response_text, lang='en', tld='com', slow=False) tts.save(tts_path) print(f"Generated TTS response: '{response_text}' saved to {tts_path}") # --- Updated HTML Output --- category_html = f"""

RATING: {category_label.upper()}

{score_label}: {percentage_score_str}

""" return category_html, tts_path, response_text except Exception as e: print(f"Error during response/TTS generation: {e}") percentage_score = max(0, final_score * 100) percentage_score_str = f"{percentage_score:.0f}%" category_html = f"""

Result: {category_label.upper()}

{score_label}: {percentage_score_str}

Error generating audio/full response.

""" # Still provide category info, but indicate TTS/response error return category_html, None, f"Analysis complete ({category_label}), but error generating audio/response."