import gradio as gr import torch import clip import numpy as np import random import os from PIL import Image from ultralytics import YOLO from gtts import gTTS import uuid import time import tempfile from huggingface_hub import hf_hub_download HF_TOKEN = os.environ.get("HF_TOKEN") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" YOLO_PERSON_MODEL_PATH = hf_hub_download( repo_id="dzmu/dripai-models", filename="yolov8n.pt", token=HF_TOKEN, repo_type="model" ) YOLO_FASHION_MODEL_PATH = hf_hub_download( repo_id="dzmu/dripai-models", filename="best.pt", token=HF_TOKEN, repo_type="model" ) CLIP_MODEL_NAME = "ViT-B/32" # Confidence Thresholds YOLO_PERSON_CONF_THRESHOLD = 0.4 YOLO_FASHION_CONF_THRESHOLD = 0.4 YOLO_FASHION_HIGH_CONF_THRESHOLD = 0.6 FASHION_CLASSES = { 0: 'long sleeve top', 1: 'skirt', 2: 'trousers', 3: 'short sleeve top', 4: 'long sleeve outwear', 5: 'short sleeve dress', 6: 'shorts', 7: 'vest dress', 8: 'sling dress', 9: 'vest', 10: 'long sleeve dress', 11: 'sling', 12: 'short sleeve outwear' } print(f"Defined {len(FASHION_CLASSES)} fashion categories for {YOLO_FASHION_MODEL_PATH}") print(f"Using device: {DEVICE}") try: clip_model, clip_preprocess = clip.load(CLIP_MODEL_NAME, device=DEVICE) print(f"CLIP model ({CLIP_MODEL_NAME}) loaded successfully.") except Exception as e: print(f"Error loading CLIP model: {e}") try: yolo_person_model = YOLO(YOLO_PERSON_MODEL_PATH) print(f"YOLO person detection model ({YOLO_PERSON_MODEL_PATH}) loaded successfully.") except Exception as e: print(f"Error loading YOLO person model: {e}") try: yolo_fashion_model = YOLO(YOLO_FASHION_MODEL_PATH) # No .to(DEVICE) needed here print(f"YOLO fashion detection model ({YOLO_FASHION_MODEL_PATH}) loaded successfully.") except Exception as e: print(f"Error loading YOLO fashion model: {e}") style_prompts = { 'drippy': [ "High-fashion runway look with designer labels", "Coordinated color palette with premium fabrics", "Trend-forward streetwear with luxury accessories", "Tailored silhouettes with intentional layering", "Seasonal trend elements executed flawlessly", "Cohesive outfit with statement pieces", "Celebrity red carpet-level styling", "Bold pattern mixing that works harmoniously" "Well-put-together casual outfit", "Confident personal style", "Good color coordination", "Flattering fit and proportions", "Appropriate for the occasion", "Clean and well-maintained clothing", "Interesting texture combinations", "Balanced silhouette", "Thoughtful accessory choices", "Modern but not overly trendy" ], 'mid': [ "Basic wardrobe staples without flair", "Safe color combinations with minimal accessories", "Mass-market fast fashion pieces", "Functional over fashionable aesthetic", "Trend-adjacent but poorly executed", "Mismatched proportions in clothing", "Overly casual for the occasion", "Decade-outdated trend revival" ], 'not_drippy': [ "Severely mismatched color schemes", "Pilled/stained/faded clothing items", "Ill-fitting garments in multiple pieces", "Clashing patterns without cohesion", "Inappropriate footwear for outfit context", "Overly literal costume-like styling", "Multiple competing trends in one outfit", "Wrinkled/unkempt fabric presentation" ] } style_prompts['drippy'] += [ "coordinated color scheme", "perfect garment proportions", "current fashion trends", "high-quality materials", "complementary accessories" ] style_prompts['not_drippy'] += [ "clashing colors", "poor fit proportions", "outdated trends", "cheap-looking fabrics", "missing accessories" ] clothing_prompts = [ "t-shirt", "dress shirt", "blouse", "hoodie", "jacket", "sweater", "coat", "dress", "skirt", "pants", "jeans", "trousers", "shorts", "sneakers", "boots", "heels", "sandals", "cap", "hat", "scarf", "gloves", "bag", "accessory", "tank-top", "haircut" ] all_prompts = [] for cat_prompts in style_prompts.values(): all_prompts.extend(cat_prompts) style_prompts_end_index = len(all_prompts) all_prompts.extend(clothing_prompts) response_templates = { 'drippy': [ "You're Drippy, bruh – fire {item}!", "{item} goes crazy, on god!", "Certified drippy with that {item}.", "Your {item} just walked a Paris runway.", "That {item}? Straight from the future.", "You just turned a sidewalk into a runway." ], 'mid': [ "Drop the {item} and you might get a text back.", "It's alright, but I'd upgrade the {item}.", "Mid fit alert. {item} is holding you back.", "We can do better come on now", "I don't think you want it enough", "You're teetering on drip... fix the {item}.", "You're in the gray zone. That {item} ain't helping." ], 'not_drippy': [ "Bro thought that {item} was tuff!", "Oh hell nah! Burn that {item}!", "Crimes against fashion, especially that {item}! Also… maybe get a haircut.", "Never walk out the house again with that {item}.", "Your drip is trash, try again.", "This ain't it chief. The overall style needs work.", "Didn't need an AI to tell you to go back to the wardrobe", "Someone call the fashion police. That {item} needs arresting.", "Your outfit just gave me a 404 error.", "That {item} made my GPU overheat in shame." ] } CATEGORY_LABEL_MAP = { "drippy": "drippy", "mid": "mid", "not_drippy": "trash" } def format_detected_items(item_list): if not item_list: return "" return "
Detected items: " + ", ".join(item_list) + "
" def get_top_clip_clothing(probs, n=1): """Gets the top N clothing items based on CLIP probabilities.""" clothing_probs_start_index = style_prompts_end_index clothing_probs = probs[clothing_probs_start_index:] actual_n = min(n, len(clothing_prompts)) if actual_n <= 0: return [] top_indices_in_slice = np.argsort(clothing_probs)[-actual_n:] top_global_indices = [idx + clothing_probs_start_index for idx in top_indices_in_slice] top_items_with_probs = [ (clothing_prompts[i], clothing_probs[i]) for i in reversed(top_indices_in_slice) ] return top_items_with_probs def wrapped_analyze(input_img): return analyze_outfit( input_img, yolo_person_model, yolo_fashion_model, clip_model, clip_preprocess, all_prompts, style_prompts_end_index, FASHION_CLASSES, CATEGORY_LABEL_MAP, response_templates, YOLO_PERSON_CONF_THRESHOLD, YOLO_FASHION_CONF_THRESHOLD, YOLO_FASHION_HIGH_CONF_THRESHOLD, DEVICE ) def analyze_outfit(input_img): if isinstance(input_img, str): try: input_img = Image.open(input_img) except Exception as e: return (f"Error loading image: {str(e)}
", None, "Image loading error") if input_img is None: return ("Please upload an image.
", None, "Error: No image provided.") img = input_img.convert("RGB").copy() person_results = yolo_person_model(img, verbose=False, conf=YOLO_PERSON_CONF_THRESHOLD) boxes = person_results[0].boxes.xyxy.cpu().numpy() classes = person_results[0].boxes.cls.cpu().numpy() confidences = person_results[0].boxes.conf.cpu().numpy() person_indices = np.where(classes == 0)[0] cropped_img = img person_detected = False if len(person_indices) > 0: max_conf_person_idx = person_indices[np.argmax(confidences[person_indices])] x1, y1, x2, y2 = map(int, boxes[max_conf_person_idx]) x1, y1 = max(0, x1), max(0, y1) x2, y2 = min(img.width, x2), min(img.height, y2) if x1 < x2 and y1 < y2: cropped_img = img.crop((x1, y1, x2, y2)) print(f"Person detected and cropped: Box {x1, y1, x2, y2}") person_detected = True else: print("Warning: Invalid person bounding box after clipping. Using full image.") cropped_img = img else: print("No person detected by yolo_person_model. Analyzing full image.") detected_fashion_item_name = None detected_fashion_item_conf = 0.0 if person_detected or True: try: fashion_results = yolo_fashion_model(cropped_img, verbose=False, conf=YOLO_FASHION_CONF_THRESHOLD) fashion_boxes = fashion_results[0].boxes.xyxy.cpu().numpy() fashion_classes = fashion_results[0].boxes.cls.cpu().numpy().astype(int) fashion_confidences = fashion_results[0].boxes.conf.cpu().numpy() if len(fashion_classes) > 0: best_fashion_idx = np.argmax(fashion_confidences) detected_class_id = fashion_classes[best_fashion_idx] detected_fashion_item_conf = fashion_confidences[best_fashion_idx] if detected_class_id in FASHION_CLASSES: detected_fashion_item_name = FASHION_CLASSES[detected_class_id] print(f"Fashion model detected: '{detected_fashion_item_name}' " f"with confidence {detected_fashion_item_conf:.2f}") else: print(f"Warning: Detected fashion class ID {detected_class_id} not in FASHION_CLASSES map.") else: print("No fashion items detected above threshold by yolo_fashion_model.") except Exception as e: print(f"Error during YOLO fashion model analysis: {e}") # Continue without fashion model input clip_detected_item = "look" # Default fallback item name clip_detected_item_prob = 0.0 category_key = 'mid' # Default category final_score_str = "N/A" final_score = 0.0 try: image_tensor = clip_preprocess(cropped_img).unsqueeze(0).to(DEVICE) text_tokens = clip.tokenize(all_prompts).to(DEVICE) with torch.no_grad(): logits, _ = clip_model(image_tensor, text_tokens) all_probs = logits.softmax(dim=-1).cpu().numpy()[0] drip_len = len(style_prompts['drippy']) mid_len = len(style_prompts['mid']) score_label = "Style Score" # Initialize with a default/fallback category_label = CATEGORY_LABEL_MAP[category_key] percentage_score = max(0, final_score * 100) percentage_score_str = f"{percentage_score:.0f}%" # Calculate average probabilities for each category drip_score = np.mean(all_probs[0 : drip_len]) mid_score = np.mean(all_probs[drip_len : drip_len + mid_len]) not_score = np.mean(all_probs[drip_len + mid_len : style_prompts_end_index]) raw_weighted_score = (drip_score * 1) + (mid_score * 0.5) + (not_score * 0.1) final_score = raw_weighted_score * 100 + 10 final_score = min(max(final_score, 0), 100) if final_score >= 50: category_key = 'drippy' score_label = "Drip Score" elif final_score >= 20: category_key = 'mid' score_label = "Mid Score" else: category_key = 'not_drippy' score_label = "Trash Score" category_label = CATEGORY_LABEL_MAP[category_key] percentage_score_str = f"{final_score:.0f}%" print(f"Style analysis: Category={category_label}, Score = {score_label}={percentage_score_str} (Raw Score: {final_score:.4f})") # Get top clothing item from CLIP top_3_clip_items = get_top_clip_clothing(all_probs, n=3) if top_3_clip_items: detected_items_str = ", ".join([f"{item[0]} ({item[1]*100:.1f}%)" for item in top_3_clip_items]) # Show item and probability print(f"I think I detected: {detected_items_str}") clip_detected_item, clip_detected_item_prob = top_3_clip_items[0] else: print("I couldn't confidently identify specific clothing items via CLIP.") clip_detected_item = "piece" # Use a different fallback if CLIP fails clip_detected_item_prob = 0.0 # Ensure prob is defined except Exception as e: print(f"Error during CLIP analysis: {e}") return ("Error during CLIP analysis.
", None, f"Analysis Error: {e}") # Determine the Final Item to Mention in Response final_clothing_item = "style" # Ultimate fallback generic term generic_response_needed = False if detected_fashion_item_name and detected_fashion_item_conf >= YOLO_FASHION_HIGH_CONF_THRESHOLD: # Priority 1: High-confidence fashion model detection final_clothing_item = detected_fashion_item_name print(f"Using highly confident fashion model item: '{final_clothing_item}'") elif detected_fashion_item_name and detected_fashion_item_conf >= YOLO_FASHION_CONF_THRESHOLD: # Priority 2: Medium-confidence fashion model detection (still prefer over CLIP) final_clothing_item = detected_fashion_item_name print(f"Using medium confidence fashion model item: '{final_clothing_item}'") elif clip_detected_item and clip_detected_item_prob > 0.05: # Check if CLIP prob is somewhat reasonable # Priority 3: CLIP detection (if fashion model didn't provide a strong candidate) final_clothing_item = clip_detected_item print(f"Using CLIP detected item: '{final_clothing_item}'") else: # Priority 4: Generic response needed (no confident detection from either model) final_clothing_item = random.choice(["fit", "look", "style", "vibe"]) # Randomize generic term generic_response_needed = True print(f"Using generic fallback item: '{final_clothing_item}'") try: response_pool = response_templates[category_key] chosen_template = random.choice(response_pool) # Format the response, substituting the item name if needed response_text = chosen_template.format(item=final_clothing_item) if '{item}' in chosen_template else chosen_template tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3") tts = gTTS(text=response_text, lang='en', tld='com', slow=False) tts.save(tts_path) print(f"Generated TTS response: '{response_text}' saved to {tts_path}") # --- Updated HTML Output --- category_html = f"""Overall Fit Rating: {percentage_score_str}/100%
Detected Item: {final_clothing_item.title()}
Overall Fit Rating: {percentage_score_str}/100
Detected Item: {final_clothing_item.title()}
Error generating audio/full response.