Spaces:

dzmu
/

DripAI2Test

Running

App Files Files Community

dzmu commited on Apr 2, 2025

Commit

aad2489

verified ·

1 Parent(s): 6640bd7

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -120

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import random
 import os
 from PIL import Image
-from ultralytics import YOLO
 from gtts import gTTS
 import uuid
 import time
@@ -14,7 +14,7 @@ import tempfile
 # --- Configuration ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 YOLO_PERSON_MODEL_PATH = 'yolov8n.pt'  # Standard YOLOv8 for person detection
-YOLO_FASHION_MODEL_PATH = 'best.pt' # Your trained fashion model
 CLIP_MODEL_NAME = "ViT-B/32"
 # --- Load Models ---
@@ -24,7 +24,7 @@ try:
     print(f"CLIP model ({CLIP_MODEL_NAME}) loaded successfully.")
 except Exception as e:
     print(f"Error loading CLIP model: {e}")
-    # Handle error appropriately, maybe exit or use a fallback
 try:
     yolo_person_model = YOLO(YOLO_PERSON_MODEL_PATH).to(DEVICE)
@@ -33,154 +33,136 @@ except Exception as e:
     print(f"Error loading YOLO person model: {e}")
     # Handle error
-try:
-    fashion_model = YOLO(YOLO_FASHION_MODEL_PATH).to(DEVICE)
-    print(f"YOLO fashion model ({YOLO_FASHION_MODEL_PATH}) loaded successfully.")
-    # It's crucial that fashion_model.names is populated correctly after loading.
-    # If it's not, you might need to load names from a corresponding .yaml file.
-    if not hasattr(fashion_model, 'names') or not fashion_model.names:
-         print("Warning: Fashion model names not found. Detection might not work correctly.")
-         # Example: Manually assign if needed (replace with your actual class names)
-         # fashion_model.names = {0: 't-shirt', 1: 'jeans', 2: 'sneakers', ...}
-except Exception as e:
-    print(f"Error loading YOLO fashion model: {e}")
-    # Handle error
 # --- Prompts and Responses ---
 style_prompts = {
     'drippy': [
-        "avant-garde streetwear",
-        "high-fashion designer outfit",
-        "trendsetting urban attire",
-        "luxury sneakers and chic accessories",
-        "cutting-edge, bold style"
     ],
     'mid': [
-        "casual everyday outfit",
-        "modern minimalistic attire",
-        "comfortable yet stylish look",
-        "simple, relaxed streetwear",
-        "balanced, practical fashion"
     ],
     'not_drippy': [
-        "disheveled outfit",
-        "poorly coordinated fashion",
-        "unfashionable, outdated attire",
-        "tacky, mismatched ensemble",
-        "sloppy, uninspired look"
     ]
 }
-# Only style prompts are needed for CLIP now
-clip_style_texts = []
-for category in style_prompts:
-    clip_style_texts.extend(style_prompts[category])
 response_templates = {
     'drippy': [
-        "You're Drippy, bruh – fire {item}!",
-        "{item} goes crazy, on god!",
-        "Certified drippy with that {item}."
     ],
     'mid': [
-        "Drop the {item} and you might get a text back.",
-        "It's alright, but I'd upgrade the {item}.",
         "Mid fit alert. That {item} is holding you back."
     ],
     'not_drippy': [
-        "Bro thought that {item} was tuff!",
-        "Oh hell nah! Burn that {item}!",
         "Crimes against fashion, especially that {item}! Also… maybe get a haircut.",
         "Never walk out the house again with that {item}."
     ]
 }
-# Map internal category keys to user-facing labels
-CATEGORY_LABEL_MAP = {
-    "drippy": "drippy",
-    "mid": "mid",
-    "not_drippy": "trash"
-}
 # --- Core Logic ---
 def analyze_outfit(input_img: Image.Image):
     if input_img is None:
         return "Please upload an image.", None, "Error: No image provided."
-    img = input_img.copy() # Work on a copy
-    # 1) YOLO Person Detection
-    person_results = yolo_person_model(img, verbose=False) # verbose=False suppresses console output
     boxes = person_results[0].boxes.xyxy.cpu().numpy()
     classes = person_results[0].boxes.cls.cpu().numpy()
     confidences = person_results[0].boxes.conf.cpu().numpy()
-    # Find the most confident 'person' detection (class ID 0 for COCO)
     person_indices = np.where(classes == 0)[0]
-    cropped_img = img # Default to full image if no person found
     if len(person_indices) > 0:
         max_conf_person_idx = person_indices[np.argmax(confidences[person_indices])]
         x1, y1, x2, y2 = map(int, boxes[max_conf_person_idx])
-        # Ensure crop coordinates are valid
         x1, y1 = max(0, x1), max(0, y1)
         x2, y2 = min(img.width, x2), min(img.height, y2)
         if x1 < x2 and y1 < y2:
              cropped_img = img.crop((x1, y1, x2, y2))
         else:
             print("Warning: Invalid person bounding box after clipping. Using full image.")
             cropped_img = img
-        print(f"Person detected and cropped: Box {x1, y1, x2, y2}")
     else:
         print("No person detected by yolo_person_model. Analyzing full image.")
-        # Decide if you want to proceed without a person or return an error
-        # return "Could not detect a person in the image.", None, "Error: Person not found."
-    # 2) YOLO Fashion Detection (on the cropped image)
-    detected_clothing_item = "fit" # Default item if no clothing detected
-    try:
-        fashion_results = fashion_model(cropped_img, conf=0.1, verbose=False)
-        if len(fashion_results[0].boxes) > 0:
-            fashion_boxes = fashion_results[0].boxes.xyxy.cpu().numpy()
-            fashion_classes = fashion_results[0].boxes.cls.cpu().numpy()
-            fashion_confidences = fashion_results[0].boxes.conf.cpu().numpy()
-            fashion_names = fashion_results[0].names # Dictionary mapping class index to name
-            # Get the most confident clothing detection
-            max_conf_fashion_idx = np.argmax(fashion_confidences)
-            detected_class_id = int(fashion_classes[max_conf_fashion_idx])
-            if fashion_names and detected_class_id in fashion_names:
-                 detected_clothing_item = fashion_names[detected_class_id]
-                 print(f"Most confident clothing item detected: {detected_clothing_item} (Conf: {fashion_confidences[max_conf_fashion_idx]:.2f})")
-            else:
-                 print(f"Warning: Detected clothing class ID {detected_class_id} not found in fashion model names.")
-                 detected_clothing_item = "clothing item" # Fallback if name mapping fails
-        else:
-            print("No clothing items detected by fashion_model on the cropped image.")
-            detected_clothing_item = "style" # Fallback if nothing specific is found
-    except Exception as e:
-        print(f"Error during fashion detection: {e}")
-        detected_clothing_item = "outfit" # General fallback on error
-    # 3) CLIP Style Analysis (on the cropped image)
     try:
         image_tensor = clip_preprocess(cropped_img).unsqueeze(0).to(DEVICE)
-        text_tokens = clip.tokenize(clip_style_texts).to(DEVICE)
         with torch.no_grad():
             logits, _ = clip_model(image_tensor, text_tokens)
-            # Probabilities ONLY for the style prompts
-            style_probs = logits.softmax(dim=-1).cpu().numpy()[0]
-        # Calculate average scores for each style category
         drip_len = len(style_prompts['drippy'])
         mid_len = len(style_prompts['mid'])
-        # not_len = len(style_prompts['not_drippy']) # Length of the last section
-        drip_score = np.mean(style_probs[0 : drip_len])
-        mid_score = np.mean(style_probs[drip_len : drip_len + mid_len])
-        not_score = np.mean(style_probs[drip_len + mid_len :]) # Rest are 'not_drippy'
         # Determine the category based on highest average score
         if drip_score > mid_score and drip_score > not_score:
@@ -194,72 +176,66 @@ def analyze_outfit(input_img: Image.Image):
             final_score = not_score
         category_label = CATEGORY_LABEL_MAP[category_key]
-        final_score_str = f"{final_score:.2f}" # Format score
         print(f"Style analysis: Category={category_label}, Score={final_score_str}")
     except Exception as e:
-        print(f"Error during CLIP analysis: {e}")
-        # Handle CLIP error - maybe return a default message
-        return "Error during style analysis.", None, f"Analysis Error: {e}"
-    # 4) Generate Response and TTS
     try:
-        # Select a random response template for the determined category
         response_text = random.choice(response_templates[category_key]).format(item=detected_clothing_item)
-        # Generate TTS audio
         tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3")
-        tts = gTTS(text=response_text, lang='en', tld='com', slow=False) # Use tld='com' for a standard voice
         tts.save(tts_path)
         print(f"Generated TTS response: '{response_text}' saved to {tts_path}")
-        # Output HTML for category + numeric score
         category_html = f"""
             <div style='text-align: center; padding: 15px; border: 1px solid #eee; border-radius: 8px;'>
                 <h2 style='color: #333; margin-bottom: 5px;'>Your fit is {category_label.upper()}!</h2>
                 <p style='font-size: 1.1em; color: #555; margin-top: 0;'>Style Score: {final_score_str}</p>
             </div>
         """
         return category_html, tts_path, response_text
     except Exception as e:
         print(f"Error during response/TTS generation: {e}")
-        # Fallback if TTS or formatting fails
         category_html = f"<h2>Result: {category_label} (Score: {final_score_str})</h2>"
         return category_html, None, f"Analysis complete ({category_label}), but error generating audio/response."
-# --- Gradio Interface ---
 with gr.Blocks(css=".gradio-container { max-width: 800px !important; margin: auto !important; } footer { display: none !important; }") as demo:
     gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>💧 DripAI: Rate Your Fit 💧</h1>")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(
-                type='pil',
-                label="Upload, Paste, or Use Webcam for your Outfit Photo",
-                # Explicitly define sources for better UI clarity
-                sources=['upload', 'webcam', 'clipboard'],
-                height=400
             )
             analyze_button = gr.Button("Analyze Outfit", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### Analysis Result:")
-            category_html = gr.HTML(label="Category & Score") # Displays HTML output
             audio_output = gr.Audio(autoplay=True, label="Audio Feedback", streaming=False)
-            response_box = gr.Textbox(lines=4, label="Text Feedback", interactive=False) # Make textbox read-only
     analyze_button.click(
-        fn=analyze_outfit,
-        inputs=[input_image],
-        outputs=[category_html, audio_output, response_box],
-        # show_progress="full" # Optional: Show progress bar during processing
     )
     gr.Markdown("<p style='text-align: center; color: grey; font-size: 0.9em;'>Upload an image of your outfit and click 'Analyze Outfit'. DripAI will rate your style and identify a key clothing item.</p>")
 # --- Launch App ---
 if __name__ == "__main__":
-    demo.launch(debug=True) # Enable debug for more detailed logs

 import random
 import os
 from PIL import Image
+from ultralytics import YOLO # Still needed for person detection
 from gtts import gTTS
 import uuid
 import time
 # --- Configuration ---
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 YOLO_PERSON_MODEL_PATH = 'yolov8n.pt'  # Standard YOLOv8 for person detection
+# YOLO_FASHION_MODEL_PATH = 'best.pt' # REMOVED - Not using fashion model anymore
 CLIP_MODEL_NAME = "ViT-B/32"
 # --- Load Models ---
     print(f"CLIP model ({CLIP_MODEL_NAME}) loaded successfully.")
 except Exception as e:
     print(f"Error loading CLIP model: {e}")
+    # Handle error
 try:
     yolo_person_model = YOLO(YOLO_PERSON_MODEL_PATH).to(DEVICE)
     print(f"Error loading YOLO person model: {e}")
     # Handle error
+# REMOVED Fashion Model Loading
+# try:
+#     fashion_model = YOLO(YOLO_FASHION_MODEL_PATH).to(DEVICE)
+#     print(f"YOLO fashion model ({YOLO_FASHION_MODEL_PATH}) loaded successfully.")
+#     if not hasattr(fashion_model, 'names') or not fashion_model.names:
+#          print("Warning: Fashion model names not found.")
+# except Exception as e:
+#     print(f"Error loading YOLO fashion model: {e}")
 # --- Prompts and Responses ---
 style_prompts = {
     'drippy': [
+        "avant-garde streetwear", "high-fashion designer outfit", "trendsetting urban attire",
+        "luxury sneakers and chic accessories", "cutting-edge, bold style"
     ],
     'mid': [
+        "casual everyday outfit", "modern minimalistic attire", "comfortable yet stylish look",
+        "simple, relaxed streetwear", "balanced, practical fashion"
     ],
     'not_drippy': [
+        "disheveled outfit", "poorly coordinated fashion", "unfashionable, outdated attire",
+        "tacky, mismatched ensemble", "sloppy, uninspired look"
     ]
 }
+# --- REINSTATED: Clothing prompts for CLIP ---
+clothing_prompts = [
+    "t-shirt", "dress shirt", "blouse", "hoodie", "jacket", "sweater", "coat",
+    "dress", "skirt", "pants", "jeans", "trousers", "shorts",
+    "sneakers", "boots", "heels", "sandals",
+    "cap", "hat", "scarf", "gloves", "bag", "accessory", "tank-top", "haircut"
+]
+# --- REINSTATED: Combine all prompts for CLIP ---
+all_prompts = []
+for cat_prompts in style_prompts.values():
+    all_prompts.extend(cat_prompts)
+# Record end of style prompts before adding clothing prompts
+style_prompts_end_index = len(all_prompts)
+all_prompts.extend(clothing_prompts)
+print(f"Total prompts for CLIP: {len(all_prompts)}")
 response_templates = {
     'drippy': [
+        "You're Drippy, bruh – fire {item}!", "{item} goes crazy, on god!", "Certified drippy with that {item}."
     ],
     'mid': [
+        "Drop the {item} and you might get a text back.", "It's alright, but I'd upgrade the {item}.",
         "Mid fit alert. That {item} is holding you back."
     ],
     'not_drippy': [
+        "Bro thought that {item} was tuff!", "Oh hell nah! Burn that {item}!",
         "Crimes against fashion, especially that {item}! Also… maybe get a haircut.",
         "Never walk out the house again with that {item}."
     ]
 }
+CATEGORY_LABEL_MAP = { "drippy": "drippy", "mid": "mid", "not_drippy": "trash" }
+# --- REINSTATED: Function to get top clothing items based on CLIP probabilities ---
+def get_top_clothing(probs, n=3):
+    """Gets the top N clothing items based on CLIP probabilities."""
+    # Calculate the start index of clothing probabilities in the combined 'probs' array
+    clothing_probs_start_index = style_prompts_end_index
+    clothing_probs = probs[clothing_probs_start_index:]
+    # Ensure we don't request more items than available prompts
+    actual_n = min(n, len(clothing_prompts))
+    if actual_n <= 0:
+        return ["item"] # Return default if no clothing prompts
+    # Get indices of top N probabilities within the clothing_probs slice
+    top_indices_in_slice = np.argsort(clothing_probs)[-actual_n:]
+    # Return the corresponding clothing prompt names in descending order of probability
+    return [clothing_prompts[i] for i in reversed(top_indices_in_slice)]
 # --- Core Logic ---
 def analyze_outfit(input_img: Image.Image):
     if input_img is None:
         return "Please upload an image.", None, "Error: No image provided."
+    img = input_img.copy()
+    # 1) YOLO Person Detection (Same as before)
+    person_results = yolo_person_model(img, verbose=False)
     boxes = person_results[0].boxes.xyxy.cpu().numpy()
     classes = person_results[0].boxes.cls.cpu().numpy()
     confidences = person_results[0].boxes.conf.cpu().numpy()
     person_indices = np.where(classes == 0)[0]
+    cropped_img = img
     if len(person_indices) > 0:
         max_conf_person_idx = person_indices[np.argmax(confidences[person_indices])]
         x1, y1, x2, y2 = map(int, boxes[max_conf_person_idx])
         x1, y1 = max(0, x1), max(0, y1)
         x2, y2 = min(img.width, x2), min(img.height, y2)
         if x1 < x2 and y1 < y2:
              cropped_img = img.crop((x1, y1, x2, y2))
+             print(f"Person detected and cropped: Box {x1, y1, x2, y2}")
         else:
             print("Warning: Invalid person bounding box after clipping. Using full image.")
             cropped_img = img
     else:
         print("No person detected by yolo_person_model. Analyzing full image.")
+        # Decide if you want to proceed or return an error
+    # --- REMOVED: YOLO Fashion Detection ---
+    # 2) CLIP Analysis (Using ALL prompts - Style + Clothing)
+    detected_clothing_item = "look" # Default if something goes wrong
     try:
         image_tensor = clip_preprocess(cropped_img).unsqueeze(0).to(DEVICE)
+        # --- Use all_prompts for tokenization ---
+        text_tokens = clip.tokenize(all_prompts).to(DEVICE)
         with torch.no_grad():
             logits, _ = clip_model(image_tensor, text_tokens)
+            # --- Probabilities for ALL prompts ---
+            all_probs = logits.softmax(dim=-1).cpu().numpy()[0]
+        # Calculate average scores for each style category based on their slices in all_probs
         drip_len = len(style_prompts['drippy'])
         mid_len = len(style_prompts['mid'])
+        # not_len = len(style_prompts['not_drippy']) # Calculated implicitly below
+        drip_score = np.mean(all_probs[0 : drip_len])
+        mid_score = np.mean(all_probs[drip_len : drip_len + mid_len])
+        not_score = np.mean(all_probs[drip_len + mid_len : style_prompts_end_index]) # Scores up to end of style prompts
         # Determine the category based on highest average score
         if drip_score > mid_score and drip_score > not_score:
             final_score = not_score
         category_label = CATEGORY_LABEL_MAP[category_key]
+        final_score_str = f"{final_score:.2f}"
         print(f"Style analysis: Category={category_label}, Score={final_score_str}")
+        # --- REINSTATED: Get clothing item using CLIP probs ---
+        clothing_items_detected_by_clip = get_top_clothing(all_probs, n=1) # Get top 1 item
+        if clothing_items_detected_by_clip:
+             detected_clothing_item = clothing_items_detected_by_clip[0]
+             print(f"Top clothing item identified by CLIP: {detected_clothing_item}")
+        else:
+             print("Warning: CLIP did not identify a top clothing item.")
+             detected_clothing_item = "fit" # Fallback if get_top_clothing fails
     except Exception as e:
+        print(f"Error during CLIP analysis or clothing selection: {e}")
+        return "Error during analysis.", None, f"Analysis Error: {e}"
+    # 3) Generate Response and TTS (Same as before, but uses item from CLIP)
     try:
         response_text = random.choice(response_templates[category_key]).format(item=detected_clothing_item)
         tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3")
+        tts = gTTS(text=response_text, lang='en', tld='com', slow=False)
         tts.save(tts_path)
         print(f"Generated TTS response: '{response_text}' saved to {tts_path}")
         category_html = f"""
             <div style='text-align: center; padding: 15px; border: 1px solid #eee; border-radius: 8px;'>
                 <h2 style='color: #333; margin-bottom: 5px;'>Your fit is {category_label.upper()}!</h2>
                 <p style='font-size: 1.1em; color: #555; margin-top: 0;'>Style Score: {final_score_str}</p>
             </div>
         """
         return category_html, tts_path, response_text
     except Exception as e:
         print(f"Error during response/TTS generation: {e}")
         category_html = f"<h2>Result: {category_label} (Score: {final_score_str})</h2>"
         return category_html, None, f"Analysis complete ({category_label}), but error generating audio/response."
+# --- Gradio Interface (Unchanged) ---
 with gr.Blocks(css=".gradio-container { max-width: 800px !important; margin: auto !important; } footer { display: none !important; }") as demo:
     gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>💧 DripAI: Rate Your Fit 💧</h1>")
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(
+                type='pil', label="Upload, Paste, or Use Webcam for your Outfit Photo",
+                sources=['upload', 'webcam', 'clipboard'], height=400
             )
             analyze_button = gr.Button("Analyze Outfit", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### Analysis Result:")
+            category_html = gr.HTML(label="Category & Score")
             audio_output = gr.Audio(autoplay=True, label="Audio Feedback", streaming=False)
+            response_box = gr.Textbox(lines=4, label="Text Feedback", interactive=False)
     analyze_button.click(
+        fn=analyze_outfit, inputs=[input_image], outputs=[category_html, audio_output, response_box]
     )
     gr.Markdown("<p style='text-align: center; color: grey; font-size: 0.9em;'>Upload an image of your outfit and click 'Analyze Outfit'. DripAI will rate your style and identify a key clothing item.</p>")
 # --- Launch App ---
 if __name__ == "__main__":
+    demo.launch(debug=True) # Assumes debug is helpful on HF too, might remove later