Spaces:

dzmu
/

DripAI2Test

Running

App Files Files Community

dzmu commited on Apr 2, 2025

Commit

1a91629

verified ·

1 Parent(s): 57c11ed

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -100

app.py CHANGED Viewed

@@ -8,131 +8,258 @@ from PIL import Image
 from ultralytics import YOLO
 from gtts import gTTS
 import uuid
 import tempfile
-device = "cuda" if torch.cuda.is_available() else "cpu"
-clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
-yolo_model = YOLO("yolov8n.pt").to(device)
-fashion_model = YOLO("best.pt").to(device)  # Your trained model
 style_prompts = {
-    "drippy": [
-        "avant-garde streetwear", "high-fashion designer outfit", "trendsetting urban attire",
-        "luxury sneakers and chic accessories", "cutting-edge, bold style"
     ],
-    "mid": [
-        "casual everyday outfit", "modern minimalistic attire", "comfortable yet stylish look",
-        "simple, relaxed streetwear", "balanced, practical fashion"
     ],
-    "not_drippy": [
-        "disheveled outfit", "poorly coordinated fashion", "unfashionable, outdated attire",
-        "tacky, mismatched ensemble", "sloppy, uninspired look"
     ]
 }
-clothing_prompts = [
-    "t-shirt", "dress shirt", "blouse", "hoodie", "jacket", "sweater", "coat", "dress", "skirt",
-    "pants", "jeans", "trousers", "shorts", "sneakers", "boots", "heels", "sandals", "cap", "hat",
-    "scarf", "gloves", "bag", "accessory", "tank-top", "haircut"
-]
 response_templates = {
-    "drippy": [
-        "You're Drippy, bruh – fire {item}!", "{item} goes crazy, on god!",
         "Certified drippy with that {item}."
     ],
-    "mid": [
-        "Drop the {item} and you might get a text back.", "It's alright, but I'd upgrade the {item}.",
         "Mid fit alert. That {item} is holding you back."
     ],
-    "not_drippy": [
-        "Bro thought that {item} was tuff!", "Oh hell nah! Burn that {item}!",
         "Crimes against fashion, especially that {item}! Also… maybe get a haircut.",
         "Never walk out the house again with that {item}."
     ]
 }
-CATEGORY_LABEL_MAP = {"drippy": "drippy", "mid": "mid", "not_drippy": "trash"}
-all_prompts = [p for cat in style_prompts.values() for p in cat] + clothing_prompts
-def get_top_clothing(probs, n=3):
-    clothing_probs = probs[len(all_prompts) - len(clothing_prompts):]
-    top_indices = np.argsort(clothing_probs)[-n:]
-    return [clothing_prompts[i] for i in reversed(top_indices)]
-def analyze_outfit(img):
-    results = yolo_model(img)
-    boxes = results[0].boxes.xyxy.cpu().numpy()
-    classes = results[0].boxes.cls.cpu().numpy()
-    confidences = results[0].boxes.conf.cpu().numpy()
     person_indices = np.where(classes == 0)[0]
-    cropped = img
     if len(person_indices) > 0:
-        idx = np.argmax(confidences[person_indices])
-        x1, y1, x2, y2 = map(int, boxes[person_indices][idx])
-        cropped = img.crop((x1, y1, x2, y2))
-    # Run fashion model to get top class label
-    fashion_results = fashion_model(cropped, verbose=False)
-    top_item_idx = fashion_results[0].probs.top1
-    top_item_name = fashion_results[0].names[int(top_item_idx)]
-    # CLIP classification
-    image_tensor = clip_preprocess(cropped).unsqueeze(0).to(device)
-    text_tokens = clip.tokenize([str(p) for p in all_prompts]).to(device)
-    with torch.no_grad():
-        logits, _ = clip_model(image_tensor, text_tokens)
-        probs = logits.softmax(dim=-1).cpu().numpy()[0]
-    drip_len = len(style_prompts["drippy"])
-    mid_len = len(style_prompts["mid"])
-    not_len = len(style_prompts["not_drippy"])
-    drip_score = np.mean(probs[:drip_len])
-    mid_score = np.mean(probs[drip_len:drip_len + mid_len])
-    not_score = np.mean(probs[drip_len + mid_len:])
-    if drip_score > mid_score and drip_score > not_score:
-        cat = "drippy"
-        final_score = drip_score
-    elif mid_score > not_score:
-        cat = "mid"
-        final_score = mid_score
     else:
-        cat = "not_drippy"
-        final_score = not_score
-    label = CATEGORY_LABEL_MAP[cat]
-    response = random.choice(response_templates[cat]).format(item=top_item_name)
-    tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3")
-    gTTS(response, lang="en").save(tts_path)
-    html = f"""
-        <div style='padding:1rem; text-align:center;'>
-            <h2>Your fit is <span style='color:#1f04ff'>{label}</span>!</h2>
-            <p>Drip Score: <strong>{final_score:.2f}</strong></p>
-        </div>
-    """
-    return html, tts_path, response
-# Gradio UI
-with gr.Blocks(css=".container { max-width: 600px; margin: auto; padding: 2rem; }") as demo:
-    with gr.Group(elem_classes=["container"]):
-        method = gr.Radio(["Upload Image", "Use Webcam"], label="Choose how to submit your fit", value="Upload Image")
-        upload = gr.Image(type="pil", label="Upload Image", visible=True)
-        webcam = gr.Image(type="pil", label="Take Photo", visible=False, sources=["webcam"])
-        analyze = gr.Button("🔥 Analyze My Fit")
-        html = gr.HTML()
-        audio = gr.Audio(autoplay=True, label="")
-        textbox = gr.Textbox(label="Response", interactive=False, lines=2)
-        def toggle_inputs(method):
-            return gr.update(visible=method == "Upload Image"), gr.update(visible=method == "Use Webcam")
-        method.change(toggle_inputs, inputs=method, outputs=[upload, webcam])
-        analyze.click(fn=analyze_outfit, inputs=upload, outputs=[html, audio, textbox])
-        analyze.click(fn=analyze_outfit, inputs=webcam, outputs=[html, audio, textbox])
 if __name__ == "__main__":
-    demo.launch()

 from ultralytics import YOLO
 from gtts import gTTS
 import uuid
+import time
 import tempfile
+# --- Configuration ---
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+YOLO_PERSON_MODEL_PATH = 'yolov8n.pt'  # Standard YOLOv8 for person detection
+YOLO_FASHION_MODEL_PATH = 'best.pt' # Your trained fashion model
+CLIP_MODEL_NAME = "ViT-B/32"
+# --- Load Models ---
+print(f"Using device: {DEVICE}")
+try:
+    clip_model, clip_preprocess = clip.load(CLIP_MODEL_NAME, device=DEVICE)
+    print(f"CLIP model ({CLIP_MODEL_NAME}) loaded successfully.")
+except Exception as e:
+    print(f"Error loading CLIP model: {e}")
+    # Handle error appropriately, maybe exit or use a fallback
+try:
+    yolo_person_model = YOLO(YOLO_PERSON_MODEL_PATH).to(DEVICE)
+    print(f"YOLO person detection model ({YOLO_PERSON_MODEL_PATH}) loaded successfully.")
+except Exception as e:
+    print(f"Error loading YOLO person model: {e}")
+    # Handle error
+try:
+    fashion_model = YOLO(YOLO_FASHION_MODEL_PATH).to(DEVICE)
+    print(f"YOLO fashion model ({YOLO_FASHION_MODEL_PATH}) loaded successfully.")
+    # It's crucial that fashion_model.names is populated correctly after loading.
+    # If it's not, you might need to load names from a corresponding .yaml file.
+    if not hasattr(fashion_model, 'names') or not fashion_model.names:
+         print("Warning: Fashion model names not found. Detection might not work correctly.")
+         # Example: Manually assign if needed (replace with your actual class names)
+         # fashion_model.names = {0: 't-shirt', 1: 'jeans', 2: 'sneakers', ...}
+except Exception as e:
+    print(f"Error loading YOLO fashion model: {e}")
+    # Handle error
+# --- Prompts and Responses ---
 style_prompts = {
+    'drippy': [
+        "avant-garde streetwear",
+        "high-fashion designer outfit",
+        "trendsetting urban attire",
+        "luxury sneakers and chic accessories",
+        "cutting-edge, bold style"
     ],
+    'mid': [
+        "casual everyday outfit",
+        "modern minimalistic attire",
+        "comfortable yet stylish look",
+        "simple, relaxed streetwear",
+        "balanced, practical fashion"
     ],
+    'not_drippy': [
+        "disheveled outfit",
+        "poorly coordinated fashion",
+        "unfashionable, outdated attire",
+        "tacky, mismatched ensemble",
+        "sloppy, uninspired look"
     ]
 }
+# Only style prompts are needed for CLIP now
+clip_style_texts = []
+for category in style_prompts:
+    clip_style_texts.extend(style_prompts[category])
 response_templates = {
+    'drippy': [
+        "You're Drippy, bruh – fire {item}!",
+        "{item} goes crazy, on god!",
         "Certified drippy with that {item}."
     ],
+    'mid': [
+        "Drop the {item} and you might get a text back.",
+        "It's alright, but I'd upgrade the {item}.",
         "Mid fit alert. That {item} is holding you back."
     ],
+    'not_drippy': [
+        "Bro thought that {item} was tuff!",
+        "Oh hell nah! Burn that {item}!",
         "Crimes against fashion, especially that {item}! Also… maybe get a haircut.",
         "Never walk out the house again with that {item}."
     ]
 }
+# Map internal category keys to user-facing labels
+CATEGORY_LABEL_MAP = {
+    "drippy": "drippy",
+    "mid": "mid",
+    "not_drippy": "trash"
+}
+# --- Core Logic ---
+def analyze_outfit(input_img: Image.Image):
+    if input_img is None:
+        return "Please upload an image.", None, "Error: No image provided."
+    img = input_img.copy() # Work on a copy
+    # 1) YOLO Person Detection
+    person_results = yolo_person_model(img, verbose=False) # verbose=False suppresses console output
+    boxes = person_results[0].boxes.xyxy.cpu().numpy()
+    classes = person_results[0].boxes.cls.cpu().numpy()
+    confidences = person_results[0].boxes.conf.cpu().numpy()
+    # Find the most confident 'person' detection (class ID 0 for COCO)
     person_indices = np.where(classes == 0)[0]
+    cropped_img = img # Default to full image if no person found
     if len(person_indices) > 0:
+        max_conf_person_idx = person_indices[np.argmax(confidences[person_indices])]
+        x1, y1, x2, y2 = map(int, boxes[max_conf_person_idx])
+        # Ensure crop coordinates are valid
+        x1, y1 = max(0, x1), max(0, y1)
+        x2, y2 = min(img.width, x2), min(img.height, y2)
+        if x1 < x2 and y1 < y2:
+             cropped_img = img.crop((x1, y1, x2, y2))
+        else:
+            print("Warning: Invalid person bounding box after clipping. Using full image.")
+            cropped_img = img
+        print(f"Person detected and cropped: Box {x1, y1, x2, y2}")
     else:
+        print("No person detected by yolo_person_model. Analyzing full image.")
+        # Decide if you want to proceed without a person or return an error
+        # return "Could not detect a person in the image.", None, "Error: Person not found."
+    # 2) YOLO Fashion Detection (on the cropped image)
+    detected_clothing_item = "fit" # Default item if no clothing detected
+    try:
+        fashion_results = fashion_model(cropped_img, verbose=False)
+        if len(fashion_results[0].boxes) > 0:
+            fashion_boxes = fashion_results[0].boxes.xyxy.cpu().numpy()
+            fashion_classes = fashion_results[0].boxes.cls.cpu().numpy()
+            fashion_confidences = fashion_results[0].boxes.conf.cpu().numpy()
+            fashion_names = fashion_results[0].names # Dictionary mapping class index to name
+            # Get the most confident clothing detection
+            max_conf_fashion_idx = np.argmax(fashion_confidences)
+            detected_class_id = int(fashion_classes[max_conf_fashion_idx])
+            if fashion_names and detected_class_id in fashion_names:
+                 detected_clothing_item = fashion_names[detected_class_id]
+                 print(f"Most confident clothing item detected: {detected_clothing_item} (Conf: {fashion_confidences[max_conf_fashion_idx]:.2f})")
+            else:
+                 print(f"Warning: Detected clothing class ID {detected_class_id} not found in fashion model names.")
+                 detected_clothing_item = "clothing item" # Fallback if name mapping fails
+        else:
+            print("No clothing items detected by fashion_model on the cropped image.")
+            detected_clothing_item = "style" # Fallback if nothing specific is found
+    except Exception as e:
+        print(f"Error during fashion detection: {e}")
+        detected_clothing_item = "outfit" # General fallback on error
+    # 3) CLIP Style Analysis (on the cropped image)
+    try:
+        image_tensor = clip_preprocess(cropped_img).unsqueeze(0).to(DEVICE)
+        text_tokens = clip.tokenize(clip_style_texts).to(DEVICE)
+        with torch.no_grad():
+            logits, _ = clip_model(image_tensor, text_tokens)
+            # Probabilities ONLY for the style prompts
+            style_probs = logits.softmax(dim=-1).cpu().numpy()[0]
+        # Calculate average scores for each style category
+        drip_len = len(style_prompts['drippy'])
+        mid_len = len(style_prompts['mid'])
+        # not_len = len(style_prompts['not_drippy']) # Length of the last section
+        drip_score = np.mean(style_probs[0 : drip_len])
+        mid_score = np.mean(style_probs[drip_len : drip_len + mid_len])
+        not_score = np.mean(style_probs[drip_len + mid_len :]) # Rest are 'not_drippy'
+        # Determine the category based on highest average score
+        if drip_score > mid_score and drip_score > not_score:
+            category_key = 'drippy'
+            final_score = drip_score
+        elif mid_score > not_score:
+            category_key = 'mid'
+            final_score = mid_score
+        else:
+            category_key = 'not_drippy'
+            final_score = not_score
+        category_label = CATEGORY_LABEL_MAP[category_key]
+        final_score_str = f"{final_score:.2f}" # Format score
+        print(f"Style analysis: Category={category_label}, Score={final_score_str}")
+    except Exception as e:
+        print(f"Error during CLIP analysis: {e}")
+        # Handle CLIP error - maybe return a default message
+        return "Error during style analysis.", None, f"Analysis Error: {e}"
+    # 4) Generate Response and TTS
+    try:
+        # Select a random response template for the determined category
+        response_text = random.choice(response_templates[category_key]).format(item=detected_clothing_item)
+        # Generate TTS audio
+        tts_path = os.path.join(tempfile.gettempdir(), f"drip_{uuid.uuid4().hex}.mp3")
+        tts = gTTS(text=response_text, lang='en', tld='com', slow=False) # Use tld='com' for a standard voice
+        tts.save(tts_path)
+        print(f"Generated TTS response: '{response_text}' saved to {tts_path}")
+        # Output HTML for category + numeric score
+        category_html = f"""
+            <div style='text-align: center; padding: 15px; border: 1px solid #eee; border-radius: 8px;'>
+                <h2 style='color: #333; margin-bottom: 5px;'>Your fit is {category_label.upper()}!</h2>
+                <p style='font-size: 1.1em; color: #555; margin-top: 0;'>Style Score: {final_score_str}</p>
+            </div>
+        """
+        return category_html, tts_path, response_text
+    except Exception as e:
+        print(f"Error during response/TTS generation: {e}")
+        # Fallback if TTS or formatting fails
+        category_html = f"<h2>Result: {category_label} (Score: {final_score_str})</h2>"
+        return category_html, None, f"Analysis complete ({category_label}), but error generating audio/response."
+# --- Gradio Interface ---
+with gr.Blocks(css=".gradio-container { max-width: 800px !important; margin: auto !important; } footer { display: none !important; }") as demo:
+    gr.Markdown("<h1 style='text-align: center; margin-bottom: 20px;'>💧 DripAI: Rate Your Fit 💧</h1>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(
+                type='pil',
+                label="Upload, Paste, or Use Webcam for your Outfit Photo",
+                # Explicitly define sources for better UI clarity
+                sources=['upload', 'webcam', 'clipboard'],
+                height=400
+            )
+            analyze_button = gr.Button("Analyze Outfit", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### Analysis Result:")
+            category_html = gr.HTML(label="Category & Score") # Displays HTML output
+            audio_output = gr.Audio(autoplay=True, label="Audio Feedback", streaming=False)
+            response_box = gr.Textbox(lines=4, label="Text Feedback", interactive=False) # Make textbox read-only
+    analyze_button.click(
+        fn=analyze_outfit,
+        inputs=[input_image],
+        outputs=[category_html, audio_output, response_box],
+        # show_progress="full" # Optional: Show progress bar during processing
+    )
+    gr.Markdown("<p style='text-align: center; color: grey; font-size: 0.9em;'>Upload an image of your outfit and click 'Analyze Outfit'. DripAI will rate your style and identify a key clothing item.</p>")
+# --- Launch App ---
 if __name__ == "__main__":
+    demo.launch(debug=True) # Enable debug for more detailed logs