Spaces:

Futuretop
/

CaricatureGenerator-4.0

Runtime error

App Files Files Community

Update app.py

by Futuretop - opened May 19, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+44

-153

Files changed (1) hide show

app.py +44 -153

app.py CHANGED Viewed

@@ -1,154 +1,45 @@
-from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration
-import numpy as np
-import cv2
-from deepface import DeepFace
 import gradio as gr
-# Load BLIP model
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Clothing extractor
-def extract_clothing(text):
-    colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
-    patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
-    items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
-             'suit', 'sneakers', 'hat', 'scarf', 'uniform']
-    found_colors = [c for c in colors if c in text.lower()]
-    found_patterns = [p for p in patterns if p in text.lower()]
-    found_items = [i for i in items if i in text.lower()]
-    return found_colors, found_patterns, found_items
-# Main function
-def analyze_image(image_pil):
-    image_pil = image_pil.convert("RGB")
-    image_np = np.array(image_pil)
-    # Caption generation
-    inputs = processor(image_pil, return_tensors="pt")
-    out = model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    # Convert to BGR for DeepFace
-    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
-    # Face detection using DeepFace with RetinaFace backend
-    try:
-        faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False)
-        print(f"DeepFace detected {len(faces)} face(s)")
-    except Exception as e:
-        print("DeepFace error:", e)
-        faces = []
-    face_infos = []
-    for face_data in faces:
-        face_crop = face_data["face"]
-        try:
-            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
-            age = analysis[0]['age']
-            gender = analysis[0]['gender']
-            emotion = analysis[0]['dominant_emotion']
-            if age < 13:
-                age_group = "child"
-            elif age < 20:
-                age_group = "teen"
-            elif age < 60:
-                age_group = "adult"
-            else:
-                age_group = "senior"
-            face_infos.append({
-                "age": age,
-                "gender": gender,
-                "age_group": age_group,
-                "emotion": emotion
-            })
-        except Exception:
-            continue
-    # Summary stats
-    num_faces = len(face_infos)
-    gender_counts = {"Man": 0, "Woman": 0}
-    age_summary = {}
-    emotion_summary = {}
-    for face in face_infos:
-        gender = face['gender']
-        age_group = face['age_group']
-        emotion = face['emotion']
-        gender_counts[gender] += 1
-        age_summary[age_group] = age_summary.get(age_group, 0) + 1
-        emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1
-    # Clothing info from caption
-    colors, patterns, items = extract_clothing(caption)
-    # Generate 15 sentences
-    sentences = []
-    sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
-    sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).")
-    gender_desc = []
-    if gender_counts["Man"] > 0:
-        gender_desc.append(f"{gender_counts['Man']} male(s)")
-    if gender_counts["Woman"] > 0:
-        gender_desc.append(f"{gender_counts['Woman']} female(s)")
-    if gender_desc:
-        sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
-    else:
-        sentences.append("Gender analysis was inconclusive.")
-    if age_summary:
-        age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
-        sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
-    else:
-        sentences.append("No conclusive age groupings found.")
-    if emotion_summary:
-        emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
-        sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
-    else:
-        sentences.append("Emotion detection yielded limited results.")
-    if colors or patterns or items:
-        cloth_parts = []
-        if colors:
-            cloth_parts.append(f"colors like {', '.join(colors)}")
-        if patterns:
-            cloth_parts.append(f"patterns such as {', '.join(patterns)}")
-        if items:
-            cloth_parts.append(f"items like {', '.join(items)}")
-        sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
-    else:
-        sentences.append("Clothing details were not clearly identified.")
-    if num_faces > 0:
-        sentences.append("Faces are distributed naturally across the image.")
-        sentences.append("Differences in face size suggest variation in distance from the camera.")
-        sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
-        sentences.append("Lighting emphasizes certain facial features and expressions.")
-        sentences.append("Some individuals face the camera while others look away.")
-        sentences.append("Mood diversity is reflected in the variety of facial expressions.")
-        sentences.append("The clothing style appears casual or semi-formal.")
-    else:
-        sentences.append("No visible faces were found to analyze further visual characteristics.")
-    sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")
-    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
-# Gradio Interface
-demo = gr.Interface(
-    fn=analyze_image,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"),
-    title="🖼️ Image Analysis with BLIP + DeepFace",
-    description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
-)
-demo.launch()

 import gradio as gr
+import subprocess
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
+florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
+def generate_caption(image):
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
+    generated_ids = florence_model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        early_stopping=False,
+        do_sample=False,
+        num_beams=3,
+    )
+    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = florence_processor.post_process_generation(
+        generated_text,
+        task="<MORE_DETAILED_CAPTION>",
+        image_size=(image.width, image.height)
+    )
+    prompt =  parsed_answer["<MORE_DETAILED_CAPTION>"]
+    print("\n\nGeneration completed!:"+ prompt)
+    return prompt
+io = gr.Interface(generate_caption,
+                  inputs=[gr.Image(label="Input Image")],
+                  outputs = [gr.Textbox(label="Output Prompt", lines=3, show_copy_button = True),
+                            ],
+                  theme="Yntec/HaleyCH_Theme_Orange",
+                  description="⚠ Sorry for the inconvenience. The space are currently running on the CPU, which might affect performance. We appreciate your understanding."
+                 )
+io.launch(debug=True)