Spaces:

ayajoharji
/

Color_PaletteExtraction_and_ImageCaptioning

Sleeping

App Files Files Community

ayajoharji commited on Sep 30, 2024

Commit

f148e92

verified ·

1 Parent(s): 3951de9

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -69

app.py CHANGED Viewed

@@ -13,26 +13,37 @@ from PIL import Image, ImageDraw
 import requests
 from io import BytesIO
 # Download example images
 def download_example_images():
     image_urls = [
         # URL format: ("Image Description", "Image URL")
-        ("Sunset over Mountains", "https://images.unsplash.com/photo-1501785888041-af3ef285b470"),
-        ("Forest Path", "https://images.unsplash.com/photo-1502082553048-f009c37129b9"),
-        ("City Skyline", "https://images.unsplash.com/photo-1498598453737-8913e843c47b"),
-        ("Beach and Ocean", "https://images.unsplash.com/photo-1507525428034-b723cf961d3e"),
-        ("Desert Dunes", "https://images.unsplash.com/photo-1501594907352-04cda38ebc29"),
     ]
     example_images = []
     for idx, (description, url) in enumerate(image_urls, start=1):
-        response = requests.get(url)
-        if response.status_code == 200:
-            img = Image.open(BytesIO(response.content))
-            img.save(f'example{idx}.jpg')
-            example_images.append([f'example{idx}.jpg'])
-        else:
-            print(f"Failed to download image from {url}")
     return example_images
 # Download example images and prepare examples list
@@ -44,7 +55,7 @@ def load_image(image):
     image_np = np.array(image.convert('RGB'))
     # Resize the image for better processing
-    resized_image = image.resize((300, 300), resample=Image.LANCZOS)
     resized_image_np = np.array(resized_image)
     return resized_image_np
@@ -58,7 +69,7 @@ def extract_colors(image, k=8):
     # Ensure data type is float64
     pixels = pixels.astype(np.float64)
     # Apply K-means clustering to find dominant colors
-    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10, max_iter=300)
     kmeans.fit(pixels)
     # Convert normalized colors back to 0-255 scale
     colors = (kmeans.cluster_centers_ * 255).astype(int)
@@ -67,15 +78,15 @@ def extract_colors(image, k=8):
 # Create an Image for the Color Palette
 def create_palette_image(colors):
     num_colors = len(colors)
-    palette_height = 100
-    palette_width = 100 * num_colors
     palette_image = Image.new("RGB", (palette_width, palette_height))
     draw = ImageDraw.Draw(palette_image)
     for i, color in enumerate(colors):
         # Ensure color values are within the valid range and integers
         color = tuple(np.clip(color, 0, 255).astype(int))
-        draw.rectangle([i * 100, 0, (i + 1) * 100, palette_height], fill=color)
     return palette_image
@@ -91,68 +102,58 @@ def display_palette(colors):
 # Generate Image Caption Using Hugging Face BLIP
 def generate_caption(image):
-    # Load models only once
-    if 'processor' not in generate_caption.__dict__:
-        generate_caption.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        generate_caption.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-    processor = generate_caption.processor
-    model = generate_caption.model
-    inputs = processor(images=image, return_tensors="pt")
-    output = model.generate(**inputs)
-    caption = processor.decode(output[0], skip_special_tokens=True)
     return caption
 # Translate Caption to Arabic Using mBART
 def translate_to_arabic(text):
-    # Load models only once
-    if 'tokenizer' not in translate_to_arabic.__dict__:
-        translate_to_arabic.tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-        translate_to_arabic.model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-    tokenizer = translate_to_arabic.tokenizer
-    model = translate_to_arabic.model
-    tokenizer.src_lang = "en_XX"
-    encoded = tokenizer(text, return_tensors="pt")
-    generated_tokens = model.generate(
         **encoded,
-        forced_bos_token_id=tokenizer.lang_code_to_id["ar_AR"]
     )
-    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
     return translated_text
 # Gradio Interface Function (Combining Elements)
 def process_image(image):
-    # Ensure input is a PIL Image
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    # Convert to RGB format for PIL processing
-    image_rgb = image.convert("RGB")
-    # Load and resize the entire image
-    resized_image_np = load_image(image_rgb)
-    # Convert resized image to PIL Image for Gradio output
-    resized_image_pil = Image.fromarray(resized_image_np)
-    # Generate caption using BLIP model
-    caption = generate_caption(image_rgb)
-    # Translate caption to Arabic
-    caption_arabic = translate_to_arabic(caption)
-    # Extract dominant colors from the entire image
-    colors = extract_colors(resized_image_np, k=8)
-    color_palette = display_palette(colors)
-    # Create palette image
-    palette_image = create_palette_image(colors)
-    # Combine English and Arabic captions
-    bilingual_caption = f"English: {caption}\nArabic: {caption_arabic}"
-    return bilingual_caption, ", ".join(color_palette), palette_image, resized_image_pil
 # Create Gradio Interface using Blocks and add a submit button
 with gr.Blocks(css=".gradio-container { height: 1000px !important; }") as demo:

 import requests
 from io import BytesIO
+# Load models globally at startup
+print("Loading models...")
+blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+mbart_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+print("Models loaded successfully.")
 # Download example images
 def download_example_images():
     image_urls = [
         # URL format: ("Image Description", "Image URL")
+        ("Sunset over Mountains", "https://images.unsplash.com/photo-1501785888041-af3ef285b470?w=512"),
+        ("Forest Path", "https://images.unsplash.com/photo-1502082553048-f009c37129b9?w=512"),
+        ("City Skyline", "https://images.unsplash.com/photo-1498598453737-8913e843c47b?w=512"),
+        ("Beach and Ocean", "https://images.unsplash.com/photo-1507525428034-b723cf961d3e?w=512"),
+        ("Desert Dunes", "https://images.unsplash.com/photo-1501594907352-04cda38ebc29?w=512"),
     ]
     example_images = []
     for idx, (description, url) in enumerate(image_urls, start=1):
+        try:
+            response = requests.get(url)
+            if response.status_code == 200:
+                img = Image.open(BytesIO(response.content))
+                img.save(f'example{idx}.jpg')
+                example_images.append([f'example{idx}.jpg'])
+            else:
+                print(f"Failed to download image from {url}")
+        except Exception as e:
+            print(f"Exception occurred while downloading image: {e}")
     return example_images
 # Download example images and prepare examples list
     image_np = np.array(image.convert('RGB'))
     # Resize the image for better processing
+    resized_image = image.resize((224, 224), resample=Image.LANCZOS)
     resized_image_np = np.array(resized_image)
     return resized_image_np
     # Ensure data type is float64
     pixels = pixels.astype(np.float64)
     # Apply K-means clustering to find dominant colors
+    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
     kmeans.fit(pixels)
     # Convert normalized colors back to 0-255 scale
     colors = (kmeans.cluster_centers_ * 255).astype(int)
 # Create an Image for the Color Palette
 def create_palette_image(colors):
     num_colors = len(colors)
+    palette_height = 50
+    palette_width = 50 * num_colors
     palette_image = Image.new("RGB", (palette_width, palette_height))
     draw = ImageDraw.Draw(palette_image)
     for i, color in enumerate(colors):
         # Ensure color values are within the valid range and integers
         color = tuple(np.clip(color, 0, 255).astype(int))
+        draw.rectangle([i * 50, 0, (i + 1) * 50, palette_height], fill=color)
     return palette_image
 # Generate Image Caption Using Hugging Face BLIP
 def generate_caption(image):
+    inputs = blip_processor(images=image, return_tensors="pt")
+    output = blip_model.generate(**inputs)
+    caption = blip_processor.decode(output[0], skip_special_tokens=True)
     return caption
 # Translate Caption to Arabic Using mBART
 def translate_to_arabic(text):
+    mbart_tokenizer.src_lang = "en_XX"
+    encoded = mbart_tokenizer(text, return_tensors="pt")
+    generated_tokens = mbart_model.generate(
         **encoded,
+        forced_bos_token_id=mbart_tokenizer.lang_code_to_id["ar_AR"]
     )
+    translated_text = mbart_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
     return translated_text
 # Gradio Interface Function (Combining Elements)
 def process_image(image):
+    try:
+        # Ensure input is a PIL Image
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        # Convert to RGB format for PIL processing
+        image_rgb = image.convert("RGB")
+        # Load and resize the entire image
+        resized_image_np = load_image(image_rgb)
+        # Convert resized image to PIL Image for Gradio output
+        resized_image_pil = Image.fromarray(resized_image_np)
+        # Generate caption using BLIP model
+        caption = generate_caption(image_rgb)
+        # Translate caption to Arabic
+        caption_arabic = translate_to_arabic(caption)
+        # Extract dominant colors from the entire image
+        colors = extract_colors(resized_image_np, k=8)
+        color_palette = display_palette(colors)
+        # Create palette image
+        palette_image = create_palette_image(colors)
+        # Combine English and Arabic captions
+        bilingual_caption = f"English: {caption}\nArabic: {caption_arabic}"
+        return bilingual_caption, ", ".join(color_palette), palette_image, resized_image_pil
+    except Exception as e:
+        print(f"Error during processing: {e}")
+        return "An error occurred during processing.", "", None, None
 # Create Gradio Interface using Blocks and add a submit button
 with gr.Blocks(css=".gradio-container { height: 1000px !important; }") as demo: