Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on 22 days ago

Commit

02b1c71

verified ·

1 Parent(s): 7004d4e

Hess - Unit 8 Assignment

Browse files

Files changed (2) hide show

app.py +192 -74
requirements.txt +6 -4

app.py CHANGED Viewed

@@ -1,89 +1,207 @@
-import torch
-from diffusers import StableDiffusionPipeline
 import gradio as gr
-# -------------------------------------------------------
-# 1. LOAD PRETRAINED TEXT-TO-IMAGE MODEL
-# -------------------------------------------------------
-model_id = "runwayml/stable-diffusion-v1-5"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if device == "cuda" else torch.float32
-pipe = StableDiffusionPipeline.from_pretrained(
-    model_id,
     torch_dtype=dtype,
-    safety_checker=None,
-    use_safetensors=True
 )
-pipe = pipe.to(device)
-# -------------------------------------------------------
-# 2. CORE PREDICTION FUNCTION
-# -------------------------------------------------------
-def generate_image(prompt: str,
-                   num_inference_steps: int = 25,
-                   guidance_scale: float = 7.5):
-    if not prompt or prompt.strip() == "":
-        prompt = "A friendly robot reading a book in a cozy library, digital art"
-    if device == "cuda":
-        with torch.autocast(device_type="cuda"):
-            result = pipe(
-                prompt,
-                num_inference_steps=num_inference_steps,
-                guidance_scale=guidance_scale
-            )
-    else:
-        result = pipe(
-            prompt,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale
         )
-    return result.images[0]
-# -------------------------------------------------------
-# 3. GRADIO UI
-# -------------------------------------------------------
-prompt_input = gr.Textbox(
-    label="Enter your image prompt",
-    lines=2,
-    placeholder="e.g., 'A watercolor painting of a sunrise over mountains'"
-)
-steps_slider = gr.Slider(
-    minimum=10,
-    maximum=40,
-    value=25,
-    step=1,
-    label="Number of inference steps"
-)
-guidance_slider = gr.Slider(
-    minimum=1.0,
-    maximum=15.0,
-    value=7.5,
-    step=0.5,
-    label="Guidance scale"
-)
-image_output = gr.Image(label="Generated image")
-demo = gr.Interface(
-    fn=generate_image,
-    inputs=[prompt_input, steps_slider, guidance_slider],
-    outputs=image_output,
-    title="Multimodal Text-to-Image Generator",
-    description="Enter a prompt to generate an image using a pretrained text-to-image model."
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+from diffusers import AutoPipelineForText2Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from PIL import Image
+"""
+Multimodal Space for Assignment 8:
+- Text to Image using stabilityai/sd-turbo
+- Image to Text (captioning) using Salesforce BLIP
+"""
+# model configuration
+TEXT_TO_IMAGE_MODEL_ID = "stabilityai/sd-turbo"
+CAPTION_MODEL_ID = "Salesforce/blip-image-captioning-base"
+# device and dtype setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if device == "cuda" else torch.float32
+# load text to image pipeline (SD-Turbo)
+text2img_pipe = AutoPipelineForText2Image.from_pretrained(
+    TEXT_TO_IMAGE_MODEL_ID,
     torch_dtype=dtype,
 )
+text2img_pipe = text2img_pipe.to(device)
+# memory optimization
+if hasattr(text2img_pipe, "enable_attention_slicing"):
+    text2img_pipe.enable_attention_slicing()
+# load image captioning model (BLIP)
+caption_processor = BlipProcessor.from_pretrained(CAPTION_MODEL_ID)
+caption_model = BlipForConditionalGeneration.from_pretrained(
+    CAPTION_MODEL_ID
+).to(device)
+# text to image generation function
+def generate_image(prompt, steps, guidance, seed):
+    """
+    Generate an image from a text prompt using the SD-Turbo pipeline.
+    """
+    if not prompt or not prompt.strip():
+        prompt = "a watercolor painting of a quiet cabin in the forest at sunrise"
+    # optional seeding for reproducibility
+    generator = None
+    if seed is not None and str(seed).strip() != "":
+        try:
+            seed_int = int(seed)
+            generator = torch.Generator(device=device).manual_seed(seed_int)
+        except ValueError:
+            # ignore invalid seeds and use a random generator instead
+            generator = None
+    # expose a slider for steps
+    # even though SD-Turbo is noted to be designed for a low number of steps (1–4)
+    steps = int(steps)
+    guidance = float(guidance)
+    result = text2img_pipe(
+        prompt=prompt,
+        num_inference_steps=steps,
+        guidance_scale=guidance,
+        generator=generator,
+    ).images[0]
+    return result
+# image to text captioning function
+def caption_image(image, max_length, num_beams):
+    """
+    Generate a text caption for an uploaded image.
+    """
+    if image is None:
+        return "Please upload an image first."
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    inputs = caption_processor(images=image, return_tensors="pt").to(device)
+    output_ids = caption_model.generate(
+        **inputs,
+        max_length=int(max_length),
+        num_beams=int(num_beams),
+    )
+    caption = caption_processor.decode(output_ids[0], skip_special_tokens=True).strip()
+    return caption
+# build Gradio UI
+# includes blocks and tabs
+with gr.Blocks() as assignment8:
+    gr.Markdown(
+        """
+        # Multimodal Assignment 8: Text to Image and Image Captioning
+        This Space demonstrates two multimodal capabilities:
+        1. Text to Image generation using Stability AI's SD-Turbo model
+        2. Image to Text captioning using a BLIP image captioning model
+        """
+    )
+    with gr.Tab("Text to Image"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                prompt_in = gr.Textbox(
+                    label="Prompt",
+                    lines=5,  # taller so full prompts are visible
+                    placeholder="Describe the image you want the model to generate.",
+                )
+                steps_in = gr.Slider(
+                    minimum=1,
+                    maximum=8,
+                    value=4,
+                    step=1,
+                    label="Number of inference steps",
+                )
+                guidance_in = gr.Slider(
+                    minimum=0.0,
+                    maximum=3.0,
+                    value=0.0,
+                    step=0.1,
+                    label="Guidance scale",
+                )
+                seed_in = gr.Textbox(
+                    label="Seed (optional, integer)",
+                    placeholder="Ex. 42. Leave blank for random seed.",
+                )
+                generate_button = gr.Button("Generate Image")
+            with gr.Column(scale=1):
+                image_out = gr.Image(label="Generated Image")
+        gr.Examples(
+            examples=[
+                [
+                    "a focused student working at a computer late at night",
+                    4,
+                    3.0,
+                    "",
+                ],
+                [
+                    "a golden retriever playing in a field of flowers",
+                    4,
+                    0.0,
+                    "42",
+                ],
+                [
+                    "a martial artist performing a high kick, dynamic motion",
+                    5,
+                    1.0,
+                    "",
+                ],
+            ],
+            inputs=[prompt_in, steps_in, guidance_in, seed_in],
+            outputs=image_out,
+            fn=generate_image,
+            cache_examples=False,
         )
+        generate_button.click(
+            fn=generate_image,
+            inputs=[prompt_in, steps_in, guidance_in, seed_in],
+            outputs=image_out,
+        )
+    with gr.Tab("Image to Text (Captioning)"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_in = gr.Image(
+                    type="pil",
+                    label="Upload an image",
+                )
+                max_length_in = gr.Slider(
+                    minimum=16,
+                    maximum=64,
+                    value=32,
+                    step=4,
+                    label="Max caption length",
+                )
+                num_beams_in = gr.Slider(
+                    minimum=1,
+                    maximum=6,
+                    value=4,
+                    step=1,
+                    label="Beam search width",
+                )
+                caption_button = gr.Button("Generate Caption")
+            with gr.Column(scale=1):
+                caption_out = gr.Textbox(
+                    label="Generated Caption",
+                    lines=4,
+                )
+        caption_button.click(
+            fn=caption_image,
+            inputs=[image_in, max_length_in, num_beams_in],
+            outputs=caption_out,
+        )
 if __name__ == "__main__":
+    assignment8.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
 gradio>=4.0.0
-diffusers>=0.30.0
-transformers>=4.40.0
-accelerate>=0.30.0
 torch
-safetensors

 gradio>=4.0.0
 torch
+torchvision
+Pillow
+diffusers
+transformers
+accelerate
+safetensors