Transformers
resnet50
testingmodel / main.py
Kajuto's picture
Upload main.py
b94a16e
raw
history blame
2.82 kB
import os
import requests
from PIL import Image
import torch
from torchvision import transforms
from transformers import (
VisionEncoderDecoderModel,
ViTImageProcessor,
AutoTokenizer,
BlipProcessor,
BlipForConditionalGeneration,
)
from diffusers import (
DiffusionPipeline,
StableDiffusionPipeline,
StableDiffusionImageVariationPipeline,
)
def generate_image_caption(image_path):
# Diffusion pipeline
device = torch.device("cpu")
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
"lambdalabs/sd-image-variations-diffusers", revision="v2.0"
)
sd_pipe = sd_pipe.to(device)
pipeline = DiffusionPipeline.from_pretrained(
"lambdalabs/sd-image-variations-diffusers"
)
# Image transformations
img_transforms = transforms.Compose(
[
transforms.ToTensor(),
transforms.Resize(
(224, 224),
interpolation=transforms.InterpolationMode.BICUBIC,
antialias=False,
),
transforms.Normalize(
[0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
),
]
)
# Image-to-image
with Image.open(image_path) as img:
img_tensor = img_transforms(img).to(device).unsqueeze(0)
out = sd_pipe(img_tensor, guidance_scale=3)
out["images"][0].save("img1.jpg")
# Blip image captioning
raw_image = Image.open(image_path).convert("RGB")
processor = BlipProcessor.from_pretrained(
"Salesforce/blip-image-captioning-large"
)
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-large"
).to(device)
# Conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt").to(device)
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
# Unconditional image captioning
inputs = processor(raw_image, return_tensors="pt").to(device)
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
# Stable diffusion pipeline
model_id = "prompthero/openjourney"
pipe = StableDiffusionPipeline.from_pretrained(
model_id, torch_dtype=torch.float32
)
pipe = pipe.to(device)
Room = "Living Room"
AI_Intervention = "High"
Mode = "Redesign"
Design = "Modern"
prompt = (
f"Give me a realistic and complete image of {caption} "
f"which room type: {Room}, AI Intervention: {AI_Intervention}, "
f"Mode: {Mode} and Design style: {Design}"
)
image = pipe(prompt).images[0]
image.save("result3.jpg")
generate_image_caption("C:\Master\First.jpg")