primerz commited on
Commit
eb833d8
·
verified ·
1 Parent(s): e456dc4

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +14 -21
utils.py CHANGED
@@ -1,12 +1,15 @@
1
- import cv2
2
- import numpy as np
3
  from PIL import Image
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  import torch
6
  from config import Config
7
 
 
 
 
 
8
  def resize_image_to_1mp(image):
9
  """Resizes image to approx 1MP (e.g., 1024x1024) preserving aspect ratio."""
 
10
  w, h = image.size
11
  target_pixels = 1024 * 1024
12
  aspect_ratio = w / h
@@ -15,35 +18,25 @@ def resize_image_to_1mp(image):
15
  new_h = int((target_pixels / aspect_ratio) ** 0.5)
16
  new_w = int(new_h * aspect_ratio)
17
 
18
- # Ensure divisibility by 8 (vae requirement), usually 32 for safety
19
  new_w = (new_w // 32) * 32
20
  new_h = (new_h // 32) * 32
21
 
 
 
 
22
  return image.resize((new_w, new_h), Image.LANCZOS)
23
 
24
- # Simple caching for captioner
25
- captioner_processor = None
26
- captioner_model = None
27
-
28
  def get_caption(image):
 
29
  global captioner_processor, captioner_model
30
 
31
  if captioner_model is None:
32
- print("Loading Captioner...")
33
- captioner_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
34
- captioner_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(Config.DEVICE)
35
 
36
  inputs = captioner_processor(image, return_tensors="pt").to(Config.DEVICE)
37
  out = captioner_model.generate(**inputs)
38
  caption = captioner_processor.decode(out[0], skip_special_tokens=True)
39
- return caption
40
-
41
- def prepare_control_images(image, zoe_detector, lineart_detector):
42
- """Generates the conditioning maps from the input image."""
43
- # 1. Zoe Depth Map
44
- depth_map = zoe_detector(image, detect_resolution=1024, image_resolution=1024)
45
-
46
- # 2. LineArt Map
47
- lineart_map = lineart_detector(image, detect_resolution=1024, image_resolution=1024)
48
-
49
- return depth_map, lineart_map
 
 
 
1
  from PIL import Image
2
  from transformers import BlipProcessor, BlipForConditionalGeneration
3
  import torch
4
  from config import Config
5
 
6
+ # Simple global caching for the captioner
7
+ captioner_processor = None
8
+ captioner_model = None
9
+
10
  def resize_image_to_1mp(image):
11
  """Resizes image to approx 1MP (e.g., 1024x1024) preserving aspect ratio."""
12
+ image = image.convert("RGB")
13
  w, h = image.size
14
  target_pixels = 1024 * 1024
15
  aspect_ratio = w / h
 
18
  new_h = int((target_pixels / aspect_ratio) ** 0.5)
19
  new_w = int(new_h * aspect_ratio)
20
 
21
+ # Ensure divisibility by 32 for efficiency
22
  new_w = (new_w // 32) * 32
23
  new_h = (new_h // 32) * 32
24
 
25
+ if new_w == 0 or new_h == 0:
26
+ new_w, new_h = 1024, 1024 # Fallback
27
+
28
  return image.resize((new_w, new_h), Image.LANCZOS)
29
 
 
 
 
 
30
  def get_caption(image):
31
+ """Generates a caption for the image if one isn't provided."""
32
  global captioner_processor, captioner_model
33
 
34
  if captioner_model is None:
35
+ print("Loading Captioner (BLIP)...")
36
+ captioner_processor = BlipProcessor.from_pretrained(Config.CAPTIONER_REPO)
37
+ captioner_model = BlipForConditionalGeneration.from_pretrained(Config.CAPTIONER_REPO).to(Config.DEVICE)
38
 
39
  inputs = captioner_processor(image, return_tensors="pt").to(Config.DEVICE)
40
  out = captioner_model.generate(**inputs)
41
  caption = captioner_processor.decode(out[0], skip_special_tokens=True)
42
+ return caption