Prashanthsrn commited on
Commit
818bff5
·
verified ·
1 Parent(s): 502f248

Update image_to_text.py

Browse files
Files changed (1) hide show
  1. image_to_text.py +12 -4
image_to_text.py CHANGED
@@ -1,11 +1,16 @@
1
  from PIL import Image
2
  import torch
 
3
  from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
4
 
5
  def preprocess_image(image, target_size=(224, 224)):
6
  # Resize the image to a fixed size
7
  image = image.resize(target_size, Image.LANCZOS)
8
- return image
 
 
 
 
9
 
10
  def generate_caption(image):
11
  # Load pre-trained model and tokenizer
@@ -21,10 +26,13 @@ def generate_caption(image):
21
  image = Image.open(image)
22
 
23
  image = image.convert('RGB')
24
- image = preprocess_image(image)
25
 
26
- # Convert image to pixel values
27
- pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
 
 
 
28
  pixel_values = pixel_values.to(device)
29
 
30
  # Generate caption
 
1
  from PIL import Image
2
  import torch
3
+ import numpy as np
4
  from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
5
 
6
  def preprocess_image(image, target_size=(224, 224)):
7
  # Resize the image to a fixed size
8
  image = image.resize(target_size, Image.LANCZOS)
9
+ # Convert to numpy array and normalize
10
+ image_array = np.array(image) / 255.0
11
+ # Transpose to (channels, height, width) format
12
+ image_array = np.transpose(image_array, (2, 0, 1))
13
+ return image_array
14
 
15
  def generate_caption(image):
16
  # Load pre-trained model and tokenizer
 
26
  image = Image.open(image)
27
 
28
  image = image.convert('RGB')
29
+ image_array = preprocess_image(image)
30
 
31
+ # Create a batch with a single image
32
+ batch = np.expand_dims(image_array, axis=0)
33
+
34
+ # Convert to tensor
35
+ pixel_values = torch.tensor(batch).float()
36
  pixel_values = pixel_values.to(device)
37
 
38
  # Generate caption