Prashanthsrn commited on
Commit
ae8437f
·
verified ·
1 Parent(s): 8fecbd2

Update image_to_text.py

Browse files
Files changed (1) hide show
  1. image_to_text.py +12 -6
image_to_text.py CHANGED
@@ -2,6 +2,11 @@ from PIL import Image
2
  import torch
3
  from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
4
 
 
 
 
 
 
5
  def generate_caption(image):
6
  # Load pre-trained model and tokenizer
7
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
@@ -12,13 +17,14 @@ def generate_caption(image):
12
  model.to(device)
13
 
14
  # Prepare image
15
- if isinstance(image, Image.Image):
16
- image = image.convert('RGB')
17
- else:
18
- image = Image.open(image).convert('RGB')
 
19
 
20
- # Add padding=True to handle images of different sizes
21
- pixel_values = feature_extractor(images=[image], return_tensors="pt", padding=True).pixel_values
22
  pixel_values = pixel_values.to(device)
23
 
24
  # Generate caption
 
2
  import torch
3
  from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
4
 
5
+ def preprocess_image(image, target_size=(224, 224)):
6
+ # Resize the image to a fixed size
7
+ image = image.resize(target_size, Image.LANCZOS)
8
+ return image
9
+
10
  def generate_caption(image):
11
  # Load pre-trained model and tokenizer
12
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 
17
  model.to(device)
18
 
19
  # Prepare image
20
+ if not isinstance(image, Image.Image):
21
+ image = Image.open(image)
22
+
23
+ image = image.convert('RGB')
24
+ image = preprocess_image(image)
25
 
26
+ # Convert image to pixel values
27
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
28
  pixel_values = pixel_values.to(device)
29
 
30
  # Generate caption