Spaces:

Kashif17
/

Text-Speech

Runtime error

App Files Files Community

Kashif17 commited on May 18, 2024

Commit

d810af8

verified ·

1 Parent(s): a97ff0b

Create app.py

Browse files

Files changed (1) hide show

app.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
+import torch
+from torchvision import models, transforms
+from PIL import Image
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import streamlit as st
+# Load the pre-trained image feature extraction model
+resnet = models.resnet50(pretrained=True)
+resnet.eval()
+# Load the pre-trained language model
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+model = GPT2LMHeadModel.from_pretrained("gpt2")
+model.eval()
+# Preprocess the image
+def preprocess_image(image_path):
+    image = Image.open(image_path)
+    preprocess = transforms.Compose([
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    input_tensor = preprocess(image)
+    input_batch = input_tensor.unsqueeze(0)
+    return input_batch
+# Extract image features
+def extract_image_features(image_path):
+    input_batch = preprocess_image(image_path)
+    with torch.no_grad():
+        output = resnet(input_batch)
+    image_features = output.squeeze(0)
+    return image_features
+# Generate caption
+def generate_caption(image_features):
+    caption = tokenizer.decode(model.generate(input_ids=model.config.pad_token_id,
+                                               max_length=50,
+                                               eos_token_id=model.config.eos_token_id,
+                                               no_repeat_ngram_size=2,
+                                               num_return_sequences=1,
+                                               attention_mask=None,
+                                               encoder_outputs=None,
+                                               decoder_start_token_id=None,
+                                               use_cache=None,
+                                               labels=None,
+                                               output_attentions=None,
+                                               output_hidden_states=None,
+                                               output_scores=None,
+                                               return_dict=None,
+                                               input_embeds=image_features.unsqueeze(0)))
+    return caption
+# Streamlit app
+st.title("Image Captioning with GPT-2")
+uploaded_file = st.file_uploader("Choose an image...", type="jpg")
+if uploaded_file is not None:
+    # Display the uploaded image
+    image = Image.open(uploaded_file)
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    # Generate caption when the image is uploaded
+    image_features = extract_image_features(uploaded_file)
+    caption = generate_caption(image_features)
+    st.write("Generated Caption:", caption)