Instructions to use getZuma/image-captioning with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use getZuma/image-captioning with Transformers:
# Use a pipeline as a high-level helper # Warning: Pipeline type "image-to-text" is no longer supported in transformers v5. # You must load the model directly (see below) or downgrade to v4.x with: # 'pip install "transformers<5.0.0' from transformers import pipeline pipe = pipeline("image-to-text", model="getZuma/image-captioning")# Load model directly from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering processor = AutoProcessor.from_pretrained("getZuma/image-captioning") model = AutoModelForVisualQuestionAnswering.from_pretrained("getZuma/image-captioning") - Notebooks
- Google Colab
- Kaggle
Commit ·
db1d42e
1
Parent(s): 98f4fce
rollback to previous code
Browse files- handler.py +9 -8
handler.py
CHANGED
|
@@ -21,9 +21,10 @@ class EndpointHandler():
|
|
| 21 |
|
| 22 |
|
| 23 |
print(" $$$$ Model Loading $$$$")
|
| 24 |
-
self.processor = Blip2Processor.from_pretrained("
|
| 25 |
-
self.model = Blip2ForConditionalGeneration.from_pretrained("
|
| 26 |
print(" $$$$ model loaded $$$$")
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
|
|
@@ -63,13 +64,13 @@ class EndpointHandler():
|
|
| 63 |
# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
| 64 |
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
print("@@@@@@ generated_text @@@@@@@")
|
| 69 |
print(generated_text)
|
|
|
|
|
|
|
| 70 |
|
|
|
|
| 71 |
|
| 72 |
-
|
| 73 |
-
# captions = processor.decode(out[0], skip_special_tokens=True)
|
| 74 |
-
|
| 75 |
-
return {"captions": generated_text}
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
print(" $$$$ Model Loading $$$$")
|
| 24 |
+
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
| 25 |
+
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
|
| 26 |
print(" $$$$ model loaded $$$$")
|
| 27 |
+
print(self.model.eval())
|
| 28 |
|
| 29 |
|
| 30 |
|
|
|
|
| 64 |
# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
| 65 |
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
| 66 |
|
| 67 |
+
inputs = self.processor(raw_image, prompt, return_tensors="pt").to("cuda", torch.float16)
|
| 68 |
+
|
| 69 |
print("@@@@@@ generated_text @@@@@@@")
|
| 70 |
print(generated_text)
|
| 71 |
+
out = self.model.generate(**inputs)
|
| 72 |
+
captions = processor.decode(out[0], skip_special_tokens=True)
|
| 73 |
|
| 74 |
+
print("captions", captions)
|
| 75 |
|
| 76 |
+
return {"captions": captions}
|
|
|
|
|
|
|
|
|