| """ |
| CaptionAgent - Image captioning using TinyCLIP |
| UPDATED MODULE - Replaced BLIP with TinyCLIP for efficient captioning |
| """ |
|
|
| import os |
| os.environ["TRANSFORMERS_NO_TF"] = "1" |
| os.environ["HF_HUB_DISABLE_TF"] = "1" |
| os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" |
|
|
| from PIL import Image |
| import torch |
| import open_clip |
|
|
| class CaptionAgent: |
| def __init__(self): |
| print("[CaptionAgent] Loading TinyCLIP model...") |
| try: |
| |
| self.model, _, self.preprocess = open_clip.create_model_and_transforms( |
| 'ViT-B-32', |
| pretrained='laion400m_e32' |
| ) |
| self.tokenizer = open_clip.get_tokenizer('ViT-B-32') |
| self.model.eval() |
| print("[CaptionAgent] Using OpenCLIP (fallback)") |
| except Exception as e: |
| print(f"[CaptionAgent] OpenCLIP failed: {e}") |
| |
| from transformers import CLIPProcessor, CLIPModel |
| self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
| self.preprocess = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
| self.tokenizer = None |
| self.model.eval() |
| print("[CaptionAgent] Using standard CLIP (fallback)") |
| |
| |
| self.scene_labels = [ |
| "a person sitting at a desk", |
| "a person working on a computer", |
| "an empty room", |
| "a person standing", |
| "multiple people in a room", |
| "a workspace with electronics", |
| "a bedroom", |
| "a kitchen", |
| "a living room", |
| "outdoor scenery", |
| "a person reading", |
| "a person using a phone", |
| "furniture and objects", |
| "indoor environment", |
| "people interacting" |
| ] |
|
|
| def describe(self, frame_bgr): |
| |
| frame_rgb = frame_bgr[:, :, ::-1] |
| image = Image.fromarray(frame_rgb) |
| |
| try: |
| if self.tokenizer: |
| image_input = self.preprocess(image).unsqueeze(0) |
| text_tokens = self.tokenizer(self.scene_labels) |
| |
| with torch.no_grad(): |
| image_features = self.model.encode_image(image_input) |
| text_features = self.model.encode_text(text_tokens) |
| image_features /= image_features.norm(dim=-1, keepdim=True) |
| text_features /= text_features.norm(dim=-1, keepdim=True) |
| similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) |
| |
| top_idx = similarity.argmax().item() |
| else: |
| inputs = self.preprocess( |
| text=self.scene_labels, |
| images=image, |
| return_tensors="pt", |
| padding=True |
| ) |
| |
| with torch.no_grad(): |
| outputs = self.model(**inputs) |
| logits_per_image = outputs.logits_per_image |
| probs = logits_per_image.softmax(dim=1) |
| |
| top_idx = probs.argmax().item() |
| |
| caption = self.scene_labels[top_idx] |
| return caption |
| |
| except Exception as e: |
| print(f"[CaptionAgent] Error during inference: {e}") |
| return "a scene with various objects" |
|
|