File size: 3,865 Bytes
dd0f882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bad0b9
 
 
 
 
9c1199b
1bad0b9
dd0f882
 
1bad0b9
 
77b0b67
1bad0b9
dd0f882
1bad0b9
 
dd0f882
 
 
 
 
 
1bad0b9
dd0f882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
"""project_model.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1oopkA5yIlfizFuhXOPmTK7MUNh3Qasa3
"""

# project_module.py

import torch, cv2, time, os
import numpy as np
from PIL import Image
from ultralytics import YOLO
from transformers import pipeline, DPTFeatureExtractor, DPTForDepthEstimation
from TTS.api import TTS


from huggingface_hub import login
import os

# Login using token stored in environment variable
login(token=os.environ["HUGGING_FACE_HUB_TOKEN"])

# Load models

device = "cuda" if torch.cuda.is_available() else "cpu" # Enable GPU

yolo_model = YOLO("yolov9c.pt") # Load YOLOv9
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device).eval() # Load MiDaS
depth_feat = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1) # Load Whisper
# Load Gemma-3-4B
gemma_pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device=0 if torch.cuda.is_available() else -1,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # Load Text-to-Speech (TTS)

# Function to process image and audio
def process_inputs(image: Image.Image, audio_path: str):
    # Convert PIL image to OpenCV format
    rgb_image = np.array(image)
    cv2_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
    pil_image = image

    # YOLO Detection
    yolo_results = yolo_model.predict(cv2_image)[0]
    boxes = yolo_results.boxes
    class_names = yolo_model.names

    # MiDaS Depth
    depth_inputs = depth_feat(images=pil_image, return_tensors="pt").to(device)
    with torch.no_grad():
        depth_output = depth_model(**depth_inputs)
    depth_map = depth_output.predicted_depth.squeeze().cpu().numpy()
    depth_map_resized = cv2.resize(depth_map, (rgb_image.shape[1], rgb_image.shape[0]))

    # Visual Context
    shared_visual_context = []
    for box in boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        label = class_names[int(box.cls[0])]
        conf = float(box.conf[0])
        depth_crop = depth_map_resized[y1:y2, x1:x2]
        avg_depth = float(depth_crop.mean()) if depth_crop.size > 0 else None
        x_center = (x1 + x2) / 2
        pos = "left" if x_center < rgb_image.shape[1] / 3 else "right" if x_center > 2 * rgb_image.shape[1] / 3 else "center"
        shared_visual_context.append({
            "label": label,
            "confidence": conf,
            "avg_depth": avg_depth,
            "position": pos
        })

    # Build Context Text
    def build_context_description(context):
        descriptions = []
        for obj in context:
            d = f"{obj['avg_depth']:.1f} units" if obj["avg_depth"] else "unknown"
            s = obj.get("position", "unknown")
            c = obj.get("confidence", 0.0)
            descriptions.append(f"a {obj['label']} ({c:.2f} confidence) is at {d} on the {s}")
        return "In the image, " + ", ".join(descriptions) + "."

    context_text = build_context_description(shared_visual_context)

    # Transcribe audio
    transcription = whisper_pipe(audio_path)["text"]
    vqa_prompt = context_text + " " + transcription

    # GEMMA answer
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": pil_image},
            {"type": "text", "text": vqa_prompt}
        ]
    }]
    gemma_output = gemma_pipe(text=messages, max_new_tokens=200)
    answer = gemma_output[0]["generated_text"][-1]["content"]

    # Generate speech
    output_audio_path = "response.wav"
    tts.tts_to_file(text=answer, file_path=output_audio_path)

    return answer, output_audio_path