Spaces:

saa231
/

MutimodalVisionAssistant

Paused

App Files Files Community

saa231 commited on Apr 15, 2025

Commit

dd0f882

verified ·

1 Parent(s): afa4a63

Upload project_model.py

Browse files

Files changed (1) hide show

project_model.py +101 -0

project_model.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# -*- coding: utf-8 -*-
+"""project_model.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1oopkA5yIlfizFuhXOPmTK7MUNh3Qasa3
+"""
+# project_module.py
+import torch, cv2, time, os
+import numpy as np
+from PIL import Image
+from ultralytics import YOLO
+from transformers import pipeline, DPTFeatureExtractor, DPTForDepthEstimation
+from TTS.api import TTS
+# Load models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+yolo_model = YOLO("yolov8n.pt")
+depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device).eval()
+depth_feat = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
+whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
+gemma_pipe = pipeline(
+    "image-text-to-text",
+    model="google/gemma-3-4b-it",
+    device=0 if torch.cuda.is_available() else -1,
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+)
+tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+# Function to process image and audio
+def process_inputs(image: Image.Image, audio_path: str):
+    # Convert PIL image to OpenCV format
+    rgb_image = np.array(image)
+    cv2_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+    pil_image = image
+    # YOLO Detection
+    yolo_results = yolo_model.predict(cv2_image)[0]
+    boxes = yolo_results.boxes
+    class_names = yolo_model.names
+    # MiDaS Depth
+    depth_inputs = depth_feat(images=pil_image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        depth_output = depth_model(**depth_inputs)
+    depth_map = depth_output.predicted_depth.squeeze().cpu().numpy()
+    depth_map_resized = cv2.resize(depth_map, (rgb_image.shape[1], rgb_image.shape[0]))
+    # Visual Context
+    shared_visual_context = []
+    for box in boxes:
+        x1, y1, x2, y2 = map(int, box.xyxy[0])
+        label = class_names[int(box.cls[0])]
+        conf = float(box.conf[0])
+        depth_crop = depth_map_resized[y1:y2, x1:x2]
+        avg_depth = float(depth_crop.mean()) if depth_crop.size > 0 else None
+        x_center = (x1 + x2) / 2
+        pos = "left" if x_center < rgb_image.shape[1] / 3 else "right" if x_center > 2 * rgb_image.shape[1] / 3 else "center"
+        shared_visual_context.append({
+            "label": label,
+            "confidence": conf,
+            "avg_depth": avg_depth,
+            "position": pos
+        })
+    # Build Context Text
+    def build_context_description(context):
+        descriptions = []
+        for obj in context:
+            d = f"{obj['avg_depth']:.1f} units" if obj["avg_depth"] else "unknown"
+            s = obj.get("position", "unknown")
+            c = obj.get("confidence", 0.0)
+            descriptions.append(f"a {obj['label']} ({c:.2f} confidence) is at {d} on the {s}")
+        return "In the image, " + ", ".join(descriptions) + "."
+    context_text = build_context_description(shared_visual_context)
+    # Transcribe audio
+    transcription = whisper_pipe(audio_path)["text"]
+    vqa_prompt = context_text + " " + transcription
+    # GEMMA answer
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image", "image": pil_image},
+            {"type": "text", "text": vqa_prompt}
+        ]
+    }]
+    gemma_output = gemma_pipe(text=messages, max_new_tokens=200)
+    answer = gemma_output[0]["generated_text"][-1]["content"]
+    # Generate speech
+    output_audio_path = "response.wav"
+    tts.tts_to_file(text=answer, file_path=output_audio_path)
+    return answer, output_audio_path