Spaces:

saa231
/

MutimodalVisionAssistant

Paused

App Files Files Community

Update project_model.py

by saa231 - opened Apr 22, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+123

-48

Files changed (1) hide show

project_model.py +123 -48

project_model.py CHANGED Viewed

@@ -9,66 +9,121 @@ Original file is located at
 # project_module.py
-import torch, cv2, time, os
 import numpy as np
 from PIL import Image
 from ultralytics import YOLO
 from transformers import pipeline, DPTFeatureExtractor, DPTForDepthEstimation
 from TTS.api import TTS
 from huggingface_hub import login
-import os
-# Login using token stored in environment variable
 login(token=os.environ["HUGGING_FACE_HUB_TOKEN"])
-# Load models
-device = "cuda" if torch.cuda.is_available() else "cpu" # Enable GPU
-yolo_model = YOLO("yolov9c.pt") # Load YOLOv9
-depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device).eval() # Load MiDaS
-depth_feat = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
-whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1) # Load Whisper
-# Load Gemma-3-4B
 gemma_pipe = pipeline(
     "image-text-to-text",
     model="google/gemma-3-4b-it",
     device=0 if torch.cuda.is_available() else -1,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
 )
-tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # Load Text-to-Speech (TTS)
-# Function to process image and audio
-def process_inputs(image: Image.Image, audio_path: str):
-    # Convert PIL image to OpenCV format
-    rgb_image = np.array(image)
     cv2_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
-    pil_image = image
-    # YOLO Detection
     yolo_results = yolo_model.predict(cv2_image)[0]
     boxes = yolo_results.boxes
     class_names = yolo_model.names
-    # MiDaS Depth
     depth_inputs = depth_feat(images=pil_image, return_tensors="pt").to(device)
     with torch.no_grad():
         depth_output = depth_model(**depth_inputs)
     depth_map = depth_output.predicted_depth.squeeze().cpu().numpy()
     depth_map_resized = cv2.resize(depth_map, (rgb_image.shape[1], rgb_image.shape[0]))
-    # Visual Context
     shared_visual_context = []
     for box in boxes:
         x1, y1, x2, y2 = map(int, box.xyxy[0])
         label = class_names[int(box.cls[0])]
         conf = float(box.conf[0])
         depth_crop = depth_map_resized[y1:y2, x1:x2]
         avg_depth = float(depth_crop.mean()) if depth_crop.size > 0 else None
         x_center = (x1 + x2) / 2
         pos = "left" if x_center < rgb_image.shape[1] / 3 else "right" if x_center > 2 * rgb_image.shape[1] / 3 else "center"
         shared_visual_context.append({
             "label": label,
             "confidence": conf,
@@ -76,35 +131,55 @@ def process_inputs(image: Image.Image, audio_path: str):
             "position": pos
         })
-    # Build Context Text
-    def build_context_description(context):
-        descriptions = []
-        for obj in context:
-            d = f"{obj['avg_depth']:.1f} units" if obj["avg_depth"] else "unknown"
-            s = obj.get("position", "unknown")
-            c = obj.get("confidence", 0.0)
-            descriptions.append(f"a {obj['label']} ({c:.2f} confidence) is at {d} on the {s}")
-        return "In the image, " + ", ".join(descriptions) + "."
-    context_text = build_context_description(shared_visual_context)
-    # Transcribe audio
-    transcription = whisper_pipe(audio_path)["text"]
-    vqa_prompt = context_text + " " + transcription
-    # GEMMA answer
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image", "image": pil_image},
-            {"type": "text", "text": vqa_prompt}
-        ]
-    }]
-    gemma_output = gemma_pipe(text=messages, max_new_tokens=200)
     answer = gemma_output[0]["generated_text"][-1]["content"]
-    # Generate speech
     output_audio_path = "response.wav"
-    tts.tts_to_file(text=answer, file_path=output_audio_path)
     return answer, output_audio_path

 # project_module.py
+# Import libraries for ML, CV, NLP, audio, and TTS
+import torch, cv2, os
 import numpy as np
 from PIL import Image
 from ultralytics import YOLO
 from transformers import pipeline, DPTFeatureExtractor, DPTForDepthEstimation
 from TTS.api import TTS
 from huggingface_hub import login
+# Authenticate to Hugging Face using environment token
 login(token=os.environ["HUGGING_FACE_HUB_TOKEN"])
+# Set device for computation (GPU if available)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load all models
+yolo_model = YOLO("yolov9c.pt")  # YOLOv9 for object detection
+depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device).eval()  # MiDaS for depth
+depth_feat = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")  # Feature extractor for depth model
+# Whisper for audio transcription
+whisper_pipe = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-small",
+    device=0 if torch.cuda.is_available() else -1
+)
+# GEMMA for image+text to text QA
 gemma_pipe = pipeline(
     "image-text-to-text",
     model="google/gemma-3-4b-it",
     device=0 if torch.cuda.is_available() else -1,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
 )
+# Text-to-speech
+tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
+# -------------------------------
+# Session Management Class
+# -------------------------------
+class VisualQAState:
+    """
+    Stores the current image context and chat history for follow-up questions.
+    """
+    def __init__(self):
+        self.current_image: Image.Image = None
+        self.visual_context: str = ""
+        self.message_history = []
+    def reset(self, image: Image.Image, visual_context: str):
+        """
+        Called when a new image is uploaded.
+        Resets context and starts new message history.
+        """
+        self.current_image = image
+        self.visual_context = visual_context
+        self.message_history = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": self.current_image},
+                {"type": "text", "text": self.visual_context}
+            ]
+        }]
+    def add_question(self, question: str):
+        """
+        Adds a follow-up text message to the chat.
+        """
+        self.message_history.append({
+            "role": "user",
+            "content": [{"type": "text", "text": question}]
+        })
+# -------------------------------
+# Generate Context from Image
+# -------------------------------
+def generate_visual_context(pil_image: Image.Image) -> str:
+    """
+    Processes the image to extract object labels, depth info, and locations.
+    Builds a natural language context description for use in prompting.
+    """
+    # Convert to OpenCV and RGB formats
+    rgb_image = np.array(pil_image)
     cv2_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+    # Object detection using YOLO
     yolo_results = yolo_model.predict(cv2_image)[0]
     boxes = yolo_results.boxes
     class_names = yolo_model.names
+    # Depth estimation using MiDaS
     depth_inputs = depth_feat(images=pil_image, return_tensors="pt").to(device)
     with torch.no_grad():
         depth_output = depth_model(**depth_inputs)
     depth_map = depth_output.predicted_depth.squeeze().cpu().numpy()
     depth_map_resized = cv2.resize(depth_map, (rgb_image.shape[1], rgb_image.shape[0]))
+    # Extract contextual information for each object
     shared_visual_context = []
     for box in boxes:
         x1, y1, x2, y2 = map(int, box.xyxy[0])
         label = class_names[int(box.cls[0])]
         conf = float(box.conf[0])
+        # Compute average depth of object
         depth_crop = depth_map_resized[y1:y2, x1:x2]
         avg_depth = float(depth_crop.mean()) if depth_crop.size > 0 else None
+        # Determine object horizontal position
         x_center = (x1 + x2) / 2
         pos = "left" if x_center < rgb_image.shape[1] / 3 else "right" if x_center > 2 * rgb_image.shape[1] / 3 else "center"
         shared_visual_context.append({
             "label": label,
             "confidence": conf,
             "position": pos
         })
+    # Convert context to a readable sentence
+    descriptions = []
+    for obj in shared_visual_context:
+        d = f"{obj['avg_depth']:.1f} units" if obj["avg_depth"] else "unknown"
+        s = obj.get("position", "unknown")
+        c = obj.get("confidence", 0.0)
+        descriptions.append(f"a {obj['label']} ({c:.2f} confidence) is at {d} on the {s}")
+    return "In the image, " + ", ".join(descriptions) + "."
+# -------------------------------
+# Main Multimodal Processing Function
+# -------------------------------
+def process_inputs(
+    session: VisualQAState,
+    image: Image.Image = None,
+    question: str = "",
+    audio_path: str = None,
+    enable_tts: bool = True
+):
+    """
+    Handles a new image upload or a follow-up question.
+    Combines image context, audio transcription, and text input to generate a GEMMA-based answer.
+    Optionally outputs audio using TTS.
+    """
+    # If new image is provided, reset session and build new context
+    if image:
+        visual_context = generate_visual_context(image)
+        session.reset(image, visual_context)
+    # If user gave an audio clip, transcribe it and append to question
+    if audio_path:
+        audio_text = whisper_pipe(audio_path)["text"]
+        question += " " + audio_text
+    # Append question to conversation history
+    session.add_question(question)
+    # Generate response using GEMMA with full conversation history
+    gemma_output = gemma_pipe(text=session.message_history, max_new_tokens=200)
     answer = gemma_output[0]["generated_text"][-1]["content"]
+    # If TTS is enabled, synthesize answer as speech
     output_audio_path = "response.wav"
+    if enable_tts:
+        tts.tts_to_file(text=answer, file_path=output_audio_path)
+    else:
+        output_audio_path = None
     return answer, output_audio_path