# project_module.py # Import libraries for ML, CV, NLP, audio, and TTS import torch, cv2, os import numpy as np from PIL import Image from ultralytics import YOLO from transformers import pipeline, DPTFeatureExtractor, DPTForDepthEstimation from TTS.api import TTS from huggingface_hub import login # Authenticate to Hugging Face using environment token login(token=os.environ["HUGGING_FACE_HUB_TOKEN"]) # Set device for computation (GPU if available) device = "cuda" if torch.cuda.is_available() else "cpu" # Load all models yolo_model = YOLO("yolov9c.pt") # YOLOv9 for object detection depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(device).eval() # MiDaS for depth depth_feat = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") # Feature extractor for depth model # Whisper for audio transcription whisper_pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1 ) # GEMMA for image+text to text QA gemma_pipe = pipeline( "image-text-to-text", model="google/gemma-3-4b-it", device=0 if torch.cuda.is_available() else -1, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32 ) # Text-to-speech tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC") # ------------------------------- # Session Management Class # ------------------------------- class VisualQAState: """ Stores the current image context and chat history for follow-up questions. """ def __init__(self): self.current_image: Image.Image = None self.annotated_image: Image.Image = None self.visual_context: str = "" self.message_history = [] def reset(self, image: Image.Image, annotated_image: Image.Image, visual_context: str): """ Called when a new image is uploaded. Resets context and starts new message history. """ self.current_image = image self.annotated_image = annotated_image self.visual_context = visual_context self.message_history = [ { "role": "system", # System prompt "content": ( "You are a helpful visual assistant designed for visually impaired users that assists users by answering their questions. " #"You must provide detailed, descriptive, and spatially-aware answers based on the given image, the question asked, and conversation history. " #"Always describe what you see clearly and help the user understand the scene. " 'If unsure, say "I am not certain."' ) }, { "role": "user", # The user input "content": [ {"type": "image", "image": self.current_image}, # Image context {"type": "text", "text": self.visual_context} # Visual context description ] } ] def add_question(self, question: str): """ Adds a follow-up question only if the last message was from assistant. Ensures alternating user/assistant messages. """ if not self.message_history or self.message_history[-1]["role"] == "assistant": self.message_history.append({ "role": "user", "content": [{"type": "text", "text": question}] }) def add_answer(self, answer: str): """ Appends the assistant's response to the conversation history. """ self.message_history.append({ "role": "assistant", "content": [{"type": "text", "text": answer}] }) # ------------------------------- # Generate Context from Image # ------------------------------- def generate_visual_context(pil_image: Image.Image): # Convert to OpenCV and RGB formats rgb_image = np.array(pil_image) cv2_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) # Object detection yolo_results = yolo_model.predict(cv2_image)[0] boxes = yolo_results.boxes class_names = yolo_model.names # Depth estimation depth_inputs = depth_feat(images=pil_image, return_tensors="pt").to(device) with torch.no_grad(): depth_output = depth_model(**depth_inputs) depth_map = depth_output.predicted_depth.squeeze().cpu().numpy() depth_map_resized = cv2.resize(depth_map, (rgb_image.shape[1], rgb_image.shape[0])) # Draw bounding boxes on a copy of the image annotated_image = cv2_image.copy() for box in boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) label = class_names[int(box.cls[0])] conf = float(box.conf[0]) cv2.rectangle(annotated_image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(annotated_image, f"{label} {conf:.2f}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) # Extract context shared_visual_context = [] for box in boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) label = class_names[int(box.cls[0])] conf = float(box.conf[0]) depth_crop = depth_map_resized[y1:y2, x1:x2] avg_depth = float(depth_crop.mean()) if depth_crop.size > 0 else None x_center = (x1 + x2) / 2 pos = "left" if x_center < rgb_image.shape[1] / 3 else "right" if x_center > 2 * rgb_image.shape[1] / 3 else "center" shared_visual_context.append({ "label": label, "confidence": conf, "avg_depth": avg_depth, "position": pos }) descriptions = [] for obj in shared_visual_context: d = f"{obj['avg_depth']:.1f} units" if obj["avg_depth"] else "unknown" s = obj.get("position", "unknown") c = obj.get("confidence", 0.0) descriptions.append(f"a {obj['label']} ({c:.2f} confidence) is at {d} on the {s}") context_sentence = "In the image, " + ", ".join(descriptions) + "." # Save annotated image annotated_pil = Image.fromarray(cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)) return context_sentence, annotated_pil # ------------------------------- # Main Multimodal Processing Function # ------------------------------- # Create a global session object to persist across follow-ups session = VisualQAState() def process_inputs( session: VisualQAState, image: Image.Image = None, question: str = "", audio_path: str = None, enable_tts: bool = True ): if image: # Generate visual context and annotated image visual_context, annotated_image = generate_visual_context(image) # Reset session with the current image and visual context session.reset(image, annotated_image, visual_context) if audio_path: # Process audio to text audio_text = whisper_pipe(audio_path)["text"] question += ' ' + audio_text.strip() # Add user's new question to the history session.add_question(question) vqa_prompt = "You are a helpful visual assistant designed for visually impaired users that assists users by answering their questions. Answer the following question with the help of the shared visual context: " + question + "Shared visual context: " + visual_context # Gemma prompt input messages = [{ "role": "user", "content": [ {"type": "image", "image": session.current_image}, {"type": "text", "text": vqa_prompt}] }] # Call to gemma_pipe gemma_output = gemma_pipe(text=messages, max_new_tokens=500) # Handle the output from Gemma model safely if isinstance(gemma_output, list) and len(gemma_output) > 0: gemma_text = gemma_output[0]["generated_text"][-1]["content"] if isinstance(gemma_text, str): answer = gemma_text else: answer = "No valid output from Gemma model." # Save assistant's answer into session history session.add_answer(answer) # Text-to-speech output output_audio_path = "response.wav" if enable_tts: tts.tts_to_file(text=answer, file_path=output_audio_path) else: output_audio_path = None return answer, output_audio_path