Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

5ceab5f

verified ·

1 Parent(s): 207b913

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -33

app.py CHANGED Viewed

@@ -24,17 +24,19 @@ class MultimodalChatbot:
         self.conversation_history = []
     def encode_image_to_base64(self, image) -> str:
-        """Convert PIL Image to base64 string"""
         try:
             if isinstance(image, str):
                 with open(image, "rb") as img_file:
                     return base64.b64encode(img_file.read()).decode('utf-8')
-            else:
                 buffered = io.BytesIO()
                 if image.mode == 'RGBA':
                     image = image.convert('RGB')
                 image.save(buffered, format="JPEG", quality=85)
                 return base64.b64encode(buffered.getvalue()).decode('utf-8')
         except Exception as e:
             return f"Error encoding image: {str(e)}"
@@ -104,7 +106,7 @@ class MultimodalChatbot:
             return f"Error transcribing audio: {str(e)}"
     def process_video(self, video_file) -> Tuple[List[str], str]:
-        """Extract frames from video and convert to base64"""
         try:
             if isinstance(video_file, str):
                 video_path = video_file
@@ -117,31 +119,13 @@ class MultimodalChatbot:
             if not cap.isOpened():
                 return [], "Error: Could not open video file"
-            frames = []
-            frame_descriptions = []
-            frame_count = 0
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
             fps = cap.get(cv2.CAP_PROP_FPS)
-            frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
-            while True:
-                ret, frame = cap.read()
-                if not ret or len(frames) >= 5:
-                    break
-                if frame_count % frame_interval == 0:
-                    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(rgb_frame)
-                    pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
-                    base64_frame = self.encode_image_to_base64(pil_image)
-                    if not base64_frame.startswith("Error"):
-                        frames.append(base64_frame)
-                        timestamp = frame_count / fps if fps > 0 else frame_count
-                        frame_descriptions.append(f"Frame at {timestamp:.1f}s")
-                frame_count += 1
             cap.release()
-            description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
-            return frames, description
         except Exception as e:
             return [], f"Error processing video: {str(e)}"
@@ -174,20 +158,20 @@ class MultimodalChatbot:
                 mode = image_file.mode
                 content_parts.append({
                     "type": "text",
-                    "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Note: Visual analysis not available with current model configuration."
                 })
             else:
                 content_parts.append({
                     "type": "text",
-                    "text": "Image uploaded. Note: Visual analysis not available with current model configuration."
                 })
             processing_info.append("🖼️ Image received (metadata only)")
         if video_file is not None:
-            frames, video_desc = self.process_video(video_file)
             content_parts.append({
                 "type": "text",
-                "text": f"Video uploaded: {video_desc}. Note: Visual analysis not available with current model configuration."
             })
             processing_info.append("🎥 Video processed (metadata only)")
@@ -255,8 +239,8 @@ def create_interface():
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
         - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
-        - **Images**: Upload images (metadata analysis only due to model limitations)
-        - **Video**: Upload videos (metadata analysis only due to model limitations)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
@@ -562,9 +546,11 @@ def create_interface():
         - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
-        **🖼️ Image Chat**: Upload images (currently metadata only due to model limitations)
-        **🎥 Video Chat**: Upload videos (currently metadata only due to model limitations)
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis

         self.conversation_history = []
     def encode_image_to_base64(self, image) -> str:
+        """Convert PIL Image or file path to base64 string"""
         try:
             if isinstance(image, str):
                 with open(image, "rb") as img_file:
                     return base64.b64encode(img_file.read()).decode('utf-8')
+            elif isinstance(image, Image.Image):
                 buffered = io.BytesIO()
                 if image.mode == 'RGBA':
                     image = image.convert('RGB')
                 image.save(buffered, format="JPEG", quality=85)
                 return base64.b64encode(buffered.getvalue()).decode('utf-8')
+            else:
+                raise ValueError("Invalid image input")
         except Exception as e:
             return f"Error encoding image: {str(e)}"
             return f"Error transcribing audio: {str(e)}"
     def process_video(self, video_file) -> Tuple[List[str], str]:
+        """Process video file (metadata only, no visual analysis)"""
         try:
             if isinstance(video_file, str):
                 video_path = video_file
             if not cap.isOpened():
                 return [], "Error: Could not open video file"
             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
             fps = cap.get(cv2.CAP_PROP_FPS)
+            duration = total_frames / fps if fps > 0 else 0
             cap.release()
+            description = f"Video metadata: {total_frames} frames, {duration:.1f} seconds. Visual analysis not supported by the current model."
+            return [], description
         except Exception as e:
             return [], f"Error processing video: {str(e)}"
                 mode = image_file.mode
                 content_parts.append({
                     "type": "text",
+                    "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Visual analysis not supported by the current model. Please describe the image for further assistance."
                 })
             else:
                 content_parts.append({
                     "type": "text",
+                    "text": "Image uploaded. Visual analysis not supported by the current model. Please describe the image for further assistance."
                 })
             processing_info.append("🖼️ Image received (metadata only)")
         if video_file is not None:
+            _, video_desc = self.process_video(video_file)
             content_parts.append({
                 "type": "text",
+                "text": f"Video uploaded: {video_desc}. Please describe the video for further assistance."
             })
             processing_info.append("🎥 Video processed (metadata only)")
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
         - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
+        - **Images**: Upload images (metadata only; visual analysis not supported)
+        - **Video**: Upload videos (metadata only; visual analysis not supported)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
         - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
+        **🖼️ Image Chat**: Upload images (metadata only; visual analysis not supported)
+        - Provide a text description of the image for further assistance
+        **🎥 Video Chat**: Upload videos (metadata only; visual analysis not supported)
+        - Provide a text description of the video for further assistance
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis