shukdevdattaEX commited on
Commit
5ceab5f
ยท
verified ยท
1 Parent(s): 207b913

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -33
app.py CHANGED
@@ -24,17 +24,19 @@ class MultimodalChatbot:
24
  self.conversation_history = []
25
 
26
  def encode_image_to_base64(self, image) -> str:
27
- """Convert PIL Image to base64 string"""
28
  try:
29
  if isinstance(image, str):
30
  with open(image, "rb") as img_file:
31
  return base64.b64encode(img_file.read()).decode('utf-8')
32
- else:
33
  buffered = io.BytesIO()
34
  if image.mode == 'RGBA':
35
  image = image.convert('RGB')
36
  image.save(buffered, format="JPEG", quality=85)
37
  return base64.b64encode(buffered.getvalue()).decode('utf-8')
 
 
38
  except Exception as e:
39
  return f"Error encoding image: {str(e)}"
40
 
@@ -104,7 +106,7 @@ class MultimodalChatbot:
104
  return f"Error transcribing audio: {str(e)}"
105
 
106
  def process_video(self, video_file) -> Tuple[List[str], str]:
107
- """Extract frames from video and convert to base64"""
108
  try:
109
  if isinstance(video_file, str):
110
  video_path = video_file
@@ -117,31 +119,13 @@ class MultimodalChatbot:
117
  if not cap.isOpened():
118
  return [], "Error: Could not open video file"
119
 
120
- frames = []
121
- frame_descriptions = []
122
- frame_count = 0
123
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
124
  fps = cap.get(cv2.CAP_PROP_FPS)
125
- frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
126
-
127
- while True:
128
- ret, frame = cap.read()
129
- if not ret or len(frames) >= 5:
130
- break
131
- if frame_count % frame_interval == 0:
132
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
133
- pil_image = Image.fromarray(rgb_frame)
134
- pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
135
- base64_frame = self.encode_image_to_base64(pil_image)
136
- if not base64_frame.startswith("Error"):
137
- frames.append(base64_frame)
138
- timestamp = frame_count / fps if fps > 0 else frame_count
139
- frame_descriptions.append(f"Frame at {timestamp:.1f}s")
140
- frame_count += 1
141
-
142
  cap.release()
143
- description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
144
- return frames, description
 
145
  except Exception as e:
146
  return [], f"Error processing video: {str(e)}"
147
 
@@ -174,20 +158,20 @@ class MultimodalChatbot:
174
  mode = image_file.mode
175
  content_parts.append({
176
  "type": "text",
177
- "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Note: Visual analysis not available with current model configuration."
178
  })
179
  else:
180
  content_parts.append({
181
  "type": "text",
182
- "text": "Image uploaded. Note: Visual analysis not available with current model configuration."
183
  })
184
  processing_info.append("๐Ÿ–ผ๏ธ Image received (metadata only)")
185
 
186
  if video_file is not None:
187
- frames, video_desc = self.process_video(video_file)
188
  content_parts.append({
189
  "type": "text",
190
- "text": f"Video uploaded: {video_desc}. Note: Visual analysis not available with current model configuration."
191
  })
192
  processing_info.append("๐ŸŽฅ Video processed (metadata only)")
193
 
@@ -255,8 +239,8 @@ def create_interface():
255
  - **Text**: Regular text messages
256
  - **PDF**: Extract and analyze document content
257
  - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
258
- - **Images**: Upload images (metadata analysis only due to model limitations)
259
- - **Video**: Upload videos (metadata analysis only due to model limitations)
260
 
261
  **Setup**: Enter your OpenRouter API key below to get started
262
  """)
@@ -562,9 +546,11 @@ def create_interface():
562
  - Supports: WAV, MP3, M4A, FLAC, OGG formats
563
  - Best results with clear speech and minimal background noise
564
 
565
- **๐Ÿ–ผ๏ธ Image Chat**: Upload images (currently metadata only due to model limitations)
 
566
 
567
- **๐ŸŽฅ Video Chat**: Upload videos (currently metadata only due to model limitations)
 
568
 
569
  **๐ŸŒŸ Combined Chat**: Use multiple input types together for comprehensive analysis
570
 
 
24
  self.conversation_history = []
25
 
26
  def encode_image_to_base64(self, image) -> str:
27
+ """Convert PIL Image or file path to base64 string"""
28
  try:
29
  if isinstance(image, str):
30
  with open(image, "rb") as img_file:
31
  return base64.b64encode(img_file.read()).decode('utf-8')
32
+ elif isinstance(image, Image.Image):
33
  buffered = io.BytesIO()
34
  if image.mode == 'RGBA':
35
  image = image.convert('RGB')
36
  image.save(buffered, format="JPEG", quality=85)
37
  return base64.b64encode(buffered.getvalue()).decode('utf-8')
38
+ else:
39
+ raise ValueError("Invalid image input")
40
  except Exception as e:
41
  return f"Error encoding image: {str(e)}"
42
 
 
106
  return f"Error transcribing audio: {str(e)}"
107
 
108
  def process_video(self, video_file) -> Tuple[List[str], str]:
109
+ """Process video file (metadata only, no visual analysis)"""
110
  try:
111
  if isinstance(video_file, str):
112
  video_path = video_file
 
119
  if not cap.isOpened():
120
  return [], "Error: Could not open video file"
121
 
 
 
 
122
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
123
  fps = cap.get(cv2.CAP_PROP_FPS)
124
+ duration = total_frames / fps if fps > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  cap.release()
126
+
127
+ description = f"Video metadata: {total_frames} frames, {duration:.1f} seconds. Visual analysis not supported by the current model."
128
+ return [], description
129
  except Exception as e:
130
  return [], f"Error processing video: {str(e)}"
131
 
 
158
  mode = image_file.mode
159
  content_parts.append({
160
  "type": "text",
161
+ "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Visual analysis not supported by the current model. Please describe the image for further assistance."
162
  })
163
  else:
164
  content_parts.append({
165
  "type": "text",
166
+ "text": "Image uploaded. Visual analysis not supported by the current model. Please describe the image for further assistance."
167
  })
168
  processing_info.append("๐Ÿ–ผ๏ธ Image received (metadata only)")
169
 
170
  if video_file is not None:
171
+ _, video_desc = self.process_video(video_file)
172
  content_parts.append({
173
  "type": "text",
174
+ "text": f"Video uploaded: {video_desc}. Please describe the video for further assistance."
175
  })
176
  processing_info.append("๐ŸŽฅ Video processed (metadata only)")
177
 
 
239
  - **Text**: Regular text messages
240
  - **PDF**: Extract and analyze document content
241
  - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
242
+ - **Images**: Upload images (metadata only; visual analysis not supported)
243
+ - **Video**: Upload videos (metadata only; visual analysis not supported)
244
 
245
  **Setup**: Enter your OpenRouter API key below to get started
246
  """)
 
546
  - Supports: WAV, MP3, M4A, FLAC, OGG formats
547
  - Best results with clear speech and minimal background noise
548
 
549
+ **๐Ÿ–ผ๏ธ Image Chat**: Upload images (metadata only; visual analysis not supported)
550
+ - Provide a text description of the image for further assistance
551
 
552
+ **๐ŸŽฅ Video Chat**: Upload videos (metadata only; visual analysis not supported)
553
+ - Provide a text description of the video for further assistance
554
 
555
  **๐ŸŒŸ Combined Chat**: Use multiple input types together for comprehensive analysis
556