Update tools.py
Browse files
tools.py
CHANGED
|
@@ -59,7 +59,26 @@ def use_vision_model(question: str, images: List[Image.Image]) -> str:
|
|
| 59 |
}
|
| 60 |
]
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
print(f'Model returned: {output}')
|
| 64 |
return output
|
| 65 |
|
|
@@ -327,5 +346,5 @@ def transcribe_audio(audio_file_path: str) -> str:
|
|
| 327 |
"""
|
| 328 |
model_size: str = "small"
|
| 329 |
model = whisper.load_model(model_size)
|
| 330 |
-
result = model.transcribe(
|
| 331 |
return result['text']
|
|
|
|
| 59 |
}
|
| 60 |
]
|
| 61 |
|
| 62 |
+
# Get the response and properly extract the content as a string
|
| 63 |
+
response = image_model(messages)
|
| 64 |
+
|
| 65 |
+
# Handle different response formats
|
| 66 |
+
if hasattr(response, 'content'):
|
| 67 |
+
output = response.content
|
| 68 |
+
# If content is a list, extract text from it
|
| 69 |
+
if isinstance(output, list):
|
| 70 |
+
text_parts = []
|
| 71 |
+
for item in output:
|
| 72 |
+
if isinstance(item, dict) and 'text' in item:
|
| 73 |
+
text_parts.append(item['text'])
|
| 74 |
+
elif isinstance(item, str):
|
| 75 |
+
text_parts.append(item)
|
| 76 |
+
output = ' '.join(text_parts) if text_parts else str(output)
|
| 77 |
+
elif not isinstance(output, str):
|
| 78 |
+
output = str(output)
|
| 79 |
+
else:
|
| 80 |
+
output = str(response)
|
| 81 |
+
|
| 82 |
print(f'Model returned: {output}')
|
| 83 |
return output
|
| 84 |
|
|
|
|
| 346 |
"""
|
| 347 |
model_size: str = "small"
|
| 348 |
model = whisper.load_model(model_size)
|
| 349 |
+
result = model.transcribe(audio_path)
|
| 350 |
return result['text']
|