| | from huggingface_hub import InferenceClient |
| | from llama_index.core.tools import FunctionTool |
| |
|
| | |
| |
|
| | def query_image(query: str, image_url: str) -> str: |
| | """Ask anything about an image using a Vision Language Model |
| | |
| | Args: |
| | query (str): the query about the image, e.g. how many persons are on the image? |
| | image_url (str): the URL to the image |
| | """ |
| |
|
| | client = InferenceClient(provider="nebius") |
| | try: |
| | completion = client.chat.completions.create( |
| | |
| | model="Qwen/Qwen2.5-VL-72B-Instruct", |
| | messages=[ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "text", |
| | "text": query |
| | }, |
| | { |
| | "type": "image_url", |
| | "image_url": { |
| | "url": image_url |
| | } |
| | } |
| | ] |
| | } |
| | ], |
| | max_tokens=512, |
| | ) |
| | return completion.choices[0].message |
| |
|
| | except Exception as e: |
| | return f"query_image failed: {e}" |
| |
|
| |
|
| | def automatic_speech_recognition(file_url: str) -> str: |
| | """Transcribe an audio file to text |
| | |
| | Args: |
| | file_url (str): the URL to the audio file |
| | """ |
| | client = InferenceClient(provider="fal-ai") |
| | try: |
| | return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3") |
| | except Exception as e: |
| | return f"automatic_speech_recognition failed: {e}" |
| |
|
| |
|
| |
|
| | |
| |
|
| | query_image_tool = FunctionTool.from_defaults(query_image) |
| | automatic_speech_recognition_tool = FunctionTool.from_defaults(automatic_speech_recognition) |