File size: 1,237 Bytes
933c2fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import os
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
# Define your models
EXPLANATION_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B"
class ModalClient:
@staticmethod
def infer_llm(prompts: list[str], max_tokens: int = 800):
client = OpenAI(
base_url=os.environ.get("MODAL_URL_LLM_INFERENCE"),
api_key=os.environ.get('VLLM_API_KEY', 'not-needed')
)
def process_one(prompt):
response = client.chat.completions.create(
model=EXPLANATION_MODEL,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
with ThreadPoolExecutor(max_workers=32) as executor:
results = list(executor.map(process_one, prompts))
return results
@staticmethod
def embedding_config():
return {
"base_url":os.environ.get("MODAL_URL_LLM_EMBEDDING"),
"api_key":os.environ.get('VLLM_API_KEY', 'not-needed'),
"model":EMBEDDING_MODEL
} |