import os from openai import OpenAI from concurrent.futures import ThreadPoolExecutor # Define your models EXPLANATION_MODEL = "Qwen/Qwen3-4B-Instruct-2507" EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B" class ModalClient: @staticmethod def infer_llm(prompts: list[str], max_tokens: int = 800): client = OpenAI( base_url=os.environ.get("MODAL_URL_LLM_INFERENCE"), api_key=os.environ.get('VLLM_API_KEY', 'not-needed') ) def process_one(prompt): response = client.chat.completions.create( model=EXPLANATION_MODEL, messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens ) return response.choices[0].message.content with ThreadPoolExecutor(max_workers=32) as executor: results = list(executor.map(process_one, prompts)) return results @staticmethod def embedding_config(): return { "base_url":os.environ.get("MODAL_URL_LLM_EMBEDDING"), "api_key":os.environ.get('VLLM_API_KEY', 'not-needed'), "model":EMBEDDING_MODEL }