|
|
import os |
|
|
from openai import OpenAI |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
|
|
|
EXPLANATION_MODEL = "Qwen/Qwen3-4B-Instruct-2507" |
|
|
EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B" |
|
|
|
|
|
class ModalClient: |
|
|
|
|
|
@staticmethod |
|
|
def infer_llm(prompts: list[str], max_tokens: int = 800): |
|
|
client = OpenAI( |
|
|
base_url=os.environ.get("MODAL_URL_LLM_INFERENCE"), |
|
|
api_key=os.environ.get('VLLM_API_KEY', 'not-needed') |
|
|
) |
|
|
|
|
|
def process_one(prompt): |
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model=EXPLANATION_MODEL, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
max_tokens=max_tokens |
|
|
) |
|
|
return response.choices[0].message.content |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=32) as executor: |
|
|
results = list(executor.map(process_one, prompts)) |
|
|
|
|
|
return results |
|
|
|
|
|
@staticmethod |
|
|
def embedding_config(): |
|
|
|
|
|
|
|
|
return { |
|
|
"base_url":os.environ.get("MODAL_URL_LLM_EMBEDDING"), |
|
|
"api_key":os.environ.get('VLLM_API_KEY', 'not-needed'), |
|
|
"model":EMBEDDING_MODEL |
|
|
} |