File size: 1,237 Bytes
933c2fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor

# Define your models
EXPLANATION_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B"

class ModalClient:
    
    @staticmethod  
    def infer_llm(prompts: list[str], max_tokens: int = 800):
        client = OpenAI(
            base_url=os.environ.get("MODAL_URL_LLM_INFERENCE"),  
            api_key=os.environ.get('VLLM_API_KEY', 'not-needed')
        )
        
        def process_one(prompt):
            
            response = client.chat.completions.create(
                model=EXPLANATION_MODEL,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens
            )
            return response.choices[0].message.content 
        
        with ThreadPoolExecutor(max_workers=32) as executor:
            results = list(executor.map(process_one, prompts))
            
        return results 
    
    @staticmethod
    def embedding_config():
        
            
        return {
            "base_url":os.environ.get("MODAL_URL_LLM_EMBEDDING"),
            "api_key":os.environ.get('VLLM_API_KEY', 'not-needed'),
            "model":EMBEDDING_MODEL
        }