File size: 2,953 Bytes
a94ab76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/**
 * Backend-based inference using FastAPI (Python) for local GPU inference
 * This allows using the full Turkish fine-tuned models with PEFT/LoRA adapters
 * 
 * Backend: backend/python_backend/main.py
 * - FastAPI server
 * - PyTorch + Transformers for model loading
 * - PEFT for LoRA adapters (Gemma models)
 * - Direct loading for Qwen/Llama merged models
 */

// Backend API configuration
const BACKEND_URL = import.meta.env.VITE_BACKEND_URL || 'http://localhost:3000';

interface GenerateResponseParams {
    modelPath: string;
    systemPrompt: string;
    userInput: string;
    image?: string;
    temperature?: number;
    maxTokens?: number;
    topP?: number;
    topK?: number;
    onToken?: (content: string) => void;
    onProgress?: (progress: any) => void;
}

export async function generateResponse(params: GenerateResponseParams): Promise<void> {
    const {
        modelPath,
        systemPrompt,
        userInput,
        // image, // TODO: Implement image support for multimodal models
        temperature = 0.7,
        maxTokens = 512,
        topP = 0.95,
        topK = 50,
        onToken,
        onProgress
    } = params;

    try {
        // Notify progress: loading model
        onProgress?.({ status: 'loading', progress: 0 });
        console.log(`Starting inference with model: ${modelPath}`);

        // Call backend API
        const response = await fetch(`${BACKEND_URL}/api/inference/generate`, {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
            },
            body: JSON.stringify({
                modelPath,
                systemPrompt,
                userInput,
                temperature,
                maxTokens,
                topP,
                topK,
            }),
        });

        if (!response.ok) {
            throw new Error(`Backend error: ${response.statusText}`);
        }

        const result = await response.json();

        if (!result.success) {
            throw new Error(result.error || 'Inference failed');
        }

        // Update progress: generating
        onProgress?.({ status: 'generating' });

        // Send the response via onToken callback
        onToken?.(result.response);
        
        onProgress?.({ status: 'done' });
    } catch (error) {
        console.error('Error in generateResponse:', error);
        onProgress?.({ status: 'error', error });
        throw error;
    }
}

/**
 * Check backend health and Python environment
 */
export async function checkBackendHealth(): Promise<{ healthy: boolean; message: string }> {
    try {
        const response = await fetch(`${BACKEND_URL}/api/inference/health`);
        const result = await response.json();
        return result;
    } catch (error) {
        return {
            healthy: false,
            message: `Cannot connect to backend at ${BACKEND_URL}. Make sure the backend is running.`,
        };
    }
}