Spaces:

snsmcy
/

UI

Sleeping

File size: 2,953 Bytes

a94ab76

/**
 * Backend-based inference using FastAPI (Python) for local GPU inference
 * This allows using the full Turkish fine-tuned models with PEFT/LoRA adapters
 * 
 * Backend: backend/python_backend/main.py
 * - FastAPI server
 * - PyTorch + Transformers for model loading
 * - PEFT for LoRA adapters (Gemma models)
 * - Direct loading for Qwen/Llama merged models
 */

// Backend API configuration
const BACKEND_URL = import.meta.env.VITE_BACKEND_URL || 'http://localhost:3000';

interface GenerateResponseParams {
    modelPath: string;
    systemPrompt: string;
    userInput: string;
    image?: string;
    temperature?: number;
    maxTokens?: number;
    topP?: number;
    topK?: number;
    onToken?: (content: string) => void;
    onProgress?: (progress: any) => void;
}

export async function generateResponse(params: GenerateResponseParams): Promise<void> {
    const {
        modelPath,
        systemPrompt,
        userInput,
        // image, // TODO: Implement image support for multimodal models
        temperature = 0.7,
        maxTokens = 512,
        topP = 0.95,
        topK = 50,
        onToken,
        onProgress
    } = params;

    try {
        // Notify progress: loading model
        onProgress?.({ status: 'loading', progress: 0 });
        console.log(`Starting inference with model: ${modelPath}`);

        // Call backend API
        const response = await fetch(`${BACKEND_URL}/api/inference/generate`, {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
            },
            body: JSON.stringify({
                modelPath,
                systemPrompt,
                userInput,
                temperature,
                maxTokens,
                topP,
                topK,
            }),
        });

        if (!response.ok) {
            throw new Error(`Backend error: ${response.statusText}`);
        }

        const result = await response.json();

        if (!result.success) {
            throw new Error(result.error || 'Inference failed');
        }

        // Update progress: generating
        onProgress?.({ status: 'generating' });

        // Send the response via onToken callback
        onToken?.(result.response);
        
        onProgress?.({ status: 'done' });
    } catch (error) {
        console.error('Error in generateResponse:', error);
        onProgress?.({ status: 'error', error });
        throw error;
    }
}

/**
 * Check backend health and Python environment
 */
export async function checkBackendHealth(): Promise<{ healthy: boolean; message: string }> {
    try {
        const response = await fetch(`${BACKEND_URL}/api/inference/health`);
        const result = await response.json();
        return result;
    } catch (error) {
        return {
            healthy: false,
            message: `Cannot connect to backend at ${BACKEND_URL}. Make sure the backend is running.`,
        };
    }
}