MiniMind Max2 - Efficient MoE Language Model

8b187bb verified 25 days ago

6.69 kB

	package com.minimind.mind2

	import android.content.Context
	import kotlinx.coroutines.*
	import kotlinx.coroutines.flow.*
	import java.io.File

	/**
	* MiniMind (Mind2) Model Interface
	* Kotlin wrapper for native llama.cpp inference
	*/
	class Mind2Model private constructor() {

	companion object {
	init {
	System.loadLibrary("mind2")
	}

	private var instance: Mind2Model? = null

	@JvmStatic
	fun getInstance(): Mind2Model {
	return instance ?: synchronized(this) {
	instance ?: Mind2Model().also { instance = it }
	}
	}
	}

	// Model state
	private var isLoaded = false
	private var modelPath: String? = null

	// Generation parameters
	data class GenerationConfig(
	val maxTokens: Int = 256,
	val temperature: Float = 0.7f,
	val topP: Float = 0.9f,
	val topK: Int = 40,
	val repeatPenalty: Float = 1.1f,
	val stopTokens: List<String> = listOf("<\|endoftext\|>", "<\|im_end\|>")
	)

	/**
	* Load model from assets or file path
	*/
	suspend fun load(
	context: Context,
	modelName: String = "mind2-lite.gguf",
	contextLength: Int = 2048,
	threads: Int = 0 // 0 = auto
	): Result<Unit> = withContext(Dispatchers.IO) {
	try {
	// Check if model is in assets
	val assetPath = "models/$modelName"
	val modelFile = File(context.filesDir, modelName)

	if (!modelFile.exists()) {
	// Copy from assets
	context.assets.open(assetPath).use { input ->
	modelFile.outputStream().use { output ->
	input.copyTo(output)
	}
	}
	}

	modelPath = modelFile.absolutePath
	val success = nativeInit(modelPath!!, contextLength, threads)

	if (success) {
	isLoaded = true
	Result.success(Unit)
	} else {
	Result.failure(RuntimeException("Failed to load model"))
	}
	} catch (e: Exception) {
	Result.failure(e)
	}
	}

	/**
	* Generate text (non-streaming)
	*/
	suspend fun generate(
	prompt: String,
	config: GenerationConfig = GenerationConfig()
	): Result<String> = withContext(Dispatchers.IO) {
	if (!isLoaded) {
	return@withContext Result.failure(IllegalStateException("Model not loaded"))
	}

	try {
	val result = nativeGenerate(
	prompt,
	config.maxTokens,
	config.temperature,
	config.topP,
	config.topK
	)
	Result.success(result)
	} catch (e: Exception) {
	Result.failure(e)
	}
	}

	/**
	* Generate text with streaming
	*/
	fun generateStream(
	prompt: String,
	config: GenerationConfig = GenerationConfig()
	): Flow<String> = callbackFlow {
	if (!isLoaded) {
	throw IllegalStateException("Model not loaded")
	}

	val callback = object : TokenCallback {
	override fun onToken(token: String) {
	trySend(token)
	}

	override fun onComplete() {
	channel.close()
	}
	}

	nativeGenerateStream(
	prompt,
	config.maxTokens,
	config.temperature,
	config.topP,
	config.topK,
	callback
	)

	awaitClose { stop() }
	}.flowOn(Dispatchers.IO)

	/**
	* Chat with conversation history
	*/
	suspend fun chat(
	message: String,
	history: List<ChatMessage> = emptyList(),
	config: GenerationConfig = GenerationConfig()
	): Result<String> {
	val prompt = buildChatPrompt(message, history)
	return generate(prompt, config)
	}

	/**
	* Chat with streaming
	*/
	fun chatStream(
	message: String,
	history: List<ChatMessage> = emptyList(),
	config: GenerationConfig = GenerationConfig()
	): Flow<String> {
	val prompt = buildChatPrompt(message, history)
	return generateStream(prompt, config)
	}

	private fun buildChatPrompt(message: String, history: List<ChatMessage>): String {
	val sb = StringBuilder()

	// System prompt
	sb.append("<\|im_start\|>system\n")
	sb.append("You are Mind2, a helpful AI assistant running locally on this device.\n")
	sb.append("<\|im_end\|>\n")

	// History
	for (msg in history) {
	sb.append("<\|im_start\|>${msg.role}\n")
	sb.append("${msg.content}\n")
	sb.append("<\|im_end\|>\n")
	}

	// Current message
	sb.append("<\|im_start\|>user\n")
	sb.append("$message\n")
	sb.append("<\|im_end\|>\n")
	sb.append("<\|im_start\|>assistant\n")

	return sb.toString()
	}

	/**
	* Stop ongoing generation
	*/
	fun stop() {
	nativeStop()
	}

	/**
	* Release resources
	*/
	fun release() {
	nativeRelease()
	isLoaded = false
	modelPath = null
	}

	/**
	* Get model info
	*/
	fun getInfo(): String = nativeGetInfo()

	/**
	* Benchmark inference speed
	*/
	suspend fun benchmark(tokens: Int = 100): Float = withContext(Dispatchers.IO) {
	nativeBenchmark(tokens)
	}

	// Native methods
	private external fun nativeInit(modelPath: String, nCtx: Int, nThreads: Int): Boolean
	private external fun nativeGenerate(
	prompt: String,
	maxTokens: Int,
	temperature: Float,
	topP: Float,
	topK: Int
	): String
	private external fun nativeGenerateStream(
	prompt: String,
	maxTokens: Int,
	temperature: Float,
	topP: Float,
	topK: Int,
	callback: TokenCallback
	)
	private external fun nativeStop()
	private external fun nativeRelease()
	private external fun nativeGetInfo(): String
	private external fun nativeBenchmark(nTokens: Int): Float

	interface TokenCallback {
	fun onToken(token: String)
	fun onComplete()
	}

	data class ChatMessage(
	val role: String, // "user" or "assistant"
	val content: String
	)
	}

	/**
	* Extension function for easy initialization
	*/
	suspend fun Context.loadMind2Model(
	modelName: String = "mind2-lite.gguf",
	contextLength: Int = 2048
	): Result<Mind2Model> {
	val model = Mind2Model.getInstance()
	return model.load(this, modelName, contextLength).map { model }
	}