lex-interviewer-chat

Sleeping

App Files Files Community

lex-interviewer-chat / src /hooks /LLMProvider.tsx

bobber

Fix cache: remove cache_dir override so env.useBrowserCache=true actually works

ba30678 verified about 2 months ago

raw

history blame contribute delete

8.91 kB

	import {
	useRef,
	useState,
	useEffect,
	useCallback,
	type ReactNode,
	} from "react";
	import {
	pipeline,
	env,
	TextStreamer,
	InterruptableStoppingCriteria,
	type TextGenerationPipeline,
	} from "@huggingface/transformers";

	// Enable browser Cache API so ONNX files persist across sessions
	env.useBrowserCache = true;
	env.allowLocalModels = false;
	import {
	LLMContext,
	createMessageId,
	type ChatMessage,
	type LoadingStatus,
	type ThinkingMode,
	} from "./LLMContext";
	import { ThinkStreamParser, type ThinkDelta } from "../utils/think-parser";
	import { MODEL_CONFIG } from "../model-config";

	interface LLMProviderProps {
	modelId: string;
	children: ReactNode;
	onReady?: () => void;
	}

	function applyDeltas(msg: ChatMessage, deltas: ThinkDelta[]): ChatMessage {
	let { content, reasoning = "" } = msg;
	for (const delta of deltas) {
	if (delta.type === "reasoning") {
	reasoning += delta.textDelta;
	} else {
	content += delta.textDelta;
	}
	}
	return { ...msg, content, reasoning };
	}

	export function LLMProvider({ modelId, children, onReady }: LLMProviderProps) {
	const generatorRef = useRef<Promise<TextGenerationPipeline> \| null>(null);
	const stoppingCriteria = useRef(new InterruptableStoppingCriteria());

	const [status, setStatus] = useState<LoadingStatus>({ state: "idle" });
	const [messages, setMessages] = useState<ChatMessage[]>([]);
	const messagesRef = useRef<ChatMessage[]>([]);
	const [isGenerating, setIsGenerating] = useState(false);
	const isGeneratingRef = useRef(false);
	const [tps, setTps] = useState(0);
	const [thinkingMode, setThinkingMode] = useState<ThinkingMode>("enabled");
	const thinkingModeRef = useRef<ThinkingMode>("enabled");
	const [systemPrompt, setSystemPrompt] = useState(MODEL_CONFIG.defaultSystemPrompt);
	const systemPromptRef = useRef(MODEL_CONFIG.defaultSystemPrompt);

	useEffect(() => {
	messagesRef.current = messages;
	}, [messages]);

	useEffect(() => {
	isGeneratingRef.current = isGenerating;
	}, [isGenerating]);

	useEffect(() => {
	thinkingModeRef.current = thinkingMode;
	}, [thinkingMode]);

	useEffect(() => {
	systemPromptRef.current = systemPrompt;
	}, [systemPrompt]);

	const onReadyRef = useRef(onReady);
	onReadyRef.current = onReady;

	useEffect(() => {
	if (status.state === "ready") onReadyRef.current?.();
	}, [status.state]);

	useEffect(() => {
	if (generatorRef.current) return;

	generatorRef.current = (async () => {
	setStatus({ state: "loading", message: "Downloading model…" });
	try {
	const gen = await pipeline("text-generation", modelId, {
	dtype: "q4",
	device: "webgpu",
	progress_callback: (progress: Record<string, unknown>) => {
	const status = progress.status as string;
	if (status === "progress") {
	const loaded = Number(progress.loaded ?? 0);
	const total = Number(progress.total ?? 1);
	const pct = Math.round((loaded / total) * 100);
	const file = String(progress.file ?? "").split("/").pop() ?? "";
	const loadedMB = (loaded / 1048576).toFixed(0);
	const totalMB = (total / 1048576).toFixed(0);
	setStatus({
	state: "loading",
	progress: pct,
	message: `Downloading ${file}… ${loadedMB}/${totalMB} MB (${pct}%)`,
	});
	} else if (status === "ready") {
	const file = String(progress.file ?? "").split("/").pop() ?? "";
	console.log(`[cache] ${file}: loaded (cached or downloaded)`);
	} else if (status === "initiate") {
	const file = String(progress.file ?? "").split("/").pop() ?? "";
	setStatus({
	state: "loading",
	message: `Loading ${file}…`,
	});
	}
	},
	});
	setStatus({ state: "ready" });
	return gen;
	} catch (err) {
	const msg = err instanceof Error ? err.message : String(err);
	setStatus({ state: "error", error: msg });
	generatorRef.current = null;
	throw err;
	}
	})();
	}, [modelId]);

	const runGeneration = useCallback(async (chatHistory: ChatMessage[]) => {
	const generator = await generatorRef.current!;
	setIsGenerating(true);
	setTps(0);
	stoppingCriteria.current.reset();

	const parser = new ThinkStreamParser();
	let tokenCount = 0;
	let firstTokenTime = 0;
	let isFirstChunk = true;

	const assistantIdx = chatHistory.length;
	setMessages((prev) => [
	...prev,
	{ id: createMessageId(), role: "assistant", content: "", reasoning: "" },
	]);

	const streamer = new TextStreamer(generator.tokenizer, {
	skip_prompt: true,
	skip_special_tokens: false,
	callback_function: (output: string) => {
	console.log("Streamed output:", output);
	if (!output \|\| output === "<\|im_end\|>") return;

	let textToPush = output;
	if (isFirstChunk && thinkingModeRef.current === "enabled") {
	textToPush = "<think>" + output;
	}
	isFirstChunk = false;
	const deltas = parser.push(textToPush);
	if (deltas.length === 0) return;

	setMessages((prev) => {
	const updated = [...prev];
	updated[assistantIdx] = applyDeltas(updated[assistantIdx], deltas);
	return updated;
	});
	},
	token_callback_function: () => {
	tokenCount++;
	if (tokenCount === 1) {
	firstTokenTime = performance.now();
	} else {
	const elapsed = (performance.now() - firstTokenTime) / 1000;
	if (elapsed > 0) {
	setTps(Math.round(((tokenCount - 1) / elapsed) * 10) / 10);
	}
	}
	},
	});

	try {
	const currentSystemPrompt = systemPromptRef.current;
	const messagesForModel = [
	...(currentSystemPrompt
	? [{ role: "system" as const, content: currentSystemPrompt }]
	: []),
	...chatHistory.map((message) => ({
	role: message.role,
	content: message.content,
	})),
	];
	await generator(
	messagesForModel,
	{
	max_new_tokens: 4096,
	do_sample: true,
	streamer,
	stopping_criteria: stoppingCriteria.current,
	tokenizer_encode_kwargs: {
	enable_thinking: thinkingModeRef.current === "enabled",
	},
	},
	);
	} catch (err) {
	console.error("Generation error:", err);
	}

	const remaining = parser.flush();
	if (remaining.length > 0) {
	setMessages((prev) => {
	const updated = [...prev];
	updated[assistantIdx] = applyDeltas(updated[assistantIdx], remaining);
	return updated;
	});
	}

	setMessages((prev) => {
	const updated = [...prev];
	updated[assistantIdx] = {
	...updated[assistantIdx],
	content: parser.content.trim() \|\| prev[assistantIdx].content,
	reasoning: parser.reasoning.trim() \|\| prev[assistantIdx].reasoning,
	};
	return updated;
	});

	setIsGenerating(false);
	}, []);

	const send = useCallback(
	(text: string) => {
	if (!generatorRef.current \|\| isGeneratingRef.current) return;

	const userMsg: ChatMessage = {
	id: createMessageId(),
	role: "user",
	content: text,
	};

	setMessages((prev) => [...prev, userMsg]);
	runGeneration([...messagesRef.current, userMsg]);
	},
	[runGeneration],
	);

	const stop = useCallback(() => {
	stoppingCriteria.current.interrupt();
	}, []);

	const clearChat = useCallback(() => {
	if (isGeneratingRef.current) return;
	setMessages([]);
	}, []);

	const editMessage = useCallback(
	(index: number, newContent: string) => {
	if (isGeneratingRef.current) return;

	const updatedHistory = [
	...messagesRef.current.slice(0, index),
	{ ...messagesRef.current[index], content: newContent },
	];

	setMessages(updatedHistory);

	if (messagesRef.current[index]?.role === "user") {
	setTimeout(() => runGeneration(updatedHistory), 0);
	}
	},
	[runGeneration],
	);

	const retryMessage = useCallback(
	(index: number) => {
	if (isGeneratingRef.current) return;

	const history = messagesRef.current.slice(0, index);
	setMessages(history);
	setTimeout(() => runGeneration(history), 0);
	},
	[runGeneration],
	);

	return (
	<LLMContext.Provider
	value={{
	status,
	messages,
	isGenerating,
	tps,
	thinkingMode,
	setThinkingMode,
	systemPrompt,
	setSystemPrompt,
	send,
	stop,
	clearChat,
	editMessage,
	retryMessage,
	}}
	>
	{children}
	</LLMContext.Provider>
	);
	}