Spaces:
Sleeping
Sleeping
| import { streamText, smoothStream } from "ai"; | |
| import { Md5 } from "ts-md5"; | |
| import { toast } from "sonner"; | |
| import useModelProvider from "@/hooks/useAiProvider"; | |
| import { useKnowledgeStore } from "@/store/knowledge"; | |
| import { useTaskStore } from "@/store/task"; | |
| import { useSettingStore } from "@/store/setting"; | |
| import { | |
| parseDeepResearchPromptOverrides, | |
| resolveDeepResearchPromptTemplates, | |
| } from "@/constants/prompts"; | |
| import { jinaReader, localCrawler } from "@/utils/crawler"; | |
| import { fileParser } from "@/utils/parser"; | |
| import { getTextByteSize } from "@/utils/file"; | |
| import { | |
| splitText, | |
| containsXmlHtmlTags, | |
| ThinkTagStreamProcessor, | |
| } from "@/utils/text"; | |
| import { parseError } from "@/utils/error"; | |
| import { omit } from "radash"; | |
| const MAX_CHUNK_LENGTH = 10000; | |
| function smoothTextStream(type: "character" | "word" | "line") { | |
| return smoothStream({ | |
| chunking: type === "character" ? /./ : type, | |
| delayInMs: 0, | |
| }); | |
| } | |
| function handleError(error: unknown) { | |
| const errorMessage = parseError(error); | |
| toast.error(errorMessage); | |
| } | |
| function useKnowledge() { | |
| const { smoothTextStreamType } = useSettingStore(); | |
| const { createModelProvider, getModel } = useModelProvider(); | |
| const knowledgeStore = useKnowledgeStore(); | |
| function getRewritingPrompt() { | |
| try { | |
| const { deepResearchPromptOverrides } = useSettingStore.getState(); | |
| const promptOverrides = parseDeepResearchPromptOverrides( | |
| deepResearchPromptOverrides | |
| ); | |
| return resolveDeepResearchPromptTemplates(promptOverrides).rewritingPrompt; | |
| } catch { | |
| return resolveDeepResearchPromptTemplates().rewritingPrompt; | |
| } | |
| } | |
| function generateId( | |
| type: "file" | "url" | "knowledge", | |
| options?: { | |
| fileMeta?: FileMeta; | |
| url?: string; | |
| } | |
| ): string { | |
| if (type === "file" && options && options.fileMeta) { | |
| const { fileMeta } = options; | |
| const meta = `${fileMeta.name}::${fileMeta.size}::${fileMeta.type}::${fileMeta.lastModified}`; | |
| return Md5.hashStr(meta); | |
| } else if (type === "url" && options && options.url) { | |
| return Md5.hashStr( | |
| `${options.url}::${Date.now().toString().substring(0, 8)}` | |
| ); | |
| } else if (type === "knowledge") { | |
| return Md5.hashStr(`KNOWLEDGE::${Date.now()}`); | |
| } else { | |
| throw new Error("Parameter error"); | |
| } | |
| } | |
| async function getKnowledgeFromFile(file: File) { | |
| const { resources, addResource, updateResource } = useTaskStore.getState(); | |
| const fileMeta: FileMeta = { | |
| name: file.name, | |
| size: file.size, | |
| type: file.type, | |
| lastModified: file.lastModified, | |
| }; | |
| const id = generateId("file", { fileMeta }); | |
| const isExist = resources.find((item) => item.id === id); | |
| if (isExist) { | |
| return toast.message(`File already exist: ${file.name}`); | |
| } | |
| async function extractText(rid: string, title: string, text: string) { | |
| const { networkingModel } = getModel(); | |
| let content = ""; | |
| let reasoning = ""; | |
| const thinkTagStreamProcessor = new ThinkTagStreamProcessor(); | |
| const result = streamText({ | |
| model: await createModelProvider(networkingModel), | |
| prompt: text, | |
| system: getRewritingPrompt(), | |
| onFinish: () => { | |
| const currentTime = Date.now(); | |
| knowledgeStore.save({ | |
| id: rid, | |
| title, | |
| content, | |
| type: "file", | |
| fileMeta, | |
| createdAt: currentTime, | |
| updatedAt: currentTime, | |
| }); | |
| }, | |
| experimental_transform: smoothTextStream(smoothTextStreamType), | |
| onError: (err) => { | |
| updateResource(id, { status: "failed" }); | |
| handleError(err); | |
| }, | |
| }); | |
| for await (const part of result.fullStream) { | |
| if (part.type === "text-delta") { | |
| thinkTagStreamProcessor.processChunk( | |
| part.textDelta, | |
| (data) => { | |
| content += data; | |
| }, | |
| (data) => { | |
| reasoning += data; | |
| } | |
| ); | |
| } else if (part.type === "reasoning") { | |
| reasoning += part.textDelta; | |
| } | |
| } | |
| if (reasoning) console.log(reasoning); | |
| return content; | |
| } | |
| try { | |
| if (knowledgeStore.exist(id)) { | |
| const knowledge = knowledgeStore.get(id); | |
| if (knowledge) { | |
| addResource({ | |
| id, | |
| name: knowledge.title, | |
| type: knowledge.type, | |
| size: getTextByteSize(knowledge.content), | |
| status: "completed", | |
| }); | |
| } | |
| } else { | |
| addResource({ | |
| ...omit(fileMeta, ["lastModified"]), | |
| id, | |
| status: "processing", | |
| }); | |
| const text = await fileParser(file); | |
| if (text.length > MAX_CHUNK_LENGTH || !file.type.startsWith("text/")) { | |
| const chunks = splitText(text, MAX_CHUNK_LENGTH); | |
| for (const idx in chunks) { | |
| const chunk = chunks[idx]; | |
| const index = Number(idx); | |
| let rid = id; | |
| const names = fileMeta.name.split("."); | |
| const filename = `${names[0]}-${index + 1}.${names[1] || "txt"}`; | |
| if (index > 0) { | |
| rid = `${id}_${index}`; | |
| addResource({ | |
| ...omit(fileMeta, ["lastModified"]), | |
| id: rid, | |
| name: filename, | |
| size: getTextByteSize(chunk), | |
| status: "processing", | |
| }); | |
| } else { | |
| updateResource(rid, { | |
| name: filename, | |
| size: getTextByteSize(chunk), | |
| status: "processing", | |
| }); | |
| } | |
| let content = ""; | |
| if (containsXmlHtmlTags(chunk)) { | |
| content = await extractText(rid, filename, chunk); | |
| } else { | |
| content = chunk; | |
| // Save to knowledge store for non-XML/HTML content | |
| const currentTime = Date.now(); | |
| knowledgeStore.save({ | |
| id: rid, | |
| title: filename, | |
| content, | |
| type: "file", | |
| fileMeta, | |
| createdAt: currentTime, | |
| updatedAt: currentTime, | |
| }); | |
| } | |
| updateResource(rid, { | |
| name: filename, | |
| size: getTextByteSize(content), | |
| status: "completed", | |
| }); | |
| } | |
| } else { | |
| knowledgeStore.save({ | |
| id, | |
| title: fileMeta.name, | |
| content: text, | |
| type: "file", | |
| fileMeta, | |
| createdAt: Date.now(), | |
| updatedAt: Date.now(), | |
| }); | |
| updateResource(id, { | |
| size: getTextByteSize(text), | |
| status: "completed", | |
| }); | |
| } | |
| } | |
| } catch (err) { | |
| if (err instanceof Error) { | |
| updateResource(id, { status: "failed" }); | |
| toast.error(err.message); | |
| } else { | |
| toast.error("File parsing failed"); | |
| } | |
| } | |
| } | |
| async function getKnowledgeFromUrl(url: string, crawler: string) { | |
| const knowledgeStore = useKnowledgeStore.getState(); | |
| const { resources, addResource, updateResource } = useTaskStore.getState(); | |
| const id = generateId("url", { url }); | |
| const isExist = resources.find((item) => item.id === id); | |
| if (isExist) { | |
| return toast.message(`Url already loaded: ${url}`); | |
| } | |
| try { | |
| if (knowledgeStore.exist(id)) { | |
| const knowledge = knowledgeStore.get(id); | |
| if (knowledge) { | |
| addResource({ | |
| id, | |
| name: url, | |
| type: "url", | |
| size: getTextByteSize(knowledge.content), | |
| status: "completed", | |
| }); | |
| } | |
| } else { | |
| addResource({ | |
| id, | |
| name: url, | |
| type: "url", | |
| size: 0, | |
| status: "processing", | |
| }); | |
| if (crawler === "jina") { | |
| const result = await jinaReader(url); | |
| const currentTime = Date.now(); | |
| knowledgeStore.save({ | |
| id, | |
| title: result.title, | |
| content: result.content, | |
| type: "url", | |
| url, | |
| createdAt: currentTime, | |
| updatedAt: currentTime, | |
| }); | |
| updateResource(id, { | |
| size: getTextByteSize(result.content), | |
| status: "completed", | |
| }); | |
| } else if (crawler === "local") { | |
| const { networkingModel } = getModel(); | |
| const { accessPassword } = useSettingStore.getState(); | |
| const result = await localCrawler(url, accessPassword); | |
| let content = ""; | |
| const stream = streamText({ | |
| model: await createModelProvider(networkingModel), | |
| prompt: result.content, | |
| system: getRewritingPrompt(), | |
| onFinish: () => { | |
| const currentTime = Date.now(); | |
| knowledgeStore.save({ | |
| id, | |
| title: result.title, | |
| content, | |
| type: "url", | |
| url, | |
| createdAt: currentTime, | |
| updatedAt: currentTime, | |
| }); | |
| }, | |
| experimental_transform: smoothTextStream(smoothTextStreamType), | |
| onError: (err) => { | |
| updateResource(id, { status: "failed" }); | |
| handleError(err); | |
| }, | |
| }); | |
| for await (const textPart of stream.textStream) { | |
| content += textPart; | |
| } | |
| updateResource(id, { | |
| size: getTextByteSize(content), | |
| status: "completed", | |
| }); | |
| } else { | |
| throw new Error(`Unknown crawler: ${crawler}`); | |
| } | |
| } | |
| } catch (err) { | |
| if (err instanceof Error) { | |
| updateResource(id, { status: "failed" }); | |
| return toast.error(err.message); | |
| } else { | |
| toast.error("Url parsing failed"); | |
| } | |
| } | |
| } | |
| return { | |
| generateId, | |
| getKnowledgeFromFile, | |
| getKnowledgeFromUrl, | |
| }; | |
| } | |
| export default useKnowledge; | |