|
|
import { streamText, smoothStream } from "ai"; |
|
|
import { Md5 } from "ts-md5"; |
|
|
import { toast } from "sonner"; |
|
|
import useModelProvider from "@/hooks/useAiProvider"; |
|
|
import { useKnowledgeStore } from "@/store/knowledge"; |
|
|
import { useTaskStore } from "@/store/task"; |
|
|
import { useSettingStore } from "@/store/setting"; |
|
|
import { rewritingPrompt } from "@/constants/prompts"; |
|
|
import { jinaReader, localCrawler } from "@/utils/crawler"; |
|
|
import { fileParser } from "@/utils/parser"; |
|
|
import { getTextByteSize } from "@/utils/file"; |
|
|
import { |
|
|
splitText, |
|
|
containsXmlHtmlTags, |
|
|
ThinkTagStreamProcessor, |
|
|
} from "@/utils/text"; |
|
|
import { parseError } from "@/utils/error"; |
|
|
import { omit } from "radash"; |
|
|
|
|
|
const MAX_CHUNK_LENGTH = 10000; |
|
|
|
|
|
function smoothTextStream(type: "character" | "word" | "line") { |
|
|
return smoothStream({ |
|
|
chunking: type === "character" ? /./ : type, |
|
|
delayInMs: 0, |
|
|
}); |
|
|
} |
|
|
|
|
|
function handleError(error: unknown) { |
|
|
const errorMessage = parseError(error); |
|
|
toast.error(errorMessage); |
|
|
} |
|
|
|
|
|
function useKnowledge() { |
|
|
const { smoothTextStreamType } = useSettingStore(); |
|
|
const { createModelProvider, getModel } = useModelProvider(); |
|
|
const knowledgeStore = useKnowledgeStore(); |
|
|
|
|
|
function generateId( |
|
|
type: "file" | "url" | "knowledge", |
|
|
options?: { |
|
|
fileMeta?: FileMeta; |
|
|
url?: string; |
|
|
} |
|
|
): string { |
|
|
if (type === "file" && options && options.fileMeta) { |
|
|
const { fileMeta } = options; |
|
|
const meta = `${fileMeta.name}::${fileMeta.size}::${fileMeta.type}::${fileMeta.lastModified}`; |
|
|
return Md5.hashStr(meta); |
|
|
} else if (type === "url" && options && options.url) { |
|
|
return Md5.hashStr( |
|
|
`${options.url}::${Date.now().toString().substring(0, 8)}` |
|
|
); |
|
|
} else if (type === "knowledge") { |
|
|
return Md5.hashStr(`KNOWLEDGE::${Date.now()}`); |
|
|
} else { |
|
|
throw new Error("Parameter error"); |
|
|
} |
|
|
} |
|
|
|
|
|
async function getKnowledgeFromFile(file: File) { |
|
|
const { resources, addResource, updateResource } = useTaskStore.getState(); |
|
|
|
|
|
const fileMeta: FileMeta = { |
|
|
name: file.name, |
|
|
size: file.size, |
|
|
type: file.type, |
|
|
lastModified: file.lastModified, |
|
|
}; |
|
|
const id = generateId("file", { fileMeta }); |
|
|
const isExist = resources.find((item) => item.id === id); |
|
|
if (isExist) { |
|
|
return toast.message(`File already exist: ${file.name}`); |
|
|
} |
|
|
|
|
|
async function extractText(rid: string, title: string, text: string) { |
|
|
const { networkingModel } = getModel(); |
|
|
|
|
|
let content = ""; |
|
|
let reasoning = ""; |
|
|
const thinkTagStreamProcessor = new ThinkTagStreamProcessor(); |
|
|
const result = streamText({ |
|
|
model: await createModelProvider(networkingModel), |
|
|
prompt: text, |
|
|
system: rewritingPrompt, |
|
|
onFinish: () => { |
|
|
const currentTime = Date.now(); |
|
|
knowledgeStore.save({ |
|
|
id: rid, |
|
|
title, |
|
|
content, |
|
|
type: "file", |
|
|
fileMeta, |
|
|
createdAt: currentTime, |
|
|
updatedAt: currentTime, |
|
|
}); |
|
|
}, |
|
|
experimental_transform: smoothTextStream(smoothTextStreamType), |
|
|
onError: (err) => { |
|
|
updateResource(id, { status: "failed" }); |
|
|
handleError(err); |
|
|
}, |
|
|
}); |
|
|
for await (const part of result.fullStream) { |
|
|
if (part.type === "text-delta") { |
|
|
thinkTagStreamProcessor.processChunk( |
|
|
part.textDelta, |
|
|
(data) => { |
|
|
content += data; |
|
|
}, |
|
|
(data) => { |
|
|
reasoning += data; |
|
|
} |
|
|
); |
|
|
} else if (part.type === "reasoning") { |
|
|
reasoning += part.textDelta; |
|
|
} |
|
|
} |
|
|
if (reasoning) console.log(reasoning); |
|
|
return content; |
|
|
} |
|
|
|
|
|
try { |
|
|
if (knowledgeStore.exist(id)) { |
|
|
const knowledge = knowledgeStore.get(id); |
|
|
if (knowledge) { |
|
|
addResource({ |
|
|
id, |
|
|
name: knowledge.title, |
|
|
type: knowledge.type, |
|
|
size: getTextByteSize(knowledge.content), |
|
|
status: "completed", |
|
|
}); |
|
|
} |
|
|
} else { |
|
|
addResource({ |
|
|
...omit(fileMeta, ["lastModified"]), |
|
|
id, |
|
|
status: "processing", |
|
|
}); |
|
|
|
|
|
const text = await fileParser(file); |
|
|
if (text.length > MAX_CHUNK_LENGTH || !file.type.startsWith("text/")) { |
|
|
const chunks = splitText(text, MAX_CHUNK_LENGTH); |
|
|
for (const idx in chunks) { |
|
|
const chunk = chunks[idx]; |
|
|
const index = Number(idx); |
|
|
let rid = id; |
|
|
const names = fileMeta.name.split("."); |
|
|
const filename = `${names[0]}-${index + 1}.${names[1] || "txt"}`; |
|
|
|
|
|
if (index > 0) { |
|
|
rid = `${id}_${index}`; |
|
|
addResource({ |
|
|
...omit(fileMeta, ["lastModified"]), |
|
|
id: rid, |
|
|
name: filename, |
|
|
size: getTextByteSize(chunk), |
|
|
status: "processing", |
|
|
}); |
|
|
} else { |
|
|
updateResource(rid, { |
|
|
name: filename, |
|
|
size: getTextByteSize(chunk), |
|
|
status: "processing", |
|
|
}); |
|
|
} |
|
|
|
|
|
let content = ""; |
|
|
if (containsXmlHtmlTags(chunk)) { |
|
|
content = await extractText(rid, filename, chunk); |
|
|
} else { |
|
|
content = chunk; |
|
|
|
|
|
const currentTime = Date.now(); |
|
|
knowledgeStore.save({ |
|
|
id: rid, |
|
|
title: filename, |
|
|
content, |
|
|
type: "file", |
|
|
fileMeta, |
|
|
createdAt: currentTime, |
|
|
updatedAt: currentTime, |
|
|
}); |
|
|
} |
|
|
updateResource(rid, { |
|
|
name: filename, |
|
|
size: getTextByteSize(content), |
|
|
status: "completed", |
|
|
}); |
|
|
} |
|
|
} else { |
|
|
knowledgeStore.save({ |
|
|
id, |
|
|
title: fileMeta.name, |
|
|
content: text, |
|
|
type: "file", |
|
|
fileMeta, |
|
|
createdAt: Date.now(), |
|
|
updatedAt: Date.now(), |
|
|
}); |
|
|
updateResource(id, { |
|
|
size: getTextByteSize(text), |
|
|
status: "completed", |
|
|
}); |
|
|
} |
|
|
} |
|
|
} catch (err) { |
|
|
if (err instanceof Error) { |
|
|
updateResource(id, { status: "failed" }); |
|
|
toast.error(err.message); |
|
|
} else { |
|
|
toast.error("File parsing failed"); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
async function getKnowledgeFromUrl(url: string, crawler: string) { |
|
|
const knowledgeStore = useKnowledgeStore.getState(); |
|
|
const { resources, addResource, updateResource } = useTaskStore.getState(); |
|
|
|
|
|
const id = generateId("url", { url }); |
|
|
const isExist = resources.find((item) => item.id === id); |
|
|
if (isExist) { |
|
|
return toast.message(`Url already loaded: ${url}`); |
|
|
} |
|
|
try { |
|
|
if (knowledgeStore.exist(id)) { |
|
|
const knowledge = knowledgeStore.get(id); |
|
|
if (knowledge) { |
|
|
addResource({ |
|
|
id, |
|
|
name: url, |
|
|
type: "url", |
|
|
size: getTextByteSize(knowledge.content), |
|
|
status: "completed", |
|
|
}); |
|
|
} |
|
|
} else { |
|
|
addResource({ |
|
|
id, |
|
|
name: url, |
|
|
type: "url", |
|
|
size: 0, |
|
|
status: "processing", |
|
|
}); |
|
|
if (crawler === "jina") { |
|
|
const result = await jinaReader(url); |
|
|
const currentTime = Date.now(); |
|
|
knowledgeStore.save({ |
|
|
id, |
|
|
title: result.title, |
|
|
content: result.content, |
|
|
type: "url", |
|
|
url, |
|
|
createdAt: currentTime, |
|
|
updatedAt: currentTime, |
|
|
}); |
|
|
updateResource(id, { |
|
|
size: getTextByteSize(result.content), |
|
|
status: "completed", |
|
|
}); |
|
|
} else if (crawler === "local") { |
|
|
const { networkingModel } = getModel(); |
|
|
const { accessPassword } = useSettingStore.getState(); |
|
|
const result = await localCrawler(url, accessPassword); |
|
|
let content = ""; |
|
|
const stream = streamText({ |
|
|
model: await createModelProvider(networkingModel), |
|
|
prompt: result.content, |
|
|
system: rewritingPrompt, |
|
|
onFinish: () => { |
|
|
const currentTime = Date.now(); |
|
|
knowledgeStore.save({ |
|
|
id, |
|
|
title: result.title, |
|
|
content, |
|
|
type: "url", |
|
|
url, |
|
|
createdAt: currentTime, |
|
|
updatedAt: currentTime, |
|
|
}); |
|
|
}, |
|
|
experimental_transform: smoothTextStream(smoothTextStreamType), |
|
|
onError: (err) => { |
|
|
updateResource(id, { status: "failed" }); |
|
|
handleError(err); |
|
|
}, |
|
|
}); |
|
|
for await (const textPart of stream.textStream) { |
|
|
content += textPart; |
|
|
} |
|
|
updateResource(id, { |
|
|
size: getTextByteSize(content), |
|
|
status: "completed", |
|
|
}); |
|
|
} else { |
|
|
throw new Error(`Unknown crawler: ${crawler}`); |
|
|
} |
|
|
} |
|
|
} catch (err) { |
|
|
if (err instanceof Error) { |
|
|
updateResource(id, { status: "failed" }); |
|
|
return toast.error(err.message); |
|
|
} else { |
|
|
toast.error("Url parsing failed"); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
return { |
|
|
generateId, |
|
|
getKnowledgeFromFile, |
|
|
getKnowledgeFromUrl, |
|
|
}; |
|
|
} |
|
|
|
|
|
export default useKnowledge; |
|
|
|