deep-researcher / src /hooks /useKnowledge.ts
Amiel's picture
Upload folder using huggingface_hub
676fc08 verified
import { streamText, smoothStream } from "ai";
import { Md5 } from "ts-md5";
import { toast } from "sonner";
import useModelProvider from "@/hooks/useAiProvider";
import { useKnowledgeStore } from "@/store/knowledge";
import { useTaskStore } from "@/store/task";
import { useSettingStore } from "@/store/setting";
import { rewritingPrompt } from "@/constants/prompts";
import { jinaReader, localCrawler } from "@/utils/crawler";
import { fileParser } from "@/utils/parser";
import { getTextByteSize } from "@/utils/file";
import {
splitText,
containsXmlHtmlTags,
ThinkTagStreamProcessor,
} from "@/utils/text";
import { parseError } from "@/utils/error";
import { omit } from "radash";
const MAX_CHUNK_LENGTH = 10000;
function smoothTextStream(type: "character" | "word" | "line") {
return smoothStream({
chunking: type === "character" ? /./ : type,
delayInMs: 0,
});
}
function handleError(error: unknown) {
const errorMessage = parseError(error);
toast.error(errorMessage);
}
function useKnowledge() {
const { smoothTextStreamType } = useSettingStore();
const { createModelProvider, getModel } = useModelProvider();
const knowledgeStore = useKnowledgeStore();
function generateId(
type: "file" | "url" | "knowledge",
options?: {
fileMeta?: FileMeta;
url?: string;
}
): string {
if (type === "file" && options && options.fileMeta) {
const { fileMeta } = options;
const meta = `${fileMeta.name}::${fileMeta.size}::${fileMeta.type}::${fileMeta.lastModified}`;
return Md5.hashStr(meta);
} else if (type === "url" && options && options.url) {
return Md5.hashStr(
`${options.url}::${Date.now().toString().substring(0, 8)}`
);
} else if (type === "knowledge") {
return Md5.hashStr(`KNOWLEDGE::${Date.now()}`);
} else {
throw new Error("Parameter error");
}
}
async function getKnowledgeFromFile(file: File) {
const { resources, addResource, updateResource } = useTaskStore.getState();
const fileMeta: FileMeta = {
name: file.name,
size: file.size,
type: file.type,
lastModified: file.lastModified,
};
const id = generateId("file", { fileMeta });
const isExist = resources.find((item) => item.id === id);
if (isExist) {
return toast.message(`File already exist: ${file.name}`);
}
async function extractText(rid: string, title: string, text: string) {
const { networkingModel } = getModel();
let content = "";
let reasoning = "";
const thinkTagStreamProcessor = new ThinkTagStreamProcessor();
const result = streamText({
model: await createModelProvider(networkingModel),
prompt: text,
system: rewritingPrompt,
onFinish: () => {
const currentTime = Date.now();
knowledgeStore.save({
id: rid,
title,
content,
type: "file",
fileMeta,
createdAt: currentTime,
updatedAt: currentTime,
});
},
experimental_transform: smoothTextStream(smoothTextStreamType),
onError: (err) => {
updateResource(id, { status: "failed" });
handleError(err);
},
});
for await (const part of result.fullStream) {
if (part.type === "text-delta") {
thinkTagStreamProcessor.processChunk(
part.textDelta,
(data) => {
content += data;
},
(data) => {
reasoning += data;
}
);
} else if (part.type === "reasoning") {
reasoning += part.textDelta;
}
}
if (reasoning) console.log(reasoning);
return content;
}
try {
if (knowledgeStore.exist(id)) {
const knowledge = knowledgeStore.get(id);
if (knowledge) {
addResource({
id,
name: knowledge.title,
type: knowledge.type,
size: getTextByteSize(knowledge.content),
status: "completed",
});
}
} else {
addResource({
...omit(fileMeta, ["lastModified"]),
id,
status: "processing",
});
const text = await fileParser(file);
if (text.length > MAX_CHUNK_LENGTH || !file.type.startsWith("text/")) {
const chunks = splitText(text, MAX_CHUNK_LENGTH);
for (const idx in chunks) {
const chunk = chunks[idx];
const index = Number(idx);
let rid = id;
const names = fileMeta.name.split(".");
const filename = `${names[0]}-${index + 1}.${names[1] || "txt"}`;
if (index > 0) {
rid = `${id}_${index}`;
addResource({
...omit(fileMeta, ["lastModified"]),
id: rid,
name: filename,
size: getTextByteSize(chunk),
status: "processing",
});
} else {
updateResource(rid, {
name: filename,
size: getTextByteSize(chunk),
status: "processing",
});
}
let content = "";
if (containsXmlHtmlTags(chunk)) {
content = await extractText(rid, filename, chunk);
} else {
content = chunk;
// Save to knowledge store for non-XML/HTML content
const currentTime = Date.now();
knowledgeStore.save({
id: rid,
title: filename,
content,
type: "file",
fileMeta,
createdAt: currentTime,
updatedAt: currentTime,
});
}
updateResource(rid, {
name: filename,
size: getTextByteSize(content),
status: "completed",
});
}
} else {
knowledgeStore.save({
id,
title: fileMeta.name,
content: text,
type: "file",
fileMeta,
createdAt: Date.now(),
updatedAt: Date.now(),
});
updateResource(id, {
size: getTextByteSize(text),
status: "completed",
});
}
}
} catch (err) {
if (err instanceof Error) {
updateResource(id, { status: "failed" });
toast.error(err.message);
} else {
toast.error("File parsing failed");
}
}
}
async function getKnowledgeFromUrl(url: string, crawler: string) {
const knowledgeStore = useKnowledgeStore.getState();
const { resources, addResource, updateResource } = useTaskStore.getState();
const id = generateId("url", { url });
const isExist = resources.find((item) => item.id === id);
if (isExist) {
return toast.message(`Url already loaded: ${url}`);
}
try {
if (knowledgeStore.exist(id)) {
const knowledge = knowledgeStore.get(id);
if (knowledge) {
addResource({
id,
name: url,
type: "url",
size: getTextByteSize(knowledge.content),
status: "completed",
});
}
} else {
addResource({
id,
name: url,
type: "url",
size: 0,
status: "processing",
});
if (crawler === "jina") {
const result = await jinaReader(url);
const currentTime = Date.now();
knowledgeStore.save({
id,
title: result.title,
content: result.content,
type: "url",
url,
createdAt: currentTime,
updatedAt: currentTime,
});
updateResource(id, {
size: getTextByteSize(result.content),
status: "completed",
});
} else if (crawler === "local") {
const { networkingModel } = getModel();
const { accessPassword } = useSettingStore.getState();
const result = await localCrawler(url, accessPassword);
let content = "";
const stream = streamText({
model: await createModelProvider(networkingModel),
prompt: result.content,
system: rewritingPrompt,
onFinish: () => {
const currentTime = Date.now();
knowledgeStore.save({
id,
title: result.title,
content,
type: "url",
url,
createdAt: currentTime,
updatedAt: currentTime,
});
},
experimental_transform: smoothTextStream(smoothTextStreamType),
onError: (err) => {
updateResource(id, { status: "failed" });
handleError(err);
},
});
for await (const textPart of stream.textStream) {
content += textPart;
}
updateResource(id, {
size: getTextByteSize(content),
status: "completed",
});
} else {
throw new Error(`Unknown crawler: ${crawler}`);
}
}
} catch (err) {
if (err instanceof Error) {
updateResource(id, { status: "failed" });
return toast.error(err.message);
} else {
toast.error("Url parsing failed");
}
}
}
return {
generateId,
getKnowledgeFromFile,
getKnowledgeFromUrl,
};
}
export default useKnowledge;