zhao1977 commited on
Commit
82bee8e
·
verified ·
1 Parent(s): c001935

Upload 9 files

Browse files
Files changed (9) hide show
  1. Dockerfile +20 -0
  2. README.md +2 -12
  3. chat.py +93 -0
  4. create_kb.py +134 -0
  5. docker +0 -0
  6. html_string.py +150 -0
  7. main.py +116 -0
  8. requirements.txt +25 -0
  9. upload_file.py +107 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 使用Python官方基础镜像,这里以Python 3.10为例,可根据实际情况调整
2
+ FROM python:3.10-slim
3
+
4
+ # 设置工作目录
5
+ WORKDIR /app
6
+
7
+ # 将项目文件复制到容器内的工作目录
8
+ COPY . /app
9
+
10
+ # 安装项目所需的依赖包
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # 暴露容器运行时的端口,假设项目运行在8000端口,可根据实际情况调整
14
+ EXPOSE 8000
15
+
16
+ # 设置环境变量
17
+ ENV DASHSCOPE_API_KEY=sk-1874513f361347c5b4b2a17883a7182b
18
+
19
+ # 定义容器启动时执行的命令,这里假设项目使用uvicorn启动,可根据实际情况调整
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -1,12 +1,2 @@
1
- ---
2
- title: Iaprag
3
- emoji: 📉
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- short_description: rag
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ Run: uvicorn main:app --port 7866
2
+ Then visit 127.0.0.1:7866
 
 
 
 
 
 
 
 
 
 
chat.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ from llama_index.core import StorageContext,load_index_from_storage,Settings
4
+ from llama_index.embeddings.dashscope import (
5
+ DashScopeEmbedding,
6
+ DashScopeTextEmbeddingModels,
7
+ DashScopeTextEmbeddingType,
8
+ )
9
+ from llama_index.postprocessor.dashscope_rerank import DashScopeRerank
10
+ from create_kb import *
11
+ DB_PATH = "VectorStore"
12
+ TMP_NAME = "tmp_abcd"
13
+ EMBED_MODEL = DashScopeEmbedding(
14
+ model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
15
+ text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
16
+ )
17
+ # 若使用本地嵌入模型,请取消以下注释:
18
+ # from langchain_community.embeddings import ModelScopeEmbeddings
19
+ # from llama_index.embeddings.langchain import LangchainEmbedding
20
+ # embeddings = ModelScopeEmbeddings(model_id="modelscope/iic/nlp_gte_sentence-embedding_chinese-large")
21
+ # EMBED_MODEL = LangchainEmbedding(embeddings)
22
+
23
+ # 设置嵌入模型
24
+ Settings.embed_model = EMBED_MODEL
25
+
26
+ def get_model_response(multi_modal_input,history,model,temperature,max_tokens,history_round,db_name,similarity_threshold,chunk_cnt):
27
+ # prompt = multi_modal_input['text']
28
+ prompt = history[-1][0]
29
+ tmp_files = multi_modal_input['files']
30
+ if os.path.exists(os.path.join("File",TMP_NAME)):
31
+ db_name = TMP_NAME
32
+ else:
33
+ if tmp_files:
34
+ create_tmp_kb(tmp_files)
35
+ db_name = TMP_NAME
36
+ # 获取index
37
+ print(f"prompt:{prompt},tmp_files:{tmp_files},db_name:{db_name}")
38
+ try:
39
+ dashscope_rerank = DashScopeRerank(top_n=chunk_cnt,return_documents=True)
40
+ storage_context = StorageContext.from_defaults(
41
+ persist_dir=os.path.join(DB_PATH,db_name)
42
+ )
43
+ index = load_index_from_storage(storage_context)
44
+ print("index获取完成")
45
+ retriever_engine = index.as_retriever(
46
+ similarity_top_k=20,
47
+ )
48
+ # 获取chunk
49
+ retrieve_chunk = retriever_engine.retrieve(prompt)
50
+ print(f"原始chunk为:{retrieve_chunk}")
51
+ try:
52
+ results = dashscope_rerank.postprocess_nodes(retrieve_chunk, query_str=prompt)
53
+ print(f"rerank成功,重排后的chunk为:{results}")
54
+ except:
55
+ results = retrieve_chunk[:chunk_cnt]
56
+ print(f"rerank失败,chunk为:{results}")
57
+ chunk_text = ""
58
+ chunk_show = ""
59
+ for i in range(len(results)):
60
+ if results[i].score >= similarity_threshold:
61
+ chunk_text = chunk_text + f"## {i+1}:\n {results[i].text}\n"
62
+ chunk_show = chunk_show + f"## {i+1}:\n {results[i].text}\nscore: {round(results[i].score,2)}\n"
63
+ print(f"已获取chunk:{chunk_text}")
64
+ prompt_template = f"请参考以下内容:{chunk_text},以合适的语气回答用户的问题:{prompt}。如果参考内容中有图片链接也请直接返回。"
65
+ except Exception as e:
66
+ print(f"异常信息:{e}")
67
+ prompt_template = prompt
68
+ chunk_show = ""
69
+ history[-1][-1] = ""
70
+ client = OpenAI(
71
+ api_key=os.getenv("DASHSCOPE_API_KEY"),
72
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
73
+ )
74
+ system_message = {'role': 'system', 'content': 'You are a helpful assistant.'}
75
+ messages = []
76
+ history_round = min(len(history),history_round)
77
+ for i in range(history_round):
78
+ messages.append({'role': 'user', 'content': history[-history_round+i][0]})
79
+ messages.append({'role': 'assistant', 'content': history[-history_round+i][1]})
80
+ messages.append({'role': 'user', 'content': prompt_template})
81
+ messages = [system_message] + messages
82
+ completion = client.chat.completions.create(
83
+ model=model,
84
+ messages=messages,
85
+ temperature=temperature,
86
+ max_tokens=max_tokens,
87
+ stream=True
88
+ )
89
+ assistant_response = ""
90
+ for chunk in completion:
91
+ assistant_response += chunk.choices[0].delta.content
92
+ history[-1][-1] = assistant_response
93
+ yield history,chunk_show
create_kb.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #####################################
2
+ ###### 创建知识库 #######
3
+ #####################################
4
+ import gradio as gr
5
+ import os
6
+ import shutil
7
+ from llama_index.core import VectorStoreIndex,Settings,SimpleDirectoryReader
8
+ from llama_index.embeddings.dashscope import (
9
+ DashScopeEmbedding,
10
+ DashScopeTextEmbeddingModels,
11
+ DashScopeTextEmbeddingType,
12
+ )
13
+ from llama_index.core.schema import TextNode
14
+ from upload_file import *
15
+ DB_PATH = "VectorStore"
16
+ STRUCTURED_FILE_PATH = "File/Structured"
17
+ UNSTRUCTURED_FILE_PATH = "File/Unstructured"
18
+ TMP_NAME = "tmp_abcd"
19
+ EMBED_MODEL = DashScopeEmbedding(
20
+ model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
21
+ text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
22
+ )
23
+ # 若使用本地嵌入模型,请取消以下注释:
24
+ # from langchain_community.embeddings import ModelScopeEmbeddings
25
+ # from llama_index.embeddings.langchain import LangchainEmbedding
26
+ # embeddings = ModelScopeEmbeddings(model_id="modelscope/iic/nlp_gte_sentence-embedding_chinese-large")
27
+ # EMBED_MODEL = LangchainEmbedding(embeddings)
28
+
29
+
30
+ # 设置嵌入模型
31
+ Settings.embed_model = EMBED_MODEL
32
+ # 刷新知识库
33
+ def refresh_knowledge_base():
34
+ return os.listdir(DB_PATH)
35
+
36
+ # 创建非结构化向量数据库
37
+ def create_unstructured_db(db_name:str,label_name:list):
38
+ print(f"知识库名称为:{db_name},类目名称为:{label_name}")
39
+ if label_name is None:
40
+ gr.Info("没有选择类目")
41
+ elif len(db_name) == 0:
42
+ gr.Info("没有命名知识库")
43
+ # 判断是否存在同名向量数据库
44
+ elif db_name in os.listdir(DB_PATH):
45
+ gr.Info("知识库已存在,请换个名字或删除原来知识库再创建")
46
+ else:
47
+ gr.Info("正在创建知识库,请等待知识库创建成功信息显示后前往RAG问答")
48
+ documents = []
49
+ for label in label_name:
50
+ label_path = os.path.join(UNSTRUCTURED_FILE_PATH,label)
51
+ documents.extend(SimpleDirectoryReader(label_path).load_data())
52
+ index = VectorStoreIndex.from_documents(
53
+ documents
54
+ )
55
+ db_path = os.path.join(DB_PATH,db_name)
56
+ if not os.path.exists(db_path):
57
+ os.mkdir(db_path)
58
+ index.storage_context.persist(db_path)
59
+ elif os.path.exists(db_path):
60
+ pass
61
+ gr.Info("知识库创建成功,可前往RAG问答进行提问")
62
+
63
+ # 创建结构化向量数据库
64
+ def create_structured_db(db_name:str,data_table:list):
65
+ print(f"知识库名称为:{db_name},数据表名称为:{data_table}")
66
+ if data_table is None:
67
+ gr.Info("没有选择数据表")
68
+ elif len(db_name) == 0:
69
+ gr.Info("没有命名知识库")
70
+ # 判断是否存在同名向量数据库
71
+ elif db_name in os.listdir(DB_PATH):
72
+ gr.Info("知识库已存在,请换个名字或删除原来知识库再创建")
73
+ else:
74
+ gr.Info("正在创建知识库,请等待知识库创建成功信息显示后前往RAG问答")
75
+ documents = []
76
+ for label in data_table:
77
+ label_path = os.path.join(STRUCTURED_FILE_PATH,label)
78
+ documents.extend(SimpleDirectoryReader(label_path).load_data())
79
+ # index = VectorStoreIndex.from_documents(
80
+ # documents
81
+ # )
82
+ nodes = []
83
+ for doc in documents:
84
+ doc_content = doc.get_content().split('\n')
85
+ for chunk in doc_content:
86
+ node = TextNode(text=chunk)
87
+ node.metadata = {'source': doc.get_doc_id(),'file_name':doc.metadata['file_name']}
88
+ nodes = nodes + [node]
89
+ index = VectorStoreIndex(nodes)
90
+ db_path = os.path.join(DB_PATH,db_name)
91
+ if not os.path.exists(db_path):
92
+ os.mkdir(db_path)
93
+ index.storage_context.persist(db_path)
94
+ gr.Info("知识库创建成功,可前往RAG问答进行提问")
95
+
96
+
97
+ # 删除指定名称知识库
98
+ def delete_db(db_name:str):
99
+ if db_name is not None:
100
+ folder_path = os.path.join(DB_PATH, db_name)
101
+ if os.path.exists(folder_path):
102
+ shutil.rmtree(folder_path)
103
+ gr.Info(f"已成功删除{db_name}知识库")
104
+ print(f"已成功删除{db_name}知识库")
105
+ else:
106
+ gr.Info(f"{db_name}知识库不存在")
107
+ print(f"{db_name}知识库不存在")
108
+
109
+ # 实时更新知识库列表
110
+ def update_knowledge_base():
111
+ return gr.update(choices=os.listdir(DB_PATH))
112
+
113
+ # 临时文件创建知识库
114
+ def create_tmp_kb(files):
115
+ if not os.path.exists(os.path.join("File",TMP_NAME)):
116
+ os.mkdir(os.path.join("File",TMP_NAME))
117
+ for file in files:
118
+ file_name = os.path.basename(file)
119
+ shutil.move(file,os.path.join("File",TMP_NAME,file_name))
120
+ documents = SimpleDirectoryReader(os.path.join("File",TMP_NAME)).load_data()
121
+ index = VectorStoreIndex.from_documents(
122
+ documents
123
+ )
124
+ db_path = os.path.join(DB_PATH,TMP_NAME)
125
+ if not os.path.exists(db_path):
126
+ os.mkdir(db_path)
127
+ index.storage_context.persist(db_path)
128
+
129
+ # 清除tmp文件夹下内容
130
+ def clear_tmp():
131
+ if os.path.exists(os.path.join("File",TMP_NAME)):
132
+ shutil.rmtree(os.path.join("File",TMP_NAME))
133
+ if os.path.exists(os.path.join(DB_PATH,TMP_NAME)):
134
+ shutil.rmtree(os.path.join(DB_PATH,TMP_NAME))
docker ADDED
File without changes
html_string.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ main_html = """<!DOCTYPE html>
2
+ <html lang="zh">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>本地RAG测试工程</title>
7
+ <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
8
+ <style>
9
+ body {
10
+ font-family: Arial, sans-serif;
11
+ background-color: #f5f5f5;
12
+ margin: 0;
13
+ padding: 0;
14
+ display: flex;
15
+ flex-direction: column;
16
+ align-items: center;
17
+ }
18
+ header {
19
+ background-color: #2196f3;
20
+ color: white;
21
+ width: 100%;
22
+ padding: 1.5em;
23
+ text-align: center;
24
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
25
+ }
26
+ main {
27
+ margin: 2em;
28
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
29
+ background-color: white;
30
+ border-radius: 8px;
31
+ overflow: hidden;
32
+ width: 90%;
33
+ max-width: 800px;
34
+ padding: 2em;
35
+ }
36
+ h1 {
37
+ color: #333;
38
+ }
39
+ p {
40
+ color: #666;
41
+ font-size: 1.1em;
42
+ }
43
+ ul {
44
+ list-style-type: none;
45
+ padding: 0;
46
+ }
47
+ ul li {
48
+ background-color: #2196f3;
49
+ margin: 0.5em 0;
50
+ padding: 1em;
51
+ border-radius: 4px;
52
+ transition: background-color 0.3s;
53
+ }
54
+ ul li a {
55
+ color: white;
56
+ text-decoration: none;
57
+ display: flex;
58
+ align-items: center;
59
+ }
60
+ ul li:hover {
61
+ background-color: #1976d2;
62
+ }
63
+ .material-icons {
64
+ margin-right: 0.5em;
65
+ }
66
+ </style>
67
+ </head>
68
+ <body>
69
+ <header>
70
+ <h1>本地RAG测试工程</h1>
71
+ </header>
72
+ <main>
73
+ <p>如果您需要基于上传的文档与模型直接对话,请直接访问<a href="/chat">RAG问答</a>,并在输入框位置上传文件,就可以开始对话了。(此次上传的数据在页面刷新后无法保留,若您希望可以持久使用、维护知识库,请创建知识库)。</p>
74
+ <p>如果您需要创建或更新知识库,请按照<a href="/upload_data">上传数据</a>、<a href="/create_knowledge_base">创建知识库</a>操作,在<a href="/chat">RAG问答</a>中的“知识库选择”位置选择您需要使用的知识库。</p>
75
+ <p>如果您需要基于已创建好的知识库进行问答,请直接访问<a href="/chat">RAG问答</a>,在“加载知识库”处选择您已创建的知识库。</p>
76
+ <ul>
77
+ <li><a href="/upload_data"><span class="material-icons"></span> 1. 上传数据</a></li>
78
+ <li><a href="/create_knowledge_base"><span class="material-icons"></span> 2. 创建知识库</a></li>
79
+ <li><a href="/chat"><span class="material-icons"></span> 3. RAG问答</a></li>
80
+ </ul>
81
+ </main>
82
+ </body>
83
+ </html>"""
84
+
85
+ plain_html = """<!DOCTYPE html>
86
+ <html lang="zh">
87
+ <head>
88
+ <title>RAG问答</title>
89
+ <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
90
+ <style>
91
+ .links-container {
92
+ display: flex;
93
+ justify-content: center; /* 在容器中居中分布子元素 */
94
+ list-style-type: none; /* 去掉ul默认的列表样式 */
95
+ padding: 0; /* 去掉ul默认的内边距 */
96
+ margin: 0; /* 去掉ul默认的外边距 */
97
+ }
98
+ .links-container li {
99
+ margin: 0 5px; /* 每个li元素的左右留出一些空间 */
100
+ padding: 10px 15px; /* 添加内边距 */
101
+ border: 1px solid #ccc; /* 添加边框 */
102
+ border-radius: 5px; /* 添加圆角 */
103
+ background-color: #f9f9f9; /* 背景颜色 */
104
+ transition: background-color 0.3s; /* 背景颜色变化的过渡效果 */
105
+ display: flex; /* 使用flex布局 */
106
+ align-items: center; /* 垂直居中对齐 */
107
+ height: 50px; /* 设置固定高度,确保一致 */
108
+ }
109
+ .links-container li:hover {
110
+ background-color: #e0e0e0; /* 悬停时的背景颜色 */
111
+ }
112
+ .links-container a {
113
+ text-decoration: none !important; /* 去掉链接的下划线 */
114
+ color: #333; /* 链接颜色 */
115
+ font-family: Arial, sans-serif; /* 字体 */
116
+ font-size: 14px; /* 字体大小 */
117
+ display: flex; /* 使用flex布局 */
118
+ align-items: center; /* 垂直居中对齐 */
119
+ height: 100%; /* 确保链接高度与父元素一致 */
120
+ }
121
+ .material-icons {
122
+ font-size: 20px; /* 图标大小 */
123
+ margin-right: 8px; /* 图标和文字间的间距 */
124
+ text-decoration: none; /* 确保图标没有下划线 */
125
+ }
126
+
127
+ /* 深色模式样式 */
128
+ @media (prefers-color-scheme: dark) {
129
+ .links-container li {
130
+ background-color: #333; /* 深色模式下的背景颜色 */
131
+ border-color: #555; /* 深色模式下的边框颜色 */
132
+ }
133
+ .links-container li:hover {
134
+ background-color: #555; /* 深色模式下悬停时的背景颜色 */
135
+ }
136
+ .links-container a {
137
+ color: #f9f9f9; /* 深色模式下的文字颜色 */
138
+ }
139
+ }
140
+ </style>
141
+ </head>
142
+ <body>
143
+ <ul class="links-container">
144
+ <li><a href="/"><span class="material-icons">home</span> 主页</a></li>
145
+ <li><a href="/upload_data"><span class="material-icons">cloud_upload</span> 上传数据</a></li>
146
+ <li><a href="/create_knowledge_base"><span class="material-icons">library_add</span> 创建知识库</a></li>
147
+ <li><a href="/chat"><span class="material-icons">question_answer</span> RAG问答</a></li>
148
+ </ul>
149
+ </body>
150
+ </html>"""
main.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.responses import HTMLResponse
3
+ import gradio as gr
4
+ import os
5
+ from html_string import main_html,plain_html
6
+ from upload_file import *
7
+ from create_kb import *
8
+ from chat import get_model_response
9
+ def user(user_message, history):
10
+ print(user_message)
11
+ return {'text': '','files': user_message['files']}, history + [[user_message['text'], None]]
12
+
13
+ #####################################
14
+ ###### gradio界面 #######
15
+ #####################################
16
+
17
+ def get_chat_block():
18
+ with gr.Blocks(theme=gr.themes.Base(),css=".gradio_container { background-color: #f0f0f0; }") as chat:
19
+ gr.HTML(plain_html)
20
+ with gr.Row():
21
+ with gr.Column(scale=10):
22
+ chatbot = gr.Chatbot(label="Chatbot",height=750,avatar_images=("images/user.jpeg","images/tongyi.png"))
23
+ with gr.Row():
24
+ #
25
+ input_message = gr.MultimodalTextbox(label="请输入",file_types=[".xlsx",".csv",".docx",".pdf",".txt"],scale=7)
26
+ clear_btn = gr.ClearButton(chatbot,input_message,scale=1)
27
+ # 模型与知识库参数
28
+ with gr.Column(scale=5):
29
+ knowledge_base =gr.Dropdown(choices=os.listdir(DB_PATH),label="加载知识库",interactive=True,scale=2)
30
+ with gr.Accordion(label="召回文本段",open=False):
31
+ chunk_text = gr.Textbox(label="召回文本段",interactive=False,scale=5,lines=10)
32
+ with gr.Accordion(label="模型设置",open=True):
33
+ model =gr.Dropdown(choices=['qwen-max','qwen-plus','qwen-turbo'],label="选择模型",interactive=True,value="qwen-max",scale=2)
34
+ temperature = gr.Slider(maximum=2,minimum=0,interactive=True,label="温度参数",step=0.01,value=0.85,scale=2)
35
+ max_tokens = gr.Slider(maximum=2000,minimum=0,interactive=True,label="最大回复长度",step=50,value=1024,scale=2)
36
+ history_round = gr.Slider(maximum=30,minimum=1,interactive=True,label="携带上下文轮数",step=1,value=3,scale=2)
37
+ with gr.Accordion(label="RAG参数设置",open=True):
38
+ chunk_cnt = gr.Slider(maximum=20,minimum=1,interactive=True,label="选择召回片段数",step=1,value=5,scale=2)
39
+ similarity_threshold = gr.Slider(maximum=1,minimum=0,interactive=True,label="相似度阈值",step=0.01,value=0.2,scale=2)
40
+ input_message.submit(fn=user,inputs=[input_message,chatbot],outputs=[input_message,chatbot],queue=False).then(
41
+ fn=get_model_response,inputs=[input_message,chatbot,model,temperature,max_tokens,history_round,knowledge_base,similarity_threshold,chunk_cnt],outputs=[chatbot,chunk_text]
42
+ )
43
+ chat.load(update_knowledge_base,[],knowledge_base)
44
+ chat.load(clear_tmp)
45
+ return chat
46
+
47
+
48
+ def get_upload_block():
49
+ with gr.Blocks(theme=gr.themes.Base()) as upload:
50
+ gr.HTML(plain_html)
51
+ with gr.Tab("非结构化数据"):
52
+ with gr.Accordion(label="新建类目",open=True):
53
+ with gr.Column(scale=2):
54
+ unstructured_file = gr.Files(file_types=["pdf","docx","txt"])
55
+ with gr.Row():
56
+ new_label = gr.Textbox(label="类目名称",placeholder="请输入类目名称",scale=5)
57
+ create_label_btn = gr.Button("新建类目",variant="primary",scale=1)
58
+ with gr.Accordion(label="管理类目",open=False):
59
+ with gr.Row():
60
+ data_label =gr.Dropdown(choices=os.listdir(UNSTRUCTURED_FILE_PATH),label="管理类目",interactive=True,scale=8,multiselect=True)
61
+ delete_label_btn = gr.Button("删除类目",variant="stop",scale=1)
62
+ with gr.Tab("结构化数据"):
63
+ with gr.Accordion(label="新建数据表",open=True):
64
+ with gr.Column(scale=2):
65
+ structured_file = gr.Files(file_types=["xlsx","csv"])
66
+ with gr.Row():
67
+ new_label_1 = gr.Textbox(label="数据表名称",placeholder="请输入数据表名称",scale=5)
68
+ create_label_btn_1 = gr.Button("新建数据表",variant="primary",scale=1)
69
+ with gr.Accordion(label="管理数据表",open=False):
70
+ with gr.Row():
71
+ data_label_1 =gr.Dropdown(choices=os.listdir(STRUCTURED_FILE_PATH),label="管理数据表",interactive=True,scale=8,multiselect=True)
72
+ delete_data_table_btn = gr.Button("删除数据表",variant="stop",scale=1)
73
+ delete_label_btn.click(delete_label,inputs=[data_label]).then(fn=update_label,outputs=[data_label])
74
+ create_label_btn.click(fn=upload_unstructured_file,inputs=[unstructured_file,new_label]).then(fn=update_label,outputs=[data_label])
75
+ delete_data_table_btn.click(delete_data_table,inputs=[data_label_1]).then(fn=update_datatable,outputs=[data_label_1])
76
+ create_label_btn_1.click(fn=upload_structured_file,inputs=[structured_file,new_label_1]).then(fn=update_datatable,outputs=[data_label_1])
77
+ upload.load(update_label,[],data_label)
78
+ upload.load(update_datatable,[],data_label_1)
79
+ return upload
80
+
81
+ def get_knowledge_base_block():
82
+ with gr.Blocks(theme=gr.themes.Base()) as knowledge:
83
+ gr.HTML(plain_html)
84
+ # 非结构化数据知识库
85
+ with gr.Tab("非结构化数据"):
86
+ with gr.Row():
87
+ data_label_2 =gr.Dropdown(choices=os.listdir(UNSTRUCTURED_FILE_PATH),label="选择类目",interactive=True,scale=2,multiselect=True)
88
+ knowledge_base_name = gr.Textbox(label="知识库名称",placeholder="请输入知识库名称",scale=2)
89
+ create_knowledge_base_btn = gr.Button("确认创建知识库",variant="primary",scale=1)
90
+ # 结构化数据知识库
91
+ with gr.Tab("结构化数据"):
92
+ with gr.Row():
93
+ data_label_3 =gr.Dropdown(choices=os.listdir(STRUCTURED_FILE_PATH),label="选择数据表",interactive=True,scale=2,multiselect=True)
94
+ knowledge_base_name_1 = gr.Textbox(label="知识库名称",placeholder="请输入知识库名称",scale=2)
95
+ create_knowledge_base_btn_1 = gr.Button("确认创建知识库",variant="primary",scale=1)
96
+ with gr.Row():
97
+ knowledge_base =gr.Dropdown(choices=os.listdir(DB_PATH),label="管理知识库",interactive=True,scale=4)
98
+ delete_db_btn = gr.Button("删除知识库",variant="stop",scale=1)
99
+ create_knowledge_base_btn.click(fn=create_unstructured_db,inputs=[knowledge_base_name,data_label_2]).then(update_knowledge_base,outputs=[knowledge_base])
100
+ delete_db_btn.click(delete_db,inputs=[knowledge_base]).then(update_knowledge_base,outputs=[knowledge_base])
101
+ create_knowledge_base_btn_1.click(fn=create_structured_db,inputs=[knowledge_base_name_1,data_label_3]).then(update_knowledge_base,outputs=[knowledge_base])
102
+ knowledge.load(update_knowledge_base,[],knowledge_base)
103
+ knowledge.load(update_label,[],data_label_2)
104
+ knowledge.load(update_datatable,[],data_label_3)
105
+ return knowledge
106
+
107
+ app = FastAPI()
108
+ @app.get("/", response_class=HTMLResponse)
109
+ def read_main():
110
+ html_content = main_html
111
+ return HTMLResponse(content=html_content)
112
+
113
+
114
+ app = gr.mount_gradio_app(app, get_chat_block(), path="/chat")
115
+ app = gr.mount_gradio_app(app, get_upload_block(), path="/upload_data")
116
+ app = gr.mount_gradio_app(app, get_knowledge_base_block(), path="/create_knowledge_base")
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.32.0
2
+ faiss-cpu==1.8.0.post1
3
+ dashscope==1.20.4
4
+ openai==1.55.3
5
+ httpx==0.27.0
6
+ llama-index-vector-stores-faiss==0.1.2
7
+ llama-index-embeddings-dashscope==0.1.4
8
+ llama-index-readers-file==0.1.33
9
+ matplotlib==3.9.3
10
+ docx2txt==0.8
11
+ openpyxl==3.1.5
12
+ llama-index-core==0.10.67
13
+ uvicorn==0.30.6
14
+ fastapi==0.112.0
15
+ llama-index-postprocessor-dashscope-rerank-custom==0.1.0
16
+ simplejson==3.19.3
17
+ # modelscope==1.18.0
18
+ # langchain_community==0.2.16
19
+ # transformers==4.44.2
20
+ # llama_index.embeddings.huggingface==0.2.3
21
+ # llama-index-embeddings-langchain==0.1.2
22
+ # datasets==2.21.0
23
+ # oss2==2.19.0
24
+ # sortedcontainers==2.4.0
25
+ # addict==2.4.0s
upload_file.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #####################################
2
+ ####### 上传文件 #######
3
+ #####################################
4
+ import gradio as gr
5
+ import os
6
+ import shutil
7
+ import pandas as pd
8
+ STRUCTURED_FILE_PATH = "File/Structured"
9
+ UNSTRUCTURED_FILE_PATH = "File/Unstructured"
10
+ # 刷新非结构化类目
11
+ def refresh_label():
12
+ return os.listdir(UNSTRUCTURED_FILE_PATH)
13
+
14
+ # 刷新结构化数据表
15
+ def refresh_data_table():
16
+ return os.listdir(STRUCTURED_FILE_PATH)
17
+
18
+ # 上传非结构化数据
19
+ def upload_unstructured_file(files,label_name):
20
+ if files is None:
21
+ gr.Info("请上传文件")
22
+ elif len(label_name) == 0:
23
+ gr.Info("请输入类目名称")
24
+ # 判断类目是否存在
25
+ elif label_name in os.listdir(UNSTRUCTURED_FILE_PATH):
26
+ gr.Info(f"{label_name}类目已存在")
27
+ else:
28
+ try:
29
+ if not os.path.exists(os.path.join(UNSTRUCTURED_FILE_PATH,label_name)):
30
+ os.mkdir(os.path.join(UNSTRUCTURED_FILE_PATH,label_name))
31
+ for file in files:
32
+ print(file)
33
+ file_path = file.name
34
+ file_name = os.path.basename(file_path)
35
+ destination_file_path = os.path.join(UNSTRUCTURED_FILE_PATH,label_name,file_name)
36
+ shutil.move(file_path,destination_file_path)
37
+ gr.Info(f"文件已上传至{label_name}类目中,请前往创建知识库")
38
+ except:
39
+ gr.Info(f"请勿重复上传")
40
+
41
+ # 上传结构化数据
42
+ def upload_structured_file(files,label_name):
43
+ if files is None:
44
+ gr.Info("请上传文件")
45
+ elif len(label_name) == 0:
46
+ gr.Info("请输入数据表名称")
47
+ # 判断数据表是否存在
48
+ elif label_name in os.listdir(STRUCTURED_FILE_PATH):
49
+ gr.Info(f"{label_name}数据表已存在")
50
+ else:
51
+ try:
52
+ if not os.path.exists(os.path.join(STRUCTURED_FILE_PATH,label_name)):
53
+ os.mkdir(os.path.join(STRUCTURED_FILE_PATH,label_name))
54
+ for file in files:
55
+ file_path = file.name
56
+ file_name = os.path.basename(file_path)
57
+ destination_file_path = os.path.join(STRUCTURED_FILE_PATH,label_name,file_name)
58
+ shutil.move(file_path,destination_file_path)
59
+ if os.path.splitext(destination_file_path)[1] == ".xlsx":
60
+ df = pd.read_excel(destination_file_path)
61
+ elif os.path.splitext(destination_file_path)[1] == ".csv":
62
+ df = pd.read_csv(destination_file_path)
63
+ txt_file_name = os.path.splitext(file_name)[0]+'.txt'
64
+ columns = df.columns
65
+ with open(os.path.join(STRUCTURED_FILE_PATH,label_name,txt_file_name),"w") as file:
66
+ for idx,row in df.iterrows():
67
+ file.write("【")
68
+ info = []
69
+ for col in columns:
70
+ info.append(f"{col}:{row[col]}")
71
+ infos = ",".join(info)
72
+ file.write(infos)
73
+ if idx != len(df)-1:
74
+ file.write("】\n")
75
+ else:
76
+ file.write("】")
77
+ os.remove(destination_file_path)
78
+ gr.Info(f"文件已上传至{label_name}数据表中,请前往创建知识库")
79
+ except:
80
+ gr.Info(f"请勿重复上传")
81
+
82
+ # 实时更新结构化数据表
83
+ def update_datatable():
84
+ return gr.update(choices=os.listdir(STRUCTURED_FILE_PATH))
85
+
86
+
87
+ # 实时更新非结构化类目
88
+ def update_label():
89
+ return gr.update(choices=os.listdir(UNSTRUCTURED_FILE_PATH))
90
+
91
+ # 删除类目
92
+ def delete_label(label_name):
93
+ if label_name is not None:
94
+ for label in label_name:
95
+ folder_path = os.path.join(UNSTRUCTURED_FILE_PATH,label)
96
+ if os.path.exists(folder_path):
97
+ shutil.rmtree(folder_path)
98
+ gr.Info(f"{label}类目已删除")
99
+
100
+ # 删除数据表
101
+ def delete_data_table(table_name):
102
+ if table_name is not None:
103
+ for table in table_name:
104
+ folder_path = os.path.join(STRUCTURED_FILE_PATH,table)
105
+ if os.path.exists(folder_path):
106
+ shutil.rmtree(folder_path)
107
+ gr.Info(f"{table}数据表已删除")