Spaces:
Build error
Build error
Fishead_East commited on
Commit ·
056dcfe
1
Parent(s): 96ee390
测试本地向量库文件
Browse files
gushiwen_vector_database/Process.ipynb
CHANGED
|
@@ -91,15 +91,24 @@
|
|
| 91 |
},
|
| 92 |
{
|
| 93 |
"cell_type": "code",
|
| 94 |
-
"execution_count":
|
| 95 |
-
"metadata": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
"outputs": [
|
| 97 |
{
|
| 98 |
-
"
|
| 99 |
-
"
|
| 100 |
-
"
|
| 101 |
-
|
| 102 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
]
|
| 104 |
}
|
| 105 |
],
|
|
|
|
| 91 |
},
|
| 92 |
{
|
| 93 |
"cell_type": "code",
|
| 94 |
+
"execution_count": 1,
|
| 95 |
+
"metadata": {
|
| 96 |
+
"ExecuteTime": {
|
| 97 |
+
"end_time": "2023-08-15T04:30:53.455501Z",
|
| 98 |
+
"start_time": "2023-08-15T04:30:09.405354Z"
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
"outputs": [
|
| 102 |
{
|
| 103 |
+
"ename": "NameError",
|
| 104 |
+
"evalue": "name 'get_vector' is not defined",
|
| 105 |
+
"output_type": "error",
|
| 106 |
+
"traceback": [
|
| 107 |
+
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
|
| 108 |
+
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
|
| 109 |
+
"Cell \u001B[0;32mIn[1], line 55\u001B[0m\n\u001B[1;32m 53\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;18m__name__\u001B[39m \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m__main__\u001B[39m\u001B[38;5;124m'\u001B[39m:\n\u001B[1;32m 54\u001B[0m input_text \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m李白的诗歌\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[0;32m---> 55\u001B[0m knowledges \u001B[38;5;241m=\u001B[39m \u001B[43mget_domain_knowledge\u001B[49m\u001B[43m(\u001B[49m\u001B[43minput_text\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m5\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 56\u001B[0m \u001B[38;5;28mprint\u001B[39m(knowledges, \u001B[38;5;28mlen\u001B[39m(knowledges))\n",
|
| 110 |
+
"Cell \u001B[0;32mIn[1], line 22\u001B[0m, in \u001B[0;36mget_domain_knowledge\u001B[0;34m(text, n, threshold)\u001B[0m\n\u001B[1;32m 20\u001B[0m vectors \u001B[38;5;241m=\u001B[39m read_local_vectors()\n\u001B[1;32m 21\u001B[0m \u001B[38;5;66;03m# 将输入文本转化为向量 - Convert the input text into a vector\u001B[39;00m\n\u001B[0;32m---> 22\u001B[0m input_vector \u001B[38;5;241m=\u001B[39m \u001B[43mget_vector\u001B[49m(text)\n\u001B[1;32m 23\u001B[0m \u001B[38;5;66;03m# 将输入文本转化为numpy数组 - Convert the input text into a numpy array\u001B[39;00m\n\u001B[1;32m 24\u001B[0m input_vector \u001B[38;5;241m=\u001B[39m input_vector\u001B[38;5;241m.\u001B[39mdetach()\u001B[38;5;241m.\u001B[39mnumpy()\n",
|
| 111 |
+
"\u001B[0;31mNameError\u001B[0m: name 'get_vector' is not defined"
|
| 112 |
]
|
| 113 |
}
|
| 114 |
],
|
gushiwen_vector_database/generate_vectors.py
CHANGED
|
@@ -13,8 +13,8 @@ from transformers import AutoTokenizer, AutoModel
|
|
| 13 |
import json
|
| 14 |
|
| 15 |
# 加载模型 - Load model
|
| 16 |
-
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers
|
| 17 |
-
model = AutoModel.from_pretrained("sentence-transformers
|
| 18 |
|
| 19 |
# 平均池化 - Average pooling
|
| 20 |
def mean_pooling(model_output, attention_mask):
|
|
|
|
| 13 |
import json
|
| 14 |
|
| 15 |
# 加载模型 - Load model
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained("./embedding_model/models--sentence-transformers--all-mpnet-base-v2/snapshots/bd44305fd6a1b43c16baf96765e2ecb20bca8e1d")
|
| 17 |
+
model = AutoModel.from_pretrained("./embedding_model/models--sentence-transformers--all-mpnet-base-v2/snapshots/bd44305fd6a1b43c16baf96765e2ecb20bca8e1d")
|
| 18 |
|
| 19 |
# 平均池化 - Average pooling
|
| 20 |
def mean_pooling(model_output, attention_mask):
|
gushiwen_vector_database/search_vectors.py
CHANGED
|
@@ -52,18 +52,22 @@ def get_domain_knowledge(text, n, threshold=0.2):
|
|
| 52 |
# 取出相似度最高的前n个文本的序号 - Take out the serial number of the top n texts with the highest similarity
|
| 53 |
knowledges_ids = similarity_sorted[:n].tolist()
|
| 54 |
# 读取知识库 - Read the knowledge base
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
| 62 |
return knowledges
|
| 63 |
return ''
|
| 64 |
|
|
|
|
| 65 |
# sample:
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
| 52 |
# 取出相似度最高的前n个文本的序号 - Take out the serial number of the top n texts with the highest similarity
|
| 53 |
knowledges_ids = similarity_sorted[:n].tolist()
|
| 54 |
# 读取知识库 - Read the knowledge base
|
| 55 |
+
with open('gushiwen.json', 'r', encoding='utf8') as file:
|
| 56 |
+
file_content = file.read()
|
| 57 |
+
knowledges = json.loads(file_content)
|
| 58 |
+
|
| 59 |
+
# 去除概率小于阈值的知识 - Remove knowledge with probability less than threshold
|
| 60 |
+
knowledges_ids = [i for i in knowledges_ids if similarity[0][i] > threshold]
|
| 61 |
+
# 直接输出资料文本 - directly output the text
|
| 62 |
+
knowledges = [str(knowledge) for knowledge in knowledges]
|
| 63 |
+
# 取出相似度最高的前n个文本 - Take out the top n texts with the highest similarity
|
| 64 |
+
knowledges = [knowledges[i] for i in knowledges_ids]
|
| 65 |
return knowledges
|
| 66 |
return ''
|
| 67 |
|
| 68 |
+
|
| 69 |
# sample:
|
| 70 |
+
if __name__ == '__main__':
|
| 71 |
+
input_text = '李白的诗歌'
|
| 72 |
+
knowledges = get_domain_knowledge(input_text, 5)
|
| 73 |
+
print(knowledges, len(knowledges))
|