Fishead_East commited on
Commit
056dcfe
·
1 Parent(s): 96ee390

测试本地向量库文件

Browse files
gushiwen_vector_database/Process.ipynb CHANGED
@@ -91,15 +91,24 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 13,
95
- "metadata": {},
 
 
 
 
 
96
  "outputs": [
97
  {
98
- "name": "stdout",
99
- "output_type": "stream",
100
- "text": [
101
- "[[0.49482936 0.5112034 0.49520627 ... 0.45510006 0.47185707 0.53729707]]\n",
102
- "['{\\'id\\': 4387, \\'href\\': \\'/shiwenv_97dccf96451a.aspx\\', \\'title\\': \\'长歌续短歌\\', \\'author\\': \\'李贺\\', \\'dynasty\\': \\'唐代\\', \\'content\\': \\'<br/> 长歌破衣襟,短歌断白发。秦王不可见,旦夕成内热。渴饮壶中酒,饥拔陇头粟。凄凉四月阑,千里一时绿。夜峰何离离,明月落石底。徘徊沿石寻,照出高峰外。不得与之游,歌成鬓先改。 <br/> \\', \\'sons\\': {\\'译文及注释\\': {\\'content\\': \\'译文<br/>写长歌把我的衣襟磨破,吟短诗使我的白发脱落。<br/>谒见秦王没有机缘,日夜焦虑我心中烦热。<br/>喝口壶中酒,聊以解渴,拔把垅头谷,暂充饥饿。<br/>四月将尽,千里大地一片绿色,自己却贫困潦倒,不由人感到凄凉难过。<br/>夜幕中峰峦起伏重叠,明亮的月光却只向谷底照射。<br/>我来来回回沿着石崖寻觅,可它又在高峰之外不可捉摸。<br/>自己终不得与其共事,歌成而头发早已变白。<br/>注释<br/>长歌续短歌:题目从古乐府《长歌行》、《短歌行》化出。<br/>长歌二句:互文的修辞手法,长歌短歌,唱破衣襟,吟断白发。<br/>秦王:指唐宪宗。宪宗当时在秦地,所以称为秦王。<br/>旦夕:日日夜夜。内热:内心急躁而炽热。<br/>陇头:田间地头。此二句比喻诗人如饥似渴地思念唐宪宗。<br/>凄凉二句:因为困顿潦倒,看到初夏万物茂盛,更加自感凄凉。<br/>离离:重叠、罗列的样子。<br/>明月:比喻唐宪宗。这两句的意思为:夜峰罗列,月光照耀在落石下,不及他处。比喻君恩被群小阻隔。<br/>裴回:即“徘徊”,彷徨不进貌。<br/>之:代词,代指唐宪宗。<br/>鬓先改:鬓发已经变白。<br/>\\', \\'cankao\\': \\'<br/>参考资料:完善<br/><br/>1、<br/>冯浩非 徐传武.李贺诗选译.成都:巴蜀书社,1991:112-114<br/><br/>\\'}, \\'赏析\\': {\\'content\\': \\'&nbsp;&nbsp;开头二句紧扣诗题,有愁苦万分,悲歌不已的意思。“破”、“断”二字,用得很奇特,但也都入情入理。古人有“长歌当哭”的话,长歌当哭,泪洒胸怀,久而久之,那衣襟自然会破烂。杜甫有“白头搔更短,浑欲不胜簪”(《春望》)的诗句,人到烦恼之至,无计可施的时候,常常会下意识地搔爬头皮,白发越搔越稀。这首诗的“断”可能就是由杜甫诗中的“短”生发出来的。<br/>&nbsp;&nbsp;三、四句写进见“秦王”的愿望不能实现,因而内心更加郁闷,像是烈火中烧,炽热难熬。“秦王”当指唐宪宗。王琦说:“时天子居秦地,故以秦王为喻。”(《李长吉歌诗汇解》)李贺在世时,宪宗还能有所作为,曾采取削藩措施,重整朝政,史家有“中兴”之誉。李贺对这样的君主是寄托希望的。他在考进士受到排挤打击之后,幻想他能像马周受知于唐太宗那样,直接去见皇帝,以实现他的政治理想。<br/>&nbsp;&nbsp;五、六句具体描述诗人苦闷的心情与清贫的生活,与开头二句相照应、相补充。“渴饮壶中酒”,渴是“内热”的表现,饮酒的目的在于平息内热、消愁解闷;“饥拔陇头粟”,为求见“秦王”不惜忍饥挨饿,靠从地里拔粟充饥。<br/>&nbsp;&nbsp;七、八句写景。“凄凉四月阑,千里一时绿”,初夏已尽,盛夏来临,草木葱翠,生气勃勃,原是不会有凄凉之感的。然而“绿肥红瘦”,万花摇落,诗人又不禁为之唏嘘感叹。下面的“千里”句,故意用欢乐的色调映衬凄苦的情怀,颇有“春物与愁客,遇时各有违”(孟郊《春愁》)的意味,这样反复渲染,有一唱三叹之妙。诗人述怀从景物落笔,寄情于景,意味深长。<br/>&nbsp;&nbsp;后六句采用借喻、拟人等修辞手法,表面上写景物,实际上写人事。“夜峰何离离,明月落石底”。夜间的峰峦一个挨一个地排列着,黝黑而高,竟把那明朗的月亮遮得无影无踪,让诗人感到纳闷。诗人沿着那崎岖的石径四处寻觅,忽而发现它在高峰之外。峰峦阻隔,高不可攀,心中异常痛苦,因而慷慨悲歌,鬓发也在不知不觉中变得更加苍白,忧伤催人老。“夜峰”、“明月”等句喻意微婉。“明月”借喻唐宪宗,夜峰指代他身边的卿相们,意思是宪宗为一些大臣所包围,闭目塞聪,就像��亮为峰峦所阻隔,虽有明光,却不能下达。这些表明诗人深知当时朝廷的弊病,他想向宪宗陈述国事,以匡时救弊,然而“山”高“月”远,投告无门,只有暗自忧伤而已。<br/>&nbsp;&nbsp;杜牧在《李长吉歌诗叙》中评李贺的诗说:“盖《骚》之苗裔,理虽不及,辞或过之。《骚》有感怨刺怼,言及君臣理乱,时有以激发人意。乃贺所为,得无有是?”这首诗在立意和表现方法的运用上,都与《离骚》很相似。“夜峰何离离,明月落石底”,寄托遥深。诗人把自己的意志和情绪融化在生动的比喻和深邃的意境中,含蓄隽永,优美动人,颇得《离骚》的神髓。<br/>\\', \\'cankao\\': \\'<br/>参考资料:完善<br/><br/>1、<br/>朱世英 等.唐诗鉴赏辞典.上海:上海辞书出版社,1983:1024-1025<br/><br/>\\'}}, \\'links\\': \"[\\'/shiwenv_b6325c42ea5c.aspx\\', \\'/shiwenv_70e98eb5d973.aspx\\', \\'/shiwenv_f66cbdaac064.aspx\\']\"}'] 1\n"
 
 
 
 
103
  ]
104
  }
105
  ],
 
91
  },
92
  {
93
  "cell_type": "code",
94
+ "execution_count": 1,
95
+ "metadata": {
96
+ "ExecuteTime": {
97
+ "end_time": "2023-08-15T04:30:53.455501Z",
98
+ "start_time": "2023-08-15T04:30:09.405354Z"
99
+ }
100
+ },
101
  "outputs": [
102
  {
103
+ "ename": "NameError",
104
+ "evalue": "name 'get_vector' is not defined",
105
+ "output_type": "error",
106
+ "traceback": [
107
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
108
+ "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
109
+ "Cell \u001B[0;32mIn[1], line 55\u001B[0m\n\u001B[1;32m 53\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;18m__name__\u001B[39m \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m__main__\u001B[39m\u001B[38;5;124m'\u001B[39m:\n\u001B[1;32m 54\u001B[0m input_text \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m李白的诗歌\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[0;32m---> 55\u001B[0m knowledges \u001B[38;5;241m=\u001B[39m \u001B[43mget_domain_knowledge\u001B[49m\u001B[43m(\u001B[49m\u001B[43minput_text\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m5\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 56\u001B[0m \u001B[38;5;28mprint\u001B[39m(knowledges, \u001B[38;5;28mlen\u001B[39m(knowledges))\n",
110
+ "Cell \u001B[0;32mIn[1], line 22\u001B[0m, in \u001B[0;36mget_domain_knowledge\u001B[0;34m(text, n, threshold)\u001B[0m\n\u001B[1;32m 20\u001B[0m vectors \u001B[38;5;241m=\u001B[39m read_local_vectors()\n\u001B[1;32m 21\u001B[0m \u001B[38;5;66;03m# 将输入文本转化为向量 - Convert the input text into a vector\u001B[39;00m\n\u001B[0;32m---> 22\u001B[0m input_vector \u001B[38;5;241m=\u001B[39m \u001B[43mget_vector\u001B[49m(text)\n\u001B[1;32m 23\u001B[0m \u001B[38;5;66;03m# 将输入文本转化为numpy数组 - Convert the input text into a numpy array\u001B[39;00m\n\u001B[1;32m 24\u001B[0m input_vector \u001B[38;5;241m=\u001B[39m input_vector\u001B[38;5;241m.\u001B[39mdetach()\u001B[38;5;241m.\u001B[39mnumpy()\n",
111
+ "\u001B[0;31mNameError\u001B[0m: name 'get_vector' is not defined"
112
  ]
113
  }
114
  ],
gushiwen_vector_database/generate_vectors.py CHANGED
@@ -13,8 +13,8 @@ from transformers import AutoTokenizer, AutoModel
13
  import json
14
 
15
  # 加载模型 - Load model
16
- tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2", cache_dir='embedding_model', model_max_length=512)
17
- model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2e", cache_dir='embedding_model')
18
 
19
  # 平均池化 - Average pooling
20
  def mean_pooling(model_output, attention_mask):
 
13
  import json
14
 
15
  # 加载模型 - Load model
16
+ tokenizer = AutoTokenizer.from_pretrained("./embedding_model/models--sentence-transformers--all-mpnet-base-v2/snapshots/bd44305fd6a1b43c16baf96765e2ecb20bca8e1d")
17
+ model = AutoModel.from_pretrained("./embedding_model/models--sentence-transformers--all-mpnet-base-v2/snapshots/bd44305fd6a1b43c16baf96765e2ecb20bca8e1d")
18
 
19
  # 平均池化 - Average pooling
20
  def mean_pooling(model_output, attention_mask):
gushiwen_vector_database/search_vectors.py CHANGED
@@ -52,18 +52,22 @@ def get_domain_knowledge(text, n, threshold=0.2):
52
  # 取出相似度最高的前n个文本的序号 - Take out the serial number of the top n texts with the highest similarity
53
  knowledges_ids = similarity_sorted[:n].tolist()
54
  # 读取知识库 - Read the knowledge base
55
- knowledges = json.load(open('gushiwen.json', 'r', encoding='utf8').readlines())
56
- # 去除概率小于阈值的知识 - Remove knowledge with probability less than threshold
57
- knowledges_ids = [i for i in knowledges_ids if similarity[0][i] > threshold]
58
- # 直接输出资料文本 - directly output the text
59
- knowledges = [str(knowledge) for knowledge in knowledges]
60
- # 取出相似度最高的前n个文本 - Take out the top n texts with the highest similarity
61
- knowledges = [knowledges[i] for i in knowledges_ids]
 
 
 
62
  return knowledges
63
  return ''
64
 
 
65
  # sample:
66
- # if __name__ == '__main__':
67
- # input_text = '这是一条测试样本'
68
- # knowledges = get_domain_knowledge(input_text, 5)
69
- # print(knowledges, len(knowledges))
 
52
  # 取出相似度最高的前n个文本的序号 - Take out the serial number of the top n texts with the highest similarity
53
  knowledges_ids = similarity_sorted[:n].tolist()
54
  # 读取知识库 - Read the knowledge base
55
+ with open('gushiwen.json', 'r', encoding='utf8') as file:
56
+ file_content = file.read()
57
+ knowledges = json.loads(file_content)
58
+
59
+ # 去除概率小于阈值的知识 - Remove knowledge with probability less than threshold
60
+ knowledges_ids = [i for i in knowledges_ids if similarity[0][i] > threshold]
61
+ # 直接输出资料文本 - directly output the text
62
+ knowledges = [str(knowledge) for knowledge in knowledges]
63
+ # 取出相似度最高的前n个文本 - Take out the top n texts with the highest similarity
64
+ knowledges = [knowledges[i] for i in knowledges_ids]
65
  return knowledges
66
  return ''
67
 
68
+
69
  # sample:
70
+ if __name__ == '__main__':
71
+ input_text = '李白的诗歌'
72
+ knowledges = get_domain_knowledge(input_text, 5)
73
+ print(knowledges, len(knowledges))