llzxx123 commited on
Commit
faf086b
·
verified ·
1 Parent(s): 9f8e56d

Upload 15 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ FinKnowledgeGraph-main/checkpoints/entity_searcher/search_tree.pkl filter=lfs diff=lfs merge=lfs -text
2
+ FinKnowledgeGraph-main/image/所属概念图.png filter=lfs diff=lfs merge=lfs -text
FinKnowledgeGraph-main/README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 基于金融知识图谱的问答系统
2
+
3
+ 下图示例展示了具体问答的过程,问答系统支持简单的查询股东、概念、所属行业等查询,同时支持简单的多轮对话功能和闲聊功能
4
+ ![问答示例](image/问答示例.png)
5
+
6
+
7
+
8
+ 所属概念图
9
+
10
+ ![所属概念图](image/所属概念图.png)
11
+
12
+ 持股图
13
+
14
+ ![持股图](image/持股图.png)
15
+
16
+ ## 1.准备数据
17
+ 利用结构化三元组构建金融知识图谱,数据可在[此处](https://pan.baidu.com/s/1UQfu5c1Y7BfdMS_uNGrZug )下载获得,提取码:`sae3`
18
+
19
+ 下载后将压缩包解压,并**改名为data**,放置在根目录下
20
+
21
+ ## 2.安装环境
22
+ 1. 图数据库neo4j下载<https://neo4j.com/download>,使用需要注册。
23
+
24
+ 2. 安装python第三方库
25
+ `pip install -r requirements.txt`
26
+
27
+ 若害怕第三方库版本冲突可以创建虚拟环境安装(Anaconda用户)
28
+
29
+ ```
30
+ conda create -n your_env_name python=x.x
31
+ conda activate your_env_name
32
+ pip install -r requirements.txt
33
+ ```
34
+
35
+ ## 3.运行
36
+ 结构图
37
+
38
+ ![结构图](image/流程图.png)
39
+ 1. `step1_get_data`文件用来爬取上交所相关数据,但Tushare需要一些权限,必要的数据已经放在*准备数据*环节了,可以直接使用
40
+ 2. `step2_store_to_neo4j.py`文件用来构建知识图谱,运行时需要打开neo4j,代码中实例化py2neo.Graph时需要依照自己创建的neo4j项目名称填写。图谱构建过程会比较漫长。
41
+ 3. `module/classifier.py`利用fasttext创建一个闲聊分类器
42
+ 4. `semantic_parser.py`利用Trie树的Aho-Corasick automation(AC自动机)算法挖按成关键词的快速匹配
43
+ 5. `main.py`运行主函数,测试效果
44
+
45
+ ## 4.更多
46
+ 配合前端框架,搭建可进行图谱信息检索及多轮问答对话的交互系统平台。点击跳转:[基于金融知识图谱的知识计算引擎构建](https://github.com/XuekaiChen/ShowKnowledge)
FinKnowledgeGraph-main/checkpoints/classifier/model.bin ADDED
Binary file (41.2 kB). View file
 
FinKnowledgeGraph-main/checkpoints/entity_searcher/search_tree.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b399e924c919e5af9b4dd4bafa3674ec3c64a48d658f22bb449eac90ae24c772
3
+ size 1246553
FinKnowledgeGraph-main/config.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 知识语料路径
2
+ entity_corpus_path = '../data/knowledge/'
3
+
4
+ # 实体搜索器存储路径
5
+ entity_searcher_save_path = '../checkpoints/entity_searcher/search_tree.pkl'
6
+
7
+ # 实体搜索器加载路径
8
+ entity_searcher_load_path = './checkpoints/entity_searcher/search_tree.pkl'
9
+
10
+ # 分类器语料路径
11
+ classifier_corpus_path = '../data/classifier/chat.train'
12
+
13
+ # 分类器模型存储路径
14
+ classifier_save_path = '../checkpoints/classifier/model.bin'
15
+
16
+ # 分类器模型加载路径
17
+ classifier_load_path = './checkpoints/classifier/model.bin'
18
+
19
+ # 闲聊回复语料库
20
+ chat_responses = {
21
+ 'qa': [],
22
+ 'greet': [
23
+ 'hello,我是小A,小哥哥小姐姐有关于股票的问题可以问我哦',
24
+ '你好,我是小A,输入股票名称或者代码查看详细信息哦',
25
+ '你好,我是小A,可以问我股票相关的问题哦'
26
+ ],
27
+ 'goodbye': [
28
+ '再见',
29
+ '不要走,继续聊会呗',
30
+ '拜拜喽,别忘了给个小红心啊',
31
+ ],
32
+ 'bot': [
33
+ '没错,我就是集美貌与才智于一身的小A',
34
+ '小A就是我,我就是小A'
35
+ ],
36
+ 'safe': [
37
+ '不好意思,您的问题我没太听懂,可以换一种说法嘛',
38
+ '亲亲,这里好像没有您想要的答案'
39
+ ]
40
+ }
41
+
42
+ # 问题类型
43
+ question_types = {
44
+ 'concept':
45
+ ['概念', '特征'],
46
+ 'holder':
47
+ ['股东', '控制', '控股', '持有'],
48
+ 'industry':
49
+ ['行业', '领域'],
50
+ }
51
+
52
+ # 存储对话历史中上一次涉及的问题类型和实体
53
+ contexts = {
54
+ 'ques_types': None,
55
+ 'entities': None
56
+ }
FinKnowledgeGraph-main/image/所属概念图.png ADDED

Git LFS Details

  • SHA256: b037df311b59c55d1684fa62464992a7c9e01e03d91d6992bbf09fbe7614c78f
  • Pointer size: 131 Bytes
  • Size of remote file: 125 kB
FinKnowledgeGraph-main/image//346/214/201/350/202/241/345/233/276.png ADDED
FinKnowledgeGraph-main/image//346/265/201/347/250/213/345/233/276.png ADDED
FinKnowledgeGraph-main/image//351/227/256/347/255/224/347/244/272/344/276/213.png ADDED
FinKnowledgeGraph-main/main.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from module.classifier import Classifier
2
+ from module.semantic_parser import SemanticParser
3
+ from module.graph_matcher import GraphMatcher
4
+ from config import classifier_load_path, entity_searcher_load_path, chat_responses, question_types
5
+ from random import choice
6
+
7
+ # 加载分类器
8
+ classifier = Classifier(classifier_load_path)
9
+
10
+ # 加载语义解析器,预测问题类型和涉及的实体
11
+ semantic_parser = SemanticParser(entity_searcher_load_path, question_types)
12
+
13
+ # 加载图数据库查询
14
+ graph_matcher = GraphMatcher()
15
+
16
+
17
+ while True:
18
+ query = input('用户: ')
19
+ if query == 'stop':
20
+ break
21
+ else:
22
+ # 预测 label 和概率
23
+ query_intent_label, query_intent_prob = classifier.predict(query)
24
+ response = ""
25
+ # 知识问答
26
+ semantics = semantic_parser.predict(query)
27
+ if len(semantics['ques_types']) > 0 and len(semantics['entities']) > 0:
28
+ response = graph_matcher.predict(semantics)
29
+ # 闲聊
30
+ elif query_intent_prob > 0.8:
31
+ response = choice(chat_responses[query_intent_label])
32
+ if response == "":
33
+ response = choice(chat_responses['safe'])
34
+ print(f'机器人: {response}')
35
+
36
+ if query_intent_label == 'goodbye':
37
+ break
FinKnowledgeGraph-main/module/classifier.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+ import jieba
3
+ from FinKnowledgeGraph.config import classifier_corpus_path, classifier_save_path
4
+
5
+
6
+ def train_classifier(input_file_path, model_save_path):
7
+ """训练分类模型"""
8
+
9
+ # 基于 fasttext api 实现模型训练
10
+ # https://fasttext.cc/docs/en/supervised-tutorial.html
11
+ model = fasttext.train_supervised(input=input_file_path, label='__label__',lr=0.5)
12
+ result = model.test(input_file_path)
13
+ print(result[1])
14
+ print(result[2])
15
+ model.save_model(model_save_path)
16
+
17
+
18
+
19
+ class Classifier:
20
+ """分类器"""
21
+
22
+ def __init__(self, model_load_path):
23
+ self.model_load_path = model_load_path
24
+ self.model = self.load_model()
25
+
26
+ def load_model(self):
27
+ """加载模型"""
28
+ return fasttext.load_model(self.model_load_path)
29
+
30
+ def predict(self, query):
31
+ """预测 query"""
32
+
33
+ # 基于 fasttext api 实现模型预测
34
+ # https://fasttext.cc/docs/en/supervised-tutorial.html
35
+ query_intent = self.model.predict(query)
36
+ # 预测 label 和概率
37
+ return query_intent[0][0].replace('__label__', ''), query_intent[1][0]
38
+
39
+
40
+ if __name__ == '__main__':
41
+
42
+ print('开始训练分类器...')
43
+
44
+ train_classifier(classifier_corpus_path, classifier_save_path)
45
+
46
+ print('分类器训练成功...')
47
+
48
+
FinKnowledgeGraph-main/module/graph_matcher.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from py2neo import Graph
2
+
3
+
4
+ class GraphMatcher:
5
+ """基于 cypher 语句查询数据库"""
6
+
7
+ def __init__(self):
8
+ self.graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))
9
+
10
+ def parse_graph(self, ques_types, entities):
11
+ """转换成 cypher 语句查询"""
12
+
13
+ response = ""
14
+ for each_ques_type in ques_types:
15
+ if each_ques_type == 'concept':
16
+ # match 股票 - 所属概念 - 概念
17
+ for entity_name, entity_type in entities.items():
18
+ # 1、问股票的概念
19
+ if entity_type == '股票':
20
+ cypher_sql = f'MATCH (s:`股票`)-[r:所属概念]->(c:`概念`) where s.股票名称 = "{entity_name}" return c.概念名称'
21
+ rtn = self.graph.run(cypher_sql).data()
22
+ # 此处应对所有返回的rtn[i]进行遍历,得到所有值并形成问句
23
+ response += f'{entity_name}所属概念是{rtn[0]["c.概念名称"]}' + '\n'
24
+ # 2、问概念有哪些股票
25
+ elif entity_type == '概念':
26
+ cypher_sql = f'MATCH (s:`股票`)-[r:所属概念]->(c:`概念`) where c.概念名称 = "{entity_name}" return s.股票名称'
27
+ rtn = self.graph.run(cypher_sql).data()
28
+ response += f'{entity_name}概念下有{rtn[0]["s.股票名称"]}等股票' + '\n'
29
+ elif each_ques_type == 'holder':
30
+ # 提示:match 股东 - 持有 - 股票
31
+ for entity_name, entity_type in entities.items():
32
+ # 1、问股票的股东
33
+ if entity_type == '股票':
34
+ cypher_sql = f'MATCH (s:`股东`)-[r:持有]->(c:`股票`) where c.股票名称 = "{entity_name}" return s.股东名称, r.持有量, r.占比'
35
+ rtn = self.graph.run(cypher_sql).data()
36
+ response += f'{entity_name}的股东是{rtn[0]["s.股东名称"]},持有股份{rtn[0]["r.持有量"]},占比{rtn[0]["r.占比"]}%' + '\n'
37
+ # 2、问股东的股票
38
+ elif entity_type == '股东':
39
+ cypher_sql = f'MATCH (s:`股东`)-[r:持有]->(c:`股票`) where s.股东名称 = "{entity_name}" return c.股票名称, r.持有量, r.占比'
40
+ rtn = self.graph.run(cypher_sql).data()
41
+ response += f'{entity_name}下有{rtn[0]["c.股票名称"]},持有股份{rtn[0]["r.持有量"]},占比{rtn[0]["r.占比"]}%' + '\n'
42
+ pass
43
+ elif each_ques_type == 'industry':
44
+ # 提示:match 股票 return 行业
45
+ for entity_name, entity_type in entities.items():
46
+ # 1、股票所属行业
47
+ if entity_type == '股票':
48
+ cypher_sql = f'MATCH (s:`股票`) where s.股票名称="{entity_name}" return s.行业'
49
+ rtn = self.graph.run(cypher_sql).data()
50
+ response += f'{entity_name}所属行业是{rtn[0]["s.行业"]}' + '\n'
51
+ pass
52
+ return response.strip()
53
+
54
+ def predict(self, semantics):
55
+ """预测 query"""
56
+ response = self.parse_graph(semantics['ques_types'], semantics['entities'])
57
+ return response
FinKnowledgeGraph-main/module/semantic_parser.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from FinKnowledgeGraph.config import entity_corpus_path, entity_searcher_save_path, contexts
2
+ import ahocorasick
3
+ import pandas as pd
4
+ import os
5
+ import pickle
6
+ from tqdm import tqdm
7
+
8
+ def build_search_tree(input_folder_path, tree_save_path):
9
+ """读取股票名称,股东和概念实体,构建 ac 树"""
10
+ # https://pypi.org/project/pyahocorasick/
11
+ tree = ahocorasick.Automaton()
12
+
13
+ stock_basic = pd.read_csv(os.path.join(input_folder_path, '股票信息.csv'), encoding='gbk')
14
+ # 遍历 stock_basic,添加 name 即股票名字
15
+ # 股票名字为key,value表示为具体的实体类型,比如:tree.add_word('股票名A', ('股票名A', '股票'))
16
+ for idx, each_row in tqdm(stock_basic.iterrows()):
17
+ tree.add_word(str(each_row['name']), (str(each_row['name']), '股票'))
18
+
19
+ concept = pd.read_csv(os.path.join(input_folder_path, '概念信息.csv'), encoding='gbk')
20
+ # 遍历 concept,添加 name 即概念名字
21
+ # 概念名字为key,value表示为具体的实体类型,比如:tree.add_word('概念名A', ('概念名A', '概念'))
22
+ for idx, each_row in tqdm(concept.iterrows()):
23
+ tree.add_word(str(each_row['name']), (str(each_row['name']), '概念'))
24
+
25
+ holder = pd.read_csv(os.path.join(input_folder_path, '股东信息.csv'), encoding='gbk')
26
+ # 遍历 holder,添加 股东名称
27
+ # 股东名称为key,value表示为具体的实体类型,比如:tree.add_word('股东名称A', ('股东名称A', '股东'))
28
+ for idx, each_row in tqdm(holder.iterrows()):
29
+ tree.add_word(str(each_row['name']), (str(each_row['name']), '股东'))
30
+
31
+ tree.make_automaton()
32
+
33
+ with open(tree_save_path, 'wb') as fout:
34
+ pickle.dump(tree, fout)
35
+
36
+
37
+ class SemanticParser:
38
+ """实体搜索器"""
39
+
40
+ def __init__(self, entity_model_load_path, question_types):
41
+ self.entity_model_load_path = entity_model_load_path
42
+ self.entity_model = self.load_model()
43
+ self.question_types = question_types
44
+
45
+ def load_model(self):
46
+ """加载模型"""
47
+ with open(self.entity_model_load_path, 'rb') as fin:
48
+ return pickle.load(fin)
49
+
50
+ def predict_question_types(self, query):
51
+ """判断问题类型,这里只是通过关键词去判断,可以改成分类模型"""
52
+
53
+ rtn_ques_types = []
54
+ for ques_type, kws in self.question_types.items():
55
+ for each_kw in kws:
56
+ if each_kw in query:
57
+ rtn_ques_types.append(ques_type)
58
+ break
59
+ return rtn_ques_types
60
+
61
+ def predict(self, query):
62
+ """预测 query"""
63
+
64
+ rtn = {}
65
+
66
+ # 预测类型
67
+ ques_types = self.predict_question_types(query)
68
+
69
+ # 预测实体
70
+ entities = {}
71
+ for end_index, (entity_name, entity_type) in self.entity_model.iter(query):
72
+ entities[entity_name] = entity_type
73
+
74
+ # 问句中至少有一种实体
75
+ if len(ques_types) != 0 and len(entities) != 0:
76
+ rtn['ques_types'] = ques_types
77
+ rtn['entities'] = entities
78
+ # 备份上下文
79
+ contexts['ques_types'] = ques_types
80
+ contexts['entities'] = entities
81
+
82
+ # 问句中有问题类型没实体,需要结合上下文问题
83
+ elif len(ques_types) != 0:
84
+ rtn['ques_types'] = ques_types
85
+ # 备份
86
+ contexts['ques_types'] = ques_types
87
+
88
+ # 从对话历史中继承问题类型
89
+ rtn['entities'] = contexts['entities']
90
+
91
+ # 问句中有实体没问句类型,结合上下文实体
92
+ elif len(entities) != 0:
93
+ # 从对话历史中继承问题类型
94
+ rtn['ques_types'] = contexts['ques_types']
95
+
96
+ rtn['entities'] = entities
97
+ # 备份
98
+ contexts['entities'] = entities
99
+ else:
100
+ # 如果两个都没有找到,那说明是没有涉及 KG
101
+ rtn['ques_types'] = []
102
+ rtn['entities'] = {}
103
+
104
+ return rtn
105
+
106
+
107
+ if __name__ == '__main__':
108
+
109
+ print('开始训练实体搜索树...')
110
+
111
+ build_search_tree(entity_corpus_path, entity_searcher_save_path)
112
+
113
+ print('实体搜索树训练成功...')
FinKnowledgeGraph-main/requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.10.0
2
+ bs4==0.0.1
3
+ certifi==2021.5.30
4
+ charset-normalizer==2.0.7
5
+ fasttext==0.9.2
6
+ idna==3.2
7
+ interchange==2021.0.3
8
+ jieba==0.42.1
9
+ lxml==4.6.3
10
+ monotonic==1.6
11
+ numpy==1.19.5
12
+ packaging==21.0
13
+ pandas==1.1.5
14
+ pansi==2020.7.3
15
+ py2neo==2021.2.1
16
+ pyahocorasick==1.4.2
17
+ pybind11==2.8.0
18
+ Pygments==2.10.0
19
+ pyparsing==2.4.7
20
+ python-dateutil==2.8.2
21
+ pytz==2021.3
22
+ requests==2.26.0
23
+ simplejson==3.17.5
24
+ six==1.16.0
25
+ soupsieve==2.2.1
26
+ tqdm==4.62.3
27
+ tushare==1.2.67
28
+ urllib3==1.26.7
29
+ websocket-client==1.2.1
FinKnowledgeGraph-main/step1_get_data.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tushare as ts
2
+ import pandas as pd
3
+
4
+ # 和 Tushare 建立连接
5
+ pro = ts.pro_api('08fbab8087eb66409ec66452b756beb05ef93388bbba7905fab1f7b5')
6
+
7
+ # 股票基本信息
8
+ # 查询当前所有正常上市交易的股票列表
9
+ # https://waditu.com/document/2?doc_id=25
10
+ # 可以限制exchange='SSE'表示上交所
11
+ # list_status='L'表示上市公司
12
+ # ts_code: TS代码
13
+ # symbol: 股票代码
14
+ # name: 股票名称
15
+ # area:地域
16
+ # industry: 行业
17
+ stock_basic = pro.stock_basic(exchange='SSE', list_status='L', fields='ts_code,symbol,name,area,industry,fullname')
18
+ stock_basic.to_csv('./data/knowledge/股票信息1.csv', encoding='gbk')
19
+
20
+ # 概念股分类 https://waditu.com/document/2?doc_id=125
21
+ concept = pro.concept()
22
+ concept.to_csv('./data/knowledge/概念信息1.csv', encoding='gbk', index=False)
23
+
24
+ # # 股票概念信息,获取概念下对应的股票(概念信息.csv 文件中共有 358 个 概念)
25
+ # concept_details = pd.DataFrame(columns=('id', 'concept_name', 'code', 'name'))
26
+ #
27
+ # for i in range(359):
28
+ # concept_id = 'TS' + str(i)
29
+ # # 获取该概念下的全部股票 https://waditu.com/document/2?doc_id=126
30
+ # concept_stocks = pro.concept_detail(id=concept_id, field='concept_name,code,name')
31
+ # concept_details = concept_details.append(concept_stocks)
32
+ # concept_details.to_csv('./data/knowledge/股票-概念信息1.csv', encoding='gbk')
33
+ #
34
+ #
35
+ # # 股票持有股东信息
36
+ # holder_basic = []
37
+ # # TS代码、公告日期、报告期、股东名、持有量、持有占比
38
+ # stock_holders = pd.DataFrame(columns=('ts_code', 'ann_date', 'end_date', 'holder_name', 'hold_amount', 'hold_ratio'))
39
+ # # 获取时间段内股票的股东信息
40
+ # for each_code in stock_basic['ts_code'].tolist():
41
+ # # 前十大股东:https://waditu.com/document/2?doc_id=61
42
+ # curr_holder = pro.top10_holders(ts_code=each_code, start_date='20200101', end_date='20201231')
43
+ # # 在这里,简单起见,只考虑第一个股东信息
44
+ # stock_holders = stock_holders.append(curr_holder.iloc[0:1])
45
+ # # 加入股东名称
46
+ # # 加入时做清洗,即去除 -,比如将 新华人寿保险股份有限公司-分红-个人分红-018L-FH002深 清洗为 新华人寿保险股份有限公司
47
+ # holder_basic.extend(curr_holder.iloc[0:1]['holder_name'].values.tolist().split('-')[0])
48
+ # stock_holders.to_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
49
+ #
50
+ # # 股东信息
51
+ # holder_basic_df = pd.DataFrame({
52
+ # '股东名称': list(set(holder_basic))
53
+ # })
54
+ # holder_basic_df.to_csv('./data/knowledge/股东信息.csv', encoding='gbk', index=False)
55
+
FinKnowledgeGraph-main/step2_store_to_neo4j.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import pandas as pd
3
+ from py2neo import Graph, Node, Relationship, NodeMatcher
4
+
5
+ # --------------------------- 连接 Neo4j
6
+ # 官方文档:https://py2neo.org/2021.1/
7
+ graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))
8
+ print(graph)
9
+ graph.run('match (n) detach delete n')# 删除所有节点及其关系
10
+ # --------------------------- 创建实体
11
+ # 股票
12
+ print('创建 股票 实体...')
13
+ stock_basic = pd.read_csv('./data/knowledge/股票信息.csv', encoding='gbk')
14
+ for idx, each_row in tqdm(stock_basic.iterrows()):
15
+ # 方法说明:https://py2neo.org/2021.1/data/index.html#py2neo.data.Node
16
+ # 股票 是 label
17
+ # keyword arguments 是属性,如 TS代码 等
18
+ each_stock = Node('股票',
19
+ TS代码=each_row['ts_code'],
20
+ 股票代码=each_row['symbol'],
21
+ 股票名称=each_row['name'],
22
+ 行业=each_row['industry'])
23
+ try:
24
+ # 方法说明:https://py2neo.org/2021.1/workflow.html#py2neo.Transaction.create
25
+ graph.create(each_stock)
26
+ except Exception as e:
27
+ print(f'Error: {e}, data idx: {idx}, data: {each_row}')
28
+
29
+ # 概念
30
+ print('创建 概念 实体...')
31
+ concept = pd.read_csv('./data/knowledge/概念信息.csv', encoding='gbk')
32
+ for idx, each_row in tqdm(concept.iterrows()):
33
+ each_concept = Node('概念',
34
+ 概念代码=each_row['code'],
35
+ 概念名称=each_row['name'])
36
+ graph.create(each_concept)
37
+
38
+ # 股东
39
+ print('创建 股东 实体...')
40
+ holder = pd.read_csv('./data/knowledge/股东信息.csv', encoding='gbk')
41
+ for idx, each_row in tqdm(holder.iterrows()):
42
+ each_holder = Node('股东',
43
+ 股东名称=each_row['name'])
44
+ graph.create(each_holder)
45
+
46
+ # # --------------------------- 创建关系
47
+ # # 方法说明:https://py2neo.org/2021.1/matching.html#py2neo.NodeMatcher
48
+ # matcher = NodeMatcher(graph)
49
+ #
50
+ # # 股票-概念
51
+ # print('创建 股票-概念 关系...')
52
+ # stock_concept = pd.read_csv('./data/knowledge/股票-概念信息.csv', encoding='gbk')
53
+ # for idx, each_row in tqdm(stock_concept.iterrows()):
54
+ # node1 = matcher.match("股票", TS代码=each_row['ts_code']).first()
55
+ # node2 = matcher.match("概念", 概念代码=each_row['Cid']).first()
56
+ # # node1 = graph.nodes.match("股票", TS代码=each_row['ts_code']).first()
57
+ # # node2 = graph.nodes.match("概念", 概念代码=each_row['Cid']).first()
58
+ #
59
+ # if node1 is not None and node2 is not None:
60
+ # # 方法说明:https://py2neo.org/2021.1/data/index.html#py2neo.data.Relationship
61
+ # # 格式:Relationship(start_node, type, end_node)
62
+ # r = Relationship(node1, '所属概念', node2)
63
+ # graph.create(r)
64
+ #
65
+ #
66
+ # # 股票-股东
67
+ # print('创建 股票-股东 关系...')
68
+ # stock_holder = pd.read_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
69
+ # for idx, each_row in tqdm(stock_holder.iterrows()):
70
+ # # first() 方法返回第一个匹配的 Node,如果找不到则返回 None
71
+ # node1 = graph.nodes.match("股票", TS代码=each_row['ts_code']).first()
72
+ # node2 = graph.nodes.match("股东", 股东名称=each_row['holder_name'].split('-')[0]).first() # 取最前面的子公司
73
+ # if node1 is not None and node2 is not None:
74
+ # r = Relationship(node2, '持有', node1,
75
+ # ann_date=each_row['ann_date'],
76
+ # end_date=each_row['end_date'],
77
+ # hold_amount=each_row['hold_amount'],
78
+ # hold_ratio=each_row['hold_ratio'])
79
+ # graph.create(r)
80
+
81
+ #-----------------上面创建关系的方式由于py2neo版本问题总报错,因此改为用cypher语句导入关系
82
+ # 创建所属关系
83
+ print('创建 股票-概念 关系...')
84
+ stock_concept = pd.read_csv('./data/knowledge/股票-概念信息.csv', encoding='gbk')
85
+ for idx, each_row in tqdm(stock_concept.iterrows()):
86
+ cypher_sql = "MATCH (a:`股票`),(b:`概念`) WHERE a.`TS代码` = '{0}' AND b.`概念代码` = '{1}'" \
87
+ " CREATE (a)-[r:所属概念] -> (b) RETURN r"\
88
+ .format(str(each_row['ts_code']),str(each_row['Cid']))
89
+ graph.run(cypher_sql)
90
+
91
+ # 创建持有关系
92
+ print('创建 股票-股东 关系...')
93
+ stock_concept = pd.read_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
94
+ for idx, each_row in tqdm(stock_concept.iterrows()):
95
+ cypher_sql = "MATCH (a:`股东`),(b:`股票`) WHERE a.`股东名称` = '{0}' AND b.`TS代码` = '{1}' " \
96
+ "CREATE (a)-[r:持有{{公告日期:'{2}', 报告期:'{3}', 持有量:'{4}', 占比:'{5}'}}] -> (b)"\
97
+ .format(str(each_row['holder_name'].split('-')[0]), str(each_row['ts_code']),
98
+ str(each_row['ann_date']), str(each_row['end_date']),
99
+ str(each_row['hold_amount']), str(each_row['hold_ratio']))
100
+ graph.run(cypher_sql)
101
+
102
+
103
+ print('实体 关系 导入成��...')