Spaces:
No application file
No application file
Upload 15 files
Browse files- .gitattributes +2 -0
- FinKnowledgeGraph-main/README.md +46 -0
- FinKnowledgeGraph-main/checkpoints/classifier/model.bin +0 -0
- FinKnowledgeGraph-main/checkpoints/entity_searcher/search_tree.pkl +3 -0
- FinKnowledgeGraph-main/config.py +56 -0
- FinKnowledgeGraph-main/image/所属概念图.png +3 -0
- FinKnowledgeGraph-main/image//346/214/201/350/202/241/345/233/276.png +0 -0
- FinKnowledgeGraph-main/image//346/265/201/347/250/213/345/233/276.png +0 -0
- FinKnowledgeGraph-main/image//351/227/256/347/255/224/347/244/272/344/276/213.png +0 -0
- FinKnowledgeGraph-main/main.py +37 -0
- FinKnowledgeGraph-main/module/classifier.py +48 -0
- FinKnowledgeGraph-main/module/graph_matcher.py +57 -0
- FinKnowledgeGraph-main/module/semantic_parser.py +113 -0
- FinKnowledgeGraph-main/requirements.txt +29 -0
- FinKnowledgeGraph-main/step1_get_data.py +55 -0
- FinKnowledgeGraph-main/step2_store_to_neo4j.py +103 -0
.gitattributes
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FinKnowledgeGraph-main/checkpoints/entity_searcher/search_tree.pkl filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
FinKnowledgeGraph-main/image/所属概念图.png filter=lfs diff=lfs merge=lfs -text
|
FinKnowledgeGraph-main/README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 基于金融知识图谱的问答系统
|
| 2 |
+
|
| 3 |
+
下图示例展示了具体问答的过程,问答系统支持简单的查询股东、概念、所属行业等查询,同时支持简单的多轮对话功能和闲聊功能
|
| 4 |
+

|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
所属概念图
|
| 9 |
+
|
| 10 |
+

|
| 11 |
+
|
| 12 |
+
持股图
|
| 13 |
+
|
| 14 |
+

|
| 15 |
+
|
| 16 |
+
## 1.准备数据
|
| 17 |
+
利用结构化三元组构建金融知识图谱,数据可在[此处](https://pan.baidu.com/s/1UQfu5c1Y7BfdMS_uNGrZug )下载获得,提取码:`sae3`
|
| 18 |
+
|
| 19 |
+
下载后将压缩包解压,并**改名为data**,放置在根目录下
|
| 20 |
+
|
| 21 |
+
## 2.安装环境
|
| 22 |
+
1. 图数据库neo4j下载<https://neo4j.com/download>,使用需要注册。
|
| 23 |
+
|
| 24 |
+
2. 安装python第三方库
|
| 25 |
+
`pip install -r requirements.txt`
|
| 26 |
+
|
| 27 |
+
若害怕第三方库版本冲突可以创建虚拟环境安装(Anaconda用户)
|
| 28 |
+
|
| 29 |
+
```
|
| 30 |
+
conda create -n your_env_name python=x.x
|
| 31 |
+
conda activate your_env_name
|
| 32 |
+
pip install -r requirements.txt
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## 3.运行
|
| 36 |
+
结构图
|
| 37 |
+
|
| 38 |
+

|
| 39 |
+
1. `step1_get_data`文件用来爬取上交所相关数据,但Tushare需要一些权限,必要的数据已经放在*准备数据*环节了,可以直接使用
|
| 40 |
+
2. `step2_store_to_neo4j.py`文件用来构建知识图谱,运行时需要打开neo4j,代码中实例化py2neo.Graph时需要依照自己创建的neo4j项目名称填写。图谱构建过程会比较漫长。
|
| 41 |
+
3. `module/classifier.py`利用fasttext创建一个闲聊分类器
|
| 42 |
+
4. `semantic_parser.py`利用Trie树的Aho-Corasick automation(AC自动机)算法挖按成关键词的快速匹配
|
| 43 |
+
5. `main.py`运行主函数,测试效果
|
| 44 |
+
|
| 45 |
+
## 4.更多
|
| 46 |
+
配合前端框架,搭建可进行图谱信息检索及多轮问答对话的交互系统平台。点击跳转:[基于金融知识图谱的知识计算引擎构建](https://github.com/XuekaiChen/ShowKnowledge)
|
FinKnowledgeGraph-main/checkpoints/classifier/model.bin
ADDED
|
Binary file (41.2 kB). View file
|
|
|
FinKnowledgeGraph-main/checkpoints/entity_searcher/search_tree.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b399e924c919e5af9b4dd4bafa3674ec3c64a48d658f22bb449eac90ae24c772
|
| 3 |
+
size 1246553
|
FinKnowledgeGraph-main/config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 知识语料路径
|
| 2 |
+
entity_corpus_path = '../data/knowledge/'
|
| 3 |
+
|
| 4 |
+
# 实体搜索器存储路径
|
| 5 |
+
entity_searcher_save_path = '../checkpoints/entity_searcher/search_tree.pkl'
|
| 6 |
+
|
| 7 |
+
# 实体搜索器加载路径
|
| 8 |
+
entity_searcher_load_path = './checkpoints/entity_searcher/search_tree.pkl'
|
| 9 |
+
|
| 10 |
+
# 分类器语料路径
|
| 11 |
+
classifier_corpus_path = '../data/classifier/chat.train'
|
| 12 |
+
|
| 13 |
+
# 分类器模型存储路径
|
| 14 |
+
classifier_save_path = '../checkpoints/classifier/model.bin'
|
| 15 |
+
|
| 16 |
+
# 分类器模型加载路径
|
| 17 |
+
classifier_load_path = './checkpoints/classifier/model.bin'
|
| 18 |
+
|
| 19 |
+
# 闲聊回复语料库
|
| 20 |
+
chat_responses = {
|
| 21 |
+
'qa': [],
|
| 22 |
+
'greet': [
|
| 23 |
+
'hello,我是小A,小哥哥小姐姐有关于股票的问题可以问我哦',
|
| 24 |
+
'你好,我是小A,输入股票名称或者代码查看详细信息哦',
|
| 25 |
+
'你好,我是小A,可以问我股票相关的问题哦'
|
| 26 |
+
],
|
| 27 |
+
'goodbye': [
|
| 28 |
+
'再见',
|
| 29 |
+
'不要走,继续聊会呗',
|
| 30 |
+
'拜拜喽,别忘了给个小红心啊',
|
| 31 |
+
],
|
| 32 |
+
'bot': [
|
| 33 |
+
'没错,我就是集美貌与才智于一身的小A',
|
| 34 |
+
'小A就是我,我就是小A'
|
| 35 |
+
],
|
| 36 |
+
'safe': [
|
| 37 |
+
'不好意思,您的问题我没太听懂,可以换一种说法嘛',
|
| 38 |
+
'亲亲,这里好像没有您想要的答案'
|
| 39 |
+
]
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# 问题类型
|
| 43 |
+
question_types = {
|
| 44 |
+
'concept':
|
| 45 |
+
['概念', '特征'],
|
| 46 |
+
'holder':
|
| 47 |
+
['股东', '控制', '控股', '持有'],
|
| 48 |
+
'industry':
|
| 49 |
+
['行业', '领域'],
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# 存储对话历史中上一次涉及的问题类型和实体
|
| 53 |
+
contexts = {
|
| 54 |
+
'ques_types': None,
|
| 55 |
+
'entities': None
|
| 56 |
+
}
|
FinKnowledgeGraph-main/image/所属概念图.png
ADDED
|
Git LFS Details
|
FinKnowledgeGraph-main/image//346/214/201/350/202/241/345/233/276.png
ADDED
|
FinKnowledgeGraph-main/image//346/265/201/347/250/213/345/233/276.png
ADDED
|
FinKnowledgeGraph-main/image//351/227/256/347/255/224/347/244/272/344/276/213.png
ADDED
|
FinKnowledgeGraph-main/main.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from module.classifier import Classifier
|
| 2 |
+
from module.semantic_parser import SemanticParser
|
| 3 |
+
from module.graph_matcher import GraphMatcher
|
| 4 |
+
from config import classifier_load_path, entity_searcher_load_path, chat_responses, question_types
|
| 5 |
+
from random import choice
|
| 6 |
+
|
| 7 |
+
# 加载分类器
|
| 8 |
+
classifier = Classifier(classifier_load_path)
|
| 9 |
+
|
| 10 |
+
# 加载语义解析器,预测问题类型和涉及的实体
|
| 11 |
+
semantic_parser = SemanticParser(entity_searcher_load_path, question_types)
|
| 12 |
+
|
| 13 |
+
# 加载图数据库查询
|
| 14 |
+
graph_matcher = GraphMatcher()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
while True:
|
| 18 |
+
query = input('用户: ')
|
| 19 |
+
if query == 'stop':
|
| 20 |
+
break
|
| 21 |
+
else:
|
| 22 |
+
# 预测 label 和概率
|
| 23 |
+
query_intent_label, query_intent_prob = classifier.predict(query)
|
| 24 |
+
response = ""
|
| 25 |
+
# 知识问答
|
| 26 |
+
semantics = semantic_parser.predict(query)
|
| 27 |
+
if len(semantics['ques_types']) > 0 and len(semantics['entities']) > 0:
|
| 28 |
+
response = graph_matcher.predict(semantics)
|
| 29 |
+
# 闲聊
|
| 30 |
+
elif query_intent_prob > 0.8:
|
| 31 |
+
response = choice(chat_responses[query_intent_label])
|
| 32 |
+
if response == "":
|
| 33 |
+
response = choice(chat_responses['safe'])
|
| 34 |
+
print(f'机器人: {response}')
|
| 35 |
+
|
| 36 |
+
if query_intent_label == 'goodbye':
|
| 37 |
+
break
|
FinKnowledgeGraph-main/module/classifier.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fasttext
|
| 2 |
+
import jieba
|
| 3 |
+
from FinKnowledgeGraph.config import classifier_corpus_path, classifier_save_path
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def train_classifier(input_file_path, model_save_path):
|
| 7 |
+
"""训练分类模型"""
|
| 8 |
+
|
| 9 |
+
# 基于 fasttext api 实现模型训练
|
| 10 |
+
# https://fasttext.cc/docs/en/supervised-tutorial.html
|
| 11 |
+
model = fasttext.train_supervised(input=input_file_path, label='__label__',lr=0.5)
|
| 12 |
+
result = model.test(input_file_path)
|
| 13 |
+
print(result[1])
|
| 14 |
+
print(result[2])
|
| 15 |
+
model.save_model(model_save_path)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Classifier:
|
| 20 |
+
"""分类器"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, model_load_path):
|
| 23 |
+
self.model_load_path = model_load_path
|
| 24 |
+
self.model = self.load_model()
|
| 25 |
+
|
| 26 |
+
def load_model(self):
|
| 27 |
+
"""加载模型"""
|
| 28 |
+
return fasttext.load_model(self.model_load_path)
|
| 29 |
+
|
| 30 |
+
def predict(self, query):
|
| 31 |
+
"""预测 query"""
|
| 32 |
+
|
| 33 |
+
# 基于 fasttext api 实现模型预测
|
| 34 |
+
# https://fasttext.cc/docs/en/supervised-tutorial.html
|
| 35 |
+
query_intent = self.model.predict(query)
|
| 36 |
+
# 预测 label 和概率
|
| 37 |
+
return query_intent[0][0].replace('__label__', ''), query_intent[1][0]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == '__main__':
|
| 41 |
+
|
| 42 |
+
print('开始训练分类器...')
|
| 43 |
+
|
| 44 |
+
train_classifier(classifier_corpus_path, classifier_save_path)
|
| 45 |
+
|
| 46 |
+
print('分类器训练成功...')
|
| 47 |
+
|
| 48 |
+
|
FinKnowledgeGraph-main/module/graph_matcher.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from py2neo import Graph
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class GraphMatcher:
|
| 5 |
+
"""基于 cypher 语句查询数据库"""
|
| 6 |
+
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))
|
| 9 |
+
|
| 10 |
+
def parse_graph(self, ques_types, entities):
|
| 11 |
+
"""转换成 cypher 语句查询"""
|
| 12 |
+
|
| 13 |
+
response = ""
|
| 14 |
+
for each_ques_type in ques_types:
|
| 15 |
+
if each_ques_type == 'concept':
|
| 16 |
+
# match 股票 - 所属概念 - 概念
|
| 17 |
+
for entity_name, entity_type in entities.items():
|
| 18 |
+
# 1、问股票的概念
|
| 19 |
+
if entity_type == '股票':
|
| 20 |
+
cypher_sql = f'MATCH (s:`股票`)-[r:所属概念]->(c:`概念`) where s.股票名称 = "{entity_name}" return c.概念名称'
|
| 21 |
+
rtn = self.graph.run(cypher_sql).data()
|
| 22 |
+
# 此处应对所有返回的rtn[i]进行遍历,得到所有值并形成问句
|
| 23 |
+
response += f'{entity_name}所属概念是{rtn[0]["c.概念名称"]}' + '\n'
|
| 24 |
+
# 2、问概念有哪些股票
|
| 25 |
+
elif entity_type == '概念':
|
| 26 |
+
cypher_sql = f'MATCH (s:`股票`)-[r:所属概念]->(c:`概念`) where c.概念名称 = "{entity_name}" return s.股票名称'
|
| 27 |
+
rtn = self.graph.run(cypher_sql).data()
|
| 28 |
+
response += f'{entity_name}概念下有{rtn[0]["s.股票名称"]}等股票' + '\n'
|
| 29 |
+
elif each_ques_type == 'holder':
|
| 30 |
+
# 提示:match 股东 - 持有 - 股票
|
| 31 |
+
for entity_name, entity_type in entities.items():
|
| 32 |
+
# 1、问股票的股东
|
| 33 |
+
if entity_type == '股票':
|
| 34 |
+
cypher_sql = f'MATCH (s:`股东`)-[r:持有]->(c:`股票`) where c.股票名称 = "{entity_name}" return s.股东名称, r.持有量, r.占比'
|
| 35 |
+
rtn = self.graph.run(cypher_sql).data()
|
| 36 |
+
response += f'{entity_name}的股东是{rtn[0]["s.股东名称"]},持有股份{rtn[0]["r.持有量"]},占比{rtn[0]["r.占比"]}%' + '\n'
|
| 37 |
+
# 2、问股东的股票
|
| 38 |
+
elif entity_type == '股东':
|
| 39 |
+
cypher_sql = f'MATCH (s:`股东`)-[r:持有]->(c:`股票`) where s.股东名称 = "{entity_name}" return c.股票名称, r.持有量, r.占比'
|
| 40 |
+
rtn = self.graph.run(cypher_sql).data()
|
| 41 |
+
response += f'{entity_name}下有{rtn[0]["c.股票名称"]},持有股份{rtn[0]["r.持有量"]},占比{rtn[0]["r.占比"]}%' + '\n'
|
| 42 |
+
pass
|
| 43 |
+
elif each_ques_type == 'industry':
|
| 44 |
+
# 提示:match 股票 return 行业
|
| 45 |
+
for entity_name, entity_type in entities.items():
|
| 46 |
+
# 1、股票所属行业
|
| 47 |
+
if entity_type == '股票':
|
| 48 |
+
cypher_sql = f'MATCH (s:`股票`) where s.股票名称="{entity_name}" return s.行业'
|
| 49 |
+
rtn = self.graph.run(cypher_sql).data()
|
| 50 |
+
response += f'{entity_name}所属行业是{rtn[0]["s.行业"]}' + '\n'
|
| 51 |
+
pass
|
| 52 |
+
return response.strip()
|
| 53 |
+
|
| 54 |
+
def predict(self, semantics):
|
| 55 |
+
"""预测 query"""
|
| 56 |
+
response = self.parse_graph(semantics['ques_types'], semantics['entities'])
|
| 57 |
+
return response
|
FinKnowledgeGraph-main/module/semantic_parser.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from FinKnowledgeGraph.config import entity_corpus_path, entity_searcher_save_path, contexts
|
| 2 |
+
import ahocorasick
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
import pickle
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
def build_search_tree(input_folder_path, tree_save_path):
|
| 9 |
+
"""读取股票名称,股东和概念实体,构建 ac 树"""
|
| 10 |
+
# https://pypi.org/project/pyahocorasick/
|
| 11 |
+
tree = ahocorasick.Automaton()
|
| 12 |
+
|
| 13 |
+
stock_basic = pd.read_csv(os.path.join(input_folder_path, '股票信息.csv'), encoding='gbk')
|
| 14 |
+
# 遍历 stock_basic,添加 name 即股票名字
|
| 15 |
+
# 股票名字为key,value表示为具体的实体类型,比如:tree.add_word('股票名A', ('股票名A', '股票'))
|
| 16 |
+
for idx, each_row in tqdm(stock_basic.iterrows()):
|
| 17 |
+
tree.add_word(str(each_row['name']), (str(each_row['name']), '股票'))
|
| 18 |
+
|
| 19 |
+
concept = pd.read_csv(os.path.join(input_folder_path, '概念信息.csv'), encoding='gbk')
|
| 20 |
+
# 遍历 concept,添加 name 即概念名字
|
| 21 |
+
# 概念名字为key,value表示为具体的实体类型,比如:tree.add_word('概念名A', ('概念名A', '概念'))
|
| 22 |
+
for idx, each_row in tqdm(concept.iterrows()):
|
| 23 |
+
tree.add_word(str(each_row['name']), (str(each_row['name']), '概念'))
|
| 24 |
+
|
| 25 |
+
holder = pd.read_csv(os.path.join(input_folder_path, '股东信息.csv'), encoding='gbk')
|
| 26 |
+
# 遍历 holder,添加 股东名称
|
| 27 |
+
# 股东名称为key,value表示为具体的实体类型,比如:tree.add_word('股东名称A', ('股东名称A', '股东'))
|
| 28 |
+
for idx, each_row in tqdm(holder.iterrows()):
|
| 29 |
+
tree.add_word(str(each_row['name']), (str(each_row['name']), '股东'))
|
| 30 |
+
|
| 31 |
+
tree.make_automaton()
|
| 32 |
+
|
| 33 |
+
with open(tree_save_path, 'wb') as fout:
|
| 34 |
+
pickle.dump(tree, fout)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class SemanticParser:
|
| 38 |
+
"""实体搜索器"""
|
| 39 |
+
|
| 40 |
+
def __init__(self, entity_model_load_path, question_types):
|
| 41 |
+
self.entity_model_load_path = entity_model_load_path
|
| 42 |
+
self.entity_model = self.load_model()
|
| 43 |
+
self.question_types = question_types
|
| 44 |
+
|
| 45 |
+
def load_model(self):
|
| 46 |
+
"""加载模型"""
|
| 47 |
+
with open(self.entity_model_load_path, 'rb') as fin:
|
| 48 |
+
return pickle.load(fin)
|
| 49 |
+
|
| 50 |
+
def predict_question_types(self, query):
|
| 51 |
+
"""判断问题类型,这里只是通过关键词去判断,可以改成分类模型"""
|
| 52 |
+
|
| 53 |
+
rtn_ques_types = []
|
| 54 |
+
for ques_type, kws in self.question_types.items():
|
| 55 |
+
for each_kw in kws:
|
| 56 |
+
if each_kw in query:
|
| 57 |
+
rtn_ques_types.append(ques_type)
|
| 58 |
+
break
|
| 59 |
+
return rtn_ques_types
|
| 60 |
+
|
| 61 |
+
def predict(self, query):
|
| 62 |
+
"""预测 query"""
|
| 63 |
+
|
| 64 |
+
rtn = {}
|
| 65 |
+
|
| 66 |
+
# 预测类型
|
| 67 |
+
ques_types = self.predict_question_types(query)
|
| 68 |
+
|
| 69 |
+
# 预测实体
|
| 70 |
+
entities = {}
|
| 71 |
+
for end_index, (entity_name, entity_type) in self.entity_model.iter(query):
|
| 72 |
+
entities[entity_name] = entity_type
|
| 73 |
+
|
| 74 |
+
# 问句中至少有一种实体
|
| 75 |
+
if len(ques_types) != 0 and len(entities) != 0:
|
| 76 |
+
rtn['ques_types'] = ques_types
|
| 77 |
+
rtn['entities'] = entities
|
| 78 |
+
# 备份上下文
|
| 79 |
+
contexts['ques_types'] = ques_types
|
| 80 |
+
contexts['entities'] = entities
|
| 81 |
+
|
| 82 |
+
# 问句中有问题类型没实体,需要结合上下文问题
|
| 83 |
+
elif len(ques_types) != 0:
|
| 84 |
+
rtn['ques_types'] = ques_types
|
| 85 |
+
# 备份
|
| 86 |
+
contexts['ques_types'] = ques_types
|
| 87 |
+
|
| 88 |
+
# 从对话历史中继承问题类型
|
| 89 |
+
rtn['entities'] = contexts['entities']
|
| 90 |
+
|
| 91 |
+
# 问句中有实体没问句类型,结合上下文实体
|
| 92 |
+
elif len(entities) != 0:
|
| 93 |
+
# 从对话历史中继承问题类型
|
| 94 |
+
rtn['ques_types'] = contexts['ques_types']
|
| 95 |
+
|
| 96 |
+
rtn['entities'] = entities
|
| 97 |
+
# 备份
|
| 98 |
+
contexts['entities'] = entities
|
| 99 |
+
else:
|
| 100 |
+
# 如果两个都没有找到,那说明是没有涉及 KG
|
| 101 |
+
rtn['ques_types'] = []
|
| 102 |
+
rtn['entities'] = {}
|
| 103 |
+
|
| 104 |
+
return rtn
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if __name__ == '__main__':
|
| 108 |
+
|
| 109 |
+
print('开始训练实体搜索树...')
|
| 110 |
+
|
| 111 |
+
build_search_tree(entity_corpus_path, entity_searcher_save_path)
|
| 112 |
+
|
| 113 |
+
print('实体搜索树训练成功...')
|
FinKnowledgeGraph-main/requirements.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beautifulsoup4==4.10.0
|
| 2 |
+
bs4==0.0.1
|
| 3 |
+
certifi==2021.5.30
|
| 4 |
+
charset-normalizer==2.0.7
|
| 5 |
+
fasttext==0.9.2
|
| 6 |
+
idna==3.2
|
| 7 |
+
interchange==2021.0.3
|
| 8 |
+
jieba==0.42.1
|
| 9 |
+
lxml==4.6.3
|
| 10 |
+
monotonic==1.6
|
| 11 |
+
numpy==1.19.5
|
| 12 |
+
packaging==21.0
|
| 13 |
+
pandas==1.1.5
|
| 14 |
+
pansi==2020.7.3
|
| 15 |
+
py2neo==2021.2.1
|
| 16 |
+
pyahocorasick==1.4.2
|
| 17 |
+
pybind11==2.8.0
|
| 18 |
+
Pygments==2.10.0
|
| 19 |
+
pyparsing==2.4.7
|
| 20 |
+
python-dateutil==2.8.2
|
| 21 |
+
pytz==2021.3
|
| 22 |
+
requests==2.26.0
|
| 23 |
+
simplejson==3.17.5
|
| 24 |
+
six==1.16.0
|
| 25 |
+
soupsieve==2.2.1
|
| 26 |
+
tqdm==4.62.3
|
| 27 |
+
tushare==1.2.67
|
| 28 |
+
urllib3==1.26.7
|
| 29 |
+
websocket-client==1.2.1
|
FinKnowledgeGraph-main/step1_get_data.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tushare as ts
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# 和 Tushare 建立连接
|
| 5 |
+
pro = ts.pro_api('08fbab8087eb66409ec66452b756beb05ef93388bbba7905fab1f7b5')
|
| 6 |
+
|
| 7 |
+
# 股票基本信息
|
| 8 |
+
# 查询当前所有正常上市交易的股票列表
|
| 9 |
+
# https://waditu.com/document/2?doc_id=25
|
| 10 |
+
# 可以限制exchange='SSE'表示上交所
|
| 11 |
+
# list_status='L'表示上市公司
|
| 12 |
+
# ts_code: TS代码
|
| 13 |
+
# symbol: 股票代码
|
| 14 |
+
# name: 股票名称
|
| 15 |
+
# area:地域
|
| 16 |
+
# industry: 行业
|
| 17 |
+
stock_basic = pro.stock_basic(exchange='SSE', list_status='L', fields='ts_code,symbol,name,area,industry,fullname')
|
| 18 |
+
stock_basic.to_csv('./data/knowledge/股票信息1.csv', encoding='gbk')
|
| 19 |
+
|
| 20 |
+
# 概念股分类 https://waditu.com/document/2?doc_id=125
|
| 21 |
+
concept = pro.concept()
|
| 22 |
+
concept.to_csv('./data/knowledge/概念信息1.csv', encoding='gbk', index=False)
|
| 23 |
+
|
| 24 |
+
# # 股票概念信息,获取概念下对应的股票(概念信息.csv 文件中共有 358 个 概念)
|
| 25 |
+
# concept_details = pd.DataFrame(columns=('id', 'concept_name', 'code', 'name'))
|
| 26 |
+
#
|
| 27 |
+
# for i in range(359):
|
| 28 |
+
# concept_id = 'TS' + str(i)
|
| 29 |
+
# # 获取该概念下的全部股票 https://waditu.com/document/2?doc_id=126
|
| 30 |
+
# concept_stocks = pro.concept_detail(id=concept_id, field='concept_name,code,name')
|
| 31 |
+
# concept_details = concept_details.append(concept_stocks)
|
| 32 |
+
# concept_details.to_csv('./data/knowledge/股票-概念信息1.csv', encoding='gbk')
|
| 33 |
+
#
|
| 34 |
+
#
|
| 35 |
+
# # 股票持有股东信息
|
| 36 |
+
# holder_basic = []
|
| 37 |
+
# # TS代码、公告日期、报告期、股东名、持有量、持有占比
|
| 38 |
+
# stock_holders = pd.DataFrame(columns=('ts_code', 'ann_date', 'end_date', 'holder_name', 'hold_amount', 'hold_ratio'))
|
| 39 |
+
# # 获取时间段内股票的股东信息
|
| 40 |
+
# for each_code in stock_basic['ts_code'].tolist():
|
| 41 |
+
# # 前十大股东:https://waditu.com/document/2?doc_id=61
|
| 42 |
+
# curr_holder = pro.top10_holders(ts_code=each_code, start_date='20200101', end_date='20201231')
|
| 43 |
+
# # 在这里,简单起见,只考虑第一个股东信息
|
| 44 |
+
# stock_holders = stock_holders.append(curr_holder.iloc[0:1])
|
| 45 |
+
# # 加入股东名称
|
| 46 |
+
# # 加入时做清洗,即去除 -,比如将 新华人寿保险股份有限公司-分红-个人分红-018L-FH002深 清洗为 新华人寿保险股份有限公司
|
| 47 |
+
# holder_basic.extend(curr_holder.iloc[0:1]['holder_name'].values.tolist().split('-')[0])
|
| 48 |
+
# stock_holders.to_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
|
| 49 |
+
#
|
| 50 |
+
# # 股东信息
|
| 51 |
+
# holder_basic_df = pd.DataFrame({
|
| 52 |
+
# '股东名称': list(set(holder_basic))
|
| 53 |
+
# })
|
| 54 |
+
# holder_basic_df.to_csv('./data/knowledge/股东信息.csv', encoding='gbk', index=False)
|
| 55 |
+
|
FinKnowledgeGraph-main/step2_store_to_neo4j.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from py2neo import Graph, Node, Relationship, NodeMatcher
|
| 4 |
+
|
| 5 |
+
# --------------------------- 连接 Neo4j
|
| 6 |
+
# 官方文档:https://py2neo.org/2021.1/
|
| 7 |
+
graph = Graph('http://localhost:7474/finance_demo/db/', auth=('neo4j', 'neo4j123'))
|
| 8 |
+
print(graph)
|
| 9 |
+
graph.run('match (n) detach delete n')# 删除所有节点及其关系
|
| 10 |
+
# --------------------------- 创建实体
|
| 11 |
+
# 股票
|
| 12 |
+
print('创建 股票 实体...')
|
| 13 |
+
stock_basic = pd.read_csv('./data/knowledge/股票信息.csv', encoding='gbk')
|
| 14 |
+
for idx, each_row in tqdm(stock_basic.iterrows()):
|
| 15 |
+
# 方法说明:https://py2neo.org/2021.1/data/index.html#py2neo.data.Node
|
| 16 |
+
# 股票 是 label
|
| 17 |
+
# keyword arguments 是属性,如 TS代码 等
|
| 18 |
+
each_stock = Node('股票',
|
| 19 |
+
TS代码=each_row['ts_code'],
|
| 20 |
+
股票代码=each_row['symbol'],
|
| 21 |
+
股票名称=each_row['name'],
|
| 22 |
+
行业=each_row['industry'])
|
| 23 |
+
try:
|
| 24 |
+
# 方法说明:https://py2neo.org/2021.1/workflow.html#py2neo.Transaction.create
|
| 25 |
+
graph.create(each_stock)
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f'Error: {e}, data idx: {idx}, data: {each_row}')
|
| 28 |
+
|
| 29 |
+
# 概念
|
| 30 |
+
print('创建 概念 实体...')
|
| 31 |
+
concept = pd.read_csv('./data/knowledge/概念信息.csv', encoding='gbk')
|
| 32 |
+
for idx, each_row in tqdm(concept.iterrows()):
|
| 33 |
+
each_concept = Node('概念',
|
| 34 |
+
概念代码=each_row['code'],
|
| 35 |
+
概念名称=each_row['name'])
|
| 36 |
+
graph.create(each_concept)
|
| 37 |
+
|
| 38 |
+
# 股东
|
| 39 |
+
print('创建 股东 实体...')
|
| 40 |
+
holder = pd.read_csv('./data/knowledge/股东信息.csv', encoding='gbk')
|
| 41 |
+
for idx, each_row in tqdm(holder.iterrows()):
|
| 42 |
+
each_holder = Node('股东',
|
| 43 |
+
股东名称=each_row['name'])
|
| 44 |
+
graph.create(each_holder)
|
| 45 |
+
|
| 46 |
+
# # --------------------------- 创建关系
|
| 47 |
+
# # 方法说明:https://py2neo.org/2021.1/matching.html#py2neo.NodeMatcher
|
| 48 |
+
# matcher = NodeMatcher(graph)
|
| 49 |
+
#
|
| 50 |
+
# # 股票-概念
|
| 51 |
+
# print('创建 股票-概念 关系...')
|
| 52 |
+
# stock_concept = pd.read_csv('./data/knowledge/股票-概念信息.csv', encoding='gbk')
|
| 53 |
+
# for idx, each_row in tqdm(stock_concept.iterrows()):
|
| 54 |
+
# node1 = matcher.match("股票", TS代码=each_row['ts_code']).first()
|
| 55 |
+
# node2 = matcher.match("概念", 概念代码=each_row['Cid']).first()
|
| 56 |
+
# # node1 = graph.nodes.match("股票", TS代码=each_row['ts_code']).first()
|
| 57 |
+
# # node2 = graph.nodes.match("概念", 概念代码=each_row['Cid']).first()
|
| 58 |
+
#
|
| 59 |
+
# if node1 is not None and node2 is not None:
|
| 60 |
+
# # 方法说明:https://py2neo.org/2021.1/data/index.html#py2neo.data.Relationship
|
| 61 |
+
# # 格式:Relationship(start_node, type, end_node)
|
| 62 |
+
# r = Relationship(node1, '所属概念', node2)
|
| 63 |
+
# graph.create(r)
|
| 64 |
+
#
|
| 65 |
+
#
|
| 66 |
+
# # 股票-股东
|
| 67 |
+
# print('创建 股票-股东 关系...')
|
| 68 |
+
# stock_holder = pd.read_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
|
| 69 |
+
# for idx, each_row in tqdm(stock_holder.iterrows()):
|
| 70 |
+
# # first() 方法返回第一个匹配的 Node,如果找不到则返回 None
|
| 71 |
+
# node1 = graph.nodes.match("股票", TS代码=each_row['ts_code']).first()
|
| 72 |
+
# node2 = graph.nodes.match("股东", 股东名称=each_row['holder_name'].split('-')[0]).first() # 取最前面的子公司
|
| 73 |
+
# if node1 is not None and node2 is not None:
|
| 74 |
+
# r = Relationship(node2, '持有', node1,
|
| 75 |
+
# ann_date=each_row['ann_date'],
|
| 76 |
+
# end_date=each_row['end_date'],
|
| 77 |
+
# hold_amount=each_row['hold_amount'],
|
| 78 |
+
# hold_ratio=each_row['hold_ratio'])
|
| 79 |
+
# graph.create(r)
|
| 80 |
+
|
| 81 |
+
#-----------------上面创建关系的方式由于py2neo版本问题总报错,因此改为用cypher语句导入关系
|
| 82 |
+
# 创建所属关系
|
| 83 |
+
print('创建 股票-概念 关系...')
|
| 84 |
+
stock_concept = pd.read_csv('./data/knowledge/股票-概念信息.csv', encoding='gbk')
|
| 85 |
+
for idx, each_row in tqdm(stock_concept.iterrows()):
|
| 86 |
+
cypher_sql = "MATCH (a:`股票`),(b:`概念`) WHERE a.`TS代码` = '{0}' AND b.`概念代码` = '{1}'" \
|
| 87 |
+
" CREATE (a)-[r:所属概念] -> (b) RETURN r"\
|
| 88 |
+
.format(str(each_row['ts_code']),str(each_row['Cid']))
|
| 89 |
+
graph.run(cypher_sql)
|
| 90 |
+
|
| 91 |
+
# 创建持有关系
|
| 92 |
+
print('创建 股票-股东 关系...')
|
| 93 |
+
stock_concept = pd.read_csv('./data/knowledge/股票-股东信息.csv', encoding='gbk')
|
| 94 |
+
for idx, each_row in tqdm(stock_concept.iterrows()):
|
| 95 |
+
cypher_sql = "MATCH (a:`股东`),(b:`股票`) WHERE a.`股东名称` = '{0}' AND b.`TS代码` = '{1}' " \
|
| 96 |
+
"CREATE (a)-[r:持有{{公告日期:'{2}', 报告期:'{3}', 持有量:'{4}', 占比:'{5}'}}] -> (b)"\
|
| 97 |
+
.format(str(each_row['holder_name'].split('-')[0]), str(each_row['ts_code']),
|
| 98 |
+
str(each_row['ann_date']), str(each_row['end_date']),
|
| 99 |
+
str(each_row['hold_amount']), str(each_row['hold_ratio']))
|
| 100 |
+
graph.run(cypher_sql)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
print('实体 关系 导入成��...')
|