GeoLLM / output /test_knn_3 /KNN_token_old.py
Pengfa Li
Upload folder using huggingface_hub
badcf3c verified
import json
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch
class EntityLevelRetriever:
def __init__(self, model_name='bert-base-chinese'):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.index = faiss.IndexFlatL2(768) # L2距离更适合BERT嵌入
self.entity_db = []
self.metadata = []
def _get_entity_span(self, text, entity):
"""通过精确匹配获取实体在文本中的位置"""
start = text.find(entity)
if start == -1:
return None
return (start, start + len(entity))
def _generate_entity_embedding(self, text, entity):
"""生成实体级上下文嵌入"""
span = self._get_entity_span(text, entity)
if not span:
return None
inputs = self.tokenizer(text, return_tensors='pt', truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
# 将字符位置转换为token位置
char_to_token = lambda x: inputs.char_to_token(x)
start_token = char_to_token(span[0])
end_token = char_to_token(span[1]-1)
if not start_token or not end_token:
return None
# 提取实体对应的token嵌入并平均
entity_embedding = outputs.last_hidden_state[0, start_token:end_token+1].mean(dim=0).numpy()
return entity_embedding.astype('float32')
def build_index(self, train_path):
"""构建实体索引"""
with open(train_path, 'r', encoding='utf-8') as f:
dataset = json.load(f)
# 仅处理500-1000索引的数据(演示用切片操作)
dataset = dataset[500:1000]
for item in dataset:
text = item['text']
for triple in item['triple_list']:
# 处理头实体和尾实体
for entity in [triple[0], triple[2]]:
embedding = self._generate_entity_embedding(text, entity)
if embedding is not None:
self.entity_db.append(embedding)
self.metadata.append({
'entity': entity,
'type': triple[1], # 保存关系类型
'context': text
})
print(f"实体数量检查 - 向量数: {len(self.entity_db)}, 元数据数: {len(self.metadata)}")
self.index.add(np.array(self.entity_db))
print(f"索引维度: {self.index.d}, 存储数量: {self.index.ntotal}")
def search_entities(self, test_path, top_k=3):
"""实体检索"""
with open(test_path, 'r', encoding='utf-8') as f:
test_data = json.load(f)
results = []
for item in test_data:
text = item['text']
entity_results = {}
for triple in item['triple_list']:
for entity in [triple[0], triple[2]]:
embedding = self._generate_entity_embedding(text, entity)
if embedding is None:
continue
# FAISS搜索
distances, indices = self.index.search(np.array([embedding]), top_k)
# 收集相似实体
neighbors = []
for j in range(top_k): # 按结果位置遍历
i = indices[0][j] # 获取实际索引位置
if 0 <= i < len(self.metadata):
neighbor = {
'entity': self.metadata[i]['entity'],
'relation': self.metadata[i]['type'],
'context': self.metadata[i]['context'],
'distance': float(distances[0][j]) # 按位置j获取对应距离
}
neighbors.append(neighbor)
entity_results[entity] = neighbors
results.append({
'text': text,
'entity_matches': entity_results
})
return results
# 使用示例
if __name__ == "__main__":
# 初始化检索系统
retriever = EntityLevelRetriever()
# 构建训练索引(约需2-5分钟,取决于数据量)
print("Building training index...")
retriever.build_index('./data/train_triples.json')
# 执行测试检索
print("\nSearching similar entities...")
results = retriever.search_entities('./data/test_triples.json')
# 保存结果
with open('./data/entity_search_results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# print("检索完成!结果已保存至entity_search_results.json")