Spaces:

yugbirla
/

GraphResearcher

Sleeping

App Files Files Community

GraphResearcher / app /graph /relation_extractor.py

yugbirla

Sync GraphRAG fusion quality cleanup and evaluation files

b7d0804 13 days ago

Raw

History Blame Contribute Delete

2.96 kB


	import itertools
	import re
	from typing import List, Dict, Any

	from app.graph.entity_extractor import split_sentences
	from app.graph.graph_quality import is_noisy_entity_name


	VERB_RELATION_MAP = {
	"stands for": "STANDS_FOR",
	"refers to": "REFERS_TO",
	"uses": "USES",
	"use": "USES",
	"retrieves": "RETRIEVES",
	"retrieve": "RETRIEVES",
	"generates": "GENERATES",
	"generate": "GENERATES",
	"provides": "PROVIDES",
	"provide": "PROVIDES",
	"reduces": "REDUCES",
	"reduce": "REDUCES",
	"improves": "IMPROVES",
	"improve": "IMPROVES",
	"contains": "CONTAINS",
	"include": "INCLUDES",
	"includes": "INCLUDES"
	}


	def relation_id(source_id: str, relation_type: str, target_id: str) -> str:
	return f"{source_id}__{relation_type.lower()}__{target_id}"[:160]


	def entity_appears_in_sentence(entity_name: str, sentence: str) -> bool:
	pattern = r"\b" + re.escape(entity_name) + r"\b"
	return re.search(pattern, sentence, flags=re.IGNORECASE) is not None


	def extract_relations_from_text(
	text: str,
	entities: List[Dict[str, Any]]
	) -> List[Dict[str, Any]]:

	if not text or len(entities) < 2:
	return []

	relations = []
	sentences = split_sentences(text)

	clean_entities = [
	entity for entity in entities
	if not is_noisy_entity_name(entity.get("name", ""))
	]

	if len(clean_entities) < 2:
	return []

	for sentence in sentences:
	present_entities = [
	entity for entity in clean_entities
	if entity_appears_in_sentence(entity["name"], sentence)
	]

	# Avoid relation explosion
	present_entities = present_entities[:5]

	if len(present_entities) < 2:
	continue

	relation_type = detect_relation_type(sentence)

	for source, target in itertools.combinations(present_entities, 2):
	if source["entity_id"] == target["entity_id"]:
	continue

	if is_noisy_entity_name(source["name"]) or is_noisy_entity_name(target["name"]):
	continue

	relations.append(
	{
	"relation_id": relation_id(
	source["entity_id"],
	relation_type,
	target["entity_id"]
	),
	"source_entity_id": source["entity_id"],
	"target_entity_id": target["entity_id"],
	"source_name": source["name"],
	"target_name": target["name"],
	"relation_type": relation_type,
	"evidence_sentence": sentence
	}
	)

	return relations


	def detect_relation_type(sentence: str) -> str:
	sentence_lower = sentence.lower()

	for phrase, relation_type in VERB_RELATION_MAP.items():
	if phrase in sentence_lower:
	return relation_type

	return "RELATED_TO"