GraphResearcher / app /graph /entity_extractor.py
yugbirla's picture
Sync GraphRAG fusion quality cleanup and evaluation files
b7d0804
Raw
History Blame Contribute Delete
2.95 kB
import re
from typing import List, Dict, Any
from app.graph.graph_quality import is_noisy_entity_name
STOP_ENTITIES = {
"The", "This", "That", "These", "Those", "It", "They", "We", "You",
"Page", "Chapter", "Figure", "Table", "Example", "Answer", "Question",
"Introduction", "Conclusion", "Summary", "Overview", "Paragraph",
"What", "Why", "When", "Where", "Who", "How", "Is", "Are", "IS"
}
def normalize_entity_name(name: str) -> str:
name = re.sub(r"\s+", " ", name or "").strip()
name = name.strip(".,;:()[]{}")
return name
def make_entity_id(name: str) -> str:
cleaned = name.lower()
cleaned = re.sub(r"[^a-z0-9]+", "_", cleaned)
cleaned = cleaned.strip("_")
return cleaned[:80] or "unknown_entity"
def classify_entity(name: str) -> str:
if re.fullmatch(r"[A-Z][A-Z0-9]{1,9}", name):
return "ACRONYM"
org_markers = [
"University", "Institute", "Corporation", "Corp", "Inc", "Ltd",
"Company", "OpenAI", "Microsoft", "Google", "Amazon"
]
if any(marker.lower() in name.lower() for marker in org_markers):
return "ORGANIZATION"
if any(char.isdigit() for char in name):
return "TECHNICAL_TERM"
if "-" in name or "/" in name:
return "TECHNICAL_TERM"
return "CONCEPT"
def is_valid_entity(name: str) -> bool:
if not name:
return False
if name in STOP_ENTITIES:
return False
if is_noisy_entity_name(name):
return False
if len(name) < 2:
return False
if len(name) > 90:
return False
return True
def extract_entities_from_text(text: str) -> List[Dict[str, Any]]:
if not text:
return []
candidates = []
# Acronyms like RAG, LLM, API, OCR, BM25
for match in re.finditer(r"\b[A-Z][A-Z0-9]{1,9}\b", text):
candidates.append(match.group(0))
# Capitalized technical phrases like Retrieval-Augmented Generation
capitalized_phrase_pattern = (
r"\b[A-Z][a-zA-Z0-9]*(?:[-/][A-Z]?[a-zA-Z0-9]+)?"
r"(?:\s+[A-Z][a-zA-Z0-9]*(?:[-/][A-Z]?[a-zA-Z0-9]+)?){0,5}\b"
)
for match in re.finditer(capitalized_phrase_pattern, text):
candidates.append(match.group(0))
cleaned_entities = []
seen = set()
for candidate in candidates:
name = normalize_entity_name(candidate)
if not is_valid_entity(name):
continue
entity_id = make_entity_id(name)
if entity_id in seen:
continue
seen.add(entity_id)
cleaned_entities.append(
{
"entity_id": entity_id,
"name": name,
"entity_type": classify_entity(name)
}
)
return cleaned_entities
def split_sentences(text: str) -> List[str]:
if not text:
return []
parts = re.split(r"(?<=[.!?])\s+", text)
return [part.strip() for part in parts if len(part.strip()) > 20]