Spaces:

yugbirla
/

GraphResearcher

Sleeping

App Files Files Community

GraphResearcher / app /graph /graph_quality.py

yugbirla

Sync GraphRAG fusion quality cleanup and evaluation files

b7d0804 16 days ago

Raw

History Blame Contribute Delete

5.04 kB


	import re
	from typing import Any


	BAD_ENTITY_NAMES = {
	"what", "why", "when", "where", "who", "how",
	"is", "are", "was", "were", "be", "been", "being",
	"this", "that", "these", "those", "it", "they", "them",
	"page", "chapter", "section", "paragraph", "figure", "table",
	"contents", "overview", "summary", "introduction", "conclusion",
	"question", "answer", "example", "note", "notes",
	"part", "step", "case", "item", "level", "scope"
	}


	BAD_SINGLE_WORDS = BAD_ENTITY_NAMES \| {
	"one", "two", "three", "first", "second", "third",
	"good", "bad", "new", "old", "main", "basic", "advanced"
	}


	def get_value(obj: Any, key: str, default=None):
	if isinstance(obj, dict):
	return obj.get(key, default)

	return getattr(obj, key, default)


	def normalize_name(name: str) -> str:
	return re.sub(r"\s+", " ", str(name or "")).strip()


	def tokenize_name(name: str):
	return re.findall(r"[a-zA-Z0-9_]+", str(name or "").lower())


	def is_noisy_entity_name(name: str) -> bool:
	name = normalize_name(name)

	if not name:
	return True

	name_lower = name.lower()
	tokens = tokenize_name(name)

	if name_lower in BAD_ENTITY_NAMES:
	return True

	if len(tokens) == 1 and tokens[0] in BAD_SINGLE_WORDS:
	return True

	if len(name) <= 1:
	return True

	# Very short uppercase words like IS, OR, TO are usually not entities.
	# Keep useful acronyms like RAG, LLM, API, OCR, SQL, NLP, BM25.
	useful_acronyms = {"rag", "llm", "api", "ocr", "sql", "nlp", "bm25", "gpt", "pdf", "mvp"}

	if name.isupper() and len(name) <= 3 and name_lower not in useful_acronyms:
	return True

	if name_lower.startswith("chapter ") and len(tokens) <= 4:
	return True

	if name_lower.startswith("page ") and len(tokens) <= 4:
	return True

	return False


	def is_noisy_relation(relation: Any) -> bool:
	source = get_value(relation, "source_name") or get_value(relation, "source")
	target = get_value(relation, "target_name") or get_value(relation, "target")
	relation_type = str(get_value(relation, "relation_type", "")).upper()

	if is_noisy_entity_name(source):
	return True

	if is_noisy_entity_name(target):
	return True

	# IS_A from rule-based extraction is noisy unless both sides look meaningful.
	if relation_type == "IS_A":
	target_tokens = tokenize_name(target)

	if len(target_tokens) == 1 and target_tokens[0] in BAD_SINGLE_WORDS:
	return True

	return False


	def is_low_quality_chunk_text(text: str) -> bool:
	text = str(text or "").strip()

	if not text:
	return True

	lower = text.lower()
	dot_leaders = len(re.findall(r"\.{5,}", text))
	words = re.findall(r"[a-zA-Z]{3,}", text)

	# Table-of-content pages often contain many dot leaders.
	if dot_leaders >= 3:
	return True

	if "table of contents" in lower and dot_leaders >= 1:
	return True

	# Mostly heading/index text, not answer evidence.
	heading_markers = [
	"chapter ",
	"page ",
	"................................................................"
	]

	marker_count = sum(1 for marker in heading_markers if marker in lower)

	if marker_count >= 2 and len(words) < 90:
	return True

	return False



	def is_meta_showcase_chunk_text(text: str) -> bool:
	"""
	Filters chunks that are about project promotion, resume bullets,
	LinkedIn drafts, portfolio text, or deployment brag text.

	These chunks may contain good keywords, but they are usually not
	good answer evidence for conceptual questions like "What is RAG?"
	"""

	lower = str(text or "").lower()

	bad_phrases = [
	"linkedin post",
	"linkedin post draft",
	"copy and customise",
	"copy and customize",
	"i just shipped",
	"resume bullet",
	"portfolio",
	"general software engineering",
	"built vectorless rag platform",
	"most ambitious project",
	"deployment framework",
	"zero external dependencies"
	]

	return any(phrase in lower for phrase in bad_phrases)



	def is_cover_or_marketing_chunk_text(text: str) -> bool:
	"""
	Filters cover pages, marketing pages, career-pitch pages,
	and table-like project overview pages.

	These chunks often contain the query keyword but are weak evidence.
	"""

	lower = str(text or "").lower()

	bad_phrases = [
	"master guide",
	"what you will build",
	"why it matters for your career",
	"from absolute beginner",
	"senior ai / ml / mlops engineer",
	"production-grade rag system",
	"no vector databases",
	"no gpu",
	"no paid apis",
	"demonstrates mastery",
	"proves you can ship",
	"shows you understand",
	"career",
	"portfolio-ready",
	"resume-worthy"
	]

	hit_count = sum(1 for phrase in bad_phrases if phrase in lower)

	return hit_count >= 2