Spaces:
Sleeping
Sleeping
| import re | |
| from typing import Any | |
| BAD_ENTITY_NAMES = { | |
| "what", "why", "when", "where", "who", "how", | |
| "is", "are", "was", "were", "be", "been", "being", | |
| "this", "that", "these", "those", "it", "they", "them", | |
| "page", "chapter", "section", "paragraph", "figure", "table", | |
| "contents", "overview", "summary", "introduction", "conclusion", | |
| "question", "answer", "example", "note", "notes", | |
| "part", "step", "case", "item", "level", "scope" | |
| } | |
| BAD_SINGLE_WORDS = BAD_ENTITY_NAMES | { | |
| "one", "two", "three", "first", "second", "third", | |
| "good", "bad", "new", "old", "main", "basic", "advanced" | |
| } | |
| def get_value(obj: Any, key: str, default=None): | |
| if isinstance(obj, dict): | |
| return obj.get(key, default) | |
| return getattr(obj, key, default) | |
| def normalize_name(name: str) -> str: | |
| return re.sub(r"\s+", " ", str(name or "")).strip() | |
| def tokenize_name(name: str): | |
| return re.findall(r"[a-zA-Z0-9_]+", str(name or "").lower()) | |
| def is_noisy_entity_name(name: str) -> bool: | |
| name = normalize_name(name) | |
| if not name: | |
| return True | |
| name_lower = name.lower() | |
| tokens = tokenize_name(name) | |
| if name_lower in BAD_ENTITY_NAMES: | |
| return True | |
| if len(tokens) == 1 and tokens[0] in BAD_SINGLE_WORDS: | |
| return True | |
| if len(name) <= 1: | |
| return True | |
| # Very short uppercase words like IS, OR, TO are usually not entities. | |
| # Keep useful acronyms like RAG, LLM, API, OCR, SQL, NLP, BM25. | |
| useful_acronyms = {"rag", "llm", "api", "ocr", "sql", "nlp", "bm25", "gpt", "pdf", "mvp"} | |
| if name.isupper() and len(name) <= 3 and name_lower not in useful_acronyms: | |
| return True | |
| if name_lower.startswith("chapter ") and len(tokens) <= 4: | |
| return True | |
| if name_lower.startswith("page ") and len(tokens) <= 4: | |
| return True | |
| return False | |
| def is_noisy_relation(relation: Any) -> bool: | |
| source = get_value(relation, "source_name") or get_value(relation, "source") | |
| target = get_value(relation, "target_name") or get_value(relation, "target") | |
| relation_type = str(get_value(relation, "relation_type", "")).upper() | |
| if is_noisy_entity_name(source): | |
| return True | |
| if is_noisy_entity_name(target): | |
| return True | |
| # IS_A from rule-based extraction is noisy unless both sides look meaningful. | |
| if relation_type == "IS_A": | |
| target_tokens = tokenize_name(target) | |
| if len(target_tokens) == 1 and target_tokens[0] in BAD_SINGLE_WORDS: | |
| return True | |
| return False | |
| def is_low_quality_chunk_text(text: str) -> bool: | |
| text = str(text or "").strip() | |
| if not text: | |
| return True | |
| lower = text.lower() | |
| dot_leaders = len(re.findall(r"\.{5,}", text)) | |
| words = re.findall(r"[a-zA-Z]{3,}", text) | |
| # Table-of-content pages often contain many dot leaders. | |
| if dot_leaders >= 3: | |
| return True | |
| if "table of contents" in lower and dot_leaders >= 1: | |
| return True | |
| # Mostly heading/index text, not answer evidence. | |
| heading_markers = [ | |
| "chapter ", | |
| "page ", | |
| "................................................................" | |
| ] | |
| marker_count = sum(1 for marker in heading_markers if marker in lower) | |
| if marker_count >= 2 and len(words) < 90: | |
| return True | |
| return False | |
| def is_meta_showcase_chunk_text(text: str) -> bool: | |
| """ | |
| Filters chunks that are about project promotion, resume bullets, | |
| LinkedIn drafts, portfolio text, or deployment brag text. | |
| These chunks may contain good keywords, but they are usually not | |
| good answer evidence for conceptual questions like "What is RAG?" | |
| """ | |
| lower = str(text or "").lower() | |
| bad_phrases = [ | |
| "linkedin post", | |
| "linkedin post draft", | |
| "copy and customise", | |
| "copy and customize", | |
| "i just shipped", | |
| "resume bullet", | |
| "portfolio", | |
| "general software engineering", | |
| "built vectorless rag platform", | |
| "most ambitious project", | |
| "deployment framework", | |
| "zero external dependencies" | |
| ] | |
| return any(phrase in lower for phrase in bad_phrases) | |
| def is_cover_or_marketing_chunk_text(text: str) -> bool: | |
| """ | |
| Filters cover pages, marketing pages, career-pitch pages, | |
| and table-like project overview pages. | |
| These chunks often contain the query keyword but are weak evidence. | |
| """ | |
| lower = str(text or "").lower() | |
| bad_phrases = [ | |
| "master guide", | |
| "what you will build", | |
| "why it matters for your career", | |
| "from absolute beginner", | |
| "senior ai / ml / mlops engineer", | |
| "production-grade rag system", | |
| "no vector databases", | |
| "no gpu", | |
| "no paid apis", | |
| "demonstrates mastery", | |
| "proves you can ship", | |
| "shows you understand", | |
| "career", | |
| "portfolio-ready", | |
| "resume-worthy" | |
| ] | |
| hit_count = sum(1 for phrase in bad_phrases if phrase in lower) | |
| return hit_count >= 2 | |