Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List, Dict, Any | |
| from app.graph.graph_quality import is_noisy_entity_name | |
| STOP_ENTITIES = { | |
| "The", "This", "That", "These", "Those", "It", "They", "We", "You", | |
| "Page", "Chapter", "Figure", "Table", "Example", "Answer", "Question", | |
| "Introduction", "Conclusion", "Summary", "Overview", "Paragraph", | |
| "What", "Why", "When", "Where", "Who", "How", "Is", "Are", "IS" | |
| } | |
| def normalize_entity_name(name: str) -> str: | |
| name = re.sub(r"\s+", " ", name or "").strip() | |
| name = name.strip(".,;:()[]{}") | |
| return name | |
| def make_entity_id(name: str) -> str: | |
| cleaned = name.lower() | |
| cleaned = re.sub(r"[^a-z0-9]+", "_", cleaned) | |
| cleaned = cleaned.strip("_") | |
| return cleaned[:80] or "unknown_entity" | |
| def classify_entity(name: str) -> str: | |
| if re.fullmatch(r"[A-Z][A-Z0-9]{1,9}", name): | |
| return "ACRONYM" | |
| org_markers = [ | |
| "University", "Institute", "Corporation", "Corp", "Inc", "Ltd", | |
| "Company", "OpenAI", "Microsoft", "Google", "Amazon" | |
| ] | |
| if any(marker.lower() in name.lower() for marker in org_markers): | |
| return "ORGANIZATION" | |
| if any(char.isdigit() for char in name): | |
| return "TECHNICAL_TERM" | |
| if "-" in name or "/" in name: | |
| return "TECHNICAL_TERM" | |
| return "CONCEPT" | |
| def is_valid_entity(name: str) -> bool: | |
| if not name: | |
| return False | |
| if name in STOP_ENTITIES: | |
| return False | |
| if is_noisy_entity_name(name): | |
| return False | |
| if len(name) < 2: | |
| return False | |
| if len(name) > 90: | |
| return False | |
| return True | |
| def extract_entities_from_text(text: str) -> List[Dict[str, Any]]: | |
| if not text: | |
| return [] | |
| candidates = [] | |
| # Acronyms like RAG, LLM, API, OCR, BM25 | |
| for match in re.finditer(r"\b[A-Z][A-Z0-9]{1,9}\b", text): | |
| candidates.append(match.group(0)) | |
| # Capitalized technical phrases like Retrieval-Augmented Generation | |
| capitalized_phrase_pattern = ( | |
| r"\b[A-Z][a-zA-Z0-9]*(?:[-/][A-Z]?[a-zA-Z0-9]+)?" | |
| r"(?:\s+[A-Z][a-zA-Z0-9]*(?:[-/][A-Z]?[a-zA-Z0-9]+)?){0,5}\b" | |
| ) | |
| for match in re.finditer(capitalized_phrase_pattern, text): | |
| candidates.append(match.group(0)) | |
| cleaned_entities = [] | |
| seen = set() | |
| for candidate in candidates: | |
| name = normalize_entity_name(candidate) | |
| if not is_valid_entity(name): | |
| continue | |
| entity_id = make_entity_id(name) | |
| if entity_id in seen: | |
| continue | |
| seen.add(entity_id) | |
| cleaned_entities.append( | |
| { | |
| "entity_id": entity_id, | |
| "name": name, | |
| "entity_type": classify_entity(name) | |
| } | |
| ) | |
| return cleaned_entities | |
| def split_sentences(text: str) -> List[str]: | |
| if not text: | |
| return [] | |
| parts = re.split(r"(?<=[.!?])\s+", text) | |
| return [part.strip() for part in parts if len(part.strip()) > 20] | |