# name_extraction_service.py import logging from typing import List, Dict, Any, Optional from gliner import GLiNER # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class NameExtractor: """ Service for extracting person names from text using GLiNER. GLiNER is a zero-shot NER model that can extract entities without being limited to predefined entity types. It's especially good for: - Multilingual name extraction (English + Arabic) - Flexible entity extraction - Lightweight and fast (~100-200ms) Size: ~150MB model Speed: ~100-200ms per query """ def __init__(self, model_name: str = "urchade/gliner_small-v2.1"): """ Initialize the name extraction service. Args: model_name: GLiNER model to use. Options: - "urchade/gliner_small-v2.1" (150MB, balanced) - "urchade/gliner_multi-v2.1" (multilingual, better for Arabic) - "urchade/gliner_large-v2.1" (larger, more accurate) """ logger.info(f"Loading GLiNER model: {model_name}") # Load the pre-trained model # This downloads the model on first run (~150MB) self.model = GLiNER.from_pretrained(model_name) # Define the entity labels we want to extract self.labels = ["person", "name", "employee"] logger.info(f"✓ GLiNER model loaded successfully") logger.info(f"Entity labels: {self.labels}") def extract_names(self, text: str, threshold: float = 0.3) -> List[str]: """ Extract person names from text. Args: text: Input text (e.g., "find Ahmed in IT") threshold: Confidence threshold (0-1). Lower = more names but less precise. Default 0.3 is good for most cases. Returns: List of extracted names Example: >>> extractor.extract_names("find Ahmed Hassan in IT") ['Ahmed Hassan'] >>> extractor.extract_names("connect me with Sarah from HR") ['Sarah'] """ logger.info(f"Extracting names from: {text}") # Predict entities using GLiNER entities = self.model.predict_entities( text, self.labels, threshold=threshold ) # Extract just the text of person entities names = [entity["text"] for entity in entities] # Remove duplicates while preserving order unique_names = list(dict.fromkeys(names)) logger.info(f"✓ Found {len(unique_names)} name(s): {unique_names}") return unique_names def extract_names_with_context( self, text: str, threshold: float = 0.3 ) -> List[Dict[str, Any]]: """ Extract person names with additional context (position, confidence). Args: text: Input text threshold: Confidence threshold (0-1) Returns: List of dictionaries with name details: [ { "name": "Ahmed Hassan", "start": 5, "end": 17, "confidence": 0.95, "label": "person" } ] """ logger.info(f"Extracting names with context from: {text}") # Predict entities entities = self.model.predict_entities( text, self.labels, threshold=threshold ) # Format results results = [] for entity in entities: results.append({ "name": entity["text"], "start": entity["start"], "end": entity["end"], "confidence": round(entity["score"], 2), "label": entity["label"] }) logger.info(f"✓ Found {len(results)} name(s) with context") return results def extract_from_query( self, query: str, extract_divisions: bool = False ) -> Dict[str, Any]: """ Extract names and optionally division keywords from a query. Args: query: User query text extract_divisions: Whether to also extract division/department mentions Returns: Dictionary with extracted information: { "names": ["Ahmed", "Sarah"], "has_names": True, "count": 2, "divisions": ["IT", "HR"] (if extract_divisions=True) } """ # Extract names names = self.extract_names(query) result = { "names": names, "has_names": len(names) > 0, "count": len(names) } # Optionally extract division keywords if extract_divisions: # Common division/department keywords division_keywords = [ "IT", "HR", "Finance", "Legal", "Accounting", "Marketing", "Sales", "Operations", "Engineering", "Security", "Facilities", "Purchasing", "Audit" ] query_upper = query.upper() found_divisions = [ kw for kw in division_keywords if kw in query_upper ] result["divisions"] = found_divisions result["has_divisions"] = len(found_divisions) > 0 return result