Spaces:
Sleeping
Sleeping
File size: 5,482 Bytes
8ef276c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# name_extraction_service.py
import logging
from typing import List, Dict, Any, Optional
from gliner import GLiNER
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NameExtractor:
"""
Service for extracting person names from text using GLiNER.
GLiNER is a zero-shot NER model that can extract entities without
being limited to predefined entity types. It's especially good for:
- Multilingual name extraction (English + Arabic)
- Flexible entity extraction
- Lightweight and fast (~100-200ms)
Size: ~150MB model
Speed: ~100-200ms per query
"""
def __init__(self, model_name: str = "urchade/gliner_small-v2.1"):
"""
Initialize the name extraction service.
Args:
model_name: GLiNER model to use. Options:
- "urchade/gliner_small-v2.1" (150MB, balanced)
- "urchade/gliner_multi-v2.1" (multilingual, better for Arabic)
- "urchade/gliner_large-v2.1" (larger, more accurate)
"""
logger.info(f"Loading GLiNER model: {model_name}")
# Load the pre-trained model
# This downloads the model on first run (~150MB)
self.model = GLiNER.from_pretrained(model_name)
# Define the entity labels we want to extract
self.labels = ["person", "name", "employee"]
logger.info(f"✓ GLiNER model loaded successfully")
logger.info(f"Entity labels: {self.labels}")
def extract_names(self, text: str, threshold: float = 0.3) -> List[str]:
"""
Extract person names from text.
Args:
text: Input text (e.g., "find Ahmed in IT")
threshold: Confidence threshold (0-1). Lower = more names but less precise.
Default 0.3 is good for most cases.
Returns:
List of extracted names
Example:
>>> extractor.extract_names("find Ahmed Hassan in IT")
['Ahmed Hassan']
>>> extractor.extract_names("connect me with Sarah from HR")
['Sarah']
"""
logger.info(f"Extracting names from: {text}")
# Predict entities using GLiNER
entities = self.model.predict_entities(
text,
self.labels,
threshold=threshold
)
# Extract just the text of person entities
names = [entity["text"] for entity in entities]
# Remove duplicates while preserving order
unique_names = list(dict.fromkeys(names))
logger.info(f"✓ Found {len(unique_names)} name(s): {unique_names}")
return unique_names
def extract_names_with_context(
self,
text: str,
threshold: float = 0.3
) -> List[Dict[str, Any]]:
"""
Extract person names with additional context (position, confidence).
Args:
text: Input text
threshold: Confidence threshold (0-1)
Returns:
List of dictionaries with name details:
[
{
"name": "Ahmed Hassan",
"start": 5,
"end": 17,
"confidence": 0.95,
"label": "person"
}
]
"""
logger.info(f"Extracting names with context from: {text}")
# Predict entities
entities = self.model.predict_entities(
text,
self.labels,
threshold=threshold
)
# Format results
results = []
for entity in entities:
results.append({
"name": entity["text"],
"start": entity["start"],
"end": entity["end"],
"confidence": round(entity["score"], 2),
"label": entity["label"]
})
logger.info(f"✓ Found {len(results)} name(s) with context")
return results
def extract_from_query(
self,
query: str,
extract_divisions: bool = False
) -> Dict[str, Any]:
"""
Extract names and optionally division keywords from a query.
Args:
query: User query text
extract_divisions: Whether to also extract division/department mentions
Returns:
Dictionary with extracted information:
{
"names": ["Ahmed", "Sarah"],
"has_names": True,
"count": 2,
"divisions": ["IT", "HR"] (if extract_divisions=True)
}
"""
# Extract names
names = self.extract_names(query)
result = {
"names": names,
"has_names": len(names) > 0,
"count": len(names)
}
# Optionally extract division keywords
if extract_divisions:
# Common division/department keywords
division_keywords = [
"IT", "HR", "Finance", "Legal", "Accounting",
"Marketing", "Sales", "Operations", "Engineering",
"Security", "Facilities", "Purchasing", "Audit"
]
query_upper = query.upper()
found_divisions = [
kw for kw in division_keywords
if kw in query_upper
]
result["divisions"] = found_divisions
result["has_divisions"] = len(found_divisions) > 0
return result
|