File size: 5,482 Bytes
8ef276c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# name_extraction_service.py
import logging
from typing import List, Dict, Any, Optional
from gliner import GLiNER

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class NameExtractor:
    """
    Service for extracting person names from text using GLiNER.

    GLiNER is a zero-shot NER model that can extract entities without
    being limited to predefined entity types. It's especially good for:
    - Multilingual name extraction (English + Arabic)
    - Flexible entity extraction
    - Lightweight and fast (~100-200ms)

    Size: ~150MB model
    Speed: ~100-200ms per query
    """

    def __init__(self, model_name: str = "urchade/gliner_small-v2.1"):
        """
        Initialize the name extraction service.

        Args:
            model_name: GLiNER model to use. Options:
                - "urchade/gliner_small-v2.1" (150MB, balanced)
                - "urchade/gliner_multi-v2.1" (multilingual, better for Arabic)
                - "urchade/gliner_large-v2.1" (larger, more accurate)
        """
        logger.info(f"Loading GLiNER model: {model_name}")

        # Load the pre-trained model
        # This downloads the model on first run (~150MB)
        self.model = GLiNER.from_pretrained(model_name)

        # Define the entity labels we want to extract
        self.labels = ["person", "name", "employee"]

        logger.info(f"✓ GLiNER model loaded successfully")
        logger.info(f"Entity labels: {self.labels}")

    def extract_names(self, text: str, threshold: float = 0.3) -> List[str]:
        """
        Extract person names from text.

        Args:
            text: Input text (e.g., "find Ahmed in IT")
            threshold: Confidence threshold (0-1). Lower = more names but less precise.
                      Default 0.3 is good for most cases.

        Returns:
            List of extracted names

        Example:
            >>> extractor.extract_names("find Ahmed Hassan in IT")
            ['Ahmed Hassan']

            >>> extractor.extract_names("connect me with Sarah from HR")
            ['Sarah']
        """
        logger.info(f"Extracting names from: {text}")

        # Predict entities using GLiNER
        entities = self.model.predict_entities(
            text,
            self.labels,
            threshold=threshold
        )

        # Extract just the text of person entities
        names = [entity["text"] for entity in entities]

        # Remove duplicates while preserving order
        unique_names = list(dict.fromkeys(names))

        logger.info(f"✓ Found {len(unique_names)} name(s): {unique_names}")

        return unique_names

    def extract_names_with_context(
        self,
        text: str,
        threshold: float = 0.3
    ) -> List[Dict[str, Any]]:
        """
        Extract person names with additional context (position, confidence).

        Args:
            text: Input text
            threshold: Confidence threshold (0-1)

        Returns:
            List of dictionaries with name details:
            [
                {
                    "name": "Ahmed Hassan",
                    "start": 5,
                    "end": 17,
                    "confidence": 0.95,
                    "label": "person"
                }
            ]
        """
        logger.info(f"Extracting names with context from: {text}")

        # Predict entities
        entities = self.model.predict_entities(
            text,
            self.labels,
            threshold=threshold
        )

        # Format results
        results = []
        for entity in entities:
            results.append({
                "name": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "confidence": round(entity["score"], 2),
                "label": entity["label"]
            })

        logger.info(f"✓ Found {len(results)} name(s) with context")

        return results

    def extract_from_query(
        self,
        query: str,
        extract_divisions: bool = False
    ) -> Dict[str, Any]:
        """
        Extract names and optionally division keywords from a query.

        Args:
            query: User query text
            extract_divisions: Whether to also extract division/department mentions

        Returns:
            Dictionary with extracted information:
            {
                "names": ["Ahmed", "Sarah"],
                "has_names": True,
                "count": 2,
                "divisions": ["IT", "HR"] (if extract_divisions=True)
            }
        """
        # Extract names
        names = self.extract_names(query)

        result = {
            "names": names,
            "has_names": len(names) > 0,
            "count": len(names)
        }

        # Optionally extract division keywords
        if extract_divisions:
            # Common division/department keywords
            division_keywords = [
                "IT", "HR", "Finance", "Legal", "Accounting",
                "Marketing", "Sales", "Operations", "Engineering",
                "Security", "Facilities", "Purchasing", "Audit"
            ]

            query_upper = query.upper()
            found_divisions = [
                kw for kw in division_keywords
                if kw in query_upper
            ]

            result["divisions"] = found_divisions
            result["has_divisions"] = len(found_divisions) > 0

        return result