Spaces:

MEssamOrg
/

ContactSearchAssistant

Sleeping

File size: 5,482 Bytes

8ef276c

# name_extraction_service.py
import logging
from typing import List, Dict, Any, Optional
from gliner import GLiNER

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class NameExtractor:
    """
    Service for extracting person names from text using GLiNER.

    GLiNER is a zero-shot NER model that can extract entities without
    being limited to predefined entity types. It's especially good for:
    - Multilingual name extraction (English + Arabic)
    - Flexible entity extraction
    - Lightweight and fast (~100-200ms)

    Size: ~150MB model
    Speed: ~100-200ms per query
    """

    def __init__(self, model_name: str = "urchade/gliner_small-v2.1"):
        """
        Initialize the name extraction service.

        Args:
            model_name: GLiNER model to use. Options:
                - "urchade/gliner_small-v2.1" (150MB, balanced)
                - "urchade/gliner_multi-v2.1" (multilingual, better for Arabic)
                - "urchade/gliner_large-v2.1" (larger, more accurate)
        """
        logger.info(f"Loading GLiNER model: {model_name}")

        # Load the pre-trained model
        # This downloads the model on first run (~150MB)
        self.model = GLiNER.from_pretrained(model_name)

        # Define the entity labels we want to extract
        self.labels = ["person", "name", "employee"]

        logger.info(f"✓ GLiNER model loaded successfully")
        logger.info(f"Entity labels: {self.labels}")

    def extract_names(self, text: str, threshold: float = 0.3) -> List[str]:
        """
        Extract person names from text.

        Args:
            text: Input text (e.g., "find Ahmed in IT")
            threshold: Confidence threshold (0-1). Lower = more names but less precise.
                      Default 0.3 is good for most cases.

        Returns:
            List of extracted names

        Example:
            >>> extractor.extract_names("find Ahmed Hassan in IT")
            ['Ahmed Hassan']

            >>> extractor.extract_names("connect me with Sarah from HR")
            ['Sarah']
        """
        logger.info(f"Extracting names from: {text}")

        # Predict entities using GLiNER
        entities = self.model.predict_entities(
            text,
            self.labels,
            threshold=threshold
        )

        # Extract just the text of person entities
        names = [entity["text"] for entity in entities]

        # Remove duplicates while preserving order
        unique_names = list(dict.fromkeys(names))

        logger.info(f"✓ Found {len(unique_names)} name(s): {unique_names}")

        return unique_names

    def extract_names_with_context(
        self,
        text: str,
        threshold: float = 0.3
    ) -> List[Dict[str, Any]]:
        """
        Extract person names with additional context (position, confidence).

        Args:
            text: Input text
            threshold: Confidence threshold (0-1)

        Returns:
            List of dictionaries with name details:
            [
                {
                    "name": "Ahmed Hassan",
                    "start": 5,
                    "end": 17,
                    "confidence": 0.95,
                    "label": "person"
                }
            ]
        """
        logger.info(f"Extracting names with context from: {text}")

        # Predict entities
        entities = self.model.predict_entities(
            text,
            self.labels,
            threshold=threshold
        )

        # Format results
        results = []
        for entity in entities:
            results.append({
                "name": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "confidence": round(entity["score"], 2),
                "label": entity["label"]
            })

        logger.info(f"✓ Found {len(results)} name(s) with context")

        return results

    def extract_from_query(
        self,
        query: str,
        extract_divisions: bool = False
    ) -> Dict[str, Any]:
        """
        Extract names and optionally division keywords from a query.

        Args:
            query: User query text
            extract_divisions: Whether to also extract division/department mentions

        Returns:
            Dictionary with extracted information:
            {
                "names": ["Ahmed", "Sarah"],
                "has_names": True,
                "count": 2,
                "divisions": ["IT", "HR"] (if extract_divisions=True)
            }
        """
        # Extract names
        names = self.extract_names(query)

        result = {
            "names": names,
            "has_names": len(names) > 0,
            "count": len(names)
        }

        # Optionally extract division keywords
        if extract_divisions:
            # Common division/department keywords
            division_keywords = [
                "IT", "HR", "Finance", "Legal", "Accounting",
                "Marketing", "Sales", "Operations", "Engineering",
                "Security", "Facilities", "Purchasing", "Audit"
            ]

            query_upper = query.upper()
            found_divisions = [
                kw for kw in division_keywords
                if kw in query_upper
            ]

            result["divisions"] = found_divisions
            result["has_divisions"] = len(found_divisions) > 0

        return result