File size: 4,365 Bytes
52a0fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
Named Entity Recognition using spaCy.
Extracts persons, organizations, dates, monetary amounts, locations, and more.
Also uses regex patterns for additional entity types.
"""
import re
from collections import Counter
from typing import List, Dict
from models.schemas import Entity, EntityResult
from config import SPACY_MODEL, NER_ENTITY_TYPES

# Try to load spaCy model
try:
    import spacy
    nlp = spacy.load(SPACY_MODEL)
    SPACY_AVAILABLE = True
except (ImportError, OSError):
    SPACY_AVAILABLE = False
    nlp = None

# Entity label descriptions
LABEL_DESCRIPTIONS = {
    "PERSON": "Person name",
    "ORG": "Organization",
    "GPE": "Country / City / State",
    "DATE": "Date or period",
    "MONEY": "Monetary value",
    "TIME": "Time expression",
    "PERCENT": "Percentage",
    "EVENT": "Named event",
    "PRODUCT": "Product name",
    "LAW": "Law or regulation",
    "NORP": "Nationality / Group",
    "FAC": "Facility / Building",
    "LOC": "Non-GPE location",
    "WORK_OF_ART": "Title of work",
    "LANGUAGE": "Language name",
    "CARDINAL": "Number",
    "ORDINAL": "Ordinal number",
    "QUANTITY": "Measurement",
    "EMAIL": "Email address",
    "PHONE": "Phone number",
    "URL": "Web URL",
}

# Regex patterns for additional entity types
REGEX_PATTERNS = {
    "EMAIL": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    "PHONE": r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}',
    "URL": r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\-._~:/?#\[\]@!$&\'()*+,;=%]*',
}


def _extract_regex_entities(text: str) -> List[Entity]:
    """Extract entities using regex patterns."""
    entities = []
    for label, pattern in REGEX_PATTERNS.items():
        matches = re.findall(pattern, text)
        if matches:
            counted = Counter(matches)
            for match_text, count in counted.most_common():
                entities.append(Entity(
                    text=match_text,
                    label=label,
                    label_description=LABEL_DESCRIPTIONS.get(label, label),
                    count=count,
                ))
    return entities


def _extract_spacy_entities(text: str) -> List[Entity]:
    """Extract entities using spaCy NER."""
    if not SPACY_AVAILABLE or nlp is None:
        return []

    # Process text (handle long texts by chunking)
    max_length = 100000
    if len(text) > max_length:
        text = text[:max_length]

    doc = nlp(text)

    # Collect and deduplicate entities
    entity_map: Dict[str, Dict] = {}

    for ent in doc.ents:
        if ent.label_ not in NER_ENTITY_TYPES:
            continue

        clean_text = ent.text.strip()
        if not clean_text or len(clean_text) < 2:
            continue

        key = f"{ent.label_}:{clean_text.lower()}"
        if key in entity_map:
            entity_map[key]["count"] += 1
            entity_map[key]["positions"].append(ent.start_char)
        else:
            entity_map[key] = {
                "text": clean_text,
                "label": ent.label_,
                "label_description": LABEL_DESCRIPTIONS.get(ent.label_, ent.label_),
                "count": 1,
                "positions": [ent.start_char],
            }

    # Convert to Entity objects and sort by count
    entities = [
        Entity(**data)
        for data in sorted(entity_map.values(), key=lambda x: x["count"], reverse=True)
    ]

    return entities


def extract_entities(text: str) -> EntityResult:
    """
    Extract named entities from text using spaCy and regex patterns.

    Args:
        text: The input text to analyze.

    Returns:
        EntityResult with all found entities and statistics.
    """
    if not text.strip():
        return EntityResult(entities=[], entity_counts={}, total_entities=0)

    # Get entities from both sources
    spacy_entities = _extract_spacy_entities(text)
    regex_entities = _extract_regex_entities(text)

    # Combine (spaCy entities first, then regex)
    all_entities = spacy_entities + regex_entities

    # Count by category
    entity_counts: Dict[str, int] = {}
    for ent in all_entities:
        entity_counts[ent.label] = entity_counts.get(ent.label, 0) + ent.count

    return EntityResult(
        entities=all_entities,
        entity_counts=entity_counts,
        total_entities=sum(ent.count for ent in all_entities),
    )