File size: 2,257 Bytes
d455ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from popular_domains import emailDomains
import jellyfish
from typing import List
from concurrent.futures import ThreadPoolExecutor
import numpy as np

class TrieNode:
    def __init__(self, char: str):
        self.char = char
        self.children = {}
        self.word_end = False

class Trie:
    def __init__(self):
        self.root = TrieNode('')

    def add(self, word: str):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode(char)
            node = node.children[char]
        node.word_end = True

    def search(self, word: str) -> bool:
        node = self.root
        for char in word:
            if char not in node.children:
                return False
            node = node.children[char]
        return node.word_end

def suggest_email_domain(domain: str, valid_domains: List[str]) -> List[str]:
    # Build a trie with valid domains
    trie = Trie()
    for valid_domain in valid_domains:
        trie.add(valid_domain)

    # Calculate distances using a faster string distance metric
    distances = {}
    with ThreadPoolExecutor(max_workers=np.minimum(16, len(valid_domains))) as executor:
        for valid_domain, distance in zip(valid_domains, executor.map(lambda x: jellyfish.damerau_levenshtein_distance(domain, x), valid_domains)):
            if distance <= 2:
                if distance in distances:
                    if valid_domain not in distances[distance]:
                        distances[distance].append(valid_domain)
                else:
                    distances[distance] = [valid_domain]

    # Choose the most similar domains based on alphabetical order
    sorted_domains = np.array([])
    if distances:
        min_distance = min(distances.keys())
        sorted_domains = sorted(distances[min_distance])
        sorted_domains = [d for d in sorted_domains if trie.search(d)]

    # Check for phonetic similarity using Soundex
    soundex_domain = jellyfish.soundex(domain)
    phonetically_similar_domains = [d for d in valid_domains if jellyfish.soundex(d) == soundex_domain and d not in sorted_domains]

    # Combine and return the results
    return sorted_domains + phonetically_similar_domains