devusman commited on
Commit
d455ad5
·
1 Parent(s): 02c03a4
README.md CHANGED
@@ -1,19 +1,37 @@
1
- ---
2
- title: Email
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- ---
13
-
14
- # Welcome to Streamlit!
15
-
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
-
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Email Validation Tool
2
+ This is a Streamlit application for email validation. The tool allows users to verify the validity of email addresses and analyze email-related data.
3
+
4
+ ## Features:
5
+ **Single Email Verification**: Users can enter an email address to check its validity. The tool performs syntax validation, checks MX records, establishes SMTP connection, and checks if the domain is a temporary one. The result is displayed along with key metrics like syntax validation, MX record status, and temporary domain status.
6
+
7
+ **Bulk Email Processing**: Users can upload a CSV, XLSX, or TXT file containing a list of email addresses. The tool processes each email in the file and performs validation similar to single email verification. After processing, users can download the results, including email addresses and their validation labels.
8
+
9
+ **Domain Information**: For valid email addresses, the tool provides additional domain information such as registrar, server, and country using the WHOIS database.
10
+
11
+ ## Installation
12
+ To run the Email Validation Tool locally, follow these steps:
13
+
14
+ ```
15
+ Clone the repository:
16
+ git clone https://github.com/sathish-1804/email_validation_tool.git
17
+
18
+ Change the directory to the cloned repository:
19
+ cd email_validation_tool
20
+
21
+ Install the required dependencies:
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ To launch the Email Validation Tool, run the following command:
26
+ ```
27
+ streamlit run main.py
28
+ ```
29
+
30
+ The tool will be accessible at http://localhost:8501 in your web browser.
31
+
32
+ **Note**:
33
+ - The tool performs email validation using various methods such as syntax validation, MX record checks, and SMTP connection.
34
+ - Bulk email processing allows users to upload a file and process multiple email addresses simultaneously.
35
+ - Domain information retrieval is provided through the WHOIS database for valid email addresses.
36
+ - The tool's functionality can be customized or extended based on specific requirements.
37
+ - Ensure that the data source (CSV, XLSX, or TXT file) contains the necessary columns for processing.
__pycache__/popular_domains.cpython-311.pyc ADDED
Binary file (2.11 kB). View file
 
__pycache__/popular_domains.cpython-313.pyc ADDED
Binary file (2.08 kB). View file
 
__pycache__/source_code.cpython-311.pyc ADDED
Binary file (5.72 kB). View file
 
__pycache__/source_code.cpython-313.pyc ADDED
Binary file (5.41 kB). View file
 
__pycache__/suggestion.cpython-311.pyc ADDED
Binary file (4.5 kB). View file
 
__pycache__/suggestion.cpython-313.pyc ADDED
Binary file (3.78 kB). View file
 
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ from tempfile import NamedTemporaryFile
3
+ import shutil
4
+ import pandas as pd
5
+ import source_code as sc
6
+ from suggestion import suggest_email_domain
7
+ import whois
8
+ from popular_domains import emailDomains
9
+ import streamlit as st
10
+ from streamlit_extras.metric_cards import style_metric_cards
11
+
12
+ st.set_page_config(
13
+ page_title="Email verification",
14
+ page_icon="✅",
15
+ layout="centered",
16
+ )
17
+
18
+ def label_email(email):
19
+ if not sc.is_valid_email(email):
20
+ return "Invalid"
21
+ if not sc.has_valid_mx_record(email.split('@')[1]):
22
+ return "Invalid"
23
+ if not sc.verify_email(email):
24
+ return "Unknown"
25
+ if sc.is_disposable(email.split('@')[1]):
26
+ return "Risky"
27
+ return "Valid"
28
+
29
+ def label_emails(input_file):
30
+ file_extension = input_file.name.split('.')[-1].lower()
31
+
32
+ if file_extension == 'csv':
33
+ df = process_csv(input_file)
34
+ elif file_extension == 'xlsx':
35
+ df = process_xlsx(input_file)
36
+ elif file_extension == 'txt':
37
+ df = process_txt(input_file)
38
+ else:
39
+ st.warning("Unsupported file format. Please provide a CSV, XLSX, or TXT file.")
40
+
41
+
42
+ def process_csv(input_file):
43
+ # Read the uploaded file as a DataFrame
44
+ if input_file:
45
+ if isinstance(input_file, str): # For Streamlit sharing compatibility
46
+ df = pd.read_csv(input_file, header=None)
47
+ else:
48
+ df = pd.read_csv(input_file, header=None)
49
+
50
+ # Create a list to store the results
51
+ results = []
52
+
53
+ # Process each row in the input DataFrame
54
+ for index, row in df.iterrows():
55
+ email = row[0].strip()
56
+ label = label_email(email)
57
+ results.append([email, label])
58
+
59
+ # Create a new DataFrame for results
60
+ result_df = pd.DataFrame(results, columns=['Email', 'Label'])
61
+ result_df.index = range(1, len(result_df) + 1) # Starting index from 1
62
+ return result_df
63
+ else:
64
+ return pd.DataFrame(columns=['Email', 'Label'])
65
+
66
+ def process_xlsx(input_file):
67
+ df = pd.read_excel(input_file, header=None)
68
+ results = []
69
+
70
+ for index, row in df.iterrows():
71
+ email = row[0].strip()
72
+ label = label_email(email)
73
+ results.append([email, label])
74
+
75
+ result_df = pd.DataFrame(results, columns=['Email', 'Label'])
76
+ result_df.index = range(1, len(result_df) + 1) # Starting index from 1
77
+
78
+ # Display the results in a table
79
+ st.dataframe(result_df)
80
+
81
+
82
+ def process_txt(input_file):
83
+ input_text = input_file.read().decode("utf-8").splitlines()
84
+
85
+ # Create a list to store the results
86
+ results = []
87
+
88
+ for line in input_text:
89
+ email = line.strip()
90
+ label = label_email(email)
91
+ results.append([email, label])
92
+
93
+ # Create a DataFrame for the results
94
+ result_df = pd.DataFrame(results, columns=['Email', 'Label'])
95
+ result_df.index = range(1, len(result_df) + 1) # Starting index from 1
96
+
97
+ # Display the results in a table
98
+ st.dataframe(result_df)
99
+
100
+ def main():
101
+ with open('style.css') as f:
102
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
103
+
104
+ st.title("Email Verification Tool", help="This tool verifies the validity of an email address.")
105
+ st.info("The result may not be accurate. However, it has 90% accuracy.")
106
+
107
+ t1, t2= st.tabs(["Single Email", "Bulk Email Processing"])
108
+
109
+ with t1:
110
+ # Single email verification
111
+
112
+ email = st.text_input("Enter an email address:")
113
+
114
+ if st.button("Verify"):
115
+ with st.spinner('Verifying...'):
116
+ result = {}
117
+
118
+ # Syntax validation
119
+ result['syntaxValidation'] = sc.is_valid_email(email)
120
+
121
+ if result['syntaxValidation']:
122
+ domain_part = email.split('@')[1] if '@' in email else ''
123
+
124
+ if not domain_part:
125
+ st.error("Invalid email format. Please enter a valid email address.")
126
+ else:
127
+ # Additional validation for the domain part
128
+ if not sc.has_valid_mx_record(domain_part):
129
+ st.warning("Not valid: MX record not found.")
130
+ suggested_domains = suggest_email_domain(domain_part, emailDomains)
131
+ if suggested_domains:
132
+ st.info("Suggested Domains:")
133
+ for suggested_domain in suggested_domains:
134
+ st.write(suggested_domain)
135
+ else:
136
+ st.warning("No suggested domains found.")
137
+ else:
138
+ # MX record validation
139
+ result['MXRecord'] = sc.has_valid_mx_record(domain_part)
140
+
141
+ # SMTP validation
142
+ if result['MXRecord']:
143
+ result['smtpConnection'] = sc.verify_email(email)
144
+ else:
145
+ result['smtpConnection'] = False
146
+
147
+ # Temporary domain check
148
+ result['is Temporary'] = sc.is_disposable(domain_part)
149
+
150
+ # Determine validity status and message
151
+ is_valid = (
152
+ result['syntaxValidation']
153
+ and result['MXRecord']
154
+ and result['smtpConnection']
155
+ and not result['is Temporary']
156
+ )
157
+
158
+ st.markdown("**Result:**")
159
+
160
+ # Display metric cards with reduced text size
161
+ col1, col2, col3 = st.columns(3)
162
+ col1.metric(label="Syntax", value=result['syntaxValidation'])
163
+ col2.metric(label="MxRecord", value=result['MXRecord'])
164
+ col3.metric(label="Is Temporary", value=result['is Temporary'])
165
+
166
+
167
+ # Show SMTP connection status as a warning
168
+ if not result['smtpConnection']:
169
+ st.warning("SMTP connection not established.")
170
+
171
+ # Show domain details in an expander
172
+ with st.expander("See Domain Information"):
173
+ try:
174
+ dm_info = whois.whois(domain_part)
175
+ st.write("Registrar:", dm_info.registrar)
176
+ st.write("Server:", dm_info.whois_server)
177
+ st.write("Country:", dm_info.country)
178
+ except:
179
+ st.error("Domain information retrieval failed.")
180
+
181
+ # Show validity message
182
+ if is_valid:
183
+ st.success(f"{email} is a Valid email")
184
+ else:
185
+ st.error(f"{email} is a Invalid email")
186
+ if result['is Temporary']:
187
+ st.text("It is a disposable email")
188
+
189
+ with t2:
190
+ # Bulk email processing
191
+ st.header("Bulk Email Processing")
192
+ input_file = st.file_uploader("Upload a CSV, XLSX, or TXT file", type=["csv", "xlsx", "txt"])
193
+ if input_file:
194
+ st.write("Processing...")
195
+ if input_file.type == 'text/plain':
196
+ process_txt(input_file)
197
+ else:
198
+ df = process_csv(input_file)
199
+ st.success("Processing completed. Displaying results:")
200
+ st.dataframe(df)
201
+
202
+
203
+
204
+ if __name__ == "__main__":
205
+ main()
popular_domains.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ emailDomains = np.array([
3
+ "aol.com", "att.net", "comcast.net", "facebook.com", "gmail.com", "gmx.com", "googlemail.com",
4
+ "google.com", "hotmail.com", "hotmail.co.uk", "mac.com", "me.com", "mail.com", "msn.com",
5
+ "live.com", "sbcglobal.net", "verizon.net", "yahoo.com", "yahoo.co.uk",
6
+ "email.com", "fastmail.fm", "games.com" , "gmx.net", "hush.com", "hushmail.com", "icloud.com",
7
+ "iname.com", "inbox.com", "lavabit.com", "love.com", "outlook.com", "pobox.com", "protonmail.ch", "protonmail.com", "tutanota.de", "tutanota.com", "tutamail.com", "tuta.io",
8
+ "keemail.me", "rocketmail.com" , "safe-mail.net", "wow.com", "ygm.com" ,
9
+ "ymail.com" , "zoho.com", "yandex.com",
10
+ "bellsouth.net", "charter.net", "cox.net", "earthlink.net", "juno.com",
11
+ "btinternet.com", "virginmedia.com", "blueyonder.co.uk", "live.co.uk",
12
+ "ntlworld.com", "orange.net", "sky.com", "talktalk.co.uk", "tiscali.co.uk",
13
+ "virgin.net", "bt.com",
14
+ "sina.com", "sina.cn", "qq.com", "naver.com", "hanmail.net", "daum.net", "nate.com", "yahoo.co.jp", "yahoo.co.kr",
15
+ "yahoo.co.id", "yahoo.co.in", "yahoo.com.sg", "yahoo.com.ph", "163.com", "yeah.net", "126.com", "21cn.com", "aliyun.com", "foxmail.com",
16
+ "hotmail.fr", "live.fr", "laposte.net", "yahoo.fr", "wanadoo.fr", "orange.fr", "gmx.fr", "sfr.fr", "neuf.fr", "free.fr",
17
+ "gmx.de", "hotmail.de", "live.de", "online.de", "t-online.de" , "web.de", "yahoo.de",
18
+ "libero.it", "virgilio.it", "hotmail.it", "aol.it", "tiscali.it", "alice.it", "live.it", "yahoo.it",
19
+ "email.it", "tin.it", "poste.it", "teletu.it",
20
+ "bk.ru", "inbox.ru", "list.ru", "mail.ru", "rambler.ru", "yandex.by", "yandex.com", "yandex.kz", "yandex.ru", "yandex.ua", "ya.ru",
21
+ "hotmail.be", "live.be", "skynet.be", "voo.be", "tvcablenet.be", "telenet.be",
22
+ "hotmail.com.ar", "live.com.ar", "yahoo.com.ar", "fibertel.com.ar", "speedy.com.ar", "arnet.com.ar",
23
+ "yahoo.com.mx", "live.com.mx", "hotmail.es", "hotmail.com.mx", "prodigy.net.mx",
24
+ "yahoo.ca", "hotmail.ca", "bell.net", "shaw.ca", "sympatico.ca", "rogers.com",
25
+ "yahoo.com.br", "hotmail.com.br", "outlook.com.br", "uol.com.br", "bol.com.br", "terra.com.br", "ig.com.br", "r7.com"
26
+ , "zipmail.com.br", "globo.com", "globomail.com", "oi.com.br"])
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
1
+ python-whois
2
+ dnspython
3
+ requests
4
+ streamlit
5
+ streamlit-extras
6
+ jellyfish
7
+ numpy
source_code.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import dns.resolver
3
+ import smtplib
4
+ import requests
5
+ import threading
6
+ import queue
7
+ import dns.reversename
8
+
9
+ CACHE_TTL = 600
10
+
11
+ # Initialize a DNS resolver with caching enabled
12
+ resolver = dns.resolver.Resolver(configure=False)
13
+ resolver.nameservers = ['8.8.8.8']
14
+ resolver.cache = dns.resolver.Cache()
15
+
16
+
17
+ # def is_valid_email(email):
18
+ # # Check if "@" is present in the email
19
+ # if "@" not in email:
20
+ # return False
21
+
22
+ # local_part, domain_part = email.split('@')
23
+
24
+ # # Check for consecutive dots, hyphens, or underscores in the local part
25
+ # if re.search(r'\.{2}|-{2}|_{2}', local_part):
26
+ # return False
27
+
28
+ # # Check for consecutive dots, hyphens, or underscores in the domain part
29
+ # if re.search(r'\.{2}|-{2}|_{2}', domain_part):
30
+ # return False
31
+
32
+ # # Check for two consecutive dots, hyphens, or underscores anywhere in the email
33
+ # if re.search(r'\.\-|\-\.|\.\.|\_\-|\-\_|\_\_|\.\.|--', email):
34
+ # return False
35
+
36
+ # # Validate email syntax
37
+ # pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
38
+ # return re.match(pattern, email) is not None
39
+
40
+ def is_valid_email(email):
41
+ # Comprehensive regex for email validation
42
+ pattern = r'''
43
+ ^ # Start of string
44
+ (?!.*[._%+-]{2}) # No consecutive special characters
45
+ [a-zA-Z0-9._%+-]{1,64} # Local part: allowed characters and length limit
46
+ (?<![._%+-]) # No special characters at the end of local part
47
+ @ # "@" symbol
48
+ [a-zA-Z0-9.-]+ # Domain part: allowed characters
49
+ (?<![.-]) # No special characters at the end of domain
50
+ \.[a-zA-Z]{2,}$ # Top-level domain with minimum 2 characters
51
+ '''
52
+
53
+ # Match the entire email against the pattern
54
+ return re.match(pattern, email, re.VERBOSE) is not None
55
+
56
+ # mx record validation
57
+ # Set the cache TTL (in seconds)
58
+
59
+ def query_dns(record_type, domain):
60
+ try:
61
+ # Try to resolve the record from cache first
62
+ record_name = domain if record_type == 'MX' else f'{domain}.'
63
+ cache_result = resolver.cache.get((record_name, record_type))
64
+ if cache_result is not None and (dns.resolver.mtime() - cache_result.time) < CACHE_TTL:
65
+ return True
66
+
67
+ # Otherwise, perform a fresh DNS query
68
+ resolver.timeout = 2
69
+ resolver.lifetime = 2
70
+ resolver.resolve(record_name, record_type)
71
+ return True
72
+ except dns.resolver.NXDOMAIN:
73
+ # The domain does not exist
74
+ return False
75
+ except dns.resolver.NoAnswer:
76
+ # No record of the requested type was found
77
+ return False
78
+ except dns.resolver.Timeout:
79
+ # The query timed out
80
+ return False
81
+ except:
82
+ # An unexpected error occurred
83
+ return False
84
+
85
+
86
+ def has_valid_mx_record(domain):
87
+ # Define a function to handle each DNS query in a separate thread
88
+ def query_mx(results_queue):
89
+ results_queue.put(query_dns('MX', domain))
90
+
91
+ def query_a(results_queue):
92
+ results_queue.put(query_dns('A', domain))
93
+
94
+ # Start multiple threads to query the MX and A records simultaneously
95
+ mx_queue = queue.Queue()
96
+ a_queue = queue.Queue()
97
+ mx_thread = threading.Thread(target=query_mx, args=(mx_queue,))
98
+ a_thread = threading.Thread(target=query_a, args=(a_queue,))
99
+ mx_thread.start()
100
+ a_thread.start()
101
+
102
+ # Wait for both threads to finish and retrieve the results from the queues
103
+ mx_thread.join()
104
+ a_thread.join()
105
+ mx_result = mx_queue.get()
106
+ a_result = a_queue.get()
107
+
108
+ return mx_result or a_result
109
+
110
+
111
+ # smtp connection
112
+ def verify_email(email):
113
+ # Split the email address into username and domain parts
114
+ domain = email.split('@')[1]
115
+
116
+ # Check the domain MX records
117
+ try:
118
+ mx_records = dns.resolver.resolve(domain, 'MX')
119
+ except dns.resolver.NoAnswer:
120
+ return False
121
+
122
+ # Connect to the SMTP server and perform the email verification
123
+ for mx in mx_records:
124
+ try:
125
+ smtp_server = smtplib.SMTP(str(mx.exchange))
126
+ smtp_server.ehlo()
127
+ smtp_server.mail('')
128
+ code, message = smtp_server.rcpt(str(email))
129
+ smtp_server.quit()
130
+ if code == 250:
131
+ return True
132
+ except:
133
+ pass
134
+
135
+ return False
136
+
137
+
138
+ # temporary domain
139
+ def is_disposable(domain):
140
+ blacklists = [
141
+ 'https://raw.githubusercontent.com/andreis/disposable-email-domains/master/domains.txt',
142
+ 'https://raw.githubusercontent.com/wesbos/burner-email-providers/master/emails.txt'
143
+ ]
144
+
145
+ for blacklist_url in blacklists:
146
+ try:
147
+ blacklist = set(requests.get(blacklist_url).text.strip().split('\n'))
148
+ if domain in blacklist:
149
+ return True
150
+ except Exception as e:
151
+ print(f'Error loading blacklist {blacklist_url}: {e}')
152
+ return False
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
style.css ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .css-1xarl3l {
2
+ font-size: 1.25rem;
3
+ padding-bottom: 0.25rem;
4
+ }
5
+
6
+ /* Move block container higher */
7
+ div.block-container.css-18e3th9.egzxvld2 {
8
+ margin-top: -5em;
9
+ }
10
+
11
+ #MainMenu {visibility: hidden;}
12
+ footer {visibility: hidden;}
13
+
14
+ div.block-container.css-z5fcl4.e1g8pov64{
15
+ margin-top: -5em;
16
+ }
17
+ /* Path: static\style.css */
suggestion.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from popular_domains import emailDomains
2
+ import jellyfish
3
+ from typing import List
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ import numpy as np
6
+
7
+ class TrieNode:
8
+ def __init__(self, char: str):
9
+ self.char = char
10
+ self.children = {}
11
+ self.word_end = False
12
+
13
+ class Trie:
14
+ def __init__(self):
15
+ self.root = TrieNode('')
16
+
17
+ def add(self, word: str):
18
+ node = self.root
19
+ for char in word:
20
+ if char not in node.children:
21
+ node.children[char] = TrieNode(char)
22
+ node = node.children[char]
23
+ node.word_end = True
24
+
25
+ def search(self, word: str) -> bool:
26
+ node = self.root
27
+ for char in word:
28
+ if char not in node.children:
29
+ return False
30
+ node = node.children[char]
31
+ return node.word_end
32
+
33
+ def suggest_email_domain(domain: str, valid_domains: List[str]) -> List[str]:
34
+ # Build a trie with valid domains
35
+ trie = Trie()
36
+ for valid_domain in valid_domains:
37
+ trie.add(valid_domain)
38
+
39
+ # Calculate distances using a faster string distance metric
40
+ distances = {}
41
+ with ThreadPoolExecutor(max_workers=np.minimum(16, len(valid_domains))) as executor:
42
+ for valid_domain, distance in zip(valid_domains, executor.map(lambda x: jellyfish.damerau_levenshtein_distance(domain, x), valid_domains)):
43
+ if distance <= 2:
44
+ if distance in distances:
45
+ if valid_domain not in distances[distance]:
46
+ distances[distance].append(valid_domain)
47
+ else:
48
+ distances[distance] = [valid_domain]
49
+
50
+ # Choose the most similar domains based on alphabetical order
51
+ sorted_domains = np.array([])
52
+ if distances:
53
+ min_distance = min(distances.keys())
54
+ sorted_domains = sorted(distances[min_distance])
55
+ sorted_domains = [d for d in sorted_domains if trie.search(d)]
56
+
57
+ # Check for phonetic similarity using Soundex
58
+ soundex_domain = jellyfish.soundex(domain)
59
+ phonetically_similar_domains = [d for d in valid_domains if jellyfish.soundex(d) == soundex_domain and d not in sorted_domains]
60
+
61
+ # Combine and return the results
62
+ return sorted_domains + phonetically_similar_domains
63
+
64
+