File size: 3,170 Bytes
281f87d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9bfd48
 
 
 
281f87d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import re
from nltk import ngrams
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


stop_words_2 = ('show','international','exhibition','trade','fair','global','conference','world',
 'expo','event','wellknown','popular','new', 'together',
'latest','offer','trend','sector','exhibitor','th','one','like','also','held','well','etc','u','bb',
  'provide', 'provides', 'provide','day','attendee','year', 'best','top','management',
  'brings','bring','event','topic','visitor','buyer','brand','take','u','national','great','come')

stop_words = stop_words.union(stop_words_2)

list_location = []
for col in ['name','capital','region','subregion']:#countries
  list_location.extend(list(set(pd.read_csv('countries.csv')[col])))
list_location.extend(list(set(pd.read_csv('states.csv')['name'])))
list_location.extend(list(set(pd.read_csv('cities.csv')['name'])))
list_location.extend(list(set(pd.read_csv('zones.csv')['Zone'])))

locations_removal = set([x.lower() for x in list_location if not pd.isna(x)])

locations_removal.discard('nan')

stop_words_bert   = stop_words.union(locations_removal).union(stop_words_2)

def preprocess_text(keyword):
  keyword = ' '.join([w for w in word_tokenize(keyword) if not w.lower() in stop_words])
  keyword = keyword.replace('/', ' ')
  keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword).strip()
  keyword = keyword.replace('_', ' ')
  keyword = keyword.replace('&', ' ').strip()
  keyword = keyword.encode('ascii', 'ignore').decode('utf-8').strip().lower()
  keyword = re.sub(r'[^a-zA-Z\s]', '', keyword)
  words = word_tokenize(keyword)
  words = [word for word in words if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  words = list(set([lemmatizer.lemmatize(word) for word in words]))
  words = [word for word in words if word not in stop_words]
  processed_text = ' '.join(words)
  processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', processed_text)
  return processed_text

def bert_preprocess(keyword):

    # Remove abbreviations
    keyword = re.sub(r"\b[A-Z\.]{2,}\b", ' ', keyword)

    # Convert to lowercase
    keyword = keyword.lower()

    # Tokenize and remove stop words
    keyword = ' '.join([w for w in word_tokenize(keyword) if re.sub(r'[^\w\s]', '', w.lower()) not in stop_words_bert])

    # Remove special characters, unwanted patterns, and symbols
    keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword)
    keyword = re.sub(r'[^a-zA-Z\s]', ' ', keyword)

    # Clean up and lemmatize words
    lemmatizer = WordNetLemmatizer()
    words  = [w for w in word_tokenize(keyword)]
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove repeated characters
    processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', ' '.join(words))

    # Join words and remove unnecessary spaces
    processed_text = ' '.join(processed_text.split())

    return processed_text