harshithakr commited on
Commit
281f87d
·
1 Parent(s): 71b7f95

Create preprocess_fun.py

Browse files
Files changed (1) hide show
  1. preprocess_fun.py +80 -0
preprocess_fun.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from nltk import ngrams
4
+ from nltk.corpus import wordnet
5
+ from nltk.corpus import stopwords
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.stem import WordNetLemmatizer
8
+ import nltk
9
+ nltk.download('wordnet')
10
+ nltk.download('stopwords')
11
+ nltk.download('punkt')
12
+ nltk.download('stopwords')
13
+ stop_words = set(stopwords.words('english'))
14
+
15
+
16
+ stop_words_2 = ('show','international','exhibition','trade','fair','global','conference','world',
17
+ 'expo','event','wellknown','popular','new', 'together',
18
+ 'latest','offer','trend','sector','exhibitor','th','one','like','also','held','well','etc','u','bb',
19
+ 'provide', 'provides', 'provide','day','attendee','year', 'best','top','management',
20
+ 'brings','bring','event','topic','visitor','buyer','brand','take','u','national','great','come')
21
+
22
+ stop_words = stop_words.union(stop_words_2)
23
+
24
+ list_location = []
25
+ for col in ['name','capital','region','subregion']:#countries
26
+ list_location.extend(list(set(pd.read_csv('/content/countries.csv')[col])))
27
+ list_location.extend(list(set(pd.read_csv('/content/states.csv')['name'])))
28
+ list_location.extend(list(set(pd.read_csv('/content/cities.csv')['name'])))
29
+ list_location.extend(list(set(pd.read_csv('/content/zones.csv')['Zone'])))
30
+
31
+ locations_removal = set([x.lower() for x in list_location if not pd.isna(x)])
32
+
33
+ locations_removal.discard('nan')
34
+
35
+ stop_words_bert = stop_words.union(locations_removal).union(stop_words_2)
36
+
37
+ def preprocess_text(keyword):
38
+ keyword = ' '.join([w for w in word_tokenize(keyword) if not w.lower() in stop_words])
39
+ keyword = keyword.replace('/', ' ')
40
+ keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword).strip()
41
+ keyword = keyword.replace('_', ' ')
42
+ keyword = keyword.replace('&', ' ').strip()
43
+ keyword = keyword.encode('ascii', 'ignore').decode('utf-8').strip().lower()
44
+ keyword = re.sub(r'[^a-zA-Z\s]', '', keyword)
45
+ words = word_tokenize(keyword)
46
+ words = [word for word in words if word not in stop_words]
47
+ lemmatizer = WordNetLemmatizer()
48
+ words = list(set([lemmatizer.lemmatize(word) for word in words]))
49
+ words = [word for word in words if word not in stop_words]
50
+ processed_text = ' '.join(words)
51
+ processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', processed_text)
52
+ return processed_text
53
+
54
+ def bert_preprocess(keyword):
55
+
56
+ # Remove abbreviations
57
+ keyword = re.sub(r"\b[A-Z\.]{2,}\b", ' ', keyword)
58
+
59
+ # Convert to lowercase
60
+ keyword = keyword.lower()
61
+
62
+ # Tokenize and remove stop words
63
+ keyword = ' '.join([w for w in word_tokenize(keyword) if re.sub(r'[^\w\s]', '', w.lower()) not in stop_words_bert])
64
+
65
+ # Remove special characters, unwanted patterns, and symbols
66
+ keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword)
67
+ keyword = re.sub(r'[^a-zA-Z\s]', ' ', keyword)
68
+
69
+ # Clean up and lemmatize words
70
+ lemmatizer = WordNetLemmatizer()
71
+ words = [w for w in word_tokenize(keyword)]
72
+ words = [lemmatizer.lemmatize(word) for word in words]
73
+
74
+ # Remove repeated characters
75
+ processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', ' '.join(words))
76
+
77
+ # Join words and remove unnecessary spaces
78
+ processed_text = ' '.join(processed_text.split())
79
+
80
+ return processed_text