File size: 8,470 Bytes
3109432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import pandas as pd
import numpy as np
from string import ascii_lowercase
from gensim.models import Doc2Vec
from gensim.models import doc2vec
from gensim.models import KeyedVectors
import snowballstemmer, re
import requests
from bs4 import BeautifulSoup
import re, sys
from tensorflow.keras.models import load_model
import joblib

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36, Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18'
}

def getsoup(url):
    response = requests.get(url, headers=headers)
    Status_Code = response.status_code
    print(url)
    print(Status_Code)
    
    if Status_Code == 200:
      soup = BeautifulSoup(response.content, features="lxml")
    else:
      soup = getsoup(url)
    return soup 

#Get Last Page number
def getLastPageNumber(soup, site):
    pageNumber = []
    if site == 'flipkart':
        review_number = int(soup.find("span", "_2_R_DZ").text.strip().replace(',', '').split()[-2])
        if review_number <=10:
            lastPage = 1
        else:
            link = soup.find(attrs={"class": "_2MImiq _1Qnn1K"})
            pageNumber = link.find('span').text.strip().replace(',', '').split()
            lastPage1 = pageNumber[len(pageNumber)-1]
            lastPage = int(lastPage1)
    elif site == 'amazon':
        review_number = int(soup.find("div", {"data-hook": "cr-filter-info-review-rating-count"}).text.strip().replace(',', '').split()[-3])
        if review_number <=10:
            lastPage = 1
        else:
            lastPage = review_number // 10
    if lastPage > 500:
        lastPage = 2
    return lastPage


def geturllist(url, lastPage):
    urllistPages = []
    url = url[:-1]
    for i in range(1,lastPage+1):
      urllistPages.append (url + str(i))
    return urllistPages


def getReviews(soup, site, url):
    if site == 'flipkart':
        #Extracting the Titles
        title_sec = soup.find_all("p",'_2-N8zT')
        title = []
        for s in title_sec:
            title.append(s.text)

        #Extracting the Author names
        author_sec = soup.find_all("p","_2sc7ZR _2V5EHH")
        author = []
        for r in author_sec:
            author.append(r.text)

        #Extracting the Text
        Review_text_sec = soup.find_all("div",'t-ZTKy')
        text = []
        for t in Review_text_sec:
            text.append(t.text)
            
        #Extracting the Star rating  
        Rating = soup.find_all("div", {"class": ["_3LWZlK _1BLPMq", "_3LWZlK _32lA32 _1BLPMq", "_3LWZlK _1rdVr6 _1BLPMq"]})    
        rate = []
        for d in Rating:
            rate.append(d.text)

        #Extracting the Date
        Date_sec = soup.find_all(lambda tag: tag.name == 'p' and tag.get('class') == ['_2sc7ZR'])    
        date = []
        for d in Date_sec:
          date.append(d.text)

        #Extracting the Helpful rating
        help_sec = soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['_1LmwT9'])    
        help1 = []
        for d in help_sec:
          help1.append(d.text)
    
    elif site == 'amazon':
        n_ = 0
        title_sec = soup.find_all(attrs={"data-hook": "review-title", "class": "a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold"})
        title = []
        for s in title_sec:
            title.append(s.text.replace('\n', ''))
        n_ = len(title)

        author_sec = soup.find_all(attrs = {"class": "a-profile-name"})
        author = []
        for r in author_sec:
            author.append(r.text)
        while(1):
            if len(author) > n_:
                author.pop(0)
            else:
                break

        Review_text_sec = soup.find_all(attrs={"data-hook": "review-body", "class": "a-size-base review-text review-text-content"})
        text = []
        for t in Review_text_sec:
            text.append(t.text.replace('\n', ''))

        Rating = soup.find_all(attrs={"data-hook": "review-star-rating"})    
        rate = []
        for d in Rating:
            rate.append(d.text)

        Date_sec = soup.find_all(attrs={"data-hook": "review-date"})    
        date = []
        for d in Date_sec:
          date.append(d.text) 

        help_sec = soup.find_all(attrs={"data-hook": "helpful-vote-statement"})    
        help1 = []
        for d in help_sec:
             help1.append(d.text.replace('\n          ', '')) 
        while(1):
            if len(help1) < n_:
                help1.append(0)
            else:
                break
        
    url1 = []
    url1 = [url] * len(date)

    collate = {'Date': date, 'URL': url1, 'Review_Title': title, 'Author': author, 'Rating': rate, 'Review_text': text, 'Review_helpful': help1}          
    collate_df = pd.DataFrame.from_dict(collate)
    return collate_df

def scraper(url):
    df2 = []      
    soup = getsoup(url)
    site = url.split('.')[1]
    if site == 'flipkart':
        url = url + '&page=1'
    elif site == 'amazon':
        url = url + '&pageNumber=1'
    product = url.split('/')[3]
    lastPage = 1
    urllistPages = geturllist(url, lastPage)
    x = 1
    for url in urllistPages:
        soup = getsoup(url)
        df1 = getReviews(soup, site, url)         
        if x == 1:
            df3 = []
            df3 = df1
        else:                        
            df2 = df3
            result = df2.append(df1, ignore_index=True)
            df3 = result
        x += 1
    print(list(df3['Review_text']))
    return list(df3['Review_text'])

arr = scraper('https://www.amazon.in/Redmi-inches-Ready-Smart-L32R8-FVIN/product-review/B0BVMLNGXR/ref=lp_90117314031_1_1?pf_rd_p=9e034799-55e2-4ab2-b0d0-eb42f95b2d05&pf_rd_r=V81TJ2VTRM0BYHQ6XX8S&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&th=1')

TaggedDocument = doc2vec.TaggedDocument

loaded_model = load_model('weights.best.from_scratch1 (1).hdf5')

def preprocess_text(text):
    stemmer = snowballstemmer.EnglishStemmer()
    text = " ".join(stemmer.stemWords(re.sub('[!"#%\'()*+,-./:;<=>?@[\\]^_`{|}~1234567890’”“′‘\\\\]', ' ', text).split(' ')))

    stop_words = set(["may", "also", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "across", "among", "beside", "however", "yet", "within"] + list(ascii_lowercase))
    stop_list = stemmer.stemWords(stop_words)
    stop_words.update(stop_list)
    text = " ".join(filter(None, filter(lambda word: word not in stop_words, text.lower().split(' '))))

    return text.split(' ')


# arr = ['This is not a nice product', 'This is brilliant product', "I have had this tv for a year and a half. The tv worked smoothly and recently, it's display stopped working. I have contacted the company and they assured a repair within 8-10 days replacing the display panel. Then came the worst after sale experience I had ever. They didn't respond to my calls and whenever, I raised a grievance, I was just given different dates and extensions.", ''' In monotheistic belief systems, God is usually viewed as the supreme being, creator, and principal object of faith. In polytheistic belief systems, a god is "a spirit or being believed to have created, or for controlling some part of the universe or life, for which such a deity is often worshipped ''']

preprocessed_arr = [preprocess_text(x) for x in arr]

doc2vec_model = Doc2Vec.load("doc2vec_model_opinion_corpus (1).d2v")

def vectorize_comments_(df, d2v_model):
    y = []
    comments = []
    for i in range(0, len(df)):
        print(i)
        label = 'SENT_%s' %i
        comments.append(d2v_model.docvecs[label])

    return comments

textData = vectorize_comments_(preprocessed_arr, doc2vec_model)

import numpy as np

textData_array = np.array(textData)

num_vectors = textData_array.shape[0]
textData_3d = textData_array.reshape((num_vectors, 1, -1))

new_shape = (textData_array.shape[0], 380, 512)

X_test3_reshaped = np.zeros(new_shape, dtype=textData_3d.dtype)
X_test3_reshaped[:, :textData_3d.shape[1], :textData_3d.shape[2]] = textData_3d

predictions = np.rint(loaded_model.predict(X_test3_reshaped))

argMax = []

for i in predictions:
    argMax.append(np.argmax(i))

def returnRequirements(comments, preds):
    arr = []
    for i, j in enumerate(preds):
        if j == 3 or j == 0:
            arr.append(comments[i])
    return arr

# 3 & 0 is deceptive rest aren't
print(argMax)
print(returnRequirements(arr, argMax))