honey234 commited on
Commit
3976c3d
·
1 Parent(s): 2974804
img/advanced-search-01.png DELETED
Binary file (104 kB)
 
scraper/__init__.py DELETED
File without changes
scraper/progress.py DELETED
@@ -1,46 +0,0 @@
1
- import sys
2
-
3
-
4
- class Progress:
5
- def __init__(self, current, total) -> None:
6
- self.current = current
7
- self.total = total
8
- pass
9
-
10
- def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
11
- self.current = current
12
- progress = current / self.total
13
- bar_length = 40
14
- progress_bar = (
15
- "["
16
- + "=" * int(bar_length * progress)
17
- + "-" * (bar_length - int(bar_length * progress))
18
- + "]"
19
- )
20
- if no_tweets_limit:
21
- if waiting:
22
- sys.stdout.write(
23
- "\rTweets scraped : {} - waiting to access older tweets {} min on 15 min".format(
24
- current, retry_cnt
25
- )
26
- )
27
- else:
28
- sys.stdout.write(
29
- "\rTweets scraped : {} ".format(
30
- current
31
- )
32
- )
33
- else:
34
- if waiting:
35
- sys.stdout.write(
36
- "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
37
- progress_bar, progress, current, self.total, retry_cnt
38
- )
39
- )
40
- else:
41
- sys.stdout.write(
42
- "\rProgress: [{:<40}] {:.2%} {} of {} ".format(
43
- progress_bar, progress, current, self.total
44
- )
45
- )
46
- sys.stdout.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scraper/scroller.py DELETED
@@ -1,26 +0,0 @@
1
- class Scroller:
2
- def __init__(self, driver) -> None:
3
- self.driver = driver
4
- self.current_position = 0
5
- self.last_position = driver.execute_script("return window.pageYOffset;")
6
- self.scrolling = True
7
- self.scroll_count = 0
8
- pass
9
-
10
- def reset(self) -> None:
11
- self.current_position = 0
12
- self.last_position = self.driver.execute_script("return window.pageYOffset;")
13
- self.scroll_count = 0
14
- pass
15
-
16
- def scroll_to_top(self) -> None:
17
- self.driver.execute_script("window.scrollTo(0, 0);")
18
- pass
19
-
20
- def scroll_to_bottom(self) -> None:
21
- self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
22
- pass
23
-
24
- def update_scroll_position(self) -> None:
25
- self.current_position = self.driver.execute_script("return window.pageYOffset;")
26
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scraper/tweet.py DELETED
@@ -1,277 +0,0 @@
1
- from time import sleep
2
- from selenium.common.exceptions import (
3
- NoSuchElementException,
4
- StaleElementReferenceException,
5
- )
6
- from selenium.webdriver.chrome.webdriver import WebDriver
7
- from selenium.webdriver.common.action_chains import ActionChains
8
-
9
-
10
- class Tweet:
11
- def __init__(
12
- self,
13
- card: WebDriver,
14
- driver: WebDriver,
15
- actions: ActionChains,
16
- scrape_poster_details=False,
17
- ) -> None:
18
- self.card = card
19
- self.error = False
20
- self.tweet = None
21
-
22
- try:
23
- self.user = card.find_element(
24
- "xpath", './/div[@data-testid="User-Name"]//span'
25
- ).text
26
- except NoSuchElementException:
27
- self.error = True
28
- self.user = "skip"
29
-
30
- try:
31
- self.handle = card.find_element(
32
- "xpath", './/span[contains(text(), "@")]'
33
- ).text
34
- except NoSuchElementException:
35
- self.error = True
36
- self.handle = "skip"
37
-
38
- try:
39
- self.date_time = card.find_element("xpath", ".//time").get_attribute(
40
- "datetime"
41
- )
42
-
43
- if self.date_time is not None:
44
- self.is_ad = False
45
- except NoSuchElementException:
46
- self.is_ad = True
47
- self.error = True
48
- self.date_time = "skip"
49
-
50
- if self.error:
51
- return
52
-
53
- try:
54
- card.find_element(
55
- "xpath", './/*[local-name()="svg" and @data-testid="icon-verified"]'
56
- )
57
-
58
- self.verified = True
59
- except NoSuchElementException:
60
- self.verified = False
61
-
62
- self.content = ""
63
- contents = card.find_elements(
64
- "xpath",
65
- '(.//div[@data-testid="tweetText"])[1]/span | (.//div[@data-testid="tweetText"])[1]/a',
66
- )
67
-
68
- for index, content in enumerate(contents):
69
- self.content += content.text
70
-
71
- try:
72
- self.reply_cnt = card.find_element(
73
- "xpath", './/button[@data-testid="reply"]//span'
74
- ).text
75
-
76
- if self.reply_cnt == "":
77
- self.reply_cnt = "0"
78
- except NoSuchElementException:
79
- self.reply_cnt = "0"
80
-
81
- try:
82
- self.retweet_cnt = card.find_element(
83
- "xpath", './/button[@data-testid="retweet"]//span'
84
- ).text
85
-
86
- if self.retweet_cnt == "":
87
- self.retweet_cnt = "0"
88
- except NoSuchElementException:
89
- self.retweet_cnt = "0"
90
-
91
- try:
92
- self.like_cnt = card.find_element(
93
- "xpath", './/button[@data-testid="like"]//span'
94
- ).text
95
-
96
- if self.like_cnt == "":
97
- self.like_cnt = "0"
98
- except NoSuchElementException:
99
- self.like_cnt = "0"
100
-
101
- try:
102
- self.analytics_cnt = card.find_element(
103
- "xpath", './/a[contains(@href, "/analytics")]//span'
104
- ).text
105
-
106
- if self.analytics_cnt == "":
107
- self.analytics_cnt = "0"
108
- except NoSuchElementException:
109
- self.analytics_cnt = "0"
110
-
111
- try:
112
- self.tags = card.find_elements(
113
- "xpath",
114
- './/a[contains(@href, "src=hashtag_click")]',
115
- )
116
-
117
- self.tags = [tag.text for tag in self.tags]
118
- except NoSuchElementException:
119
- self.tags = []
120
-
121
- try:
122
- self.mentions = card.find_elements(
123
- "xpath",
124
- '(.//div[@data-testid="tweetText"])[1]//a[contains(text(), "@")]',
125
- )
126
-
127
- self.mentions = [mention.text for mention in self.mentions]
128
- except NoSuchElementException:
129
- self.mentions = []
130
-
131
- try:
132
- raw_emojis = card.find_elements(
133
- "xpath",
134
- '(.//div[@data-testid="tweetText"])[1]/img[contains(@src, "emoji")]',
135
- )
136
-
137
- self.emojis = [
138
- emoji.get_attribute("alt").encode("unicode-escape").decode("ASCII")
139
- for emoji in raw_emojis
140
- ]
141
- except NoSuchElementException:
142
- self.emojis = []
143
-
144
- try:
145
- self.profile_img = card.find_element(
146
- "xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
147
- ).get_attribute("src")
148
- except NoSuchElementException:
149
- self.profile_img = ""
150
-
151
- try:
152
- self.tweet_link = self.card.find_element(
153
- "xpath",
154
- ".//a[contains(@href, '/status/')]",
155
- ).get_attribute("href")
156
- self.tweet_id = str(self.tweet_link.split("/")[-1])
157
- except NoSuchElementException:
158
- self.tweet_link = ""
159
- self.tweet_id = ""
160
-
161
- self.following_cnt = "0"
162
- self.followers_cnt = "0"
163
- self.user_id = None
164
-
165
- if scrape_poster_details:
166
- el_name = card.find_element(
167
- "xpath", './/div[@data-testid="User-Name"]//span'
168
- )
169
-
170
- ext_hover_card = False
171
- ext_user_id = False
172
- ext_following = False
173
- ext_followers = False
174
- hover_attempt = 0
175
-
176
- while (
177
- not ext_hover_card
178
- or not ext_user_id
179
- or not ext_following
180
- or not ext_followers
181
- ):
182
- try:
183
- actions.move_to_element(el_name).perform()
184
-
185
- hover_card = driver.find_element(
186
- "xpath", '//div[@data-testid="hoverCardParent"]'
187
- )
188
-
189
- ext_hover_card = True
190
-
191
- while not ext_user_id:
192
- try:
193
- raw_user_id = hover_card.find_element(
194
- "xpath",
195
- '(.//div[contains(@data-testid, "-follow")]) | (.//div[contains(@data-testid, "-unfollow")])',
196
- ).get_attribute("data-testid")
197
-
198
- if raw_user_id == "":
199
- self.user_id = None
200
- else:
201
- self.user_id = str(raw_user_id.split("-")[0])
202
-
203
- ext_user_id = True
204
- except NoSuchElementException:
205
- continue
206
- except StaleElementReferenceException:
207
- self.error = True
208
- return
209
-
210
- while not ext_following:
211
- try:
212
- self.following_cnt = hover_card.find_element(
213
- "xpath", './/a[contains(@href, "/following")]//span'
214
- ).text
215
-
216
- if self.following_cnt == "":
217
- self.following_cnt = "0"
218
-
219
- ext_following = True
220
- except NoSuchElementException:
221
- continue
222
- except StaleElementReferenceException:
223
- self.error = True
224
- return
225
-
226
- while not ext_followers:
227
- try:
228
- self.followers_cnt = hover_card.find_element(
229
- "xpath",
230
- './/a[contains(@href, "/verified_followers")]//span',
231
- ).text
232
-
233
- if self.followers_cnt == "":
234
- self.followers_cnt = "0"
235
-
236
- ext_followers = True
237
- except NoSuchElementException:
238
- continue
239
- except StaleElementReferenceException:
240
- self.error = True
241
- return
242
- except NoSuchElementException:
243
- if hover_attempt == 3:
244
- self.error
245
- return
246
- hover_attempt += 1
247
- sleep(0.5)
248
- continue
249
- except StaleElementReferenceException:
250
- self.error = True
251
- return
252
-
253
- if ext_hover_card and ext_following and ext_followers:
254
- actions.reset_actions()
255
-
256
- self.tweet = (
257
- self.user,
258
- self.handle,
259
- self.date_time,
260
- self.verified,
261
- self.content,
262
- self.reply_cnt,
263
- self.retweet_cnt,
264
- self.like_cnt,
265
- self.analytics_cnt,
266
- self.tags,
267
- self.mentions,
268
- self.emojis,
269
- self.profile_img,
270
- self.tweet_link,
271
- self.tweet_id,
272
- self.user_id,
273
- self.following_cnt,
274
- self.followers_cnt,
275
- )
276
-
277
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scraper/twitter_scraper.py DELETED
@@ -1,777 +0,0 @@
1
- # import os
2
- # import sys
3
- # import pandas as pd
4
- # from scraper.progress import Progress
5
- # from scraper.scroller import Scroller
6
- # from scraper.tweet import Tweet
7
-
8
- # from datetime import datetime
9
- # from fake_headers import Headers
10
- # from time import sleep
11
-
12
- # from selenium import webdriver
13
- # from selenium.webdriver.common.keys import Keys
14
- # from selenium.common.exceptions import (
15
- # NoSuchElementException,
16
- # StaleElementReferenceException,
17
- # WebDriverException,
18
- # )
19
- # from selenium.webdriver.common.action_chains import ActionChains
20
- # from selenium.webdriver.chrome.options import Options as ChromeOptions
21
- # from selenium.webdriver.chrome.service import Service as ChromeService
22
-
23
- # from selenium.webdriver.firefox.options import Options as FirefoxOptions
24
- # from selenium.webdriver.firefox.service import Service as FirefoxService
25
-
26
- # from selenium.webdriver.support.ui import WebDriverWait
27
-
28
- # from webdriver_manager.chrome import ChromeDriverManager
29
- # from webdriver_manager.firefox import GeckoDriverManager
30
-
31
- # TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
32
-
33
-
34
- # class Twitter_Scraper:
35
- # def __init__(
36
- # self,
37
- # mail,
38
- # username,
39
- # password,
40
- # max_tweets=50,
41
- # scrape_username=None,
42
- # scrape_hashtag=None,
43
- # scrape_query=None,
44
- # scrape_poster_details=False,
45
- # scrape_latest=True,
46
- # scrape_top=False,
47
- # proxy=None,
48
- # ):
49
- # print("Initializing Twitter Scraper...")
50
- # self.mail = mail
51
- # self.username = username
52
- # self.password = password
53
- # self.interrupted = False
54
- # self.tweet_ids = set()
55
- # self.data = []
56
- # self.tweet_cards = []
57
- # self.scraper_details = {
58
- # "type": None,
59
- # "username": None,
60
- # "hashtag": None,
61
- # "query": None,
62
- # "tab": None,
63
- # "poster_details": False,
64
- # }
65
- # self.max_tweets = max_tweets
66
- # self.progress = Progress(0, max_tweets)
67
- # self.router = self.go_to_home
68
- # self.driver = self._get_driver(proxy)
69
- # self.actions = ActionChains(self.driver)
70
- # self.scroller = Scroller(self.driver)
71
- # self._config_scraper(
72
- # max_tweets,
73
- # scrape_username,
74
- # scrape_hashtag,
75
- # scrape_query,
76
- # scrape_latest,
77
- # scrape_top,
78
- # scrape_poster_details,
79
- # )
80
-
81
- # def _config_scraper(
82
- # self,
83
- # max_tweets=50,
84
- # scrape_username=None,
85
- # scrape_hashtag=None,
86
- # scrape_query=None,
87
- # scrape_latest=True,
88
- # scrape_top=False,
89
- # scrape_poster_details=False,
90
- # ):
91
- # self.tweet_ids = set()
92
- # self.data = []
93
- # self.tweet_cards = []
94
- # self.max_tweets = max_tweets
95
- # self.progress = Progress(0, max_tweets)
96
- # self.scraper_details = {
97
- # "type": None,
98
- # "username": scrape_username,
99
- # "hashtag": str(scrape_hashtag).replace("#", "")
100
- # if scrape_hashtag is not None
101
- # else None,
102
- # "query": scrape_query,
103
- # "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
104
- # "poster_details": scrape_poster_details,
105
- # }
106
- # self.router = self.go_to_home
107
- # self.scroller = Scroller(self.driver)
108
-
109
- # if scrape_username is not None:
110
- # self.scraper_details["type"] = "Username"
111
- # self.router = self.go_to_profile
112
- # elif scrape_hashtag is not None:
113
- # self.scraper_details["type"] = "Hashtag"
114
- # self.router = self.go_to_hashtag
115
- # elif scrape_query is not None:
116
- # self.scraper_details["type"] = "Query"
117
- # self.router = self.go_to_search
118
- # else:
119
- # self.scraper_details["type"] = "Home"
120
- # self.router = self.go_to_home
121
- # pass
122
-
123
- # def _get_driver(
124
- # self,
125
- # proxy=None,
126
- # ):
127
- # print("Setup WebDriver...")
128
- # header = Headers().generate()["User-Agent"]
129
-
130
- # # browser_option = ChromeOptions()
131
- # browser_option = FirefoxOptions()
132
- # browser_option.add_argument("--no-sandbox")
133
- # browser_option.add_argument("--disable-dev-shm-usage")
134
- # browser_option.add_argument("--ignore-certificate-errors")
135
- # browser_option.add_argument("--disable-gpu")
136
- # browser_option.add_argument("--log-level=3")
137
- # browser_option.add_argument("--disable-notifications")
138
- # browser_option.add_argument("--disable-popup-blocking")
139
- # browser_option.add_argument("--user-agent={}".format(header))
140
- # if proxy is not None:
141
- # browser_option.add_argument("--proxy-server=%s" % proxy)
142
-
143
- # # For Hiding Browser
144
- # browser_option.add_argument("--headless")
145
-
146
- # try:
147
- # # print("Initializing ChromeDriver...")
148
- # # driver = webdriver.Chrome(
149
- # # options=browser_option,
150
- # # )
151
-
152
- # print("Initializing FirefoxDriver...")
153
- # driver = webdriver.Firefox(
154
- # options=browser_option,
155
- # )
156
-
157
- # print("WebDriver Setup Complete")
158
- # return driver
159
- # except WebDriverException:
160
- # try:
161
- # # print("Downloading ChromeDriver...")
162
- # # chromedriver_path = ChromeDriverManager().install()
163
- # # chrome_service = ChromeService(executable_path=chromedriver_path)
164
-
165
- # print("Downloading FirefoxDriver...")
166
- # firefoxdriver_path = GeckoDriverManager().install()
167
- # firefox_service = FirefoxService(executable_path=firefoxdriver_path)
168
-
169
- # # print("Initializing ChromeDriver...")
170
- # # driver = webdriver.Chrome(
171
- # # service=chrome_service,
172
- # # options=browser_option,
173
- # # )
174
-
175
- # print("Initializing FirefoxDriver...")
176
- # driver = webdriver.Firefox(
177
- # service=firefox_service,
178
- # options=browser_option,
179
- # )
180
-
181
- # print("WebDriver Setup Complete")
182
- # return driver
183
- # except Exception as e:
184
- # print(f"Error setting up WebDriver: {e}")
185
- # sys.exit(1)
186
- # pass
187
-
188
- # def login(self):
189
- # print()
190
- # print("Logging in to Twitter...")
191
-
192
- # try:
193
- # self.driver.maximize_window()
194
- # self.driver.get(TWITTER_LOGIN_URL)
195
- # sleep(3)
196
-
197
- # self._input_username()
198
- # self._input_unusual_activity()
199
- # self._input_password()
200
-
201
- # cookies = self.driver.get_cookies()
202
-
203
- # auth_token = None
204
-
205
- # for cookie in cookies:
206
- # if cookie["name"] == "auth_token":
207
- # auth_token = cookie["value"]
208
- # break
209
-
210
- # if auth_token is None:
211
- # raise ValueError(
212
- # """This may be due to the following:
213
-
214
- # - Internet connection is unstable
215
- # - Username is incorrect
216
- # - Password is incorrect
217
- # """
218
- # )
219
-
220
- # print()
221
- # print("Login Successful")
222
- # print()
223
- # except Exception as e:
224
- # print()
225
- # print(f"Login Failed: {e}")
226
- # sys.exit(1)
227
-
228
- # pass
229
-
230
- # def _input_username(self):
231
- # input_attempt = 0
232
-
233
- # while True:
234
- # try:
235
- # username = self.driver.find_element(
236
- # "xpath", "//input[@autocomplete='username']"
237
- # )
238
- # print("username", username)
239
- # username.send_keys(self.username)
240
- # username.send_keys(Keys.RETURN)
241
- # sleep(3)
242
- # break
243
- # except NoSuchElementException:
244
- # input_attempt += 1
245
- # if input_attempt >= 3:
246
- # print()
247
- # print(
248
- # """There was an error inputting the username.
249
-
250
- # It may be due to the following:
251
- # - Internet connection is unstable
252
- # - Username is incorrect
253
- # - Twitter is experiencing unusual activity"""
254
- # )
255
- # self.driver.quit()
256
- # sys.exit(1)
257
- # else:
258
- # print("Re-attempting to input username...")
259
- # sleep(2)
260
-
261
- # def _input_unusual_activity(self):
262
- # input_attempt = 0
263
-
264
- # while True:
265
- # try:
266
- # unusual_activity = self.driver.find_element(
267
- # "xpath", "//input[@data-testid='ocfEnterTextTextInput']"
268
- # )
269
- # print("unusual_activity", unusual_activity)
270
- # unusual_activity.send_keys(self.username)
271
- # unusual_activity.send_keys(Keys.RETURN)
272
- # sleep(3)
273
- # break
274
- # except NoSuchElementException:
275
- # input_attempt += 1
276
- # if input_attempt >= 3:
277
- # break
278
-
279
- # def _input_password(self):
280
- # input_attempt = 0
281
-
282
- # while True:
283
- # try:
284
- # password = self.driver.find_element(
285
- # "xpath", "//input[@autocomplete='current-password']"
286
- # )
287
- # print("password", password)
288
-
289
- # password.send_keys(self.password)
290
- # password.send_keys(Keys.RETURN)
291
- # sleep(3)
292
- # break
293
- # except NoSuchElementException:
294
- # input_attempt += 1
295
- # if input_attempt >= 3:
296
- # print()
297
- # print(
298
- # """There was an error inputting the password.
299
-
300
- # It may be due to the following:
301
- # - Internet connection is unstable
302
- # - Password is incorrect
303
- # - Twitter is experiencing unusual activity"""
304
- # )
305
- # self.driver.quit()
306
- # sys.exit(1)
307
- # else:
308
- # print("Re-attempting to input password...")
309
- # sleep(2)
310
-
311
- # def go_to_home(self):
312
- # self.driver.get("https://twitter.com/home")
313
- # sleep(3)
314
- # pass
315
-
316
- # def go_to_profile(self):
317
- # if (
318
- # self.scraper_details["username"] is None
319
- # or self.scraper_details["username"] == ""
320
- # ):
321
- # print("Username is not set.")
322
- # sys.exit(1)
323
- # else:
324
- # self.driver.get(f"https://twitter.com/{self.scraper_details['username']}")
325
- # sleep(3)
326
- # pass
327
-
328
- # def go_to_hashtag(self):
329
- # if (
330
- # self.scraper_details["hashtag"] is None
331
- # or self.scraper_details["hashtag"] == ""
332
- # ):
333
- # print("Hashtag is not set.")
334
- # sys.exit(1)
335
- # else:
336
- # url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click"
337
- # if self.scraper_details["tab"] == "Latest":
338
- # url += "&f=live"
339
-
340
- # self.driver.get(url)
341
- # sleep(3)
342
- # pass
343
-
344
- # def go_to_search(self):
345
- # if self.scraper_details["query"] is None or self.scraper_details["query"] == "":
346
- # print("Query is not set.")
347
- # sys.exit(1)
348
- # else:
349
- # url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
350
- # if self.scraper_details["tab"] == "Latest":
351
- # url += "&f=live"
352
-
353
- # self.driver.get(url)
354
- # sleep(3)
355
- # pass
356
-
357
- # def get_tweet_cards(self):
358
- # self.tweet_cards = self.driver.find_elements(
359
- # "xpath", '//article[@data-testid="tweet" and not(@disabled)]'
360
- # )
361
- # pass
362
-
363
- # def remove_hidden_cards(self):
364
- # try:
365
- # hidden_cards = self.driver.find_elements(
366
- # "xpath", '//article[@data-testid="tweet" and @disabled]'
367
- # )
368
-
369
- # for card in hidden_cards[1:-2]:
370
- # self.driver.execute_script(
371
- # "arguments[0].parentNode.parentNode.parentNode.remove();", card
372
- # )
373
- # except Exception as e:
374
- # return
375
- # pass
376
-
377
- # def scrape_tweets(
378
- # self,
379
- # max_tweets=50,
380
- # no_tweets_limit=False,
381
- # scrape_username=None,
382
- # scrape_hashtag=None,
383
- # scrape_query=None,
384
- # scrape_latest=True,
385
- # scrape_top=False,
386
- # scrape_poster_details=False,
387
- # router=None,
388
- # ):
389
- # self._config_scraper(
390
- # max_tweets,
391
- # scrape_username,
392
- # scrape_hashtag,
393
- # scrape_query,
394
- # scrape_latest,
395
- # scrape_top,
396
- # scrape_poster_details,
397
- # )
398
-
399
- # if router is None:
400
- # router = self.router
401
-
402
- # router()
403
-
404
- # if self.scraper_details["type"] == "Username":
405
- # print(
406
- # "Scraping Tweets from @{}...".format(self.scraper_details["username"])
407
- # )
408
- # elif self.scraper_details["type"] == "Hashtag":
409
- # print(
410
- # "Scraping {} Tweets from #{}...".format(
411
- # self.scraper_details["tab"], self.scraper_details["hashtag"]
412
- # )
413
- # )
414
- # elif self.scraper_details["type"] == "Query":
415
- # print(
416
- # "Scraping {} Tweets from {} search...".format(
417
- # self.scraper_details["tab"], self.scraper_details["query"]
418
- # )
419
- # )
420
- # elif self.scraper_details["type"] == "Home":
421
- # print("Scraping Tweets from Home...")
422
-
423
- # # Accept cookies to make the banner disappear
424
- # try:
425
- # accept_cookies_btn = self.driver.find_element(
426
- # "xpath", "//span[text()='Refuse non-essential cookies']/../../..")
427
- # accept_cookies_btn.click()
428
- # except NoSuchElementException:
429
- # pass
430
-
431
- # self.progress.print_progress(0, False, 0, no_tweets_limit)
432
-
433
- # refresh_count = 0
434
- # added_tweets = 0
435
- # empty_count = 0
436
- # retry_cnt = 0
437
-
438
- # while self.scroller.scrolling:
439
- # try:
440
- # self.get_tweet_cards()
441
- # added_tweets = 0
442
-
443
- # for card in self.tweet_cards[-15:]:
444
- # try:
445
- # tweet_id = str(card)
446
-
447
- # if tweet_id not in self.tweet_ids:
448
- # self.tweet_ids.add(tweet_id)
449
-
450
- # if not self.scraper_details["poster_details"]:
451
- # self.driver.execute_script(
452
- # "arguments[0].scrollIntoView();", card
453
- # )
454
-
455
- # tweet = Tweet(
456
- # card=card,
457
- # driver=self.driver,
458
- # actions=self.actions,
459
- # scrape_poster_details=self.scraper_details[
460
- # "poster_details"
461
- # ],
462
- # )
463
-
464
- # if tweet:
465
- # if not tweet.error and tweet.tweet is not None:
466
- # if not tweet.is_ad:
467
- # self.data.append(tweet.tweet)
468
- # added_tweets += 1
469
- # self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
470
-
471
- # if len(self.data) >= self.max_tweets and not no_tweets_limit:
472
- # self.scroller.scrolling = False
473
- # break
474
- # else:
475
- # continue
476
- # else:
477
- # continue
478
- # else:
479
- # continue
480
- # else:
481
- # continue
482
- # except NoSuchElementException:
483
- # continue
484
-
485
- # if len(self.data) >= self.max_tweets and not no_tweets_limit:
486
- # break
487
-
488
- # if added_tweets == 0:
489
- # # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
490
- # try:
491
- # while retry_cnt < 15:
492
- # retry_button = self.driver.find_element(
493
- # "xpath", "//span[text()='Retry']/../../..")
494
- # self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
495
- # sleep(58)
496
- # retry_button.click()
497
- # retry_cnt += 1
498
- # sleep(2)
499
- # # There is no Retry button so the counter is reseted
500
- # except NoSuchElementException:
501
- # retry_cnt = 0
502
- # self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
503
-
504
- # if empty_count >= 5:
505
- # if refresh_count >= 3:
506
- # print()
507
- # print("No more tweets to scrape")
508
- # break
509
- # refresh_count += 1
510
- # empty_count += 1
511
- # sleep(1)
512
- # else:
513
- # empty_count = 0
514
- # refresh_count = 0
515
- # except StaleElementReferenceException:
516
- # sleep(2)
517
- # continue
518
- # except KeyboardInterrupt:
519
- # print("\n")
520
- # print("Keyboard Interrupt")
521
- # self.interrupted = True
522
- # break
523
- # except Exception as e:
524
- # print("\n")
525
- # print(f"Error scraping tweets: {e}")
526
- # break
527
-
528
- # print("")
529
-
530
- # if len(self.data) >= self.max_tweets or no_tweets_limit:
531
- # print("Scraping Complete")
532
- # else:
533
- # print("Scraping Incomplete")
534
-
535
- # if not no_tweets_limit:
536
- # print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
537
-
538
- # pass
539
-
540
- # def save_to_csv(self):
541
- # print("Saving Tweets to CSV...")
542
- # now = datetime.now()
543
- # folder_path = "./tweets/"
544
-
545
- # if not os.path.exists(folder_path):
546
- # os.makedirs(folder_path)
547
- # print("Created Folder: {}".format(folder_path))
548
-
549
- # data = {
550
- # "Name": [tweet[0] for tweet in self.data],
551
- # "Handle": [tweet[1] for tweet in self.data],
552
- # "Timestamp": [tweet[2] for tweet in self.data],
553
- # "Verified": [tweet[3] for tweet in self.data],
554
- # "Content": [tweet[4] for tweet in self.data],
555
- # "Comments": [tweet[5] for tweet in self.data],
556
- # "Retweets": [tweet[6] for tweet in self.data],
557
- # "Likes": [tweet[7] for tweet in self.data],
558
- # "Analytics": [tweet[8] for tweet in self.data],
559
- # "Tags": [tweet[9] for tweet in self.data],
560
- # "Mentions": [tweet[10] for tweet in self.data],
561
- # "Emojis": [tweet[11] for tweet in self.data],
562
- # "Profile Image": [tweet[12] for tweet in self.data],
563
- # "Tweet Link": [tweet[13] for tweet in self.data],
564
- # "Tweet ID": [f"tweet_id:{tweet[14]}" for tweet in self.data],
565
- # }
566
-
567
- # if self.scraper_details["poster_details"]:
568
- # data["Tweeter ID"] = [f"user_id:{tweet[15]}" for tweet in self.data]
569
- # data["Following"] = [tweet[16] for tweet in self.data]
570
- # data["Followers"] = [tweet[17] for tweet in self.data]
571
-
572
- # df = pd.DataFrame(data)
573
-
574
- # current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
575
- # file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
576
- # pd.set_option("display.max_colwidth", None)
577
- # df.to_csv(file_path, index=False, encoding="utf-8")
578
-
579
- # print("CSV Saved: {}".format(file_path))
580
-
581
- # pass
582
-
583
- # def get_tweets(self):
584
- # return self.data
585
-
586
-
587
- # import os
588
- # from flask import Flask, request, jsonify
589
- # from selenium import webdriver
590
- # from selenium.webdriver.firefox.service import Service
591
- # from selenium.webdriver.firefox.options import Options
592
- # from selenium.webdriver.common.by import By
593
- # from selenium.webdriver.support.ui import WebDriverWait
594
- # from selenium.webdriver.support import expected_conditions as EC
595
- # from webdriver_manager.firefox import GeckoDriverManager
596
- # import time
597
- # import random
598
-
599
- # app = Flask(__name__)
600
-
601
- # def setup_driver():
602
- # """Set up Chrome WebDriver with appropriate options for headless browsing."""
603
- # chrome_options = Options()
604
- # # chrome_options.add_argument("--headless")
605
- # chrome_options.add_argument("--incognito")
606
- # chrome_options.add_argument("--disable-blink-features=AutomationControlled")
607
- # # chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
608
- # # chrome_options.add_experimental_option('useAutomationExtension', False)
609
- # chrome_options.add_argument("--start-maximized")
610
- # chrome_options.add_argument("--no-sandbox")
611
- # chrome_options.add_argument("--disable-dev-shm-usage")
612
- # chrome_options.add_argument("--disable-extensions")
613
- # chrome_options.add_argument("--disable-gpu")
614
- # chrome_options.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
615
- # # chrome_options.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
616
-
617
-
618
- # # service = Service(GeckoDriverManager().install())
619
- # service = Service(executable_path=r'C:\Users\HP\.cache\selenium\geckodriver\win64\0.35.0\geckodriver.exe')
620
-
621
- # driver = webdriver.Firefox(service=service, options=chrome_options)
622
- # return driver
623
-
624
- # def reddit_login_and_scrape(username, password, subreddit):
625
- # """
626
- # Log into Reddit and scrape posts from a specified subreddit.
627
-
628
- # Args:
629
- # username (str): Reddit username
630
- # password (str): Reddit password
631
- # subreddit (str): Name of the subreddit to scrape
632
-
633
- # Returns:
634
- # list: List of dictionaries containing scraped post information
635
- # """
636
- # driver = setup_driver()
637
- # posts = []
638
-
639
- # try:
640
- # # Navigate to Reddit login page
641
- # driver.get("https://www.reddit.com/login/")
642
-
643
- # # Wait for login form to load
644
- # WebDriverWait(driver, 10).until(
645
- # EC.presence_of_element_located((By.ID, "login-username"))
646
- # )
647
-
648
- # # Find and fill in login credentials
649
- # username_field = driver.find_element(By.ID, "login-username")
650
- # password_field = driver.find_element(By.ID, "login-password")
651
-
652
- # username_field.send_keys(username)
653
- # password_field.send_keys(password)
654
-
655
- # # Submit login form
656
- # # login_button = driver.find_element(By.XPATH, "//button[@type='button']")
657
- # # login_button.click()
658
- # # Find login button using complex selector
659
- # # login_button=driver.find_element(By.CSS_SELECTOR, 'faceplate-tracker[action="click]')
660
- # login_button = WebDriverWait(driver, 4).until(
661
- # EC.element_to_be_clickable((By.XPATH, "//*[@id='login']/auth-flow-modal/div[2]/faceplate-tracker/button"))
662
- # )
663
- # driver.execute_script("arguments[0].scrollIntoView(true);", login_button)
664
- # time.sleep(random.uniform(1, 2))
665
- # login_button.click()
666
-
667
- # # Wait for login to complete
668
- # # WebDriverWait(driver, 10).until(
669
- # # EC.presence_of_element_located((By.XPATH, "//a[@href='/submit']"))
670
- # # )
671
-
672
- # # # Add random delay to mimic human behavior
673
- # time.sleep(random.uniform(2, 4))
674
-
675
- # # # # Navigate to subreddit
676
- # # driver.get(f"https://www.reddit.com/r/{subreddit}/")
677
-
678
- # # # # Wait for posts to load
679
- # # WebDriverWait(driver, 10).until(
680
- # # EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='post-container']"))
681
- # # )
682
-
683
- # # # Find post elements
684
- # # post_elements = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='post-container']")
685
-
686
- # # # Iterate through posts
687
- # # for post in post_elements[:10]: # Limit to 10 posts
688
- # # try:
689
- # # # Extract post details
690
- # # title = post.find_element(By.CSS_SELECTOR, "h3").text
691
-
692
- # # # Try to get upvotes
693
- # # try:
694
- # # upvotes = post.find_element(By.CSS_SELECTOR, "div[id^='vote-arrows-']").text
695
- # # except:
696
- # # upvotes = "N/A"
697
-
698
- # # # Try to get link
699
- # # try:
700
- # # link = post.find_element(By.CSS_SELECTOR, "a[data-click-id='body']").get_attribute('href')
701
- # # except:
702
- # # link = "No link available"
703
-
704
- # # posts.append({
705
- # # "title": title,
706
- # # "upvotes": upvotes,
707
- # # "link": link
708
- # # })
709
-
710
- # # except Exception as post_error:
711
- # # print(f"Error processing individual post: {post_error}")
712
-
713
- # except Exception as e:
714
- # print(f"Login or scraping error: {e}")
715
- # return [{"error": str(e)}]
716
-
717
- # finally:
718
- # driver.quit()
719
-
720
- # return posts
721
-
722
- # @app.route('/scrape', methods=['POST'])
723
- # def scrape_reddit():
724
- # """
725
- # Flask endpoint for scraping Reddit posts
726
-
727
- # Expected JSON payload:
728
- # {
729
- # "username": "your_reddit_username",
730
- # "password": "your_reddit_password",
731
- # "subreddit": "technology"
732
- # }
733
- # """
734
- # # Get data from request
735
- # data = request.json
736
-
737
- # # Validate input
738
- # if not all(key in data for key in ['subreddit']):
739
- # return jsonify({
740
- # "error": "Missing required parameters. subreddit"
741
- # }), 400
742
-
743
- # try:
744
- # # Perform scraping
745
- # results = reddit_login_and_scrape(
746
- # 'Final-Difference7055',
747
- # '#CW2968honey',
748
- # data['subreddit']
749
- # )
750
-
751
- # # Check for errors
752
- # if results and 'error' in results[0]:
753
- # return jsonify({
754
- # "error": results[0]['error']
755
- # }), 500
756
-
757
- # return jsonify({
758
- # "posts": results
759
- # }), 200
760
-
761
- # except Exception as e:
762
- # return jsonify({
763
- # "error": str(e)
764
- # }), 500
765
-
766
- # @app.route('/', methods=['GET'])
767
- # def health_check():
768
- # """Simple health check endpoint"""
769
- # return jsonify({
770
- # "status": "healthy",
771
- # "message": "Reddit Scraper API is running"
772
- # }), 200
773
-
774
- # if __name__ == '__main__':
775
- # # Use environment variable for port, default to 5000
776
- # port = int(os.environ.get('PORT', 5000))
777
- # app.run(host='127.0.0.34', port=port,debug=True)