Spaces:
Runtime error
Runtime error
df
Browse files- img/advanced-search-01.png +0 -0
- scraper/__init__.py +0 -0
- scraper/progress.py +0 -46
- scraper/scroller.py +0 -26
- scraper/tweet.py +0 -277
- scraper/twitter_scraper.py +0 -777
img/advanced-search-01.png
DELETED
|
Binary file (104 kB)
|
|
|
scraper/__init__.py
DELETED
|
File without changes
|
scraper/progress.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
class Progress:
|
| 5 |
-
def __init__(self, current, total) -> None:
|
| 6 |
-
self.current = current
|
| 7 |
-
self.total = total
|
| 8 |
-
pass
|
| 9 |
-
|
| 10 |
-
def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
|
| 11 |
-
self.current = current
|
| 12 |
-
progress = current / self.total
|
| 13 |
-
bar_length = 40
|
| 14 |
-
progress_bar = (
|
| 15 |
-
"["
|
| 16 |
-
+ "=" * int(bar_length * progress)
|
| 17 |
-
+ "-" * (bar_length - int(bar_length * progress))
|
| 18 |
-
+ "]"
|
| 19 |
-
)
|
| 20 |
-
if no_tweets_limit:
|
| 21 |
-
if waiting:
|
| 22 |
-
sys.stdout.write(
|
| 23 |
-
"\rTweets scraped : {} - waiting to access older tweets {} min on 15 min".format(
|
| 24 |
-
current, retry_cnt
|
| 25 |
-
)
|
| 26 |
-
)
|
| 27 |
-
else:
|
| 28 |
-
sys.stdout.write(
|
| 29 |
-
"\rTweets scraped : {} ".format(
|
| 30 |
-
current
|
| 31 |
-
)
|
| 32 |
-
)
|
| 33 |
-
else:
|
| 34 |
-
if waiting:
|
| 35 |
-
sys.stdout.write(
|
| 36 |
-
"\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
|
| 37 |
-
progress_bar, progress, current, self.total, retry_cnt
|
| 38 |
-
)
|
| 39 |
-
)
|
| 40 |
-
else:
|
| 41 |
-
sys.stdout.write(
|
| 42 |
-
"\rProgress: [{:<40}] {:.2%} {} of {} ".format(
|
| 43 |
-
progress_bar, progress, current, self.total
|
| 44 |
-
)
|
| 45 |
-
)
|
| 46 |
-
sys.stdout.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scraper/scroller.py
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
class Scroller:
|
| 2 |
-
def __init__(self, driver) -> None:
|
| 3 |
-
self.driver = driver
|
| 4 |
-
self.current_position = 0
|
| 5 |
-
self.last_position = driver.execute_script("return window.pageYOffset;")
|
| 6 |
-
self.scrolling = True
|
| 7 |
-
self.scroll_count = 0
|
| 8 |
-
pass
|
| 9 |
-
|
| 10 |
-
def reset(self) -> None:
|
| 11 |
-
self.current_position = 0
|
| 12 |
-
self.last_position = self.driver.execute_script("return window.pageYOffset;")
|
| 13 |
-
self.scroll_count = 0
|
| 14 |
-
pass
|
| 15 |
-
|
| 16 |
-
def scroll_to_top(self) -> None:
|
| 17 |
-
self.driver.execute_script("window.scrollTo(0, 0);")
|
| 18 |
-
pass
|
| 19 |
-
|
| 20 |
-
def scroll_to_bottom(self) -> None:
|
| 21 |
-
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 22 |
-
pass
|
| 23 |
-
|
| 24 |
-
def update_scroll_position(self) -> None:
|
| 25 |
-
self.current_position = self.driver.execute_script("return window.pageYOffset;")
|
| 26 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scraper/tweet.py
DELETED
|
@@ -1,277 +0,0 @@
|
|
| 1 |
-
from time import sleep
|
| 2 |
-
from selenium.common.exceptions import (
|
| 3 |
-
NoSuchElementException,
|
| 4 |
-
StaleElementReferenceException,
|
| 5 |
-
)
|
| 6 |
-
from selenium.webdriver.chrome.webdriver import WebDriver
|
| 7 |
-
from selenium.webdriver.common.action_chains import ActionChains
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class Tweet:
|
| 11 |
-
def __init__(
|
| 12 |
-
self,
|
| 13 |
-
card: WebDriver,
|
| 14 |
-
driver: WebDriver,
|
| 15 |
-
actions: ActionChains,
|
| 16 |
-
scrape_poster_details=False,
|
| 17 |
-
) -> None:
|
| 18 |
-
self.card = card
|
| 19 |
-
self.error = False
|
| 20 |
-
self.tweet = None
|
| 21 |
-
|
| 22 |
-
try:
|
| 23 |
-
self.user = card.find_element(
|
| 24 |
-
"xpath", './/div[@data-testid="User-Name"]//span'
|
| 25 |
-
).text
|
| 26 |
-
except NoSuchElementException:
|
| 27 |
-
self.error = True
|
| 28 |
-
self.user = "skip"
|
| 29 |
-
|
| 30 |
-
try:
|
| 31 |
-
self.handle = card.find_element(
|
| 32 |
-
"xpath", './/span[contains(text(), "@")]'
|
| 33 |
-
).text
|
| 34 |
-
except NoSuchElementException:
|
| 35 |
-
self.error = True
|
| 36 |
-
self.handle = "skip"
|
| 37 |
-
|
| 38 |
-
try:
|
| 39 |
-
self.date_time = card.find_element("xpath", ".//time").get_attribute(
|
| 40 |
-
"datetime"
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
if self.date_time is not None:
|
| 44 |
-
self.is_ad = False
|
| 45 |
-
except NoSuchElementException:
|
| 46 |
-
self.is_ad = True
|
| 47 |
-
self.error = True
|
| 48 |
-
self.date_time = "skip"
|
| 49 |
-
|
| 50 |
-
if self.error:
|
| 51 |
-
return
|
| 52 |
-
|
| 53 |
-
try:
|
| 54 |
-
card.find_element(
|
| 55 |
-
"xpath", './/*[local-name()="svg" and @data-testid="icon-verified"]'
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
self.verified = True
|
| 59 |
-
except NoSuchElementException:
|
| 60 |
-
self.verified = False
|
| 61 |
-
|
| 62 |
-
self.content = ""
|
| 63 |
-
contents = card.find_elements(
|
| 64 |
-
"xpath",
|
| 65 |
-
'(.//div[@data-testid="tweetText"])[1]/span | (.//div[@data-testid="tweetText"])[1]/a',
|
| 66 |
-
)
|
| 67 |
-
|
| 68 |
-
for index, content in enumerate(contents):
|
| 69 |
-
self.content += content.text
|
| 70 |
-
|
| 71 |
-
try:
|
| 72 |
-
self.reply_cnt = card.find_element(
|
| 73 |
-
"xpath", './/button[@data-testid="reply"]//span'
|
| 74 |
-
).text
|
| 75 |
-
|
| 76 |
-
if self.reply_cnt == "":
|
| 77 |
-
self.reply_cnt = "0"
|
| 78 |
-
except NoSuchElementException:
|
| 79 |
-
self.reply_cnt = "0"
|
| 80 |
-
|
| 81 |
-
try:
|
| 82 |
-
self.retweet_cnt = card.find_element(
|
| 83 |
-
"xpath", './/button[@data-testid="retweet"]//span'
|
| 84 |
-
).text
|
| 85 |
-
|
| 86 |
-
if self.retweet_cnt == "":
|
| 87 |
-
self.retweet_cnt = "0"
|
| 88 |
-
except NoSuchElementException:
|
| 89 |
-
self.retweet_cnt = "0"
|
| 90 |
-
|
| 91 |
-
try:
|
| 92 |
-
self.like_cnt = card.find_element(
|
| 93 |
-
"xpath", './/button[@data-testid="like"]//span'
|
| 94 |
-
).text
|
| 95 |
-
|
| 96 |
-
if self.like_cnt == "":
|
| 97 |
-
self.like_cnt = "0"
|
| 98 |
-
except NoSuchElementException:
|
| 99 |
-
self.like_cnt = "0"
|
| 100 |
-
|
| 101 |
-
try:
|
| 102 |
-
self.analytics_cnt = card.find_element(
|
| 103 |
-
"xpath", './/a[contains(@href, "/analytics")]//span'
|
| 104 |
-
).text
|
| 105 |
-
|
| 106 |
-
if self.analytics_cnt == "":
|
| 107 |
-
self.analytics_cnt = "0"
|
| 108 |
-
except NoSuchElementException:
|
| 109 |
-
self.analytics_cnt = "0"
|
| 110 |
-
|
| 111 |
-
try:
|
| 112 |
-
self.tags = card.find_elements(
|
| 113 |
-
"xpath",
|
| 114 |
-
'.//a[contains(@href, "src=hashtag_click")]',
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
self.tags = [tag.text for tag in self.tags]
|
| 118 |
-
except NoSuchElementException:
|
| 119 |
-
self.tags = []
|
| 120 |
-
|
| 121 |
-
try:
|
| 122 |
-
self.mentions = card.find_elements(
|
| 123 |
-
"xpath",
|
| 124 |
-
'(.//div[@data-testid="tweetText"])[1]//a[contains(text(), "@")]',
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
self.mentions = [mention.text for mention in self.mentions]
|
| 128 |
-
except NoSuchElementException:
|
| 129 |
-
self.mentions = []
|
| 130 |
-
|
| 131 |
-
try:
|
| 132 |
-
raw_emojis = card.find_elements(
|
| 133 |
-
"xpath",
|
| 134 |
-
'(.//div[@data-testid="tweetText"])[1]/img[contains(@src, "emoji")]',
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
self.emojis = [
|
| 138 |
-
emoji.get_attribute("alt").encode("unicode-escape").decode("ASCII")
|
| 139 |
-
for emoji in raw_emojis
|
| 140 |
-
]
|
| 141 |
-
except NoSuchElementException:
|
| 142 |
-
self.emojis = []
|
| 143 |
-
|
| 144 |
-
try:
|
| 145 |
-
self.profile_img = card.find_element(
|
| 146 |
-
"xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
|
| 147 |
-
).get_attribute("src")
|
| 148 |
-
except NoSuchElementException:
|
| 149 |
-
self.profile_img = ""
|
| 150 |
-
|
| 151 |
-
try:
|
| 152 |
-
self.tweet_link = self.card.find_element(
|
| 153 |
-
"xpath",
|
| 154 |
-
".//a[contains(@href, '/status/')]",
|
| 155 |
-
).get_attribute("href")
|
| 156 |
-
self.tweet_id = str(self.tweet_link.split("/")[-1])
|
| 157 |
-
except NoSuchElementException:
|
| 158 |
-
self.tweet_link = ""
|
| 159 |
-
self.tweet_id = ""
|
| 160 |
-
|
| 161 |
-
self.following_cnt = "0"
|
| 162 |
-
self.followers_cnt = "0"
|
| 163 |
-
self.user_id = None
|
| 164 |
-
|
| 165 |
-
if scrape_poster_details:
|
| 166 |
-
el_name = card.find_element(
|
| 167 |
-
"xpath", './/div[@data-testid="User-Name"]//span'
|
| 168 |
-
)
|
| 169 |
-
|
| 170 |
-
ext_hover_card = False
|
| 171 |
-
ext_user_id = False
|
| 172 |
-
ext_following = False
|
| 173 |
-
ext_followers = False
|
| 174 |
-
hover_attempt = 0
|
| 175 |
-
|
| 176 |
-
while (
|
| 177 |
-
not ext_hover_card
|
| 178 |
-
or not ext_user_id
|
| 179 |
-
or not ext_following
|
| 180 |
-
or not ext_followers
|
| 181 |
-
):
|
| 182 |
-
try:
|
| 183 |
-
actions.move_to_element(el_name).perform()
|
| 184 |
-
|
| 185 |
-
hover_card = driver.find_element(
|
| 186 |
-
"xpath", '//div[@data-testid="hoverCardParent"]'
|
| 187 |
-
)
|
| 188 |
-
|
| 189 |
-
ext_hover_card = True
|
| 190 |
-
|
| 191 |
-
while not ext_user_id:
|
| 192 |
-
try:
|
| 193 |
-
raw_user_id = hover_card.find_element(
|
| 194 |
-
"xpath",
|
| 195 |
-
'(.//div[contains(@data-testid, "-follow")]) | (.//div[contains(@data-testid, "-unfollow")])',
|
| 196 |
-
).get_attribute("data-testid")
|
| 197 |
-
|
| 198 |
-
if raw_user_id == "":
|
| 199 |
-
self.user_id = None
|
| 200 |
-
else:
|
| 201 |
-
self.user_id = str(raw_user_id.split("-")[0])
|
| 202 |
-
|
| 203 |
-
ext_user_id = True
|
| 204 |
-
except NoSuchElementException:
|
| 205 |
-
continue
|
| 206 |
-
except StaleElementReferenceException:
|
| 207 |
-
self.error = True
|
| 208 |
-
return
|
| 209 |
-
|
| 210 |
-
while not ext_following:
|
| 211 |
-
try:
|
| 212 |
-
self.following_cnt = hover_card.find_element(
|
| 213 |
-
"xpath", './/a[contains(@href, "/following")]//span'
|
| 214 |
-
).text
|
| 215 |
-
|
| 216 |
-
if self.following_cnt == "":
|
| 217 |
-
self.following_cnt = "0"
|
| 218 |
-
|
| 219 |
-
ext_following = True
|
| 220 |
-
except NoSuchElementException:
|
| 221 |
-
continue
|
| 222 |
-
except StaleElementReferenceException:
|
| 223 |
-
self.error = True
|
| 224 |
-
return
|
| 225 |
-
|
| 226 |
-
while not ext_followers:
|
| 227 |
-
try:
|
| 228 |
-
self.followers_cnt = hover_card.find_element(
|
| 229 |
-
"xpath",
|
| 230 |
-
'.//a[contains(@href, "/verified_followers")]//span',
|
| 231 |
-
).text
|
| 232 |
-
|
| 233 |
-
if self.followers_cnt == "":
|
| 234 |
-
self.followers_cnt = "0"
|
| 235 |
-
|
| 236 |
-
ext_followers = True
|
| 237 |
-
except NoSuchElementException:
|
| 238 |
-
continue
|
| 239 |
-
except StaleElementReferenceException:
|
| 240 |
-
self.error = True
|
| 241 |
-
return
|
| 242 |
-
except NoSuchElementException:
|
| 243 |
-
if hover_attempt == 3:
|
| 244 |
-
self.error
|
| 245 |
-
return
|
| 246 |
-
hover_attempt += 1
|
| 247 |
-
sleep(0.5)
|
| 248 |
-
continue
|
| 249 |
-
except StaleElementReferenceException:
|
| 250 |
-
self.error = True
|
| 251 |
-
return
|
| 252 |
-
|
| 253 |
-
if ext_hover_card and ext_following and ext_followers:
|
| 254 |
-
actions.reset_actions()
|
| 255 |
-
|
| 256 |
-
self.tweet = (
|
| 257 |
-
self.user,
|
| 258 |
-
self.handle,
|
| 259 |
-
self.date_time,
|
| 260 |
-
self.verified,
|
| 261 |
-
self.content,
|
| 262 |
-
self.reply_cnt,
|
| 263 |
-
self.retweet_cnt,
|
| 264 |
-
self.like_cnt,
|
| 265 |
-
self.analytics_cnt,
|
| 266 |
-
self.tags,
|
| 267 |
-
self.mentions,
|
| 268 |
-
self.emojis,
|
| 269 |
-
self.profile_img,
|
| 270 |
-
self.tweet_link,
|
| 271 |
-
self.tweet_id,
|
| 272 |
-
self.user_id,
|
| 273 |
-
self.following_cnt,
|
| 274 |
-
self.followers_cnt,
|
| 275 |
-
)
|
| 276 |
-
|
| 277 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scraper/twitter_scraper.py
DELETED
|
@@ -1,777 +0,0 @@
|
|
| 1 |
-
# import os
|
| 2 |
-
# import sys
|
| 3 |
-
# import pandas as pd
|
| 4 |
-
# from scraper.progress import Progress
|
| 5 |
-
# from scraper.scroller import Scroller
|
| 6 |
-
# from scraper.tweet import Tweet
|
| 7 |
-
|
| 8 |
-
# from datetime import datetime
|
| 9 |
-
# from fake_headers import Headers
|
| 10 |
-
# from time import sleep
|
| 11 |
-
|
| 12 |
-
# from selenium import webdriver
|
| 13 |
-
# from selenium.webdriver.common.keys import Keys
|
| 14 |
-
# from selenium.common.exceptions import (
|
| 15 |
-
# NoSuchElementException,
|
| 16 |
-
# StaleElementReferenceException,
|
| 17 |
-
# WebDriverException,
|
| 18 |
-
# )
|
| 19 |
-
# from selenium.webdriver.common.action_chains import ActionChains
|
| 20 |
-
# from selenium.webdriver.chrome.options import Options as ChromeOptions
|
| 21 |
-
# from selenium.webdriver.chrome.service import Service as ChromeService
|
| 22 |
-
|
| 23 |
-
# from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
| 24 |
-
# from selenium.webdriver.firefox.service import Service as FirefoxService
|
| 25 |
-
|
| 26 |
-
# from selenium.webdriver.support.ui import WebDriverWait
|
| 27 |
-
|
| 28 |
-
# from webdriver_manager.chrome import ChromeDriverManager
|
| 29 |
-
# from webdriver_manager.firefox import GeckoDriverManager
|
| 30 |
-
|
| 31 |
-
# TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
# class Twitter_Scraper:
|
| 35 |
-
# def __init__(
|
| 36 |
-
# self,
|
| 37 |
-
# mail,
|
| 38 |
-
# username,
|
| 39 |
-
# password,
|
| 40 |
-
# max_tweets=50,
|
| 41 |
-
# scrape_username=None,
|
| 42 |
-
# scrape_hashtag=None,
|
| 43 |
-
# scrape_query=None,
|
| 44 |
-
# scrape_poster_details=False,
|
| 45 |
-
# scrape_latest=True,
|
| 46 |
-
# scrape_top=False,
|
| 47 |
-
# proxy=None,
|
| 48 |
-
# ):
|
| 49 |
-
# print("Initializing Twitter Scraper...")
|
| 50 |
-
# self.mail = mail
|
| 51 |
-
# self.username = username
|
| 52 |
-
# self.password = password
|
| 53 |
-
# self.interrupted = False
|
| 54 |
-
# self.tweet_ids = set()
|
| 55 |
-
# self.data = []
|
| 56 |
-
# self.tweet_cards = []
|
| 57 |
-
# self.scraper_details = {
|
| 58 |
-
# "type": None,
|
| 59 |
-
# "username": None,
|
| 60 |
-
# "hashtag": None,
|
| 61 |
-
# "query": None,
|
| 62 |
-
# "tab": None,
|
| 63 |
-
# "poster_details": False,
|
| 64 |
-
# }
|
| 65 |
-
# self.max_tweets = max_tweets
|
| 66 |
-
# self.progress = Progress(0, max_tweets)
|
| 67 |
-
# self.router = self.go_to_home
|
| 68 |
-
# self.driver = self._get_driver(proxy)
|
| 69 |
-
# self.actions = ActionChains(self.driver)
|
| 70 |
-
# self.scroller = Scroller(self.driver)
|
| 71 |
-
# self._config_scraper(
|
| 72 |
-
# max_tweets,
|
| 73 |
-
# scrape_username,
|
| 74 |
-
# scrape_hashtag,
|
| 75 |
-
# scrape_query,
|
| 76 |
-
# scrape_latest,
|
| 77 |
-
# scrape_top,
|
| 78 |
-
# scrape_poster_details,
|
| 79 |
-
# )
|
| 80 |
-
|
| 81 |
-
# def _config_scraper(
|
| 82 |
-
# self,
|
| 83 |
-
# max_tweets=50,
|
| 84 |
-
# scrape_username=None,
|
| 85 |
-
# scrape_hashtag=None,
|
| 86 |
-
# scrape_query=None,
|
| 87 |
-
# scrape_latest=True,
|
| 88 |
-
# scrape_top=False,
|
| 89 |
-
# scrape_poster_details=False,
|
| 90 |
-
# ):
|
| 91 |
-
# self.tweet_ids = set()
|
| 92 |
-
# self.data = []
|
| 93 |
-
# self.tweet_cards = []
|
| 94 |
-
# self.max_tweets = max_tweets
|
| 95 |
-
# self.progress = Progress(0, max_tweets)
|
| 96 |
-
# self.scraper_details = {
|
| 97 |
-
# "type": None,
|
| 98 |
-
# "username": scrape_username,
|
| 99 |
-
# "hashtag": str(scrape_hashtag).replace("#", "")
|
| 100 |
-
# if scrape_hashtag is not None
|
| 101 |
-
# else None,
|
| 102 |
-
# "query": scrape_query,
|
| 103 |
-
# "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
|
| 104 |
-
# "poster_details": scrape_poster_details,
|
| 105 |
-
# }
|
| 106 |
-
# self.router = self.go_to_home
|
| 107 |
-
# self.scroller = Scroller(self.driver)
|
| 108 |
-
|
| 109 |
-
# if scrape_username is not None:
|
| 110 |
-
# self.scraper_details["type"] = "Username"
|
| 111 |
-
# self.router = self.go_to_profile
|
| 112 |
-
# elif scrape_hashtag is not None:
|
| 113 |
-
# self.scraper_details["type"] = "Hashtag"
|
| 114 |
-
# self.router = self.go_to_hashtag
|
| 115 |
-
# elif scrape_query is not None:
|
| 116 |
-
# self.scraper_details["type"] = "Query"
|
| 117 |
-
# self.router = self.go_to_search
|
| 118 |
-
# else:
|
| 119 |
-
# self.scraper_details["type"] = "Home"
|
| 120 |
-
# self.router = self.go_to_home
|
| 121 |
-
# pass
|
| 122 |
-
|
| 123 |
-
# def _get_driver(
|
| 124 |
-
# self,
|
| 125 |
-
# proxy=None,
|
| 126 |
-
# ):
|
| 127 |
-
# print("Setup WebDriver...")
|
| 128 |
-
# header = Headers().generate()["User-Agent"]
|
| 129 |
-
|
| 130 |
-
# # browser_option = ChromeOptions()
|
| 131 |
-
# browser_option = FirefoxOptions()
|
| 132 |
-
# browser_option.add_argument("--no-sandbox")
|
| 133 |
-
# browser_option.add_argument("--disable-dev-shm-usage")
|
| 134 |
-
# browser_option.add_argument("--ignore-certificate-errors")
|
| 135 |
-
# browser_option.add_argument("--disable-gpu")
|
| 136 |
-
# browser_option.add_argument("--log-level=3")
|
| 137 |
-
# browser_option.add_argument("--disable-notifications")
|
| 138 |
-
# browser_option.add_argument("--disable-popup-blocking")
|
| 139 |
-
# browser_option.add_argument("--user-agent={}".format(header))
|
| 140 |
-
# if proxy is not None:
|
| 141 |
-
# browser_option.add_argument("--proxy-server=%s" % proxy)
|
| 142 |
-
|
| 143 |
-
# # For Hiding Browser
|
| 144 |
-
# browser_option.add_argument("--headless")
|
| 145 |
-
|
| 146 |
-
# try:
|
| 147 |
-
# # print("Initializing ChromeDriver...")
|
| 148 |
-
# # driver = webdriver.Chrome(
|
| 149 |
-
# # options=browser_option,
|
| 150 |
-
# # )
|
| 151 |
-
|
| 152 |
-
# print("Initializing FirefoxDriver...")
|
| 153 |
-
# driver = webdriver.Firefox(
|
| 154 |
-
# options=browser_option,
|
| 155 |
-
# )
|
| 156 |
-
|
| 157 |
-
# print("WebDriver Setup Complete")
|
| 158 |
-
# return driver
|
| 159 |
-
# except WebDriverException:
|
| 160 |
-
# try:
|
| 161 |
-
# # print("Downloading ChromeDriver...")
|
| 162 |
-
# # chromedriver_path = ChromeDriverManager().install()
|
| 163 |
-
# # chrome_service = ChromeService(executable_path=chromedriver_path)
|
| 164 |
-
|
| 165 |
-
# print("Downloading FirefoxDriver...")
|
| 166 |
-
# firefoxdriver_path = GeckoDriverManager().install()
|
| 167 |
-
# firefox_service = FirefoxService(executable_path=firefoxdriver_path)
|
| 168 |
-
|
| 169 |
-
# # print("Initializing ChromeDriver...")
|
| 170 |
-
# # driver = webdriver.Chrome(
|
| 171 |
-
# # service=chrome_service,
|
| 172 |
-
# # options=browser_option,
|
| 173 |
-
# # )
|
| 174 |
-
|
| 175 |
-
# print("Initializing FirefoxDriver...")
|
| 176 |
-
# driver = webdriver.Firefox(
|
| 177 |
-
# service=firefox_service,
|
| 178 |
-
# options=browser_option,
|
| 179 |
-
# )
|
| 180 |
-
|
| 181 |
-
# print("WebDriver Setup Complete")
|
| 182 |
-
# return driver
|
| 183 |
-
# except Exception as e:
|
| 184 |
-
# print(f"Error setting up WebDriver: {e}")
|
| 185 |
-
# sys.exit(1)
|
| 186 |
-
# pass
|
| 187 |
-
|
| 188 |
-
# def login(self):
|
| 189 |
-
# print()
|
| 190 |
-
# print("Logging in to Twitter...")
|
| 191 |
-
|
| 192 |
-
# try:
|
| 193 |
-
# self.driver.maximize_window()
|
| 194 |
-
# self.driver.get(TWITTER_LOGIN_URL)
|
| 195 |
-
# sleep(3)
|
| 196 |
-
|
| 197 |
-
# self._input_username()
|
| 198 |
-
# self._input_unusual_activity()
|
| 199 |
-
# self._input_password()
|
| 200 |
-
|
| 201 |
-
# cookies = self.driver.get_cookies()
|
| 202 |
-
|
| 203 |
-
# auth_token = None
|
| 204 |
-
|
| 205 |
-
# for cookie in cookies:
|
| 206 |
-
# if cookie["name"] == "auth_token":
|
| 207 |
-
# auth_token = cookie["value"]
|
| 208 |
-
# break
|
| 209 |
-
|
| 210 |
-
# if auth_token is None:
|
| 211 |
-
# raise ValueError(
|
| 212 |
-
# """This may be due to the following:
|
| 213 |
-
|
| 214 |
-
# - Internet connection is unstable
|
| 215 |
-
# - Username is incorrect
|
| 216 |
-
# - Password is incorrect
|
| 217 |
-
# """
|
| 218 |
-
# )
|
| 219 |
-
|
| 220 |
-
# print()
|
| 221 |
-
# print("Login Successful")
|
| 222 |
-
# print()
|
| 223 |
-
# except Exception as e:
|
| 224 |
-
# print()
|
| 225 |
-
# print(f"Login Failed: {e}")
|
| 226 |
-
# sys.exit(1)
|
| 227 |
-
|
| 228 |
-
# pass
|
| 229 |
-
|
| 230 |
-
# def _input_username(self):
|
| 231 |
-
# input_attempt = 0
|
| 232 |
-
|
| 233 |
-
# while True:
|
| 234 |
-
# try:
|
| 235 |
-
# username = self.driver.find_element(
|
| 236 |
-
# "xpath", "//input[@autocomplete='username']"
|
| 237 |
-
# )
|
| 238 |
-
# print("username", username)
|
| 239 |
-
# username.send_keys(self.username)
|
| 240 |
-
# username.send_keys(Keys.RETURN)
|
| 241 |
-
# sleep(3)
|
| 242 |
-
# break
|
| 243 |
-
# except NoSuchElementException:
|
| 244 |
-
# input_attempt += 1
|
| 245 |
-
# if input_attempt >= 3:
|
| 246 |
-
# print()
|
| 247 |
-
# print(
|
| 248 |
-
# """There was an error inputting the username.
|
| 249 |
-
|
| 250 |
-
# It may be due to the following:
|
| 251 |
-
# - Internet connection is unstable
|
| 252 |
-
# - Username is incorrect
|
| 253 |
-
# - Twitter is experiencing unusual activity"""
|
| 254 |
-
# )
|
| 255 |
-
# self.driver.quit()
|
| 256 |
-
# sys.exit(1)
|
| 257 |
-
# else:
|
| 258 |
-
# print("Re-attempting to input username...")
|
| 259 |
-
# sleep(2)
|
| 260 |
-
|
| 261 |
-
# def _input_unusual_activity(self):
|
| 262 |
-
# input_attempt = 0
|
| 263 |
-
|
| 264 |
-
# while True:
|
| 265 |
-
# try:
|
| 266 |
-
# unusual_activity = self.driver.find_element(
|
| 267 |
-
# "xpath", "//input[@data-testid='ocfEnterTextTextInput']"
|
| 268 |
-
# )
|
| 269 |
-
# print("unusual_activity", unusual_activity)
|
| 270 |
-
# unusual_activity.send_keys(self.username)
|
| 271 |
-
# unusual_activity.send_keys(Keys.RETURN)
|
| 272 |
-
# sleep(3)
|
| 273 |
-
# break
|
| 274 |
-
# except NoSuchElementException:
|
| 275 |
-
# input_attempt += 1
|
| 276 |
-
# if input_attempt >= 3:
|
| 277 |
-
# break
|
| 278 |
-
|
| 279 |
-
# def _input_password(self):
|
| 280 |
-
# input_attempt = 0
|
| 281 |
-
|
| 282 |
-
# while True:
|
| 283 |
-
# try:
|
| 284 |
-
# password = self.driver.find_element(
|
| 285 |
-
# "xpath", "//input[@autocomplete='current-password']"
|
| 286 |
-
# )
|
| 287 |
-
# print("password", password)
|
| 288 |
-
|
| 289 |
-
# password.send_keys(self.password)
|
| 290 |
-
# password.send_keys(Keys.RETURN)
|
| 291 |
-
# sleep(3)
|
| 292 |
-
# break
|
| 293 |
-
# except NoSuchElementException:
|
| 294 |
-
# input_attempt += 1
|
| 295 |
-
# if input_attempt >= 3:
|
| 296 |
-
# print()
|
| 297 |
-
# print(
|
| 298 |
-
# """There was an error inputting the password.
|
| 299 |
-
|
| 300 |
-
# It may be due to the following:
|
| 301 |
-
# - Internet connection is unstable
|
| 302 |
-
# - Password is incorrect
|
| 303 |
-
# - Twitter is experiencing unusual activity"""
|
| 304 |
-
# )
|
| 305 |
-
# self.driver.quit()
|
| 306 |
-
# sys.exit(1)
|
| 307 |
-
# else:
|
| 308 |
-
# print("Re-attempting to input password...")
|
| 309 |
-
# sleep(2)
|
| 310 |
-
|
| 311 |
-
# def go_to_home(self):
|
| 312 |
-
# self.driver.get("https://twitter.com/home")
|
| 313 |
-
# sleep(3)
|
| 314 |
-
# pass
|
| 315 |
-
|
| 316 |
-
# def go_to_profile(self):
|
| 317 |
-
# if (
|
| 318 |
-
# self.scraper_details["username"] is None
|
| 319 |
-
# or self.scraper_details["username"] == ""
|
| 320 |
-
# ):
|
| 321 |
-
# print("Username is not set.")
|
| 322 |
-
# sys.exit(1)
|
| 323 |
-
# else:
|
| 324 |
-
# self.driver.get(f"https://twitter.com/{self.scraper_details['username']}")
|
| 325 |
-
# sleep(3)
|
| 326 |
-
# pass
|
| 327 |
-
|
| 328 |
-
# def go_to_hashtag(self):
|
| 329 |
-
# if (
|
| 330 |
-
# self.scraper_details["hashtag"] is None
|
| 331 |
-
# or self.scraper_details["hashtag"] == ""
|
| 332 |
-
# ):
|
| 333 |
-
# print("Hashtag is not set.")
|
| 334 |
-
# sys.exit(1)
|
| 335 |
-
# else:
|
| 336 |
-
# url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click"
|
| 337 |
-
# if self.scraper_details["tab"] == "Latest":
|
| 338 |
-
# url += "&f=live"
|
| 339 |
-
|
| 340 |
-
# self.driver.get(url)
|
| 341 |
-
# sleep(3)
|
| 342 |
-
# pass
|
| 343 |
-
|
| 344 |
-
# def go_to_search(self):
|
| 345 |
-
# if self.scraper_details["query"] is None or self.scraper_details["query"] == "":
|
| 346 |
-
# print("Query is not set.")
|
| 347 |
-
# sys.exit(1)
|
| 348 |
-
# else:
|
| 349 |
-
# url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
|
| 350 |
-
# if self.scraper_details["tab"] == "Latest":
|
| 351 |
-
# url += "&f=live"
|
| 352 |
-
|
| 353 |
-
# self.driver.get(url)
|
| 354 |
-
# sleep(3)
|
| 355 |
-
# pass
|
| 356 |
-
|
| 357 |
-
# def get_tweet_cards(self):
|
| 358 |
-
# self.tweet_cards = self.driver.find_elements(
|
| 359 |
-
# "xpath", '//article[@data-testid="tweet" and not(@disabled)]'
|
| 360 |
-
# )
|
| 361 |
-
# pass
|
| 362 |
-
|
| 363 |
-
# def remove_hidden_cards(self):
|
| 364 |
-
# try:
|
| 365 |
-
# hidden_cards = self.driver.find_elements(
|
| 366 |
-
# "xpath", '//article[@data-testid="tweet" and @disabled]'
|
| 367 |
-
# )
|
| 368 |
-
|
| 369 |
-
# for card in hidden_cards[1:-2]:
|
| 370 |
-
# self.driver.execute_script(
|
| 371 |
-
# "arguments[0].parentNode.parentNode.parentNode.remove();", card
|
| 372 |
-
# )
|
| 373 |
-
# except Exception as e:
|
| 374 |
-
# return
|
| 375 |
-
# pass
|
| 376 |
-
|
| 377 |
-
# def scrape_tweets(
|
| 378 |
-
# self,
|
| 379 |
-
# max_tweets=50,
|
| 380 |
-
# no_tweets_limit=False,
|
| 381 |
-
# scrape_username=None,
|
| 382 |
-
# scrape_hashtag=None,
|
| 383 |
-
# scrape_query=None,
|
| 384 |
-
# scrape_latest=True,
|
| 385 |
-
# scrape_top=False,
|
| 386 |
-
# scrape_poster_details=False,
|
| 387 |
-
# router=None,
|
| 388 |
-
# ):
|
| 389 |
-
# self._config_scraper(
|
| 390 |
-
# max_tweets,
|
| 391 |
-
# scrape_username,
|
| 392 |
-
# scrape_hashtag,
|
| 393 |
-
# scrape_query,
|
| 394 |
-
# scrape_latest,
|
| 395 |
-
# scrape_top,
|
| 396 |
-
# scrape_poster_details,
|
| 397 |
-
# )
|
| 398 |
-
|
| 399 |
-
# if router is None:
|
| 400 |
-
# router = self.router
|
| 401 |
-
|
| 402 |
-
# router()
|
| 403 |
-
|
| 404 |
-
# if self.scraper_details["type"] == "Username":
|
| 405 |
-
# print(
|
| 406 |
-
# "Scraping Tweets from @{}...".format(self.scraper_details["username"])
|
| 407 |
-
# )
|
| 408 |
-
# elif self.scraper_details["type"] == "Hashtag":
|
| 409 |
-
# print(
|
| 410 |
-
# "Scraping {} Tweets from #{}...".format(
|
| 411 |
-
# self.scraper_details["tab"], self.scraper_details["hashtag"]
|
| 412 |
-
# )
|
| 413 |
-
# )
|
| 414 |
-
# elif self.scraper_details["type"] == "Query":
|
| 415 |
-
# print(
|
| 416 |
-
# "Scraping {} Tweets from {} search...".format(
|
| 417 |
-
# self.scraper_details["tab"], self.scraper_details["query"]
|
| 418 |
-
# )
|
| 419 |
-
# )
|
| 420 |
-
# elif self.scraper_details["type"] == "Home":
|
| 421 |
-
# print("Scraping Tweets from Home...")
|
| 422 |
-
|
| 423 |
-
# # Accept cookies to make the banner disappear
|
| 424 |
-
# try:
|
| 425 |
-
# accept_cookies_btn = self.driver.find_element(
|
| 426 |
-
# "xpath", "//span[text()='Refuse non-essential cookies']/../../..")
|
| 427 |
-
# accept_cookies_btn.click()
|
| 428 |
-
# except NoSuchElementException:
|
| 429 |
-
# pass
|
| 430 |
-
|
| 431 |
-
# self.progress.print_progress(0, False, 0, no_tweets_limit)
|
| 432 |
-
|
| 433 |
-
# refresh_count = 0
|
| 434 |
-
# added_tweets = 0
|
| 435 |
-
# empty_count = 0
|
| 436 |
-
# retry_cnt = 0
|
| 437 |
-
|
| 438 |
-
# while self.scroller.scrolling:
|
| 439 |
-
# try:
|
| 440 |
-
# self.get_tweet_cards()
|
| 441 |
-
# added_tweets = 0
|
| 442 |
-
|
| 443 |
-
# for card in self.tweet_cards[-15:]:
|
| 444 |
-
# try:
|
| 445 |
-
# tweet_id = str(card)
|
| 446 |
-
|
| 447 |
-
# if tweet_id not in self.tweet_ids:
|
| 448 |
-
# self.tweet_ids.add(tweet_id)
|
| 449 |
-
|
| 450 |
-
# if not self.scraper_details["poster_details"]:
|
| 451 |
-
# self.driver.execute_script(
|
| 452 |
-
# "arguments[0].scrollIntoView();", card
|
| 453 |
-
# )
|
| 454 |
-
|
| 455 |
-
# tweet = Tweet(
|
| 456 |
-
# card=card,
|
| 457 |
-
# driver=self.driver,
|
| 458 |
-
# actions=self.actions,
|
| 459 |
-
# scrape_poster_details=self.scraper_details[
|
| 460 |
-
# "poster_details"
|
| 461 |
-
# ],
|
| 462 |
-
# )
|
| 463 |
-
|
| 464 |
-
# if tweet:
|
| 465 |
-
# if not tweet.error and tweet.tweet is not None:
|
| 466 |
-
# if not tweet.is_ad:
|
| 467 |
-
# self.data.append(tweet.tweet)
|
| 468 |
-
# added_tweets += 1
|
| 469 |
-
# self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
| 470 |
-
|
| 471 |
-
# if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
| 472 |
-
# self.scroller.scrolling = False
|
| 473 |
-
# break
|
| 474 |
-
# else:
|
| 475 |
-
# continue
|
| 476 |
-
# else:
|
| 477 |
-
# continue
|
| 478 |
-
# else:
|
| 479 |
-
# continue
|
| 480 |
-
# else:
|
| 481 |
-
# continue
|
| 482 |
-
# except NoSuchElementException:
|
| 483 |
-
# continue
|
| 484 |
-
|
| 485 |
-
# if len(self.data) >= self.max_tweets and not no_tweets_limit:
|
| 486 |
-
# break
|
| 487 |
-
|
| 488 |
-
# if added_tweets == 0:
|
| 489 |
-
# # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
|
| 490 |
-
# try:
|
| 491 |
-
# while retry_cnt < 15:
|
| 492 |
-
# retry_button = self.driver.find_element(
|
| 493 |
-
# "xpath", "//span[text()='Retry']/../../..")
|
| 494 |
-
# self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
|
| 495 |
-
# sleep(58)
|
| 496 |
-
# retry_button.click()
|
| 497 |
-
# retry_cnt += 1
|
| 498 |
-
# sleep(2)
|
| 499 |
-
# # There is no Retry button so the counter is reseted
|
| 500 |
-
# except NoSuchElementException:
|
| 501 |
-
# retry_cnt = 0
|
| 502 |
-
# self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
|
| 503 |
-
|
| 504 |
-
# if empty_count >= 5:
|
| 505 |
-
# if refresh_count >= 3:
|
| 506 |
-
# print()
|
| 507 |
-
# print("No more tweets to scrape")
|
| 508 |
-
# break
|
| 509 |
-
# refresh_count += 1
|
| 510 |
-
# empty_count += 1
|
| 511 |
-
# sleep(1)
|
| 512 |
-
# else:
|
| 513 |
-
# empty_count = 0
|
| 514 |
-
# refresh_count = 0
|
| 515 |
-
# except StaleElementReferenceException:
|
| 516 |
-
# sleep(2)
|
| 517 |
-
# continue
|
| 518 |
-
# except KeyboardInterrupt:
|
| 519 |
-
# print("\n")
|
| 520 |
-
# print("Keyboard Interrupt")
|
| 521 |
-
# self.interrupted = True
|
| 522 |
-
# break
|
| 523 |
-
# except Exception as e:
|
| 524 |
-
# print("\n")
|
| 525 |
-
# print(f"Error scraping tweets: {e}")
|
| 526 |
-
# break
|
| 527 |
-
|
| 528 |
-
# print("")
|
| 529 |
-
|
| 530 |
-
# if len(self.data) >= self.max_tweets or no_tweets_limit:
|
| 531 |
-
# print("Scraping Complete")
|
| 532 |
-
# else:
|
| 533 |
-
# print("Scraping Incomplete")
|
| 534 |
-
|
| 535 |
-
# if not no_tweets_limit:
|
| 536 |
-
# print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
|
| 537 |
-
|
| 538 |
-
# pass
|
| 539 |
-
|
| 540 |
-
# def save_to_csv(self):
|
| 541 |
-
# print("Saving Tweets to CSV...")
|
| 542 |
-
# now = datetime.now()
|
| 543 |
-
# folder_path = "./tweets/"
|
| 544 |
-
|
| 545 |
-
# if not os.path.exists(folder_path):
|
| 546 |
-
# os.makedirs(folder_path)
|
| 547 |
-
# print("Created Folder: {}".format(folder_path))
|
| 548 |
-
|
| 549 |
-
# data = {
|
| 550 |
-
# "Name": [tweet[0] for tweet in self.data],
|
| 551 |
-
# "Handle": [tweet[1] for tweet in self.data],
|
| 552 |
-
# "Timestamp": [tweet[2] for tweet in self.data],
|
| 553 |
-
# "Verified": [tweet[3] for tweet in self.data],
|
| 554 |
-
# "Content": [tweet[4] for tweet in self.data],
|
| 555 |
-
# "Comments": [tweet[5] for tweet in self.data],
|
| 556 |
-
# "Retweets": [tweet[6] for tweet in self.data],
|
| 557 |
-
# "Likes": [tweet[7] for tweet in self.data],
|
| 558 |
-
# "Analytics": [tweet[8] for tweet in self.data],
|
| 559 |
-
# "Tags": [tweet[9] for tweet in self.data],
|
| 560 |
-
# "Mentions": [tweet[10] for tweet in self.data],
|
| 561 |
-
# "Emojis": [tweet[11] for tweet in self.data],
|
| 562 |
-
# "Profile Image": [tweet[12] for tweet in self.data],
|
| 563 |
-
# "Tweet Link": [tweet[13] for tweet in self.data],
|
| 564 |
-
# "Tweet ID": [f"tweet_id:{tweet[14]}" for tweet in self.data],
|
| 565 |
-
# }
|
| 566 |
-
|
| 567 |
-
# if self.scraper_details["poster_details"]:
|
| 568 |
-
# data["Tweeter ID"] = [f"user_id:{tweet[15]}" for tweet in self.data]
|
| 569 |
-
# data["Following"] = [tweet[16] for tweet in self.data]
|
| 570 |
-
# data["Followers"] = [tweet[17] for tweet in self.data]
|
| 571 |
-
|
| 572 |
-
# df = pd.DataFrame(data)
|
| 573 |
-
|
| 574 |
-
# current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
|
| 575 |
-
# file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
|
| 576 |
-
# pd.set_option("display.max_colwidth", None)
|
| 577 |
-
# df.to_csv(file_path, index=False, encoding="utf-8")
|
| 578 |
-
|
| 579 |
-
# print("CSV Saved: {}".format(file_path))
|
| 580 |
-
|
| 581 |
-
# pass
|
| 582 |
-
|
| 583 |
-
# def get_tweets(self):
|
| 584 |
-
# return self.data
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
# import os
|
| 588 |
-
# from flask import Flask, request, jsonify
|
| 589 |
-
# from selenium import webdriver
|
| 590 |
-
# from selenium.webdriver.firefox.service import Service
|
| 591 |
-
# from selenium.webdriver.firefox.options import Options
|
| 592 |
-
# from selenium.webdriver.common.by import By
|
| 593 |
-
# from selenium.webdriver.support.ui import WebDriverWait
|
| 594 |
-
# from selenium.webdriver.support import expected_conditions as EC
|
| 595 |
-
# from webdriver_manager.firefox import GeckoDriverManager
|
| 596 |
-
# import time
|
| 597 |
-
# import random
|
| 598 |
-
|
| 599 |
-
# app = Flask(__name__)
|
| 600 |
-
|
| 601 |
-
# def setup_driver():
|
| 602 |
-
# """Set up Chrome WebDriver with appropriate options for headless browsing."""
|
| 603 |
-
# chrome_options = Options()
|
| 604 |
-
# # chrome_options.add_argument("--headless")
|
| 605 |
-
# chrome_options.add_argument("--incognito")
|
| 606 |
-
# chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
| 607 |
-
# # chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 608 |
-
# # chrome_options.add_experimental_option('useAutomationExtension', False)
|
| 609 |
-
# chrome_options.add_argument("--start-maximized")
|
| 610 |
-
# chrome_options.add_argument("--no-sandbox")
|
| 611 |
-
# chrome_options.add_argument("--disable-dev-shm-usage")
|
| 612 |
-
# chrome_options.add_argument("--disable-extensions")
|
| 613 |
-
# chrome_options.add_argument("--disable-gpu")
|
| 614 |
-
# chrome_options.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
|
| 615 |
-
# # chrome_options.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
# # service = Service(GeckoDriverManager().install())
|
| 619 |
-
# service = Service(executable_path=r'C:\Users\HP\.cache\selenium\geckodriver\win64\0.35.0\geckodriver.exe')
|
| 620 |
-
|
| 621 |
-
# driver = webdriver.Firefox(service=service, options=chrome_options)
|
| 622 |
-
# return driver
|
| 623 |
-
|
| 624 |
-
# def reddit_login_and_scrape(username, password, subreddit):
|
| 625 |
-
# """
|
| 626 |
-
# Log into Reddit and scrape posts from a specified subreddit.
|
| 627 |
-
|
| 628 |
-
# Args:
|
| 629 |
-
# username (str): Reddit username
|
| 630 |
-
# password (str): Reddit password
|
| 631 |
-
# subreddit (str): Name of the subreddit to scrape
|
| 632 |
-
|
| 633 |
-
# Returns:
|
| 634 |
-
# list: List of dictionaries containing scraped post information
|
| 635 |
-
# """
|
| 636 |
-
# driver = setup_driver()
|
| 637 |
-
# posts = []
|
| 638 |
-
|
| 639 |
-
# try:
|
| 640 |
-
# # Navigate to Reddit login page
|
| 641 |
-
# driver.get("https://www.reddit.com/login/")
|
| 642 |
-
|
| 643 |
-
# # Wait for login form to load
|
| 644 |
-
# WebDriverWait(driver, 10).until(
|
| 645 |
-
# EC.presence_of_element_located((By.ID, "login-username"))
|
| 646 |
-
# )
|
| 647 |
-
|
| 648 |
-
# # Find and fill in login credentials
|
| 649 |
-
# username_field = driver.find_element(By.ID, "login-username")
|
| 650 |
-
# password_field = driver.find_element(By.ID, "login-password")
|
| 651 |
-
|
| 652 |
-
# username_field.send_keys(username)
|
| 653 |
-
# password_field.send_keys(password)
|
| 654 |
-
|
| 655 |
-
# # Submit login form
|
| 656 |
-
# # login_button = driver.find_element(By.XPATH, "//button[@type='button']")
|
| 657 |
-
# # login_button.click()
|
| 658 |
-
# # Find login button using complex selector
|
| 659 |
-
# # login_button=driver.find_element(By.CSS_SELECTOR, 'faceplate-tracker[action="click]')
|
| 660 |
-
# login_button = WebDriverWait(driver, 4).until(
|
| 661 |
-
# EC.element_to_be_clickable((By.XPATH, "//*[@id='login']/auth-flow-modal/div[2]/faceplate-tracker/button"))
|
| 662 |
-
# )
|
| 663 |
-
# driver.execute_script("arguments[0].scrollIntoView(true);", login_button)
|
| 664 |
-
# time.sleep(random.uniform(1, 2))
|
| 665 |
-
# login_button.click()
|
| 666 |
-
|
| 667 |
-
# # Wait for login to complete
|
| 668 |
-
# # WebDriverWait(driver, 10).until(
|
| 669 |
-
# # EC.presence_of_element_located((By.XPATH, "//a[@href='/submit']"))
|
| 670 |
-
# # )
|
| 671 |
-
|
| 672 |
-
# # # Add random delay to mimic human behavior
|
| 673 |
-
# time.sleep(random.uniform(2, 4))
|
| 674 |
-
|
| 675 |
-
# # # # Navigate to subreddit
|
| 676 |
-
# # driver.get(f"https://www.reddit.com/r/{subreddit}/")
|
| 677 |
-
|
| 678 |
-
# # # # Wait for posts to load
|
| 679 |
-
# # WebDriverWait(driver, 10).until(
|
| 680 |
-
# # EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='post-container']"))
|
| 681 |
-
# # )
|
| 682 |
-
|
| 683 |
-
# # # Find post elements
|
| 684 |
-
# # post_elements = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='post-container']")
|
| 685 |
-
|
| 686 |
-
# # # Iterate through posts
|
| 687 |
-
# # for post in post_elements[:10]: # Limit to 10 posts
|
| 688 |
-
# # try:
|
| 689 |
-
# # # Extract post details
|
| 690 |
-
# # title = post.find_element(By.CSS_SELECTOR, "h3").text
|
| 691 |
-
|
| 692 |
-
# # # Try to get upvotes
|
| 693 |
-
# # try:
|
| 694 |
-
# # upvotes = post.find_element(By.CSS_SELECTOR, "div[id^='vote-arrows-']").text
|
| 695 |
-
# # except:
|
| 696 |
-
# # upvotes = "N/A"
|
| 697 |
-
|
| 698 |
-
# # # Try to get link
|
| 699 |
-
# # try:
|
| 700 |
-
# # link = post.find_element(By.CSS_SELECTOR, "a[data-click-id='body']").get_attribute('href')
|
| 701 |
-
# # except:
|
| 702 |
-
# # link = "No link available"
|
| 703 |
-
|
| 704 |
-
# # posts.append({
|
| 705 |
-
# # "title": title,
|
| 706 |
-
# # "upvotes": upvotes,
|
| 707 |
-
# # "link": link
|
| 708 |
-
# # })
|
| 709 |
-
|
| 710 |
-
# # except Exception as post_error:
|
| 711 |
-
# # print(f"Error processing individual post: {post_error}")
|
| 712 |
-
|
| 713 |
-
# except Exception as e:
|
| 714 |
-
# print(f"Login or scraping error: {e}")
|
| 715 |
-
# return [{"error": str(e)}]
|
| 716 |
-
|
| 717 |
-
# finally:
|
| 718 |
-
# driver.quit()
|
| 719 |
-
|
| 720 |
-
# return posts
|
| 721 |
-
|
| 722 |
-
# @app.route('/scrape', methods=['POST'])
|
| 723 |
-
# def scrape_reddit():
|
| 724 |
-
# """
|
| 725 |
-
# Flask endpoint for scraping Reddit posts
|
| 726 |
-
|
| 727 |
-
# Expected JSON payload:
|
| 728 |
-
# {
|
| 729 |
-
# "username": "your_reddit_username",
|
| 730 |
-
# "password": "your_reddit_password",
|
| 731 |
-
# "subreddit": "technology"
|
| 732 |
-
# }
|
| 733 |
-
# """
|
| 734 |
-
# # Get data from request
|
| 735 |
-
# data = request.json
|
| 736 |
-
|
| 737 |
-
# # Validate input
|
| 738 |
-
# if not all(key in data for key in ['subreddit']):
|
| 739 |
-
# return jsonify({
|
| 740 |
-
# "error": "Missing required parameters. subreddit"
|
| 741 |
-
# }), 400
|
| 742 |
-
|
| 743 |
-
# try:
|
| 744 |
-
# # Perform scraping
|
| 745 |
-
# results = reddit_login_and_scrape(
|
| 746 |
-
# 'Final-Difference7055',
|
| 747 |
-
# '#CW2968honey',
|
| 748 |
-
# data['subreddit']
|
| 749 |
-
# )
|
| 750 |
-
|
| 751 |
-
# # Check for errors
|
| 752 |
-
# if results and 'error' in results[0]:
|
| 753 |
-
# return jsonify({
|
| 754 |
-
# "error": results[0]['error']
|
| 755 |
-
# }), 500
|
| 756 |
-
|
| 757 |
-
# return jsonify({
|
| 758 |
-
# "posts": results
|
| 759 |
-
# }), 200
|
| 760 |
-
|
| 761 |
-
# except Exception as e:
|
| 762 |
-
# return jsonify({
|
| 763 |
-
# "error": str(e)
|
| 764 |
-
# }), 500
|
| 765 |
-
|
| 766 |
-
# @app.route('/', methods=['GET'])
|
| 767 |
-
# def health_check():
|
| 768 |
-
# """Simple health check endpoint"""
|
| 769 |
-
# return jsonify({
|
| 770 |
-
# "status": "healthy",
|
| 771 |
-
# "message": "Reddit Scraper API is running"
|
| 772 |
-
# }), 200
|
| 773 |
-
|
| 774 |
-
# if __name__ == '__main__':
|
| 775 |
-
# # Use environment variable for port, default to 5000
|
| 776 |
-
# port = int(os.environ.get('PORT', 5000))
|
| 777 |
-
# app.run(host='127.0.0.34', port=port,debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|