honey234 commited on
Commit
8956ebe
·
1 Parent(s): 19e6eda
Files changed (4) hide show
  1. Dockerfile +9 -9
  2. __pycache__/app.cpython-311.pyc +0 -0
  3. app.py +85 -14
  4. posts_data_0.csv +1 -26
Dockerfile CHANGED
@@ -22,16 +22,16 @@ RUN apt-get update && apt-get install -y \
22
  apt-get clean && \
23
  rm -rf /var/lib/apt/lists/*
24
 
25
- # RUN apt-get update && apt-get install -y wget unzip && \
26
- # wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
27
- # apt install -y ./google-chrome-stable_current_amd64.deb && \
28
- # rm google-chrome-stable_current_amd64.deb && \
29
- # apt-get clean
30
 
31
- # Update the package list and install wget, unzip, and Firefox
32
- RUN apt-get update && apt-get install -y wget unzip \
33
- && apt-get install -y firefox-esr \
34
- && apt-get clean
35
  RUN which firefox
36
  RUN useradd -m -u 1000 user
37
  USER user
 
22
  apt-get clean && \
23
  rm -rf /var/lib/apt/lists/*
24
 
25
+ RUN apt-get update && apt-get install -y wget unzip && \
26
+ wget https://dl.google.com/Linux/direct/google-chrome-stable_current_amd64.deb && \
27
+ apt install -y ./google-chrome-stable_current_amd64.deb && \
28
+ rm google-chrome-stable_current_amd64.deb && \
29
+ apt-get clean
30
 
31
+ # # Update the package list and install wget, unzip, and Firefox
32
+ # RUN apt-get update && apt-get install -y wget unzip \
33
+ # && apt-get install -y firefox-esr \
34
+ # && apt-get clean
35
  RUN which firefox
36
  RUN useradd -m -u 1000 user
37
  USER user
__pycache__/app.cpython-311.pyc CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
 
app.py CHANGED
@@ -4,10 +4,10 @@ import time
4
  import pandas as pd
5
  from fastapi import FastAPI, HTTPException
6
  from selenium import webdriver
7
- from selenium.webdriver.firefox.service import Service
8
  from selenium.webdriver.common.action_chains import ActionChains
9
- from selenium.webdriver.firefox.options import Options
10
- from webdriver_manager.firefox import GeckoDriverManager
11
  from webdriver_manager.core.driver_cache import DriverCacheManager
12
  from selenium.webdriver.common.by import By
13
  from fake_headers import Headers
@@ -41,13 +41,13 @@ app.add_middleware(
41
  logging.basicConfig(level=logging.INFO)
42
 
43
  def setup_chromedriver():
44
- logging.info("Setting up ChromeDriver...")
45
  custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
46
  os.environ['WDM_LOCAL'] = custom_wdm_cache
47
  cache_manager = DriverCacheManager(custom_wdm_cache)
48
  os.chmod(custom_wdm_cache, 0o755) # Ensure proper permissions
49
- # path = ChromeDriverManager(cache_manager=cache_manager).install()
50
- path = GeckoDriverManager(cache_manager=cache_manager).install()
51
  os.chmod(path, 0o755) # Ensure ChromeDriver is executable
52
  logging.info(f"ChromeDriver path: {path}")
53
  return path
@@ -58,18 +58,23 @@ def setup_chromedriver():
58
  header = Headers().generate()["User-Agent"]
59
  # my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
60
 
61
- capabilities = webdriver.DesiredCapabilities.FIREFOX
62
  # proxy = None
63
  browser_option = Options()
64
- browser_option.log.level = 'trace'
65
  browser_option.add_argument("--headless") # Running in headless mode (no GUI)
66
  browser_option.add_argument("--no-sandbox")
67
  browser_option.add_argument("--disable-dev-shm-usage")
68
  browser_option.add_argument("--ignore-certificate-errors")
69
- profile = webdriver.FirefoxProfile()
70
- profile.set_preference("general.useragent.override", "Your User Agent String")
71
- browser_option.profile = profile
72
  logging.info(f"browser_version: {browser_option.browser_version}")
 
 
 
 
 
73
  # browser_option.capabilities = {
74
  # "moz:firefoxOptions": {
75
  # "args": [
@@ -81,7 +86,7 @@ logging.info(f"browser_version: {browser_option.browser_version}")
81
  # ]
82
  # }
83
  # }
84
- browser_option.binary_location = '/usr/bin/firefox'
85
  # browser_option.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
86
  # browser_option.add_argument("--disable-gpu")
87
  # browser_option.add_argument("--log-level=3")
@@ -97,7 +102,7 @@ browser_option.add_argument("--user-agent={}".format(header))
97
  # Setup WebDriver
98
  driver_path = setup_chromedriver()
99
  service = Service(executable_path=driver_path)
100
- driver = webdriver.Firefox(service=service, options=browser_option)
101
  # actions = ActionChains(driver)
102
 
103
  def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
@@ -162,10 +167,76 @@ def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=Fals
162
  logging.info(f"Data saved to posts_data_{index}.csv")
163
  return df
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def get_webpage_title(url: str) -> str:
166
  try:
167
  # getSearchPostData(search_keyword="migraine", index=0)
168
-
169
  driver.get(url)
170
  time.sleep(3)
171
  title = driver.title
 
4
  import pandas as pd
5
  from fastapi import FastAPI, HTTPException
6
  from selenium import webdriver
7
+ from selenium.webdriver.chrome.service import Service
8
  from selenium.webdriver.common.action_chains import ActionChains
9
+ from selenium.webdriver.chrome.options import Options
10
+ from webdriver_manager.chrome import ChromeDriverManager
11
  from webdriver_manager.core.driver_cache import DriverCacheManager
12
  from selenium.webdriver.common.by import By
13
  from fake_headers import Headers
 
41
  logging.basicConfig(level=logging.INFO)
42
 
43
  def setup_chromedriver():
44
+ # logging.info("Setting up ChromeDriver...")
45
  custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
46
  os.environ['WDM_LOCAL'] = custom_wdm_cache
47
  cache_manager = DriverCacheManager(custom_wdm_cache)
48
  os.chmod(custom_wdm_cache, 0o755) # Ensure proper permissions
49
+ path = ChromeDriverManager(cache_manager=cache_manager).install()
50
+ # path = GeckoDriverManager(cache_manager=cache_manager).install()
51
  os.chmod(path, 0o755) # Ensure ChromeDriver is executable
52
  logging.info(f"ChromeDriver path: {path}")
53
  return path
 
58
  header = Headers().generate()["User-Agent"]
59
  # my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
60
 
61
+ # capabilities = webdriver.DesiredCapabilities.CHROME
62
  # proxy = None
63
  browser_option = Options()
64
+ # browser_option..level = 'trace'
65
  browser_option.add_argument("--headless") # Running in headless mode (no GUI)
66
  browser_option.add_argument("--no-sandbox")
67
  browser_option.add_argument("--disable-dev-shm-usage")
68
  browser_option.add_argument("--ignore-certificate-errors")
69
+ # profile = webdriver.FirefoxProfile()
70
+ # profile.set_preference("general.useragent.override", "Your User Agent String")
71
+ # browser_option.profile = profile
72
  logging.info(f"browser_version: {browser_option.browser_version}")
73
+ # browser_option.set_capability(
74
+
75
+ # )
76
+ # name="",
77
+ # value=capabilities)
78
  # browser_option.capabilities = {
79
  # "moz:firefoxOptions": {
80
  # "args": [
 
86
  # ]
87
  # }
88
  # }
89
+ # browser_option.binary_location = '/usr/bin/firefox'
90
  # browser_option.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
91
  # browser_option.add_argument("--disable-gpu")
92
  # browser_option.add_argument("--log-level=3")
 
102
  # Setup WebDriver
103
  driver_path = setup_chromedriver()
104
  service = Service(executable_path=driver_path)
105
+ driver = webdriver.Chrome(service=service, options=browser_option)
106
  # actions = ActionChains(driver)
107
 
108
  def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
 
167
  logging.info(f"Data saved to posts_data_{index}.csv")
168
  return df
169
 
170
+ def getPinterestSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
171
+ # Navigate to the search results page
172
+ url = f'https://www.pinterest.com/search/pins?q={search_keyword}'
173
+ driver.get(url)
174
+ time.sleep(3) # Consider using WebDriverWait instead of sleep for better reliability
175
+ logging.info("Navigated to search page.")
176
+ # links = driver.find_elements(By.CSS_SELECTOR, "div[data-test-id='related-pins-title']")
177
+ # print("links", links)
178
+ posts_data = []
179
+ list_length = 0 # posts count
180
+
181
+ try:
182
+ if forCompetitorAnalysis:
183
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
184
+ time.sleep(5)
185
+ post_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-test-id='related-pins-title']")
186
+ # post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
187
+ # post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
188
+ logging.info(f"Found {len(post_cards)} post cards.")
189
+
190
+ # idx = list_length
191
+ # for card in post_cards_1:
192
+ # try:
193
+ # votes_count = card.find_element(By.XPATH, './/faceplate-number').text
194
+ # comments_count = card.find_element(By.XPATH,
195
+ # './/span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
196
+ # ).text
197
+ # posts_data.append({
198
+ # "index": idx,
199
+ # "comment_count": comments_count,
200
+ # "votes_count": votes_count
201
+ # })
202
+ # idx += 1
203
+ # except Exception as e:
204
+ # logging.error(f"Error processing post_card_1: {e}")
205
+
206
+ idx = 0
207
+ for card in post_cards:
208
+ try:
209
+ title = card.find_element(By.XPATH,
210
+ './/div'
211
+ ).text
212
+ # posts_data[idx]["title"] = title
213
+ print("title", title)
214
+ # idx += 1
215
+ except Exception as e:
216
+ logging.error(f"Error processing post_cards: {e}")
217
+
218
+ # idx = list_length
219
+ # for card in post_cards_2:
220
+ # try:
221
+ # time_element = card.find_element(By.XPATH, './time')
222
+ # post_time = time_element.get_attribute('datetime')
223
+ # posts_data[idx]["time"] = post_time
224
+ # idx += 1
225
+ # except Exception as e:
226
+ # logging.error(f"Error processing post_cards_2: {e}")
227
+
228
+ except Exception as e:
229
+ logging.error(f"Error in scrolling or extracting data: {e}")
230
+
231
+ df = pd.DataFrame(posts_data)
232
+ df.to_csv(f'posts_data_{index}.csv', index=False)
233
+ logging.info(f"Data saved to posts_data_{index}.csv")
234
+ return df
235
+
236
  def get_webpage_title(url: str) -> str:
237
  try:
238
  # getSearchPostData(search_keyword="migraine", index=0)
239
+ getPinterestSearchPostData(search_keyword="watercolor art",index=0)
240
  driver.get(url)
241
  time.sleep(3)
242
  title = driver.title
posts_data_0.csv CHANGED
@@ -1,26 +1 @@
1
- index,comment_count,votes_count,title,url,time
2
- 0,26,6,Migraine DBQ,https://www.reddit.com/r/VeteransBenefits/comments/1cogdv4/migraine_dbq/,2024-05-10T03:51:24.726Z
3
- 1,84,651,A cool guide to migraine symptoms!,https://www.reddit.com/r/coolguides/comments/1f7wymg/a_cool_guide_to_migraine_symptoms/,2024-09-03T10:56:10.230Z
4
- 2,321,141,What makes you sure it's a migraine?,https://www.reddit.com/r/migraine/comments/1cth4kw/what_makes_you_sure_its_a_migraine/,2024-05-16T16:33:05.274Z
5
- 3,217,100,Biggest migraine life hacks?,https://www.reddit.com/r/migraine/comments/1902t0p/biggest_migraine_life_hacks/,2024-01-06T15:49:12.132Z
6
- 4,112,40,What finally stopped your migraine?,https://www.reddit.com/r/migraine/comments/1e7wvsc/what_finally_stopped_your_migraine/,2024-07-20T14:27:22.529Z
7
- 5,228,222,ELI5: What causes Migraines?,https://www.reddit.com/r/explainlikeimfive/comments/1e84z59/eli5_what_causes_migraines/,2024-07-20T20:35:12.699Z
8
- 6,1K,2.9K,Most people use the word 'migraine' wrong.,https://www.reddit.com/r/unpopularopinion/comments/1884d1g/most_people_use_the_word_migraine_wrong/,2023-12-01T05:45:16.842Z
9
- 7,236,1.9K,Wife’s migraines reduced by 90% and I feel like a jackass,https://www.reddit.com/r/migraine/comments/1g1dxw5/wifes_migraines_reduced_by_90_and_i_feel_like_a/,2024-10-11T16:31:11.670Z
10
- 8,257,1.1K,It's just a migraine,https://www.reddit.com/r/Radiology/comments/1cisw7o/its_just_a_migraine/,2024-05-02T22:33:13.773Z
11
- 9,239,660,"The cure for migraines! I've got it now, you can all stop searching! /s",https://www.reddit.com/r/migraine/comments/1ebkkff/the_cure_for_migraines_ive_got_it_now_you_can_all/,2024-07-25T03:09:38.426Z
12
- 10,40,10,What are your migraine relief tricks?,https://www.reddit.com/r/AskReddit/comments/15oald2/what_are_your_migraine_relief_tricks/,2023-08-11T14:34:43.326Z
13
- 11,400,1.7K,Some migraine symptoms you might not expect,https://www.reddit.com/r/migraine/comments/1e8zlkv/some_migraine_symptoms_you_might_not_expect/,2024-07-21T23:20:45.826Z
14
- 12,158,3.1K,I painted this based on the visuals I get from migraines.,https://www.reddit.com/r/AbstractArt/comments/1gojgxz/i_painted_this_based_on_the_visuals_i_get_from/,2024-11-11T04:02:54.421Z
15
- 13,185,698,migraine suffers are born with defective nervous systems,https://www.reddit.com/r/migraine/comments/1bs1g6c/migraine_suffers_are_born_with_defective_nervous/,2024-03-31T04:09:32.745Z
16
- 14,911,25K,I’ve had constant migraines for the past week thanks to Matlab.,https://www.reddit.com/r/ProgrammerHumor/comments/xbmg98/ive_had_constant_migraines_for_the_past_week/,2022-09-11T16:10:36.852Z
17
- 15,65,41,ELI5: what is the difference between a Headache vs a migraine?,https://www.reddit.com/r/explainlikeimfive/comments/1ccqoaj/eli5_what_is_the_difference_between_a_headache_vs/,2024-04-25T12:28:12.099Z
18
- 16,243,6.5K,Need to share this. I got a migraine from laughing so hard.,https://www.reddit.com/r/BaldursGate3/comments/1butvvc/need_to_share_this_i_got_a_migraine_from_laughing/,2024-04-03T14:23:48.741Z
19
- 17,54,6,What type of Migraine Logs does VA accept?,https://www.reddit.com/r/VeteransBenefits/comments/1fmdy2v/what_type_of_migraine_logs_does_va_accept/,2024-09-21T21:54:44.854Z
20
- 18,231,1.9K,Raise your hand if you’re an American with a stress induced migraine today,https://www.reddit.com/r/migraine/comments/1gk64ce/raise_your_hand_if_youre_an_american_with_a/,2024-11-05T13:01:07.737Z
21
- 19,240,316,Sign a migraine is coming,https://www.reddit.com/r/migraine/comments/1b90est/sign_a_migraine_is_coming/,2024-03-07T17:33:14.935Z
22
- 20,1.1K,24K,TIL: Migraines are 3 times more common in women than in men.,https://www.reddit.com/r/todayilearned/comments/t0jzlf/til_migraines_are_3_times_more_common_in_women/,2022-02-24T20:03:31.456Z
23
- 21,16,6,Migraine or MS?,https://www.reddit.com/r/MultipleSclerosis/comments/1fu5xnr/migraine_or_ms/,2024-10-02T02:31:44.292Z
24
- 22,644,3.4K,This is the first time I’ve had someone tell me what I have isn’t a migraine. I mentioned having to grocery shop during a migraine and got this as a response.,https://www.reddit.com/r/migraine/comments/1780gdm/this_is_the_first_time_ive_had_someone_tell_me/,2023-10-14T22:12:03.962Z
25
- 23,91,46,How do you manage your migraines?,https://www.reddit.com/r/POTS/comments/1duim0f/how_do_you_manage_your_migraines/,2024-07-03T16:27:50.806Z
26
- 24,928,10K,10 minutes video call healthcare for $300 ...and just for migraine. Wow.,https://www.reddit.com/r/facepalm/comments/w6t020/10_minutes_video_call_healthcare_for_300_and_just/,2022-07-24T10:59:04.264Z
 
1
+