File size: 15,030 Bytes
fe89926
 
 
 
 
 
8956ebe
fe89926
8956ebe
 
fe89926
 
 
 
 
9c88a63
 
 
370b561
cb71789
 
fe89926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8956ebe
6c61448
 
 
 
8956ebe
 
fe89926
 
 
 
 
 
9c4ee58
 
fe89926
cb71789
8956ebe
fe89926
 
8956ebe
fe89926
 
 
1e67a24
 
cb71789
8956ebe
 
 
6999795
8956ebe
 
 
 
 
9a7c284
 
 
 
 
 
 
 
 
 
 
8956ebe
6999795
fe89926
 
 
 
9c4ee58
 
cb71789
fe89926
 
 
 
 
 
 
8956ebe
fe89926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eacdd5
fe89926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8956ebe
 
 
 
1e67a24
8956ebe
 
 
 
 
1e67a24
8956ebe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe89926
 
370b561
8956ebe
fe89926
370b561
fe89926
 
 
 
 
 
 
 
 
 
ea42b3e
1e67a24
9c88a63
 
8592ca7
1e67a24
 
 
 
 
8592ca7
1e67a24
9c88a63
 
 
9726aad
1e67a24
 
 
 
 
 
 
 
 
 
 
9c88a63
 
 
 
1e67a24
 
ea42b3e
 
1e67a24
9c88a63
fe89926
 
 
 
 
 
 
 
 
 
 
 
0eacdd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe89926
 
 
 
0eacdd5
fe89926
 
 
 
 
 
 
 
cb71789
fe89926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
084be66
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
import os
import random
import time
import pandas as pd
from fastapi import FastAPI, HTTPException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.driver_cache import DriverCacheManager
from selenium.webdriver.common.by import By
from fake_headers import Headers
from fastapi.middleware.cors import CORSMiddleware
import logging
from pyppeteer import launch
import asyncio

# from selenium_driverless import webdriver as webdriverless


# Initialize FastAPI
app = FastAPI(
    debug=True,
    title="NextAnalytics Server",
    consumes=["application/x-www-form-urlencoded", "multipart/form-data"],
    docs_url='/swagger'
)

# Configure CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Setup ChromeDriver and Selenium
# custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
# os.environ['WDM_LOCAL'] = custom_wdm_cache

# Setup logging
logging.basicConfig(level=logging.INFO)

def setup_chromedriver():
    # logging.info("Setting up ChromeDriver...")
    custom_wdm_cache = os.path.join(os.getcwd(), 'custom_wdm_cache')
    os.environ['WDM_LOCAL'] = custom_wdm_cache
    cache_manager = DriverCacheManager(custom_wdm_cache)
    os.chmod(custom_wdm_cache, 0o755)  # Ensure proper permissions
    path = ChromeDriverManager(cache_manager=cache_manager).install()
    # path = GeckoDriverManager(cache_manager=cache_manager).install()
    os.chmod(path, 0o755)  # Ensure ChromeDriver is executable
    logging.info(f"ChromeDriver path: {path}")
    return path

# Setup headless Chrome options
# Define a custom user agent
# my_user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
header = Headers().generate()["User-Agent"]
# my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"

# capabilities = webdriver.DesiredCapabilities.CHROME
# proxy = None
browser_option = Options()
# browser_option..level = 'trace'
browser_option.add_argument("--headless")  # Running in headless mode (no GUI)
browser_option.add_argument("--no-sandbox")
browser_option.add_argument("--disable-dev-shm-usage")
browser_option.add_argument("--disable-blink-features=AutomationControlled")

browser_option.add_argument("--ignore-certificate-errors")
# profile = webdriver.FirefoxProfile()
# profile.set_preference("general.useragent.override", "Your User Agent String")
# browser_option.profile = profile
logging.info(f"browser_version: {browser_option.browser_version}")
# browser_option.set_capability(
    
# )
#     name="",
#     value=capabilities)
# browser_option.capabilities = {
#     "moz:firefoxOptions": {
#         "args": [
#             "--headless",
#             "--no-sandbox",
#             "--disable-dev-shm-usage",
#             "--ignore-certificate-errors",
#             f"--user-agent={header}"
#         ]
#     }
# }
# browser_option.binary_location = '/usr/bin/firefox'
# browser_option.binary_location = r'C:\Users\HP\.cache\selenium\firefox\win64\133.0\firefox.exe'
# browser_option.add_argument("--disable-gpu")
# browser_option.add_argument("--log-level=3")
# browser_option.add_argument("--disable-notifications")
# browser_option.add_argument("--disable-popup-blocking")
# browser_option.add_argument(f"--user-agent={my_user_agent}")
browser_option.add_argument("--user-agent={}".format(header))

# if proxy:
#     browser_option.add_argument(f"--proxy-server={proxy}")


# Setup WebDriver
driver_path = setup_chromedriver()
service = Service(executable_path=driver_path)
driver = webdriver.Chrome(service=service, options=browser_option)
# actions = ActionChains(driver)

def getSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
    # Navigate to the search results page
    url = f'https://www.reddit.com/search/?q={search_keyword}'
    driver.get(url)
    time.sleep(3)  # Consider using WebDriverWait instead of sleep for better reliability
    logging.info("Navigated to search page.")

    posts_data = []
    list_length = 0  # posts count
    try:
        if forCompetitorAnalysis:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)
        post_cards = driver.find_elements(By.CSS_SELECTOR, 'a[data-testid="post-title-text"]')
        post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
        post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
        logging.info(f"Found {len(post_cards)} post cards.")

        idx = list_length
        for card in post_cards_1:
            try:
                votes_count = card.find_element(By.XPATH, './/faceplate-number').text
                comments_count = card.find_element(By.XPATH,
                    './/span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
                ).text
                posts_data.append({
                    "index": idx,
                    "comment_count": comments_count,
                    "votes_count": votes_count
                })
                idx += 1
            except Exception as e:
                logging.error(f"Error processing post_card_1: {e}")

        idx = list_length
        for card in post_cards:
            try:
                url = card.get_attribute("href")
                title = card.text
                posts_data[idx]["title"] = title
                posts_data[idx]["url"] = url
                idx += 1
            except Exception as e:
                logging.error(f"Error processing post_cards: {e}")
        
        idx = list_length
        for card in post_cards_2:
            try:
                time_element = card.find_element(By.XPATH, './time')
                post_time = time_element.get_attribute('datetime')
                posts_data[idx]["time"] = post_time
                idx += 1
            except Exception as e:
                logging.error(f"Error processing post_cards_2: {e}")
    except Exception as e:
        logging.error(f"Error in scrolling or extracting data: {e}")

    df = pd.DataFrame(posts_data)
    df.to_csv(f'posts_data_{index}.csv', index=False)
    logging.info(f"Data saved to posts_data_{index}.csv")
    return df

def getPinterestSearchPostData(search_keyword, index, name="", forCompetitorAnalysis=False):
    # Navigate to the search results page
    url = f'https://www.pinterest.com/search/pins?q={search_keyword}'
    driver.get(url)
    driver.implicitly_wait(5)  # Consider using WebDriverWait instead of sleep for better reliability
    logging.info("Navigated to search page.")
    # links = driver.find_elements(By.CSS_SELECTOR, "div[data-test-id='related-pins-title']")
    # print("links", links)
    posts_data = []
    list_length = 0  # posts count
    
    try:
        if forCompetitorAnalysis:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)
        post_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-test-id='related-pins-title']")
        # post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
        # post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
        logging.info(f"Found {len(post_cards)} post cards.")

        # idx = list_length
        # for card in post_cards_1:
        #     try:
        #         votes_count = card.find_element(By.XPATH, './/faceplate-number').text
        #         comments_count = card.find_element(By.XPATH,
        #             './/span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
        #         ).text
        #         posts_data.append({
        #             "index": idx,
        #             "comment_count": comments_count,
        #             "votes_count": votes_count
        #         })
        #         idx += 1
        #     except Exception as e:
        #         logging.error(f"Error processing post_card_1: {e}")

        idx = 0
        for card in post_cards:
            try:
                title = card.find_element(By.XPATH,
                    './/div'
                ).text
                # posts_data[idx]["title"] = title
                print("title", title)
                # idx += 1
            except Exception as e:
                logging.error(f"Error processing post_cards: {e}")
        
        # idx = list_length
        # for card in post_cards_2:
        #     try:
        #         time_element = card.find_element(By.XPATH, './time')
        #         post_time = time_element.get_attribute('datetime')
        #         posts_data[idx]["time"] = post_time
        #         idx += 1
        #     except Exception as e:
        #         logging.error(f"Error processing post_cards_2: {e}")
   
    except Exception as e:
        logging.error(f"Error in scrolling or extracting data: {e}")

    df = pd.DataFrame(posts_data)
    df.to_csv(f'posts_data_{index}.csv', index=False)
    logging.info(f"Data saved to posts_data_{index}.csv")
    return df

def get_webpage_title(url: str) -> str:
    try:
        # getSearchPostData(search_keyword="migraine", index=0)
        getPinterestSearchPostData(search_keyword="watercolor art",index=0)
        driver.get(url)
        time.sleep(3)
        title = driver.title
        logging.info(f"Page title: {title}")
        return title
    except Exception as e:
        logging.error(f"Error fetching webpage title: {e}")
        return str(e)

@app.get("/")
async def home():
    return {"message": "Hello"}

async def pupFcuntin(url)->str:
    browser = await launch(
        options={
            'headless': True,
            'args': [
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-blink-features=AutomationControlled',
            ],
            # 'executablePath': 'usr/bin/google-chrome',
            # 'executablePath': r'C:\Program Files\Google\Chrome\Application\chrome.exe',
        }
    )
    page = await browser.newPage()
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
    # Pretend to be a real browser
    await page.evaluateOnNewDocument(
        """
        () => {
            Object.defineProperty(navigator, 'webdriver', { get: () => false });
            Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
            Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
        }
        """
    )
    await page.goto(url, options={'waitUntil': 'domcontentloaded'})
    time.sleep(3)
    ## Get HTML
    html = await page.title()
    await browser.close()
    logging.info(f"Page title: {html}")
    return html
@app.get("/puppeteerTrial")
async def puppeteerTrial(url: str):
    html =await pupFcuntin(url)
    return {"message": html}

@app.get("/get-title/")
async def fetch_title(url: str):
    """
    Fetch the title of a webpage by URL.
    Example: /get-title/?url=https://www.reddit.com
    """
    try:
        title = get_webpage_title(url)
        return {"url": url, "title": title}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
# @app.get("/get-reddit/")
# async def getReddit(url: str):
#     """
#     Fetch the title of a webpage by URL.
#     Example: /get-title/?url=https://www.reddit.com
#     """
#     try:
#         options = webdriverless.ChromeOptions()
#         driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"

#         options.add_argument("--headless")  # Running in headless mode (no GUI)
#         options.add_argument("--no-sandbox")
#         options.add_argument("--disable-dev-shm-usage")
#         options.add_argument("--ignore-certificate-errors")
#         options.add_argument(f"--user-agent={driver_agent}")

#         title="Notitle"
#         async with webdriverless.Chrome(options=options) as driver:
#             await driver.get('https://www.reddit.com')
#             time.sleep(3)

#             title = await driver.title
#             url = await driver.current_url
#             source = await driver.page_source
#             print(title)
#             return {"url": url, "title": title}
#         return {"url": url, "title": title}
#     except Exception as e:
#         raise HTTPException(status_code=500, detail=str(e))

# Run the app
# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host="127.0.0.1", port=7860)

# from selenium import webdriver
# from flask import Flask, request
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.proxy import Proxy, ProxyType
# app = Flask(__name__)


# def download_selenium():
#     prox = Proxy()
#     prox.proxy_type = ProxyType.MANUAL
#     prox.http_proxy = "ip_addr:port"
#     prox.socks_proxy = "ip_addr:port"
#     prox.ssl_proxy = "ip_addr:port"
#     chrome_options = webdriver.ChromeOptions()
#     driver_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
#     capabilities = webdriver.DesiredCapabilities.CHROME
#     prox.to_capabilities(capabilities)
#     chrome_options.add_argument("--headless=new")
#     # chrome_options.add_argument(f"--proxy-server={proxy}")
#     chrome_options.add_argument("--no-sandbox")
#     chrome_options.add_argument("--disable-dev-shm-usage")
#     # chrome_options.add_argument("--ignore-certificate-errors")
#     # chrome_options.add_argument("--disable-gpu")
#     # chrome_options.add_argument("--log-level=3")
#     # chrome_options.add_argument("--disable-notifications")
#     # chrome_options.add_argument("--disable-popup-blocking")
#     prefs = {"profile.managed_default_content_settings.images": 2}
#     # chrome_options.add_experimental_option("prefs", prefs)
#     chrome_options.add_argument(f"--user-agent={driver_agent}")
#     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install(),desired_capabilities=capabilities), options=chrome_options)
#     driver.get("https://reddit.com")
#     title = driver.title
#     # language = driver.find_element(By.XPATH, "//div[@id='SIvCob']").text
#     data = {'Page Title': title}
#     return data


# @app.route('/', methods = ['GET','POST'])
# def home():
#     if (request.method == 'GET'):
#         return download_selenium()


# if __name__ == "__main__":
#     app.run(debug=True, port=3000)