zavavan commited on
Commit
fd6f5bf
·
verified ·
1 Parent(s): f37fad7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -38
app.py CHANGED
@@ -8,7 +8,9 @@ from playwright.async_api import async_playwright
8
  from bs4 import BeautifulSoup
9
  import pandas as pd
10
  import time
11
- import asyncio
 
 
12
 
13
  from Gradio_UI import GradioUI
14
 
@@ -25,65 +27,78 @@ def scrape_drug_reviews_tool(drug_name: str, max_pages: int = 3) -> dict :
25
  Output: a dictionary url:review mapping the url of a review to the text of the review
26
  """
27
  try:
28
- df = asyncio.run(scrape_drugs_com_reviews_playwright(drug_name, max_pages))
29
  return df.to_dict(orient="records")
30
  except Exception as e:
31
  return {"error": str(e)}
32
 
33
 
34
 
35
- async def scrape_drugs_com_reviews_playwright(drug_name, max_pages=3, delay=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  base_url = f"https://www.drugs.com/comments/{drug_name}/"
37
  all_reviews = []
38
 
39
- async with async_playwright() as p:
40
- browser = await p.chromium.launch(headless=False)
41
- context = await browser.new_context(
42
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
43
- locale="en-US",
44
- viewport={'width': 1280, 'height': 800},
45
- device_scale_factor=1,
46
- is_mobile=False,
47
- has_touch=False
48
- )
49
- page = await context.new_page()
50
-
51
- for page_num in range(1, max_pages + 1):
52
- url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
53
- print(f"Scraping: {url}")
54
- await page.goto(url, timeout=60000)
55
- await asyncio.sleep(delay) # Give page some time to load
56
-
57
- html = await page.content()
58
- await asyncio.sleep(delay) # Give page some time to load
59
- soup = BeautifulSoup(html, 'html.parser')
60
- review_blocks = soup.find_all('div', class_='ddc-comment ddc-box ddc-mgb-2')
61
 
62
  if not review_blocks:
63
- print("No reviews found on this page.")
64
  break
65
 
66
  for block in review_blocks:
67
- review_paragraph = block.find('p')
 
68
  if review_paragraph:
69
- # Remove the <b> tag from the paragraph to isolate the review text
70
  if review_paragraph.b:
71
- review_paragraph.b.extract() # Removes <b> so it doesn't show up in the text
72
- # Get the cleaned text
73
- review_text = review_paragraph.get_text(strip=True)
74
-
75
 
76
  all_reviews.append({
77
- "source": url,
78
- "review": review_text if review_text else None
79
-
80
  })
81
 
82
- await asyncio.sleep(delay)
83
 
84
- await browser.close()
85
- return pd.DataFrame(all_reviews)
 
86
 
 
87
 
88
 
89
  final_answer = FinalAnswerTool()
 
8
  from bs4 import BeautifulSoup
9
  import pandas as pd
10
  import time
11
+ import random
12
+ from requests.adapters import HTTPAdapter
13
+ from urllib3.util.retry import Retry
14
 
15
  from Gradio_UI import GradioUI
16
 
 
27
  Output: a dictionary url:review mapping the url of a review to the text of the review
28
  """
29
  try:
30
+ df = asyncio.run(scrape_drugs_com_reviews_requests(drug_name, max_pages))
31
  return df.to_dict(orient="records")
32
  except Exception as e:
33
  return {"error": str(e)}
34
 
35
 
36
 
37
+ # List of User-Agents for rotation
38
+ USER_AGENTS = [
39
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
40
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
41
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36",
42
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"
43
+ ]
44
+
45
+ # Retry logic wrapper
46
+ def requests_retry_session(retries=3, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), session=None):
47
+ session = session or requests.Session()
48
+ retry = Retry(
49
+ total=retries,
50
+ read=retries,
51
+ connect=retries,
52
+ backoff_factor=backoff_factor,
53
+ status_forcelist=status_forcelist,
54
+ )
55
+ adapter = HTTPAdapter(max_retries=retry)
56
+ session.mount("http://", adapter)
57
+ session.mount("https://", adapter)
58
+ return session
59
+
60
+ # Scraper function using requests
61
+ def scrape_drugs_com_reviews_requests(drug_name, max_pages=3, delay=2):
62
  base_url = f"https://www.drugs.com/comments/{drug_name}/"
63
  all_reviews = []
64
 
65
+ session = requests_retry_session()
66
+
67
+ for page_num in range(1, max_pages + 1):
68
+ url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
69
+ headers = {"User-Agent": random.choice(USER_AGENTS)}
70
+
71
+ try:
72
+ response = session.get(url, headers=headers, timeout=10)
73
+ response.raise_for_status()
74
+
75
+ soup = BeautifulSoup(response.text, "html.parser")
76
+ review_blocks = soup.find_all("div", class_="ddc-comment ddc-box ddc-mgb-2")
 
 
 
 
 
 
 
 
 
 
77
 
78
  if not review_blocks:
79
+ print(f"No reviews found on page {page_num}.")
80
  break
81
 
82
  for block in review_blocks:
83
+ review_paragraph = block.find("p")
84
+ review_text = None
85
  if review_paragraph:
 
86
  if review_paragraph.b:
87
+ review_paragraph.b.extract() # remove category (e.g., "For Back Pain")
88
+ review_text = review_paragraph.get_text(strip=True)
 
 
89
 
90
  all_reviews.append({
91
+ "review": review_text,
92
+ "source": url
 
93
  })
94
 
95
+ time.sleep(delay) # Polite delay
96
 
97
+ except Exception as e:
98
+ print(f"Error scraping {url}: {e}")
99
+ continue
100
 
101
+ return pd.DataFrame(all_reviews)
102
 
103
 
104
  final_answer = FinalAnswerTool()