zavavan commited on
Commit
522a7c4
·
verified ·
1 Parent(s): 6324d26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -18
app.py CHANGED
@@ -32,44 +32,56 @@ def scrape_drugs_com_reviews(drug_name, max_pages=3, delay=2):
32
  """
33
  Scrapes user reviews from Drugs.com for a given drug.
34
  """
35
- base_url = f"https://www.drugs.com/comments/{drug_name}/"
36
  all_reviews = []
37
 
38
- with sync_playwright() as p:
39
- browser = p.chromium.launch(headless=True)
40
- page = browser.new_page()
 
 
 
 
 
 
 
 
41
 
42
  for page_num in range(1, max_pages + 1):
43
  url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
44
  print(f"Scraping: {url}")
45
- page.goto(url, timeout=60000)
46
- time.sleep(delay) # Give page some time to load
47
 
48
- html = page.content()
 
49
  soup = BeautifulSoup(html, 'html.parser')
50
- review_blocks = soup.select('.user-comment')
 
51
 
52
  if not review_blocks:
53
  print("No reviews found on this page.")
54
  break
55
 
56
  for block in review_blocks:
57
- review_text = block.select_one('.user-comment-text')
58
- condition = block.select_one('.drug-condition')
59
- rating = block.select_one('.rating-score')
60
- date = block.select_one('.comment-date')
 
 
 
 
61
 
62
  all_reviews.append({
63
- "condition": condition.get_text(strip=True) if condition else None,
64
- "rating": rating.get_text(strip=True) if rating else None,
65
- "review": review_text.get_text(strip=True) if review_text else None,
66
- "date": date.get_text(strip=True) if date else None,
67
  "source": url
68
  })
69
 
70
- time.sleep(delay)
71
 
72
- browser.close()
73
  return pd.DataFrame(all_reviews)
74
 
75
 
 
32
  """
33
  Scrapes user reviews from Drugs.com for a given drug.
34
  """
35
+ base_url = f"https://www.drugs.com/comments/{drug_name}/"
36
  all_reviews = []
37
 
38
+ async with async_playwright() as p:
39
+ browser = await p.chromium.launch(headless=False)
40
+ context = await browser.new_context(
41
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
42
+ locale="en-US",
43
+ viewport={'width': 1280, 'height': 800},
44
+ device_scale_factor=1,
45
+ is_mobile=False,
46
+ has_touch=False
47
+ )
48
+ page = await context.new_page()
49
 
50
  for page_num in range(1, max_pages + 1):
51
  url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
52
  print(f"Scraping: {url}")
53
+ await page.goto(url, timeout=60000)
54
+ await asyncio.sleep(delay) # Give page some time to load
55
 
56
+ html = await page.content()
57
+ await asyncio.sleep(delay) # Give page some time to load
58
  soup = BeautifulSoup(html, 'html.parser')
59
+ print(soup)
60
+ review_blocks = soup.find_all('div', class_='ddc-comment ddc-box ddc-mgb-2')
61
 
62
  if not review_blocks:
63
  print("No reviews found on this page.")
64
  break
65
 
66
  for block in review_blocks:
67
+ review_paragraph = block.find('p')
68
+ if review_paragraph:
69
+ # Remove the <b> tag from the paragraph to isolate the review text
70
+ if review_paragraph.b:
71
+ review_paragraph.b.extract() # Removes <b> so it doesn't show up in the text
72
+ # Get the cleaned text
73
+ review_text = review_paragraph.get_text(strip=True)
74
+
75
 
76
  all_reviews.append({
77
+
78
+ "review": review_text if review_text else None,
 
 
79
  "source": url
80
  })
81
 
82
+ await asyncio.sleep(delay)
83
 
84
+ await browser.close()
85
  return pd.DataFrame(all_reviews)
86
 
87