Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -32,44 +32,56 @@ def scrape_drugs_com_reviews(drug_name, max_pages=3, delay=2):
|
|
| 32 |
"""
|
| 33 |
Scrapes user reviews from Drugs.com for a given drug.
|
| 34 |
"""
|
| 35 |
-
|
| 36 |
all_reviews = []
|
| 37 |
|
| 38 |
-
with
|
| 39 |
-
browser = p.chromium.launch(headless=
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
for page_num in range(1, max_pages + 1):
|
| 43 |
url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
|
| 44 |
print(f"Scraping: {url}")
|
| 45 |
-
page.goto(url, timeout=60000)
|
| 46 |
-
|
| 47 |
|
| 48 |
-
html = page.content()
|
|
|
|
| 49 |
soup = BeautifulSoup(html, 'html.parser')
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
if not review_blocks:
|
| 53 |
print("No reviews found on this page.")
|
| 54 |
break
|
| 55 |
|
| 56 |
for block in review_blocks:
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
all_reviews.append({
|
| 63 |
-
|
| 64 |
-
"
|
| 65 |
-
"review": review_text.get_text(strip=True) if review_text else None,
|
| 66 |
-
"date": date.get_text(strip=True) if date else None,
|
| 67 |
"source": url
|
| 68 |
})
|
| 69 |
|
| 70 |
-
|
| 71 |
|
| 72 |
-
browser.close()
|
| 73 |
return pd.DataFrame(all_reviews)
|
| 74 |
|
| 75 |
|
|
|
|
| 32 |
"""
|
| 33 |
Scrapes user reviews from Drugs.com for a given drug.
|
| 34 |
"""
|
| 35 |
+
base_url = f"https://www.drugs.com/comments/{drug_name}/"
|
| 36 |
all_reviews = []
|
| 37 |
|
| 38 |
+
async with async_playwright() as p:
|
| 39 |
+
browser = await p.chromium.launch(headless=False)
|
| 40 |
+
context = await browser.new_context(
|
| 41 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36",
|
| 42 |
+
locale="en-US",
|
| 43 |
+
viewport={'width': 1280, 'height': 800},
|
| 44 |
+
device_scale_factor=1,
|
| 45 |
+
is_mobile=False,
|
| 46 |
+
has_touch=False
|
| 47 |
+
)
|
| 48 |
+
page = await context.new_page()
|
| 49 |
|
| 50 |
for page_num in range(1, max_pages + 1):
|
| 51 |
url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
|
| 52 |
print(f"Scraping: {url}")
|
| 53 |
+
await page.goto(url, timeout=60000)
|
| 54 |
+
await asyncio.sleep(delay) # Give page some time to load
|
| 55 |
|
| 56 |
+
html = await page.content()
|
| 57 |
+
await asyncio.sleep(delay) # Give page some time to load
|
| 58 |
soup = BeautifulSoup(html, 'html.parser')
|
| 59 |
+
print(soup)
|
| 60 |
+
review_blocks = soup.find_all('div', class_='ddc-comment ddc-box ddc-mgb-2')
|
| 61 |
|
| 62 |
if not review_blocks:
|
| 63 |
print("No reviews found on this page.")
|
| 64 |
break
|
| 65 |
|
| 66 |
for block in review_blocks:
|
| 67 |
+
review_paragraph = block.find('p')
|
| 68 |
+
if review_paragraph:
|
| 69 |
+
# Remove the <b> tag from the paragraph to isolate the review text
|
| 70 |
+
if review_paragraph.b:
|
| 71 |
+
review_paragraph.b.extract() # Removes <b> so it doesn't show up in the text
|
| 72 |
+
# Get the cleaned text
|
| 73 |
+
review_text = review_paragraph.get_text(strip=True)
|
| 74 |
+
|
| 75 |
|
| 76 |
all_reviews.append({
|
| 77 |
+
|
| 78 |
+
"review": review_text if review_text else None,
|
|
|
|
|
|
|
| 79 |
"source": url
|
| 80 |
})
|
| 81 |
|
| 82 |
+
await asyncio.sleep(delay)
|
| 83 |
|
| 84 |
+
await browser.close()
|
| 85 |
return pd.DataFrame(all_reviews)
|
| 86 |
|
| 87 |
|