Spaces:
Paused
Paused
| from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool | |
| import datetime | |
| import requests | |
| import yaml | |
| from tools.final_answer import FinalAnswerTool | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| import datetime | |
| import random | |
| from requests.adapters import HTTPAdapter | |
| from urllib3.util.retry import Retry | |
| from Gradio_UI import GradioUI | |
| # ✅ Tool wrapper function for SmolAgent | |
| def scrape_drug_reviews_tool(drug_name: str, max_pages: int = 3) -> dict : | |
| """ | |
| Scrapes reviews from the website Drugs.com using Playwright for a given drug name. | |
| Args: | |
| drug_name: the name of the target drug for which I want to retrieve reviews, | |
| max_pages: the number of pages of reviews from Drugs.com that I want to collect | |
| Output: a dictionary url:review mapping the url of a review to the text of the review | |
| """ | |
| try: | |
| df = scrape_drugs_com_reviews_requests(drug_name, max_pages) | |
| return df.to_dict(orient="records") | |
| except Exception as e: | |
| return {"error": str(e)} | |
| # List of User-Agents for rotation | |
| USER_AGENTS = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0" | |
| ] | |
| # Retry logic wrapper | |
| def requests_retry_session(retries=3, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), session=None): | |
| session = session or requests.Session() | |
| retry = Retry( | |
| total=retries, | |
| read=retries, | |
| connect=retries, | |
| backoff_factor=backoff_factor, | |
| status_forcelist=status_forcelist, | |
| ) | |
| adapter = HTTPAdapter(max_retries=retry) | |
| session.mount("http://", adapter) | |
| session.mount("https://", adapter) | |
| return session | |
| # Scraper function using requests | |
| def scrape_drugs_com_reviews_requests(drug_name, max_pages=3, delay=2): | |
| base_url = f"https://www.drugs.com/comments/{drug_name}/" | |
| all_reviews = [] | |
| session = requests_retry_session() | |
| for page_num in range(1, max_pages + 1): | |
| url = base_url if page_num == 1 else f"{base_url}?page={page_num}" | |
| headers = {"User-Agent": random.choice(USER_AGENTS)} | |
| try: | |
| response = session.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| review_blocks = soup.find_all("div", class_="ddc-comment ddc-box ddc-mgb-2") | |
| if not review_blocks: | |
| print(f"No reviews found on page {page_num}.") | |
| break | |
| for block in review_blocks: | |
| review_paragraph = block.find("p") | |
| review_text = None | |
| if review_paragraph: | |
| if review_paragraph.b: | |
| review_paragraph.b.extract() # remove category (e.g., "For Back Pain") | |
| review_text = review_paragraph.get_text(strip=True) | |
| all_reviews.append({ | |
| "review": review_text, | |
| "source": url | |
| }) | |
| time.sleep(delay) # Polite delay | |
| except Exception as e: | |
| print(f"Error scraping {url}: {e}") | |
| continue | |
| return pd.DataFrame(all_reviews) | |
| final_answer = FinalAnswerTool() | |
| # If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder: | |
| # model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud' | |
| model = HfApiModel( | |
| max_tokens=2096, | |
| temperature=0.5, | |
| model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded | |
| custom_role_conversions=None, | |
| ) | |
| with open("prompts.yaml", 'r') as stream: | |
| prompt_templates = yaml.safe_load(stream) | |
| agent = CodeAgent( | |
| model=model, | |
| tools=[scrape_drug_reviews_tool,final_answer], ## add your tools here (don't remove final answer) | |
| max_steps=6, | |
| verbosity_level=1, | |
| grammar=None, | |
| planning_interval=None, | |
| name="DrugReviewScraperAgent", | |
| description="Agent that can scrape drug reviews and analyze causal relations", | |
| prompt_templates=prompt_templates | |
| ) | |
| GradioUI(agent).launch() |