First_agent_template

Sleeping

App Files Files Community

dygoo commited on Feb 14, 2025

Commit

ae517af

verified ·

1 Parent(s): 2344737

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -44

app.py CHANGED Viewed

@@ -14,59 +14,75 @@ from Gradio_UI import GradioUI
 search_tool = DuckDuckGoSearchTool()
 @tool
 def get_latest_news() -> Dict[str, List[str]]:
     """
-    Tool returns news headlines from news sites.
     Returns:
-        Dict[str, List[str]]: A dictionary where the keys are the news site URLs and the values are lists of headlines.
-    Notes:
-        The function uses a predefined `news_sites` list and a `site_config` dictionary to determine the HTML tag and class
-        to extract headlines from each site. The `site_config` dictionary should have the following structure:
-        {
-            "site_url": {'tag': 'html_tag', 'class': 'css_class'}
-        }
-        If a site is not found in `site_config`, it defaults to {'tag': 'h2', 'class': 'headline'}.
     """
-    news_sites = ["https://www.cnn.com/", "https://www.politico.com/"]
     site_config = {
-        "https://www.cnn.com/": {'tag': 'h2', 'class': 'headline'},
-        "https://www.politico.com/": {'tag': 'h2', 'class': 'headline'}
     }
     headlines = {}
-    for site in news_sites:
         try:
-            config = site_config.get(site, {'tag': 'h2', 'class': 'headline'})
-            response = requests.get(site)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
-            site_headlines = soup.find_all(config['tag'], class_=config['class'])
-            headlines[site] = [headline.text for headline in site_headlines]
-        except requests.RequestException as e:
-            headlines[site] = f"Error fetching news: {e}"
     return headlines
-@tool
-def get_current_time_in_timezone(timezone: str) -> str:
-    """A tool that fetches the current local time in a specified timezone.
-    Args:
-        timezone: A string representing a valid timezone (e.g., 'America/New_York').
-    """
-    try:
-        # Create timezone object
-        tz = pytz.timezone(timezone)
-        # Get current time in that timezone
-        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
-        return f"The current local time in {timezone} is: {local_time}"
-    except Exception as e:
-        return f"Error fetching time for timezone '{timezone}': {str(e)}"
 final_answer = FinalAnswerTool()
@@ -83,15 +99,13 @@ custom_role_conversions=None,
 )
-# Import tool from Hub
-image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
 agent = CodeAgent(
     model=model,
-    tools=[final_answer, get_current_time_in_timezone,image_generation_tool, search_tool, get_latest_news], ## add your tools here (don't remove final answer)
     max_steps=6,
     verbosity_level=1,
     grammar=None,

 search_tool = DuckDuckGoSearchTool()
 @tool
 def get_latest_news() -> Dict[str, List[str]]:
     """
+    Tool returns news headlines from news sites with improved scraping configuration.
     Returns:
+        Dict[str, List[str]]: A dictionary where keys are news site URLs and values are lists of headlines.
     """
+    # More specific configuration for each site
     site_config = {
+        "https://www.cnn.com/": {
+            'container_tag': 'div',
+            'container_class': 'container__headline',  # Main headline container class
+            'headline_tag': 'span',
+            'headline_class': 'container__headline-text',  # Actual headline text class
+            'alternative_tags': [
+                {'tag': 'h3', 'class': 'cd__headline'},
+                {'tag': 'span', 'class': 'card-text'}
+            ]
+        },
+        "https://www.politico.com/": {
+            'container_tag': 'div',
+            'container_class': 'headline',  # Main headline container
+            'headline_tag': 'h3',
+            'headline_class': 'headline__text',  # Actual headline text class
+            'alternative_tags': [
+                {'tag': 'h2', 'class': 'story-card__title'},
+                {'tag': 'h3', 'class': 'media-item__headline'}
+            ]
+        }
     }
     headlines = {}
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    for site_url, config in site_config.items():
         try:
+            response = requests.get(site_url, headers=headers, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
+            site_headlines = set()  # Using set to avoid duplicates
+            # Try primary method
+            for container in soup.find_all(config['container_tag'], class_=config['container_class']):
+                headline = container.find(config['headline_tag'], class_=config['headline_class'])
+                if headline and headline.text.strip():
+                    site_headlines.add(headline.text.strip())
+            # Try alternative tags if primary method didn't work
+            if not site_headlines:
+                for alt_config in config['alternative_tags']:
+                    headlines_elements = soup.find_all(alt_config['tag'], class_=alt_config['class'])
+                    for headline in headlines_elements:
+                        if headline.text.strip():
+                            site_headlines.add(headline.text.strip())
+            # Clean up and store results
+            headlines[site_url] = list(site_headlines)[:10]  # Limit to top 10 headlines
+            # If no headlines found, try a more generic approach
+            if not headlines[site_url]:
+                all_headlines = soup.find_all(['h1', 'h2', 'h3'], class_=lambda x: x and ('headline' in x.lower() or 'title' in x.lower()))
+                headlines[site_url] = [h.text.strip() for h in all_headlines[:10] if h.text.strip()]
+        except Exception as e:
+            headlines[site_url] = [f"Error fetching news: {str(e)}"]
     return headlines
 final_answer = FinalAnswerTool()
 )
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
 agent = CodeAgent(
     model=model,
+    tools=[final_answer, search_tool, get_latest_news], ## add your tools here (don't remove final answer)
     max_steps=6,
     verbosity_level=1,
     grammar=None,