First_agent_template

Sleeping

App Files Files Community

MariaMaraShe commited on Feb 22, 2025

Commit

fde458a

verified ·

1 Parent(s): f0d1264

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -29

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import datetime
 import requests
 import pytz
 import yaml
-from bs4 import BeautifulSoup
 from tools.final_answer import FinalAnswerTool
 web_search = DuckDuckGoSearchTool()
@@ -39,7 +39,7 @@ def get_current_time_in_timezone(timezone: str) -> str:
 @tool
 def visit_webpage(url: str) -> str:
-    """Извлекает текстовое содержимое веб-страницы по URL с улучшенной обработкой заголовков.
     Args:
     url: Адрес веб-страницы для чтения
     """
@@ -48,39 +48,34 @@ def visit_webpage(url: str) -> str:
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
         response = requests.get(url, headers=headers, timeout=30)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Улучшенное извлечение заголовков
         headlines = []
-        # Поиск заголовков в различных HTML-тегах
-        headline_tags = ['h1', 'h2', 'h3', 'h4', 'article-title', 'title']
-        for tag in headline_tags:
-            headlines.extend([
-                h.get_text(strip=True)
-                for h in soup.find_all(tag)
-                if h.get_text(strip=True)
-            ])
-        # Удаление лишних тегов и скриптов
-        for tag in soup(['script', 'style', 'meta', 'link']):
-            tag.decompose()
-        # Очистка и фильтрация заголовков
-        headlines = [
-            headline for headline in headlines
-            if len(headline) > 10 and len(headline) < 200
         ]
-        # Возвращаем текст страницы с выделенными заголовками
-        full_text = soup.get_text(separator='\n', strip=True)
-        # Добавляем заголовки в начало текста
-        if headlines:
-            headlines_text = "Заголовки:\n" + "\n".join(headlines) + "\n\n"
-            return headlines_text + full_text
-        return full_text
     except Exception as e:
         return f"Error fetching webpage: {str(e)}"

 import requests
 import pytz
 import yaml
+import re
 from tools.final_answer import FinalAnswerTool
 web_search = DuckDuckGoSearchTool()
 @tool
 def visit_webpage(url: str) -> str:
+    """Извлекает текстовое содержимое веб-страницы по URL.
     Args:
     url: Адрес веб-страницы для чтения
     """
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
         response = requests.get(url, headers=headers, timeout=30)
+        # Извлечение заголовков с помощью регулярных выражений
         headlines = []
+        patterns = [
+            r'<h3[^>]*>(.*?)</h3>',
+            r'<h2[^>]*>(.*?)</h2>',
+            r'class="[^"]*title[^"]*">(.*?)</h3>'
         ]
+        for pattern in patterns:
+            found = re.findall(pattern, response.text, re.DOTALL | re.IGNORECASE)
+            headlines.extend(found)
+            if len(headlines) >= 5:
+                break
+        # Очистка заголовков
+        cleaned_headlines = []
+        for headline in headlines:
+            clean = re.sub(r'<[^>]+>', '', headline).strip()
+            if 10 < len(clean) < 200:
+                cleaned_headlines.append(clean)
+        # Возврат текста с заголовками
+        if cleaned_headlines:
+            headlines_text = "Заголовки:\n" + "\n".join(cleaned_headlines[:5]) + "\n\n"
+            return headlines_text + response.text
+        return response.text
     except Exception as e:
         return f"Error fetching webpage: {str(e)}"