First_agent_template

Sleeping

App Files Files Community

MariaMaraShe commited on Feb 23, 2025

Commit

a29018c

verified ·

1 Parent(s): a796766

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -24

app.py CHANGED Viewed

@@ -38,51 +38,51 @@ def visit_webpage(url: str) -> str:
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Cache-Control': 'max-age=0'
         }
         response = requests.get(url, headers=headers, timeout=30)
         content = response.text
-        # Извлекаем текст между тегами title
-        title_match = re.search(r'<title>(.*?)</title>', content, re.DOTALL)
-        title = title_match.group(1) if title_match else ""
-        # Ищем заголовки новостей с разными паттернами
         patterns = [
-            r'<h1[^>]*>(.*?)</h1>',
-            r'<h2[^>]*>(.*?)</h2>',
-            r'<h3[^>]*>(.*?)</h3>',
-            r'class="[^"]*headline[^"]*"[^>]*>(.*?)</[^>]*>',
-            r'class="[^"]*title[^"]*"[^>]*>(.*?)</[^>]*>',
-            r'<a[^>]*class="[^"]*"[^>]*>(.*?)</a>'
         ]
         headlines = []
         for pattern in patterns:
             matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
             for match in matches:
-                # Очищаем текст от HTML-тегов
                 clean_text = re.sub(r'<[^>]+>', '', match)
-                # Очищаем от лишних пробелов
                 clean_text = re.sub(r'\s+', ' ', clean_text).strip()
-                if clean_text and len(clean_text) > 20 and len(clean_text) < 200:
                     headlines.append(clean_text)
-        # Удаляем дубликаты
         unique_headlines = list(set(headlines))
         if unique_headlines:
-            return "Основные новости:\n" + "\n".join(unique_headlines[:10])
         else:
-            # Если не нашли заголовки, берем просто текст
-            text_content = re.sub(r'<[^>]+>', ' ', content)
-            text_content = re.sub(r'\s+', ' ', text_content).strip()
-            return text_content[:1000]
     except Exception as e:
-        return f"Ошибка при загрузке страницы: {str(e)}"
 final_answer = FinalAnswerTool()

             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
+            'Connection': 'keep-alive'
         }
         response = requests.get(url, headers=headers, timeout=30)
         content = response.text
+        # Улучшенные паттерны для поиска заголовков
         patterns = [
+            r'<h1[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</h1>',
+            r'<h2[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</h2>',
+            r'<h3[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</h3>',
+            r'class="[^"]*headline[^"]*"[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</[^>]*>',
+            r'class="[^"]*title[^"]*"[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</[^>]*>'
         ]
         headlines = []
         for pattern in patterns:
             matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
             for match in matches:
+                # Очищаем текст от HTML-тегов и лишних пробелов
                 clean_text = re.sub(r'<[^>]+>', '', match)
                 clean_text = re.sub(r'\s+', ' ', clean_text).strip()
+                # Фильтруем неинформативные заголовки
+                if (clean_text and
+                    len(clean_text) > 20 and
+                    len(clean_text) < 200 and
+                    not any(word in clean_text.lower() for word in [
+                        'cookie', 'privacy', 'terms', 'subscribe', 'sign in',
+                        'login', 'newsletter', 'advertisement', 'sponsored'
+                    ])):
                     headlines.append(clean_text)
+        # Удаляем дубликаты и сортируем по длине (обычно более длинные заголовки более информативны)
         unique_headlines = list(set(headlines))
+        unique_headlines.sort(key=len, reverse=True)
         if unique_headlines:
+            # Извлекаем имя источника из URL
+            source_name = url.split('/')[2].replace('www.', '')
+            return f"Новости с {source_name}:\n" + "\n".join(unique_headlines[:5])
         else:
+            return f"Не удалось найти новости на {url}"
     except Exception as e:
+        return f"Ошибка при загрузке {url}: {str(e)}"
 final_answer = FinalAnswerTool()