File size: 4,418 Bytes
9b5b26a
6a5d3c0
9b5b26a
 
 
c19d193
fde458a
e36e662
6aae614
ddd5aa2
 
 
 
b77a101
8fe992b
9b5b26a
 
5df72d6
9b5b26a
3d1237b
9b5b26a
 
 
 
 
 
 
 
8c01ffb
bc41013
 
fde458a
bc41013
f0d1264
bc41013
 
 
247a82f
 
 
a29018c
bc41013
 
247a82f
f0d1264
a29018c
fde458a
a29018c
 
 
 
 
f0d1264
 
247a82f
fde458a
247a82f
 
a29018c
247a82f
 
a29018c
 
 
 
 
 
 
 
 
247a82f
f0d1264
a29018c
247a82f
a29018c
f0d1264
247a82f
a29018c
 
 
247a82f
a29018c
247a82f
bc41013
a29018c
bc41013
6aae614
ae7a494
e121372
247a82f
 
 
 
 
13d500a
8c01ffb
247a82f
8c01ffb
8fe992b
e36e662
f707a69
8c01ffb
 
 
 
 
247a82f
8fe992b
 
9b5b26a
8c01ffb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
import os
import datetime
import requests
import pytz
import yaml
import re
from bs4 import BeautifulSoup
from tools.final_answer import FinalAnswerTool

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
    
web_search = DuckDuckGoSearchTool() 

from Gradio_UI import GradioUI

# Below is an example of a tool that does nothing. Amaze us with your creativity !
@tool
def my_custom_tool(arg1:str, arg2:int)-> str: #it's import to specify the return type
    #Keep this format for the description / args / args description but feel free to modify the tool
    """A tool that does nothing yet 
    Args:
        arg1: the first argument
        arg2: the second argument
    """
    return "What magic will you build ?"


@tool
def visit_webpage(url: str) -> str:
    """Извлекает текстовое содержимое веб-страницы по URL.
    Args:
    url: Адрес веб-страницы для чтения
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive'
        }
        response = requests.get(url, headers=headers, timeout=30)
        content = response.text
        
        # Улучшенные паттерны для поиска заголовков
        patterns = [
            r'<h1[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</h1>',
            r'<h2[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</h2>',
            r'<h3[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</h3>',
            r'class="[^"]*headline[^"]*"[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</[^>]*>',
            r'class="[^"]*title[^"]*"[^>]*>((?!Privacy|Cookie|Terms|Log in|Sign up|Subscribe|Menu|Navigation)[^<]+)</[^>]*>'
        ]
        
        headlines = []
        for pattern in patterns:
            matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
            for match in matches:
                # Очищаем текст от HTML-тегов и лишних пробелов
                clean_text = re.sub(r'<[^>]+>', '', match)
                clean_text = re.sub(r'\s+', ' ', clean_text).strip()
                
                # Фильтруем неинформативные заголовки
                if (clean_text and 
                    len(clean_text) > 20 and 
                    len(clean_text) < 200 and
                    not any(word in clean_text.lower() for word in [
                        'cookie', 'privacy', 'terms', 'subscribe', 'sign in', 
                        'login', 'newsletter', 'advertisement', 'sponsored'
                    ])):
                    headlines.append(clean_text)
        
        # Удаляем дубликаты и сортируем по длине (обычно более длинные заголовки более информативны)
        unique_headlines = list(set(headlines))
        unique_headlines.sort(key=len, reverse=True)
        
        if unique_headlines:
            # Извлекаем имя источника из URL
            source_name = url.split('/')[2].replace('www.', '')
            return f"Новости с {source_name}:\n" + "\n".join(unique_headlines[:5])
        else:
            return f"Не удалось найти новости на {url}"
            
    except Exception as e:
        return f"Ошибка при загрузке {url}: {str(e)}"
        
final_answer = FinalAnswerTool()

model = HfApiModel(
    max_tokens=1048,
    temperature=0.5,
    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
    custom_role_conversions=None,
    token=os.environ.get('HF_TOKEN') 
)

# Создаем агента без authorized_imports
agent = CodeAgent(
    model=model,
    tools=[web_search, visit_webpage, final_answer],
    max_steps=5,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name=None,
    description=None,
    prompt_templates=prompt_templates
)


GradioUI(agent).launch()