MariaMaraShe commited on
Commit
e36e662
·
verified ·
1 Parent(s): 05e3eb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -15
app.py CHANGED
@@ -5,6 +5,7 @@ import requests
5
  import pytz
6
  import yaml
7
  import re
 
8
  from tools.final_answer import FinalAnswerTool
9
  web_search = DuckDuckGoSearchTool()
10
 
@@ -30,40 +31,44 @@ def visit_webpage(url: str) -> str:
30
  """
31
  try:
32
  headers = {
33
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
34
  }
35
  response = requests.get(url, headers=headers, timeout=30)
36
 
37
- # Извлечение заголовков с помощью регулярных выражений
38
  headlines = []
39
  patterns = [
40
- r'<h3[^>]*>(.*?)</h3>',
41
  r'<h2[^>]*>(.*?)</h2>',
42
- r'class="[^"]*title[^"]*">(.*?)</h3>'
 
 
43
  ]
44
 
45
  for pattern in patterns:
46
  found = re.findall(pattern, response.text, re.DOTALL | re.IGNORECASE)
47
  headlines.extend(found)
48
- if len(headlines) >= 5:
49
- break
50
 
51
- # Очистка заголовков
52
  cleaned_headlines = []
53
  for headline in headlines:
54
  clean = re.sub(r'<[^>]+>', '', headline).strip()
55
- if 10 < len(clean) < 200:
56
  cleaned_headlines.append(clean)
57
 
58
- # Возврат текста с заголовками
59
  if cleaned_headlines:
60
- headlines_text = "Заголовки:\n" + "\n".join(cleaned_headlines[:5]) + "\n\n"
61
- return headlines_text + response.text
62
 
63
- return response.text
 
 
 
64
 
65
  except Exception as e:
66
- return f"Error fetching webpage: {str(e)}"
67
 
68
  final_answer = FinalAnswerTool()
69
 
@@ -87,14 +92,17 @@ with open("prompts.yaml", 'r') as stream:
87
 
88
  agent = CodeAgent(
89
  model=model,
90
- tools=[web_search, visit_webpage, final_answer], ## add your tools here (don't remove final answer)
91
  max_steps=5,
92
  verbosity_level=1,
93
  grammar=None,
94
  planning_interval=None,
95
  name=None,
96
  description=None,
97
- prompt_templates=prompt_templates
 
 
 
98
  )
99
 
100
 
 
5
  import pytz
6
  import yaml
7
  import re
8
+ from bs4 import BeautifulSoup
9
  from tools.final_answer import FinalAnswerTool
10
  web_search = DuckDuckGoSearchTool()
11
 
 
31
  """
32
  try:
33
  headers = {
34
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
35
+ 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
36
+ 'Referer': 'https://www.google.com/'
37
  }
38
  response = requests.get(url, headers=headers, timeout=30)
39
 
40
+ # Сначала пытаемся найти заголовки напрямую
41
  headlines = []
42
  patterns = [
43
+ r'<h1[^>]*>(.*?)</h1>',
44
  r'<h2[^>]*>(.*?)</h2>',
45
+ r'<h3[^>]*>(.*?)</h3>',
46
+ r'class="[^"]*headline[^"]*"[^>]*>(.*?)</',
47
+ r'class="[^"]*title[^"]*"[^>]*>(.*?)</'
48
  ]
49
 
50
  for pattern in patterns:
51
  found = re.findall(pattern, response.text, re.DOTALL | re.IGNORECASE)
52
  headlines.extend(found)
 
 
53
 
54
+ # Очистка заголовков от HTML-тегов
55
  cleaned_headlines = []
56
  for headline in headlines:
57
  clean = re.sub(r'<[^>]+>', '', headline).strip()
58
+ if 10 < len(clean) < 200 and not clean.startswith('{') and not clean.startswith('.'):
59
  cleaned_headlines.append(clean)
60
 
61
+ # Возврат заголовков
62
  if cleaned_headlines:
63
+ return "Заголовки новостей:\n" + "\n".join(cleaned_headlines[:10])
 
64
 
65
+ # Если не нашли заголовки, вернем часть текста страницы
66
+ text_content = re.sub(r'<[^>]+>', ' ', response.text)
67
+ text_content = re.sub(r'\s+', ' ', text_content).strip()
68
+ return "Содержимое страницы (фрагмент):\n" + text_content[:1000]
69
 
70
  except Exception as e:
71
+ return f"Ошибка при загрузке страницы: {str(e)}"
72
 
73
  final_answer = FinalAnswerTool()
74
 
 
92
 
93
  agent = CodeAgent(
94
  model=model,
95
+ tools=[web_search, visit_webpage, final_answer],
96
  max_steps=5,
97
  verbosity_level=1,
98
  grammar=None,
99
  planning_interval=None,
100
  name=None,
101
  description=None,
102
+ prompt_templates=prompt_templates,
103
+ authorized_imports=['itertools', 're', 'queue', 'random', 'stat',
104
+ 'datetime', 'collections', 'math', 'statistics',
105
+ 'unicodedata', 'time', 'bs4']
106
  )
107
 
108