File size: 9,117 Bytes
9362797 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# Установка необходимых библиотек
import sys
!pip install feedparser requests beautifulsoup4
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import re
import os
import time
# Define the RSS feed URL
rss_url = 'https://vecherka.su/rss/'
csv_file_path = 'bd.csv'
def check_for_new_articles():
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...")
# 1. Define date range (today and yesterday) dynamically for each check
today_date = datetime.now()
yesterday_date = today_date - timedelta(days=1)
today_str = today_date.strftime('%d-%m-%Y')
yesterday_str = yesterday_date.strftime('%d-%m-%Y')
# 2. Load existing articles to avoid duplicates
processed_links = set()
existing_df = None
if os.path.exists(csv_file_path):
try:
existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';')
processed_links = set(existing_df['link'].tolist())
print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.")
except Exception as e:
print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.")
# 3. Fetch and parse the RSS feed
feed = feedparser.parse(rss_url)
if not feed.entries:
print("No entries found in the RSS feed.")
return 0
new_articles_data = []
articles_added_count = 0
for entry in feed.entries:
title = getattr(entry, 'title', 'No Title')
news_link = getattr(entry, 'link', None)
if not news_link or news_link in processed_links:
continue # Skip if no link or already processed
published_date_str = getattr(entry, 'published', None)
if not published_date_str:
print(f"Skipping entry '{title}' due to missing publication date.")
continue
# Try parsing date in several common RSS formats
parsed_date = None
try:
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z')
except ValueError:
try:
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z')
except ValueError:
try:
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') # Try again, some might just work
except ValueError:
try:
# Fallback for simpler date formats if time/timezone are problematic
date_parts = published_date_str.split(' ')[1:4] # e.g., ['21', 'Feb', '2026']
if len(date_parts) == 3:
parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y')
except ValueError:
print(f"Could not parse date for entry: '{title}' - '{published_date_str}'")
continue # Skip this entry if date cannot be parsed
if parsed_date:
article_date_str = parsed_date.strftime('%d-%m-%Y')
# Filter articles published today or yesterday
if article_date_str == today_str or article_date_str == yesterday_str:
image_urls = []
# Extract images from media_content (if available and max 3 not reached)
if 'media_content' in entry and len(image_urls) < 3:
for media in entry.media_content:
if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls:
image_urls.append(media['url'])
if len(image_urls) == 3: break
# Extract images from links with rel='enclosure' (if available and max 3 not reached)
if 'links' in entry and len(image_urls) < 3:
for link_entry in entry.links:
if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls:
image_urls.append(link_entry['href'])
if len(image_urls) == 3: break
# Extract images from summary or content using BeautifulSoup (if available and max 3 not reached)
html_content = ''
if 'summary' in entry:
html_content = entry.summary
elif 'content' in entry and entry.content:
html_content = entry.content[0].value
if html_content and len(image_urls) < 3:
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img')
for img in img_tags:
if img.get('src') and img.get('src') not in image_urls:
image_urls.append(img['src'])
if len(image_urls) == 3: break
# Fetch full text content from the article link
full_text = ""
try:
response = requests.get(news_link, timeout=10) # Added timeout
response.raise_for_status() # Raise an exception for HTTP errors
article_soup = BeautifulSoup(response.text, 'html.parser')
detail_text_div = article_soup.find('div', class_='detail-text')
if detail_text_div:
full_text = detail_text_div.get_text(separator=' ', strip=True)
# Remove sentences containing 'подписывайтесь'
full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE)
full_text = re.sub(r'\s+', ' ', full_text).strip()
# Check for 'Реклама' in the full text and skip if found
if re.search(r'\bРеклама\b', full_text, re.IGNORECASE):
print(f"Skipping article '{title}' due to 'Реклама' in full text.")
continue # Skip this article if 'Реклама' is found
else:
print(f"Could not find 'detail-text' div for article: '{title}'")
except requests.exceptions.RequestException as e:
print(f"Error fetching content for {news_link}: {e}")
except Exception as e:
print(f"Error parsing content for {news_link}: {e}")
# Extract short text (first 200 characters of full_text)
short_text = '' # Устанавливаем short_text как пустую строку
# Only add if full_text is not empty or if it's acceptable without it
# For this task, we assume full_text is important, so we skip if not found or problematic
if full_text:
new_articles_data.append({
'title': title,
'published': article_date_str,
'image_urls': image_urls,
'link': news_link,
'full_text': full_text,
'Status': 'Off',
'short_text': short_text, # Добавляем новый столбец 'short_text'
'Constant': '' # Добавляем новый столбец 'Constant' со значением ''
})
processed_links.add(news_link) # Add to processed links immediately
articles_added_count += 1
if new_articles_data:
new_df = pd.DataFrame(new_articles_data)
new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x))
if existing_df is not None and not existing_df.empty:
# Append to existing file without header if it exists
new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';')
else:
# Write new file with header
new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';')
print(f"Added {articles_added_count} new articles to {csv_file_path}.")
else:
print("No new articles found to add.")
return articles_added_count
# --- Main loop for continuous checking ---
print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.")
while True:
try:
new_count = check_for_new_articles()
print(f"Found and added {new_count} new articles.")
time.sleep(1800) # Wait for 30 minutes (1800 seconds)
except KeyboardInterrupt:
print("Monitoring stopped by user.")
break
except Exception as e:
print(f"An unexpected error occurred in the main loop: {e}")
time.sleep(60) # Wait for a shorter period before retrying on error
|