# Установка необходимых библиотек import sys !pip install feedparser requests beautifulsoup4 import pandas as pd from datetime import datetime, timedelta from bs4 import BeautifulSoup import requests import re import os import time # Define the RSS feed URL rss_url = 'https://vecherka.su/rss/' csv_file_path = 'bd.csv' def check_for_new_articles(): print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...") # 1. Define date range (today and yesterday) dynamically for each check today_date = datetime.now() yesterday_date = today_date - timedelta(days=1) today_str = today_date.strftime('%d-%m-%Y') yesterday_str = yesterday_date.strftime('%d-%m-%Y') # 2. Load existing articles to avoid duplicates processed_links = set() existing_df = None if os.path.exists(csv_file_path): try: existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';') processed_links = set(existing_df['link'].tolist()) print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.") except Exception as e: print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.") # 3. Fetch and parse the RSS feed feed = feedparser.parse(rss_url) if not feed.entries: print("No entries found in the RSS feed.") return 0 new_articles_data = [] articles_added_count = 0 for entry in feed.entries: title = getattr(entry, 'title', 'No Title') news_link = getattr(entry, 'link', None) if not news_link or news_link in processed_links: continue # Skip if no link or already processed published_date_str = getattr(entry, 'published', None) if not published_date_str: print(f"Skipping entry '{title}' due to missing publication date.") continue # Try parsing date in several common RSS formats parsed_date = None try: parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') except ValueError: try: parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z') except ValueError: try: parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') # Try again, some might just work except ValueError: try: # Fallback for simpler date formats if time/timezone are problematic date_parts = published_date_str.split(' ')[1:4] # e.g., ['21', 'Feb', '2026'] if len(date_parts) == 3: parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y') except ValueError: print(f"Could not parse date for entry: '{title}' - '{published_date_str}'") continue # Skip this entry if date cannot be parsed if parsed_date: article_date_str = parsed_date.strftime('%d-%m-%Y') # Filter articles published today or yesterday if article_date_str == today_str or article_date_str == yesterday_str: image_urls = [] # Extract images from media_content (if available and max 3 not reached) if 'media_content' in entry and len(image_urls) < 3: for media in entry.media_content: if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls: image_urls.append(media['url']) if len(image_urls) == 3: break # Extract images from links with rel='enclosure' (if available and max 3 not reached) if 'links' in entry and len(image_urls) < 3: for link_entry in entry.links: if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls: image_urls.append(link_entry['href']) if len(image_urls) == 3: break # Extract images from summary or content using BeautifulSoup (if available and max 3 not reached) html_content = '' if 'summary' in entry: html_content = entry.summary elif 'content' in entry and entry.content: html_content = entry.content[0].value if html_content and len(image_urls) < 3: soup = BeautifulSoup(html_content, 'html.parser') img_tags = soup.find_all('img') for img in img_tags: if img.get('src') and img.get('src') not in image_urls: image_urls.append(img['src']) if len(image_urls) == 3: break # Fetch full text content from the article link full_text = "" try: response = requests.get(news_link, timeout=10) # Added timeout response.raise_for_status() # Raise an exception for HTTP errors article_soup = BeautifulSoup(response.text, 'html.parser') detail_text_div = article_soup.find('div', class_='detail-text') if detail_text_div: full_text = detail_text_div.get_text(separator=' ', strip=True) # Remove sentences containing 'подписывайтесь' full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE) full_text = re.sub(r'\s+', ' ', full_text).strip() # Check for 'Реклама' in the full text and skip if found if re.search(r'\bРеклама\b', full_text, re.IGNORECASE): print(f"Skipping article '{title}' due to 'Реклама' in full text.") continue # Skip this article if 'Реклама' is found else: print(f"Could not find 'detail-text' div for article: '{title}'") except requests.exceptions.RequestException as e: print(f"Error fetching content for {news_link}: {e}") except Exception as e: print(f"Error parsing content for {news_link}: {e}") # Extract short text (first 200 characters of full_text) short_text = '' # Устанавливаем short_text как пустую строку # Only add if full_text is not empty or if it's acceptable without it # For this task, we assume full_text is important, so we skip if not found or problematic if full_text: new_articles_data.append({ 'title': title, 'published': article_date_str, 'image_urls': image_urls, 'link': news_link, 'full_text': full_text, 'Status': 'Off', 'short_text': short_text, # Добавляем новый столбец 'short_text' 'Constant': '' # Добавляем новый столбец 'Constant' со значением '' }) processed_links.add(news_link) # Add to processed links immediately articles_added_count += 1 if new_articles_data: new_df = pd.DataFrame(new_articles_data) new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x)) if existing_df is not None and not existing_df.empty: # Append to existing file without header if it exists new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';') else: # Write new file with header new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';') print(f"Added {articles_added_count} new articles to {csv_file_path}.") else: print("No new articles found to add.") return articles_added_count # --- Main loop for continuous checking --- print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.") while True: try: new_count = check_for_new_articles() print(f"Found and added {new_count} new articles.") time.sleep(1800) # Wait for 30 minutes (1800 seconds) except KeyboardInterrupt: print("Monitoring stopped by user.") break except Exception as e: print(f"An unexpected error occurred in the main loop: {e}") time.sleep(60) # Wait for a shorter period before retrying on error