News
File size: 9,117 Bytes
# Установка необходимых библиотек
import sys
!pip install feedparser requests beautifulsoup4

import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import re
import os
import time

# Define the RSS feed URL
rss_url = 'https://vecherka.su/rss/'
csv_file_path = 'bd.csv'

def check_for_new_articles():
    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...")

    # 1. Define date range (today and yesterday) dynamically for each check
    today_date = datetime.now()
    yesterday_date = today_date - timedelta(days=1)
    today_str = today_date.strftime('%d-%m-%Y')
    yesterday_str = yesterday_date.strftime('%d-%m-%Y')

    # 2. Load existing articles to avoid duplicates
    processed_links = set()
    existing_df = None
    if os.path.exists(csv_file_path):
        try:
            existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';')
            processed_links = set(existing_df['link'].tolist())
            print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.")
        except Exception as e:
            print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.")

    # 3. Fetch and parse the RSS feed
    feed = feedparser.parse(rss_url)
    if not feed.entries:
        print("No entries found in the RSS feed.")
        return 0

    new_articles_data = []
    articles_added_count = 0

    for entry in feed.entries:
        title = getattr(entry, 'title', 'No Title')
        news_link = getattr(entry, 'link', None)

        if not news_link or news_link in processed_links:
            continue # Skip if no link or already processed

        published_date_str = getattr(entry, 'published', None)
        if not published_date_str:
            print(f"Skipping entry '{title}' due to missing publication date.")
            continue

        # Try parsing date in several common RSS formats
        parsed_date = None
        try:
            parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z')
        except ValueError:
            try:
                parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z')
            except ValueError:
                try:
                    parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') # Try again, some might just work
                except ValueError:
                    try:
                        # Fallback for simpler date formats if time/timezone are problematic
                        date_parts = published_date_str.split(' ')[1:4] # e.g., ['21', 'Feb', '2026']
                        if len(date_parts) == 3:
                            parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y')
                    except ValueError:
                        print(f"Could not parse date for entry: '{title}' - '{published_date_str}'")
                        continue # Skip this entry if date cannot be parsed

        if parsed_date:
            article_date_str = parsed_date.strftime('%d-%m-%Y')

            # Filter articles published today or yesterday
            if article_date_str == today_str or article_date_str == yesterday_str:
                image_urls = []

                # Extract images from media_content (if available and max 3 not reached)
                if 'media_content' in entry and len(image_urls) < 3:
                    for media in entry.media_content:
                        if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls:
                            image_urls.append(media['url'])
                            if len(image_urls) == 3: break

                # Extract images from links with rel='enclosure' (if available and max 3 not reached)
                if 'links' in entry and len(image_urls) < 3:
                    for link_entry in entry.links:
                        if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls:
                            image_urls.append(link_entry['href'])
                            if len(image_urls) == 3: break

                # Extract images from summary or content using BeautifulSoup (if available and max 3 not reached)
                html_content = ''
                if 'summary' in entry:
                    html_content = entry.summary
                elif 'content' in entry and entry.content:
                    html_content = entry.content[0].value

                if html_content and len(image_urls) < 3:
                    soup = BeautifulSoup(html_content, 'html.parser')
                    img_tags = soup.find_all('img')
                    for img in img_tags:
                        if img.get('src') and img.get('src') not in image_urls:
                            image_urls.append(img['src'])
                            if len(image_urls) == 3: break

                # Fetch full text content from the article link
                full_text = ""
                try:
                    response = requests.get(news_link, timeout=10) # Added timeout
                    response.raise_for_status() # Raise an exception for HTTP errors
                    article_soup = BeautifulSoup(response.text, 'html.parser')
                    detail_text_div = article_soup.find('div', class_='detail-text')
                    if detail_text_div:
                        full_text = detail_text_div.get_text(separator=' ', strip=True)
                        # Remove sentences containing 'подписывайтесь'
                        full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE)
                        full_text = re.sub(r'\s+', ' ', full_text).strip()

                        # Check for 'Реклама' in the full text and skip if found
                        if re.search(r'\bРеклама\b', full_text, re.IGNORECASE):
                            print(f"Skipping article '{title}' due to 'Реклама' in full text.")
                            continue # Skip this article if 'Реклама' is found
                    else:
                        print(f"Could not find 'detail-text' div for article: '{title}'")
                except requests.exceptions.RequestException as e:
                    print(f"Error fetching content for {news_link}: {e}")
                except Exception as e:
                    print(f"Error parsing content for {news_link}: {e}")

                # Extract short text (first 200 characters of full_text)
                short_text = '' # Устанавливаем short_text как пустую строку

                # Only add if full_text is not empty or if it's acceptable without it
                # For this task, we assume full_text is important, so we skip if not found or problematic
                if full_text:
                    new_articles_data.append({
                        'title': title,
                        'published': article_date_str,
                        'image_urls': image_urls,
                        'link': news_link,
                        'full_text': full_text,
                        'Status': 'Off',
                        'short_text': short_text, # Добавляем новый столбец 'short_text'
                        'Constant': '' # Добавляем новый столбец 'Constant' со значением ''
                    })
                    processed_links.add(news_link) # Add to processed links immediately
                    articles_added_count += 1

    if new_articles_data:
        new_df = pd.DataFrame(new_articles_data)
        new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x))

        if existing_df is not None and not existing_df.empty:
            # Append to existing file without header if it exists
            new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';')
        else:
            # Write new file with header
            new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';')
        print(f"Added {articles_added_count} new articles to {csv_file_path}.")
    else:
        print("No new articles found to add.")

    return articles_added_count

# --- Main loop for continuous checking ---
print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.")
while True:
    try:
        new_count = check_for_new_articles()
        print(f"Found and added {new_count} new articles.")
        time.sleep(1800) # Wait for 30 minutes (1800 seconds)
    except KeyboardInterrupt:
        print("Monitoring stopped by user.")
        break
    except Exception as e:
        print(f"An unexpected error occurred in the main loop: {e}")
        time.sleep(60) # Wait for a shorter period before retrying on error