|
|
|
|
|
import sys |
|
|
!pip install feedparser requests beautifulsoup4 |
|
|
|
|
|
import pandas as pd |
|
|
from datetime import datetime, timedelta |
|
|
from bs4 import BeautifulSoup |
|
|
import requests |
|
|
import re |
|
|
import os |
|
|
import time |
|
|
|
|
|
|
|
|
rss_url = 'https://vecherka.su/rss/' |
|
|
csv_file_path = 'bd.csv' |
|
|
|
|
|
def check_for_new_articles(): |
|
|
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...") |
|
|
|
|
|
|
|
|
today_date = datetime.now() |
|
|
yesterday_date = today_date - timedelta(days=1) |
|
|
today_str = today_date.strftime('%d-%m-%Y') |
|
|
yesterday_str = yesterday_date.strftime('%d-%m-%Y') |
|
|
|
|
|
|
|
|
processed_links = set() |
|
|
existing_df = None |
|
|
if os.path.exists(csv_file_path): |
|
|
try: |
|
|
existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';') |
|
|
processed_links = set(existing_df['link'].tolist()) |
|
|
print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.") |
|
|
except Exception as e: |
|
|
print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.") |
|
|
|
|
|
|
|
|
feed = feedparser.parse(rss_url) |
|
|
if not feed.entries: |
|
|
print("No entries found in the RSS feed.") |
|
|
return 0 |
|
|
|
|
|
new_articles_data = [] |
|
|
articles_added_count = 0 |
|
|
|
|
|
for entry in feed.entries: |
|
|
title = getattr(entry, 'title', 'No Title') |
|
|
news_link = getattr(entry, 'link', None) |
|
|
|
|
|
if not news_link or news_link in processed_links: |
|
|
continue |
|
|
|
|
|
published_date_str = getattr(entry, 'published', None) |
|
|
if not published_date_str: |
|
|
print(f"Skipping entry '{title}' due to missing publication date.") |
|
|
continue |
|
|
|
|
|
|
|
|
parsed_date = None |
|
|
try: |
|
|
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') |
|
|
except ValueError: |
|
|
try: |
|
|
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z') |
|
|
except ValueError: |
|
|
try: |
|
|
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') |
|
|
except ValueError: |
|
|
try: |
|
|
|
|
|
date_parts = published_date_str.split(' ')[1:4] |
|
|
if len(date_parts) == 3: |
|
|
parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y') |
|
|
except ValueError: |
|
|
print(f"Could not parse date for entry: '{title}' - '{published_date_str}'") |
|
|
continue |
|
|
|
|
|
if parsed_date: |
|
|
article_date_str = parsed_date.strftime('%d-%m-%Y') |
|
|
|
|
|
|
|
|
if article_date_str == today_str or article_date_str == yesterday_str: |
|
|
image_urls = [] |
|
|
|
|
|
|
|
|
if 'media_content' in entry and len(image_urls) < 3: |
|
|
for media in entry.media_content: |
|
|
if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls: |
|
|
image_urls.append(media['url']) |
|
|
if len(image_urls) == 3: break |
|
|
|
|
|
|
|
|
if 'links' in entry and len(image_urls) < 3: |
|
|
for link_entry in entry.links: |
|
|
if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls: |
|
|
image_urls.append(link_entry['href']) |
|
|
if len(image_urls) == 3: break |
|
|
|
|
|
|
|
|
html_content = '' |
|
|
if 'summary' in entry: |
|
|
html_content = entry.summary |
|
|
elif 'content' in entry and entry.content: |
|
|
html_content = entry.content[0].value |
|
|
|
|
|
if html_content and len(image_urls) < 3: |
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
img_tags = soup.find_all('img') |
|
|
for img in img_tags: |
|
|
if img.get('src') and img.get('src') not in image_urls: |
|
|
image_urls.append(img['src']) |
|
|
if len(image_urls) == 3: break |
|
|
|
|
|
|
|
|
full_text = "" |
|
|
try: |
|
|
response = requests.get(news_link, timeout=10) |
|
|
response.raise_for_status() |
|
|
article_soup = BeautifulSoup(response.text, 'html.parser') |
|
|
detail_text_div = article_soup.find('div', class_='detail-text') |
|
|
if detail_text_div: |
|
|
full_text = detail_text_div.get_text(separator=' ', strip=True) |
|
|
|
|
|
full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE) |
|
|
full_text = re.sub(r'\s+', ' ', full_text).strip() |
|
|
|
|
|
|
|
|
if re.search(r'\bРеклама\b', full_text, re.IGNORECASE): |
|
|
print(f"Skipping article '{title}' due to 'Реклама' in full text.") |
|
|
continue |
|
|
else: |
|
|
print(f"Could not find 'detail-text' div for article: '{title}'") |
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Error fetching content for {news_link}: {e}") |
|
|
except Exception as e: |
|
|
print(f"Error parsing content for {news_link}: {e}") |
|
|
|
|
|
|
|
|
short_text = '' |
|
|
|
|
|
|
|
|
|
|
|
if full_text: |
|
|
new_articles_data.append({ |
|
|
'title': title, |
|
|
'published': article_date_str, |
|
|
'image_urls': image_urls, |
|
|
'link': news_link, |
|
|
'full_text': full_text, |
|
|
'Status': 'Off', |
|
|
'short_text': short_text, |
|
|
'Constant': '' |
|
|
}) |
|
|
processed_links.add(news_link) |
|
|
articles_added_count += 1 |
|
|
|
|
|
if new_articles_data: |
|
|
new_df = pd.DataFrame(new_articles_data) |
|
|
new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x)) |
|
|
|
|
|
if existing_df is not None and not existing_df.empty: |
|
|
|
|
|
new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';') |
|
|
else: |
|
|
|
|
|
new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';') |
|
|
print(f"Added {articles_added_count} new articles to {csv_file_path}.") |
|
|
else: |
|
|
print("No new articles found to add.") |
|
|
|
|
|
return articles_added_count |
|
|
|
|
|
|
|
|
print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.") |
|
|
while True: |
|
|
try: |
|
|
new_count = check_for_new_articles() |
|
|
print(f"Found and added {new_count} new articles.") |
|
|
time.sleep(1800) |
|
|
except KeyboardInterrupt: |
|
|
print("Monitoring stopped by user.") |
|
|
break |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred in the main loop: {e}") |
|
|
time.sleep(60) |
|
|
|