News / Parser.py
Genn9508's picture
Upload Parser.py
9362797 verified
# Установка необходимых библиотек
import sys
!pip install feedparser requests beautifulsoup4
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import re
import os
import time
# Define the RSS feed URL
rss_url = 'https://vecherka.su/rss/'
csv_file_path = 'bd.csv'
def check_for_new_articles():
print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...")
# 1. Define date range (today and yesterday) dynamically for each check
today_date = datetime.now()
yesterday_date = today_date - timedelta(days=1)
today_str = today_date.strftime('%d-%m-%Y')
yesterday_str = yesterday_date.strftime('%d-%m-%Y')
# 2. Load existing articles to avoid duplicates
processed_links = set()
existing_df = None
if os.path.exists(csv_file_path):
try:
existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';')
processed_links = set(existing_df['link'].tolist())
print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.")
except Exception as e:
print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.")
# 3. Fetch and parse the RSS feed
feed = feedparser.parse(rss_url)
if not feed.entries:
print("No entries found in the RSS feed.")
return 0
new_articles_data = []
articles_added_count = 0
for entry in feed.entries:
title = getattr(entry, 'title', 'No Title')
news_link = getattr(entry, 'link', None)
if not news_link or news_link in processed_links:
continue # Skip if no link or already processed
published_date_str = getattr(entry, 'published', None)
if not published_date_str:
print(f"Skipping entry '{title}' due to missing publication date.")
continue
# Try parsing date in several common RSS formats
parsed_date = None
try:
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z')
except ValueError:
try:
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z')
except ValueError:
try:
parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') # Try again, some might just work
except ValueError:
try:
# Fallback for simpler date formats if time/timezone are problematic
date_parts = published_date_str.split(' ')[1:4] # e.g., ['21', 'Feb', '2026']
if len(date_parts) == 3:
parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y')
except ValueError:
print(f"Could not parse date for entry: '{title}' - '{published_date_str}'")
continue # Skip this entry if date cannot be parsed
if parsed_date:
article_date_str = parsed_date.strftime('%d-%m-%Y')
# Filter articles published today or yesterday
if article_date_str == today_str or article_date_str == yesterday_str:
image_urls = []
# Extract images from media_content (if available and max 3 not reached)
if 'media_content' in entry and len(image_urls) < 3:
for media in entry.media_content:
if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls:
image_urls.append(media['url'])
if len(image_urls) == 3: break
# Extract images from links with rel='enclosure' (if available and max 3 not reached)
if 'links' in entry and len(image_urls) < 3:
for link_entry in entry.links:
if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls:
image_urls.append(link_entry['href'])
if len(image_urls) == 3: break
# Extract images from summary or content using BeautifulSoup (if available and max 3 not reached)
html_content = ''
if 'summary' in entry:
html_content = entry.summary
elif 'content' in entry and entry.content:
html_content = entry.content[0].value
if html_content and len(image_urls) < 3:
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img')
for img in img_tags:
if img.get('src') and img.get('src') not in image_urls:
image_urls.append(img['src'])
if len(image_urls) == 3: break
# Fetch full text content from the article link
full_text = ""
try:
response = requests.get(news_link, timeout=10) # Added timeout
response.raise_for_status() # Raise an exception for HTTP errors
article_soup = BeautifulSoup(response.text, 'html.parser')
detail_text_div = article_soup.find('div', class_='detail-text')
if detail_text_div:
full_text = detail_text_div.get_text(separator=' ', strip=True)
# Remove sentences containing 'подписывайтесь'
full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE)
full_text = re.sub(r'\s+', ' ', full_text).strip()
# Check for 'Реклама' in the full text and skip if found
if re.search(r'\bРеклама\b', full_text, re.IGNORECASE):
print(f"Skipping article '{title}' due to 'Реклама' in full text.")
continue # Skip this article if 'Реклама' is found
else:
print(f"Could not find 'detail-text' div for article: '{title}'")
except requests.exceptions.RequestException as e:
print(f"Error fetching content for {news_link}: {e}")
except Exception as e:
print(f"Error parsing content for {news_link}: {e}")
# Extract short text (first 200 characters of full_text)
short_text = '' # Устанавливаем short_text как пустую строку
# Only add if full_text is not empty or if it's acceptable without it
# For this task, we assume full_text is important, so we skip if not found or problematic
if full_text:
new_articles_data.append({
'title': title,
'published': article_date_str,
'image_urls': image_urls,
'link': news_link,
'full_text': full_text,
'Status': 'Off',
'short_text': short_text, # Добавляем новый столбец 'short_text'
'Constant': '' # Добавляем новый столбец 'Constant' со значением ''
})
processed_links.add(news_link) # Add to processed links immediately
articles_added_count += 1
if new_articles_data:
new_df = pd.DataFrame(new_articles_data)
new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x))
if existing_df is not None and not existing_df.empty:
# Append to existing file without header if it exists
new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';')
else:
# Write new file with header
new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';')
print(f"Added {articles_added_count} new articles to {csv_file_path}.")
else:
print("No new articles found to add.")
return articles_added_count
# --- Main loop for continuous checking ---
print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.")
while True:
try:
new_count = check_for_new_articles()
print(f"Found and added {new_count} new articles.")
time.sleep(1800) # Wait for 30 minutes (1800 seconds)
except KeyboardInterrupt:
print("Monitoring stopped by user.")
break
except Exception as e:
print(f"An unexpected error occurred in the main loop: {e}")
time.sleep(60) # Wait for a shorter period before retrying on error