News / Parser.py

Upload Parser.py

9362797 verified 2 days ago

9.12 kB

	# Установка необходимых библиотек
	import sys
	!pip install feedparser requests beautifulsoup4

	import pandas as pd
	from datetime import datetime, timedelta
	from bs4 import BeautifulSoup
	import requests
	import re
	import os
	import time

	# Define the RSS feed URL
	rss_url = 'https://vecherka.su/rss/'
	csv_file_path = 'bd.csv'

	def check_for_new_articles():
	print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...")

	# 1. Define date range (today and yesterday) dynamically for each check
	today_date = datetime.now()
	yesterday_date = today_date - timedelta(days=1)
	today_str = today_date.strftime('%d-%m-%Y')
	yesterday_str = yesterday_date.strftime('%d-%m-%Y')

	# 2. Load existing articles to avoid duplicates
	processed_links = set()
	existing_df = None
	if os.path.exists(csv_file_path):
	try:
	existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';')
	processed_links = set(existing_df['link'].tolist())
	print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.")
	except Exception as e:
	print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.")

	# 3. Fetch and parse the RSS feed
	feed = feedparser.parse(rss_url)
	if not feed.entries:
	print("No entries found in the RSS feed.")
	return 0

	new_articles_data = []
	articles_added_count = 0

	for entry in feed.entries:
	title = getattr(entry, 'title', 'No Title')
	news_link = getattr(entry, 'link', None)

	if not news_link or news_link in processed_links:
	continue # Skip if no link or already processed

	published_date_str = getattr(entry, 'published', None)
	if not published_date_str:
	print(f"Skipping entry '{title}' due to missing publication date.")
	continue

	# Try parsing date in several common RSS formats
	parsed_date = None
	try:
	parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z')
	except ValueError:
	try:
	parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z')
	except ValueError:
	try:
	parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') # Try again, some might just work
	except ValueError:
	try:
	# Fallback for simpler date formats if time/timezone are problematic
	date_parts = published_date_str.split(' ')[1:4] # e.g., ['21', 'Feb', '2026']
	if len(date_parts) == 3:
	parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y')
	except ValueError:
	print(f"Could not parse date for entry: '{title}' - '{published_date_str}'")
	continue # Skip this entry if date cannot be parsed

	if parsed_date:
	article_date_str = parsed_date.strftime('%d-%m-%Y')

	# Filter articles published today or yesterday
	if article_date_str == today_str or article_date_str == yesterday_str:
	image_urls = []

	# Extract images from media_content (if available and max 3 not reached)
	if 'media_content' in entry and len(image_urls) < 3:
	for media in entry.media_content:
	if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls:
	image_urls.append(media['url'])
	if len(image_urls) == 3: break

	# Extract images from links with rel='enclosure' (if available and max 3 not reached)
	if 'links' in entry and len(image_urls) < 3:
	for link_entry in entry.links:
	if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls:
	image_urls.append(link_entry['href'])
	if len(image_urls) == 3: break

	# Extract images from summary or content using BeautifulSoup (if available and max 3 not reached)
	html_content = ''
	if 'summary' in entry:
	html_content = entry.summary
	elif 'content' in entry and entry.content:
	html_content = entry.content[0].value

	if html_content and len(image_urls) < 3:
	soup = BeautifulSoup(html_content, 'html.parser')
	img_tags = soup.find_all('img')
	for img in img_tags:
	if img.get('src') and img.get('src') not in image_urls:
	image_urls.append(img['src'])
	if len(image_urls) == 3: break

	# Fetch full text content from the article link
	full_text = ""
	try:
	response = requests.get(news_link, timeout=10) # Added timeout
	response.raise_for_status() # Raise an exception for HTTP errors
	article_soup = BeautifulSoup(response.text, 'html.parser')
	detail_text_div = article_soup.find('div', class_='detail-text')
	if detail_text_div:
	full_text = detail_text_div.get_text(separator=' ', strip=True)
	# Remove sentences containing 'подписывайтесь'
	full_text = re.sub(r'[^.!?]\bподписывайтесь\b[^.!?][?.!]', '', full_text, flags=re.IGNORECASE)
	full_text = re.sub(r'\s+', ' ', full_text).strip()

	# Check for 'Реклама' in the full text and skip if found
	if re.search(r'\bРеклама\b', full_text, re.IGNORECASE):
	print(f"Skipping article '{title}' due to 'Реклама' in full text.")
	continue # Skip this article if 'Реклама' is found
	else:
	print(f"Could not find 'detail-text' div for article: '{title}'")
	except requests.exceptions.RequestException as e:
	print(f"Error fetching content for {news_link}: {e}")
	except Exception as e:
	print(f"Error parsing content for {news_link}: {e}")

	# Extract short text (first 200 characters of full_text)
	short_text = '' # Устанавливаем short_text как пустую строку

	# Only add if full_text is not empty or if it's acceptable without it
	# For this task, we assume full_text is important, so we skip if not found or problematic
	if full_text:
	new_articles_data.append({
	'title': title,
	'published': article_date_str,
	'image_urls': image_urls,
	'link': news_link,
	'full_text': full_text,
	'Status': 'Off',
	'short_text': short_text, # Добавляем новый столбец 'short_text'
	'Constant': '' # Добавляем новый столбец 'Constant' со значением ''
	})
	processed_links.add(news_link) # Add to processed links immediately
	articles_added_count += 1

	if new_articles_data:
	new_df = pd.DataFrame(new_articles_data)
	new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x))

	if existing_df is not None and not existing_df.empty:
	# Append to existing file without header if it exists
	new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';')
	else:
	# Write new file with header
	new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';')
	print(f"Added {articles_added_count} new articles to {csv_file_path}.")
	else:
	print("No new articles found to add.")

	return articles_added_count

	# --- Main loop for continuous checking ---
	print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.")
	while True:
	try:
	new_count = check_for_new_articles()
	print(f"Found and added {new_count} new articles.")
	time.sleep(1800) # Wait for 30 minutes (1800 seconds)
	except KeyboardInterrupt:
	print("Monitoring stopped by user.")
	break
	except Exception as e:
	print(f"An unexpected error occurred in the main loop: {e}")
	time.sleep(60) # Wait for a shorter period before retrying on error