Genn9508 commited on
Commit
e9f7eee
·
verified ·
1 Parent(s): f2784ae

Upload Parser (3).py

Browse files
Files changed (1) hide show
  1. Parser (3).py +187 -0
Parser (3).py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import pandas as pd
3
+ from datetime import datetime, timedelta
4
+ from bs4 import BeautifulSoup
5
+ import requests
6
+ import re
7
+ import os
8
+ import time
9
+ import csv
10
+ import random # Import random module for delays
11
+
12
+ # Define the RSS feed URL
13
+ rss_url = 'https://vecherka.su/rss/'
14
+ csv_file_path = 'bd.csv'
15
+
16
+ def check_for_new_articles():
17
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Checking for new articles...")
18
+
19
+ # 1. Define date range (today and yesterday) dynamically for each check
20
+ today_date = datetime.now()
21
+ yesterday_date = today_date - timedelta(days=1)
22
+ today_str = today_date.strftime('%d-%m-%Y')
23
+ yesterday_str = yesterday_date.strftime('%d-%m-%Y')
24
+
25
+ # 2. Load existing articles to avoid duplicates
26
+ processed_links = set()
27
+ existing_df = None
28
+ if os.path.exists(csv_file_path):
29
+ try:
30
+ existing_df = pd.read_csv(csv_file_path, encoding='utf-8-sig', sep=';')
31
+ processed_links = set(existing_df['link'].tolist())
32
+ print(f"Loaded {len(processed_links)} existing articles from {csv_file_path}.")
33
+ except Exception as e:
34
+ print(f"Error loading existing CSV: {e}. Starting with an empty processed_links set.")
35
+
36
+ # 3. Fetch and parse the RSS feed
37
+ feed = feedparser.parse(rss_url)
38
+ if not feed.entries:
39
+ print("No entries found in the RSS feed.")
40
+ return 0
41
+
42
+ new_articles_data = []
43
+ articles_added_count = 0
44
+
45
+ # Define a common User-Agent header to mimic a browser
46
+ headers = {
47
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
48
+ }
49
+
50
+ for entry in feed.entries:
51
+ title = getattr(entry, 'title', 'No Title')
52
+ news_link = getattr(entry, 'link', None)
53
+
54
+ if not news_link or news_link in processed_links:
55
+ continue # Skip if no link or already processed
56
+
57
+ published_date_str = getattr(entry, 'published', None)
58
+ if not published_date_str:
59
+ print(f"Skipping entry '{title}' due to missing publication date.")
60
+ continue
61
+
62
+ # Try parsing date in several common RSS formats
63
+ parsed_date = None
64
+ try:
65
+ parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z')
66
+ except ValueError:
67
+ try:
68
+ parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %Z')
69
+ except ValueError:
70
+ try:
71
+ parsed_date = datetime.strptime(published_date_str, '%a, %d %b %Y %H:%M:%S %z') # Try again, some might just work
72
+ except ValueError:
73
+ try:
74
+ # Fallback for simpler date formats if time/timezone are problematic
75
+ date_parts = published_date_str.split(' ')[1:4] # e.g., ['21', 'Feb', '2026']
76
+ if len(date_parts) == 3:
77
+ parsed_date = datetime.strptime(' '.join(date_parts), '%d %b %Y')
78
+ except ValueError:
79
+ print(f"Could not parse date for entry: '{title}' - '{published_date_str}'")
80
+ continue # Skip this entry if date cannot be parsed
81
+
82
+ if parsed_date:
83
+ article_date_str = parsed_date.strftime('%d-%m-%Y')
84
+
85
+ # Filter articles published today or yesterday
86
+ if article_date_str == today_str or article_date_str == yesterday_str:
87
+ image_urls = []
88
+
89
+ # Extract images from media_content (if available and max 3 not reached)
90
+ if 'media_content' in entry and len(image_urls) < 3:
91
+ for media in entry.media_content:
92
+ if media.get('type', '').startswith('image/') and media.get('url') and media.get('url') not in image_urls:
93
+ image_urls.append(media['url'])
94
+ if len(image_urls) == 3: break
95
+
96
+ # Extract images from links with rel='enclosure' (if available and max 3 not reached)
97
+ if 'links' in entry and len(image_urls) < 3:
98
+ for link_entry in entry.links:
99
+ if link_entry.get('rel') == 'enclosure' and link_entry.get('type', '').startswith('image/') and link_entry.get('href') and link_entry.get('href') not in image_urls:
100
+ image_urls.append(link_entry['href'])
101
+ if len(image_urls) == 3: break
102
+
103
+ # Extract images from summary or content using BeautifulSoup (if available and max 3 not reached)
104
+ html_content = ''
105
+ if 'summary' in entry:
106
+ html_content = entry.summary
107
+ elif 'content' in entry and entry.content:
108
+ html_content = entry.content[0].value
109
+
110
+ if html_content and len(image_urls) < 3:
111
+ soup = BeautifulSoup(html_content, 'html.parser')
112
+ img_tags = soup.find_all('img')
113
+ for img in img_tags:
114
+ if img.get('src') and img.get('src') not in image_urls:
115
+ image_urls.append(img['src'])
116
+ if len(image_urls) == 3: break
117
+
118
+ # Fetch full text content from the article link
119
+ full_text = ""
120
+ try:
121
+ # Introduce a random delay before making the request
122
+ time.sleep(random.uniform(1, 3)) # Delay between 1 and 3 seconds
123
+ response = requests.get(news_link, headers=headers, timeout=10) # Added headers
124
+ response.raise_for_status() # Raise an exception for HTTP errors
125
+ article_soup = BeautifulSoup(response.text, 'html.parser')
126
+ detail_text_div = article_soup.find('div', class_='detail-text')
127
+ if detail_text_div:
128
+ full_text = detail_text_div.get_text(separator=' ', strip=True)
129
+ # Remove sentences containing 'подписывайтесь'
130
+ full_text = re.sub(r'[^.!?]*\bподписывайтесь\b[^.!?]*[?.!]', '', full_text, flags=re.IGNORECASE)
131
+ full_text = re.sub(r'\s+', ' ', full_text).strip()
132
+
133
+ # Check for 'Реклама' in the full text and skip if found
134
+ if re.search(r'\bРеклама\b', full_text, re.IGNORECASE):
135
+ print(f"Skipping article '{title}' due to 'Реклама' in full text.")
136
+ continue # Skip this article if 'Реклама' is found
137
+ else:
138
+ print(f"Could not find 'detail-text' div for article: '{title}'")
139
+ except requests.exceptions.RequestException as e:
140
+ print(f"Error fetching content for {news_link}: {e}")
141
+ except Exception as e:
142
+ print(f"Error parsing content for {news_link}: {e}")
143
+
144
+ # Only add if full_text is not empty or if it's acceptable without it
145
+ # For this task, we assume full_text is important, so we skip if not found or problematic
146
+ if full_text:
147
+ new_articles_data.append({
148
+ 'title': title,
149
+ 'published': article_date_str,
150
+ 'image_urls': image_urls,
151
+ 'link': news_link,
152
+ 'full_text': full_text,
153
+ 'short_text': ''
154
+ })
155
+ processed_links.add(news_link) # Add to processed links immediately
156
+ articles_added_count += 1
157
+
158
+ if new_articles_data:
159
+ new_df = pd.DataFrame(new_articles_data)
160
+ new_df['image_urls'] = new_df['image_urls'].apply(lambda x: ', '.join(x))
161
+
162
+ if existing_df is not None and not existing_df.empty:
163
+ # Append to existing file without header if it exists
164
+ new_df.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig', sep=';', quoting=csv.QUOTE_MINIMAL)
165
+ else:
166
+ # Write new file with header
167
+ new_df.to_csv(csv_file_path, mode='w', header=True, index=False, encoding='utf-8-sig', sep=';', quoting=csv.QUOTE_MINIMAL)
168
+ print(f"Added {articles_added_count} new articles to {csv_file_path}.")
169
+ else:
170
+ print("No new articles found to add.")
171
+
172
+ return articles_added_count
173
+
174
+ # --- Main loop for continuous checking ---
175
+ print("Starting continuous RSS feed monitoring. Press Ctrl+C to stop.")
176
+ while True:
177
+ try:
178
+ new_count = check_for_new_articles()
179
+ print(f"Found and added {new_count} new articles.")
180
+ print(f"Waiting for 30 minutes before the next check...") # Added message here
181
+ time.sleep(1800) # Wait for 30 minutes (1800 seconds)
182
+ except KeyboardInterrupt:
183
+ print("Monitoring stopped by user.")
184
+ break
185
+ except Exception as e:
186
+ print(f"An unexpected error occurred in the main loop: {e}")
187
+ time.sleep(60) # Wait for a shorter period before retrying on error