File size: 13,221 Bytes
dd99def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import os
import time
import logging
import csv
import signal
import sys
import argparse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, ElementClickInterceptedException
)
from bs4 import BeautifulSoup
from utils.webdriver_utils import create_chrome_driver, scroll_to_element

class ScrollArticleScraper:
    def __init__(self, max_workers=4, articles_per_page=10):
        self.max_workers = max_workers
        self.articles_per_page = articles_per_page
        self.base_url = "https://scroll.in/search"
        self.fetched_articles = set()
        self.articles = []
        self.is_interrupted = False
        self.last_save_time = time.time()
        self.save_interval = 300  # Save every 5 minutes

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

        # Set up signal handlers
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)

    def signal_handler(self, signum, frame):
        """Handle interrupt signals"""
        print("\nReceived interrupt signal. Saving progress and shutting down...")
        self.is_interrupted = True
        if self.articles:
            self.save_progress("interrupted", force=True)
        sys.exit(0)

    # def create_driver(self):
    #     """Create and return a new Chrome driver instance"""
    #     chrome_options = webdriver.ChromeOptions()
    #     chrome_options.add_argument('--headless')
    #     chrome_options.add_argument('--no-sandbox')
    #     chrome_options.add_argument('--disable-dev-shm-usage')
    #     chrome_options.add_argument('--disable-extensions')
    #     chrome_options.page_load_strategy = 'eager'
    #     return webdriver.Chrome(options=chrome_options)
    def create_driver(self):
        """Create and return a new Chrome driver instance"""
        return create_chrome_driver(headless=True, load_images=False)
    
    def get_total_pages(self, driver, search_term):
        """Get the total number of pages for the search term"""
        try:
            driver.get(f"{self.base_url}?q={search_term}&page=1")
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Scroll.in might have a different pagination structure
            pagination = soup.find('div', class_='page-numbers')
            if pagination:
                pages = pagination.find_all('a', class_='page-number')
                if pages:
                    # Get the last page number
                    last_page = max([int(page.text.strip()) for page in pages])
                    return last_page

            # Fallback to 144 pages if pagination not found
            return 144
        except Exception as e:
            self.logger.error(f"Error getting total pages: {str(e)}")
            return 144  # Default to 144 pages
    def extract_visible_articles(self, driver):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article_containers = soup.find_all('li', itemscope=True, itemtype="https://schema.org/NewsArticle")

        new_articles = []
        for container in article_containers:
            link_element = container.find('a', href=True)
            date_element = container.find('time')  # Change back to 'time'

            if link_element and link_element['href']:
                full_link = link_element['href'] if link_element['href'].startswith('http') else 'https://scroll.in' + link_element['href']

                # Extract date
                date = None
                if date_element:
                    date_text = date_element.get_text(strip=True)
                    date_text = date_text.replace('Â', '').replace('·', '').strip()
                    try:
                        # Try parsing the date
                        parsed_date = datetime.strptime(date_text, '%b %d, %Y')
                        date = parsed_date.strftime('%Y-%m-%d')
                    except:
                        try:
                            # Alternative date format
                            parsed_date = datetime.strptime(date_text, '%B %d, %Y')
                            date = parsed_date.strftime('%Y-%m-%d')
                        except:
                            date = date_text

                if full_link not in self.fetched_articles:
                    self.fetched_articles.add(full_link)
                    new_articles.append({'link': full_link, 'date': date})

        return new_articles

    def scrape_topic(self, search_term, topic):
        """Scrape articles from all pages for a given search term"""
        driver = self.create_driver()
        try:
            total_pages = self.get_total_pages(driver, search_term)
            total_expected_articles = total_pages * self.articles_per_page
            self.logger.info(f"Found {total_pages} pages to scrape (approximately {total_expected_articles} articles)")

            for page in range(1, total_pages + 1):
                if self.is_interrupted:
                    break

                page_url = f"{self.base_url}?q={search_term}&page={page}"
                articles_scraped = len(self.articles)
                progress_percentage = (articles_scraped / total_expected_articles) * 100

                self.logger.info(f"Scraping page {page}/{total_pages} - Progress: {articles_scraped}/{total_expected_articles} articles ({progress_percentage:.1f}%)")

                try:
                    driver.get(page_url)
                    time.sleep(2)  # Allow page to load

                    new_articles = self.extract_visible_articles(driver)
                    if new_articles:
                        self.process_articles_batch(new_articles)
                        self.logger.info(f"Scraped {len(new_articles)}/{self.articles_per_page} articles from page {page}")
                        self.save_progress(topic)
                    else:
                        self.logger.warning(f"No articles found on page {page}")

                except Exception as e:
                    self.logger.error(f"Error scraping page {page}: {str(e)}")
                    continue

            if self.articles:
                return self.save_to_csv(self.articles, topic, final=True)
            return None

        except Exception as e:
            self.logger.error(f"Error in scrape_topic: {str(e)}")
            if self.articles:
                return self.save_to_csv(self.articles, topic, final=True)
            return None
        finally:
            driver.quit()

    def scrape_article_parallel(self, article_data):

        driver = self.create_driver()
        try:
            url = article_data['link']
            driver.get(url)
            time.sleep(2)

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Extract title
            title = None
            title_element = soup.find('h1')  # Simple h1 search
            if title_element:
                title = title_element.get_text().strip()

            # Extract description
            desc = None
            article_body = soup.find('section', class_='article-content')
            if article_body:
                paragraphs = article_body.find_all('p')
                desc = ' '.join([p.get_text().strip() for p in paragraphs])

            return {
                'title': title or 'Title not found',
                'desc': desc or 'Description not found',
                'date': article_data['date'] or 'Date not found',
                'link': url
            }

        except Exception as e:
            self.logger.error(f"Error scraping article {url}: {str(e)}")
            return None
        finally:
            driver.quit()

    def process_articles_batch(self, article_batch):
        """Process a batch of articles in parallel"""
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [executor.submit(self.scrape_article_parallel, article_data)
                      for article_data in article_batch]

            successful_articles = 0
            for future in as_completed(futures):
                try:
                    article = future.result()
                    if article:
                        self.articles.append(article)
                        successful_articles += 1
                        self.logger.info(f"Successfully scraped: {article['title'][:50]}...")
                except Exception as e:
                    self.logger.error(f"Error processing article: {str(e)}")

            if successful_articles < len(article_batch):
                self.logger.warning(f"Only processed {successful_articles}/{len(article_batch)} articles in this batch")

    def save_progress(self, topic, force=False):
        """Save current progress to CSV"""
        current_time = time.time()
        if force or (current_time - self.last_save_time >= self.save_interval and self.articles):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"scroll_{topic}articles_{timestamp}_partial.csv"
            try:
                with open(filename, 'w', newline='', encoding='utf-8') as file:
                    writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
                    writer.writeheader()
                    writer.writerows(self.articles)
                self.last_save_time = current_time
                print(f"\nProgress saved to {filename}: {len(self.articles)} articles")
                self.logger.info(f"Progress saved to {filename}: {len(self.articles)} articles")
            except Exception as e:
                self.logger.error(f"Error saving progress: {str(e)}")

    def save_to_csv(self, articles, topic, final=False):
        """Save articles to CSV file"""
        if not articles:
            self.logger.error("No articles to save")
            return None

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"scroll_{topic}articles_{timestamp}_{'final' if final else 'partial'}.csv"

        try:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=['title', 'desc', 'date', 'link'])
                writer.writeheader()
                writer.writerows(articles)
                self.logger.info(f"Successfully saved {len(articles)} articles to: {filename}")
            return filename
        except Exception as e:
            self.logger.error(f"Error saving to CSV: {str(e)}")
            return None

# def parse_arguments():
#     """Parse command line arguments"""
#     parser = argparse.ArgumentParser(description='Scrape articles from Scroll.in by topic')
    
#     parser.add_argument('topic', type=str, 
#                         help='Topic to scrape (e.g., "RSS", "Covid", "India")')
    
#     parser.add_argument('-w', '--workers', type=int, default=4,
#                         help='Number of worker threads (default: 4)')
    
#     parser.add_argument('-i', '--interval', type=int, default=300,
#                         help='Auto-save interval in seconds (default: 300)')
    
#     parser.add_argument('-a', '--articles-per-page', type=int, default=10,
#                         help='Expected number of articles per page (default: 10)')
    
#     return parser.parse_args()

def main():
    try:
        # Parse command line arguments
        args = parse_arguments()
        
        # Initialize the scraper with command line arguments
        scraper = ScrollArticleScraper(
            max_workers=args.workers,
            articles_per_page=args.articles_per_page
        )
        scraper.save_interval = args.interval
        
        # Get topic from command line argument
        topic = args.topic

        print(f"\nScraping {topic}-related articles from Scroll.in...")
        print("Press Ctrl+C at any time to save progress and exit.")

        final_csv = scraper.scrape_topic(topic.lower(), topic)

        if final_csv:
            print(f"\nArticles have been saved to: {final_csv}")
            print(f"Total articles scraped: {len(scraper.articles)}")
        else:
            print("\nError saving to final CSV file")

    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Saving progress...")
        if scraper.articles:
            scraper.save_progress(topic, force=True)
        print("Saved progress and exiting.")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        if 'scraper' in locals() and scraper.articles:
            scraper.save_progress(topic, force=True)
        print("Saved progress despite error.")

if __name__ == "__main__":
    main()