Spaces:
Sleeping
Sleeping
| import cloudscraper | |
| from bs4 import BeautifulSoup | |
| import time | |
| BASE_URL = "https://www.lightreading.com" | |
| AUTHOR_URL = f"{BASE_URL}/author/iain-morris" | |
| TARGET_COUNT = 100 | |
| DELAY = 1 # polite wait between requests | |
| # Create a scraper that bypasses Cloudflare protection | |
| scraper = cloudscraper.create_scraper() | |
| def fetch_page(url): | |
| """Fetch and parse a page from the given URL.""" | |
| resp = scraper.get(url) | |
| resp.raise_for_status() | |
| return BeautifulSoup(resp.text, "html.parser") | |
| def extract_article_links(soup): | |
| """Extract valid article links from a BeautifulSoup object.""" | |
| links = [] | |
| # Use the correct selector based on the actual HTML structure | |
| for a in soup.select('a.ListPreview-Title[data-testid="preview-default-title"]'): | |
| href = a['href'] | |
| if href.startswith("/author/"): | |
| continue # skip author links | |
| full_url = BASE_URL + href if href.startswith("/") else href | |
| links.append(full_url) | |
| return links | |
| def scrape_latest_urls(): | |
| """Scrape up to TARGET_COUNT article URLs from paginated author pages.""" | |
| urls, seen = [], set() | |
| page_num = 1 | |
| while len(urls) < TARGET_COUNT: | |
| page_url = f"{AUTHOR_URL}?page={page_num}" | |
| print(f"Fetching {page_url} …") | |
| soup = fetch_page(page_url) | |
| found = extract_article_links(soup) | |
| if not found: | |
| print("No more articles found; stopping.") | |
| break | |
| for u in found: | |
| if u not in seen: | |
| seen.add(u) | |
| urls.append(u) | |
| if len(urls) >= TARGET_COUNT: | |
| break | |
| page_num += 1 | |
| time.sleep(DELAY) | |
| return urls | |
| if __name__ == "__main__": | |
| urls = scrape_latest_urls() | |
| print(f"\n✅ Collected {len(urls)} article URLs:\n") | |
| for idx, url in enumerate(urls, 1): | |
| print(f"{url}") | |