Spaces:

eusholli
/

morris-bot

Sleeping

App Files Files Community

morris-bot / urls.py

eusholli

Upload folder using huggingface_hub

599c2c0 verified 10 months ago

raw

history blame contribute delete

1.87 kB

	import cloudscraper
	from bs4 import BeautifulSoup
	import time

	BASE_URL = "https://www.lightreading.com"
	AUTHOR_URL = f"{BASE_URL}/author/iain-morris"
	TARGET_COUNT = 100
	DELAY = 1 # polite wait between requests

	# Create a scraper that bypasses Cloudflare protection
	scraper = cloudscraper.create_scraper()

	def fetch_page(url):
	"""Fetch and parse a page from the given URL."""
	resp = scraper.get(url)
	resp.raise_for_status()
	return BeautifulSoup(resp.text, "html.parser")

	def extract_article_links(soup):
	"""Extract valid article links from a BeautifulSoup object."""
	links = []
	# Use the correct selector based on the actual HTML structure
	for a in soup.select('a.ListPreview-Title[data-testid="preview-default-title"]'):
	href = a['href']
	if href.startswith("/author/"):
	continue # skip author links
	full_url = BASE_URL + href if href.startswith("/") else href
	links.append(full_url)
	return links

	def scrape_latest_urls():
	"""Scrape up to TARGET_COUNT article URLs from paginated author pages."""
	urls, seen = [], set()
	page_num = 1

	while len(urls) < TARGET_COUNT:
	page_url = f"{AUTHOR_URL}?page={page_num}"
	print(f"Fetching {page_url} …")
	soup = fetch_page(page_url)

	found = extract_article_links(soup)
	if not found:
	print("No more articles found; stopping.")
	break

	for u in found:
	if u not in seen:
	seen.add(u)
	urls.append(u)
	if len(urls) >= TARGET_COUNT:
	break

	page_num += 1
	time.sleep(DELAY)

	return urls

	if __name__ == "__main__":
	urls = scrape_latest_urls()
	print(f"\n✅ Collected {len(urls)} article URLs:\n")
	for idx, url in enumerate(urls, 1):
	print(f"{url}")