Buckets:

Ranjit89
/

Assamese-Text-Dataset-bucket

Files

xet

Ranjit89/Assamese-Text-Dataset-bucket / web_scraping.py

Ranjit89

3 days ago

download

raw

7.89 kB

	"""
	Assamese paragraph scraper with resume & checkpointing.

	Behavior:
	- Reads last saved paragraph from assamese_paragraphs.txt (if any).
	- Loads the target URL using Firefox profile.
	- Scrolls the page until the last paragraph is visible (or gives up after MAX_FIND_ATTEMPTS).
	- Scrolls further until content stabilizes.
	- Appends new paragraphs every 10 scroll attempts (checkpointing).
	- Finally appends any remaining new paragraphs.
	"""

	import os
	import re
	import time
	import shutil
	import tempfile
	from selenium import webdriver
	from selenium.webdriver.firefox.options import Options
	from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
	from bs4 import BeautifulSoup

	# ------------- User settings -------------
	profile_path = "/home/ranjit/.mozilla/firefox/luddqo6a.default-release"
	url = "https://m.dailyhunt.in/news/india/assamese/news?mode=pwa&action=click"

	output_file = r"Ranjit_Data/assamese_paragraphs.txt"

	HEADLESS = False
	WAIT_BETWEEN_SCROLLS = 5 # Just makes each scroll slower (waits 5 seconds before the next scroll). No effect on the logic.
	MAX_FIND_ATTEMPTS = 800000000 # Only affects how long it keeps searching for your last saved paragraph before giving up. Doesn’t affect checkpointing.
	MAX_SCROLL_ATTEMPTS = 300 # Sets the total possible scrolls before stopping. Checkpoints will still happen at every 10th scroll (so roughly 30 checkpoints max).
	STABLE_REQUIRED = 10 # Makes it more patient before deciding the page is “fully loaded.” Still unrelated to checkpointing.
	CHECKPOINT_EVERY = 10 # Works exactly as intended.

	# -----------------------------------------

	assamese_pattern = re.compile(r"[\u0980-\u09FF]")

	def normalize_text(s: str) -> str:
	if s is None:
	return ""
	s = re.sub(r'\s+', ' ', s).strip()
	return s

	def safe_copy_profile(src_path: str) -> str:
	if not os.path.isdir(src_path):
	raise FileNotFoundError(f"profile path not found: {src_path}")
	tmpdir = tempfile.mkdtemp(prefix="fh_profile_")
	dest = os.path.join(tmpdir, "profile_copy")

	def ignore_lock_files(directory, contents):
	return [f for f in contents if f in ('lock', 'parent.lock')]

	shutil.copytree(src_path, dest, ignore=ignore_lock_files)
	return dest

	def start_driver_with_profile(copied_profile_path: str):
	options = Options()
	options.headless = HEADLESS
	profile_obj = FirefoxProfile(copied_profile_path)
	options.profile = profile_obj
	driver = webdriver.Firefox(options=options)
	return driver

	def find_last_saved_paragraph(file_path: str):
	if not os.path.exists(file_path):
	return None
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read().strip()
	if not content:
	return None
	blocks = [normalize_text(b) for b in re.split(r'\n\s*\n', content) if normalize_text(b)]
	if not blocks:
	return None
	return blocks[-1]

	def extract_paragraphs_in_order(soup):
	tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
	paragraphs = []
	for tag in tags:
	txt = tag.get_text(separator=" ", strip=True)
	txt = normalize_text(txt)
	if txt and assamese_pattern.search(txt):
	paragraphs.append(txt)
	return paragraphs

	def scroll_once(driver):
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	driver.execute_script("""
	const sel = document.querySelector('.infinite-scroll-component') \|\|
	document.querySelector('.infinite-scroll-component__outerdiv') \|\|
	document.querySelector('[data-infinite-scroll]') \|\|
	document.querySelector('div[style*="overflow:auto"]');
	if (sel) { sel.scrollTop = sel.scrollHeight; }
	""")

	def scroll_until_paragraph_found(driver, target_norm_text):
	print("Searching for last saved paragraph on the loaded page...")
	if not target_norm_text:
	print("No previous paragraph found; starting fresh.")
	return True
	attempt = 0
	while attempt < MAX_FIND_ATTEMPTS:
	attempt += 1
	scroll_once(driver)
	time.sleep(WAIT_BETWEEN_SCROLLS)
	html = driver.page_source
	if target_norm_text in normalize_text(BeautifulSoup(html, "html.parser").get_text(separator=" ")):
	print(f"Found last-saved paragraph on attempt #{attempt}.")
	return True
	if attempt % 10 == 0:
	print(f"...still searching (attempt {attempt}/{MAX_FIND_ATTEMPTS})")
	print("Warning: last saved paragraph not found after max attempts.")
	return False

	def append_new_paragraphs_to_file(file_path: str, existing_last_norm: str, all_paragraphs):
	start_index = 0
	if existing_last_norm:
	try:
	start_index = all_paragraphs.index(existing_last_norm) + 1
	except ValueError:
	start_index = 0

	new_paras = all_paragraphs[start_index:]
	if not new_paras:
	return 0

	with open(file_path, "a", encoding="utf-8") as fout:
	if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
	fout.write("\n")
	for p in new_paras:
	fout.write(p + "\n\n")
	return len(new_paras)

	def scroll_and_checkpoint(driver, output_file, last_saved_norm):
	last_height = driver.execute_script("return document.body.scrollHeight")
	stable_count = 0
	scroll_attempts = 0
	appended_total = 0

	while scroll_attempts < MAX_SCROLL_ATTEMPTS:
	scroll_attempts += 1
	scroll_once(driver)
	time.sleep(WAIT_BETWEEN_SCROLLS)

	# checkpoint
	if scroll_attempts % CHECKPOINT_EVERY == 0:
	html = driver.page_source
	soup = BeautifulSoup(html, "html.parser")
	paragraphs = extract_paragraphs_in_order(soup)
	appended = append_new_paragraphs_to_file(output_file, last_saved_norm, paragraphs)
	if appended:
	last_saved_norm = paragraphs[-1]
	appended_total += appended
	print(f"[Checkpoint] Scroll {scroll_attempts}: appended {appended} paragraph(s).")

	# check for stable scroll
	new_height = driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	stable_count += 1
	if stable_count >= STABLE_REQUIRED:
	print(f"Reached stable bottom after {scroll_attempts} scrolls.")
	break
	else:
	stable_count = 0
	last_height = new_height

	# final append
	html = driver.page_source
	soup = BeautifulSoup(html, "html.parser")
	paragraphs = extract_paragraphs_in_order(soup)
	appended = append_new_paragraphs_to_file(output_file, last_saved_norm, paragraphs)
	appended_total += appended
	print(f"Total new paragraphs appended in this run: {appended_total}")

	def main():
	last_saved = find_last_saved_paragraph(output_file)
	if last_saved:
	print("Last saved paragraph (preview):")
	print(last_saved[:200] + ("..." if len(last_saved) > 200 else ""))
	else:
	print("No existing output file found or file empty — starting fresh.")

	print("Copying Firefox profile (to avoid lock issues)...")
	copied_profile = safe_copy_profile(profile_path)

	try:
	driver = start_driver_with_profile(copied_profile)
	except Exception as e:
	raise RuntimeError("Failed to start Firefox webdriver. Check geckodriver/selenium and profile path.") from e

	try:
	print("Loading URL:", url)
	driver.get(url)
	time.sleep(WAIT_BETWEEN_SCROLLS + 1.0)

	scroll_until_paragraph_found(driver, last_saved)
	scroll_and_checkpoint(driver, output_file, last_saved)

	finally:
	try:
	driver.quit()
	except Exception:
	pass

	print("Scraping complete.")

	if __name__ == "__main__":
	main()

Xet Storage Details

Size:: 7.89 kB
Xet hash:: ff32075a258a5dbac6512e03694f0d81457de6abca6249a9b29b0314a0cd7138

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.