Ranjit89's picture
download
raw
7.89 kB
"""
Assamese paragraph scraper with resume & checkpointing.
Behavior:
- Reads last saved paragraph from assamese_paragraphs.txt (if any).
- Loads the target URL using Firefox profile.
- Scrolls the page until the last paragraph is visible (or gives up after MAX_FIND_ATTEMPTS).
- Scrolls further until content stabilizes.
- Appends new paragraphs every 10 scroll attempts (checkpointing).
- Finally appends any remaining new paragraphs.
"""
import os
import re
import time
import shutil
import tempfile
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from bs4 import BeautifulSoup
# ------------- User settings -------------
profile_path = "/home/ranjit/.mozilla/firefox/luddqo6a.default-release"
url = "https://m.dailyhunt.in/news/india/assamese/news?mode=pwa&action=click"
output_file = r"Ranjit_Data/assamese_paragraphs.txt"
HEADLESS = False
WAIT_BETWEEN_SCROLLS = 5 # Just makes each scroll slower (waits 5 seconds before the next scroll). No effect on the logic.
MAX_FIND_ATTEMPTS = 800000000 # Only affects how long it keeps searching for your last saved paragraph before giving up. Doesn’t affect checkpointing.
MAX_SCROLL_ATTEMPTS = 300 # Sets the total possible scrolls before stopping. Checkpoints will still happen at every 10th scroll (so roughly 30 checkpoints max).
STABLE_REQUIRED = 10 # Makes it more patient before deciding the page is “fully loaded.” Still unrelated to checkpointing.
CHECKPOINT_EVERY = 10 # Works exactly as intended.
# -----------------------------------------
assamese_pattern = re.compile(r"[\u0980-\u09FF]")
def normalize_text(s: str) -> str:
if s is None:
return ""
s = re.sub(r'\s+', ' ', s).strip()
return s
def safe_copy_profile(src_path: str) -> str:
if not os.path.isdir(src_path):
raise FileNotFoundError(f"profile path not found: {src_path}")
tmpdir = tempfile.mkdtemp(prefix="fh_profile_")
dest = os.path.join(tmpdir, "profile_copy")
def ignore_lock_files(directory, contents):
return [f for f in contents if f in ('lock', 'parent.lock')]
shutil.copytree(src_path, dest, ignore=ignore_lock_files)
return dest
def start_driver_with_profile(copied_profile_path: str):
options = Options()
options.headless = HEADLESS
profile_obj = FirefoxProfile(copied_profile_path)
options.profile = profile_obj
driver = webdriver.Firefox(options=options)
return driver
def find_last_saved_paragraph(file_path: str):
if not os.path.exists(file_path):
return None
with open(file_path, "r", encoding="utf-8") as f:
content = f.read().strip()
if not content:
return None
blocks = [normalize_text(b) for b in re.split(r'\n\s*\n', content) if normalize_text(b)]
if not blocks:
return None
return blocks[-1]
def extract_paragraphs_in_order(soup):
tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
paragraphs = []
for tag in tags:
txt = tag.get_text(separator=" ", strip=True)
txt = normalize_text(txt)
if txt and assamese_pattern.search(txt):
paragraphs.append(txt)
return paragraphs
def scroll_once(driver):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("""
const sel = document.querySelector('.infinite-scroll-component') ||
document.querySelector('.infinite-scroll-component__outerdiv') ||
document.querySelector('[data-infinite-scroll]') ||
document.querySelector('div[style*="overflow:auto"]');
if (sel) { sel.scrollTop = sel.scrollHeight; }
""")
def scroll_until_paragraph_found(driver, target_norm_text):
print("Searching for last saved paragraph on the loaded page...")
if not target_norm_text:
print("No previous paragraph found; starting fresh.")
return True
attempt = 0
while attempt < MAX_FIND_ATTEMPTS:
attempt += 1
scroll_once(driver)
time.sleep(WAIT_BETWEEN_SCROLLS)
html = driver.page_source
if target_norm_text in normalize_text(BeautifulSoup(html, "html.parser").get_text(separator=" ")):
print(f"Found last-saved paragraph on attempt #{attempt}.")
return True
if attempt % 10 == 0:
print(f"...still searching (attempt {attempt}/{MAX_FIND_ATTEMPTS})")
print("Warning: last saved paragraph not found after max attempts.")
return False
def append_new_paragraphs_to_file(file_path: str, existing_last_norm: str, all_paragraphs):
start_index = 0
if existing_last_norm:
try:
start_index = all_paragraphs.index(existing_last_norm) + 1
except ValueError:
start_index = 0
new_paras = all_paragraphs[start_index:]
if not new_paras:
return 0
with open(file_path, "a", encoding="utf-8") as fout:
if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
fout.write("\n")
for p in new_paras:
fout.write(p + "\n\n")
return len(new_paras)
def scroll_and_checkpoint(driver, output_file, last_saved_norm):
last_height = driver.execute_script("return document.body.scrollHeight")
stable_count = 0
scroll_attempts = 0
appended_total = 0
while scroll_attempts < MAX_SCROLL_ATTEMPTS:
scroll_attempts += 1
scroll_once(driver)
time.sleep(WAIT_BETWEEN_SCROLLS)
# checkpoint
if scroll_attempts % CHECKPOINT_EVERY == 0:
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
paragraphs = extract_paragraphs_in_order(soup)
appended = append_new_paragraphs_to_file(output_file, last_saved_norm, paragraphs)
if appended:
last_saved_norm = paragraphs[-1]
appended_total += appended
print(f"[Checkpoint] Scroll {scroll_attempts}: appended {appended} paragraph(s).")
# check for stable scroll
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
stable_count += 1
if stable_count >= STABLE_REQUIRED:
print(f"Reached stable bottom after {scroll_attempts} scrolls.")
break
else:
stable_count = 0
last_height = new_height
# final append
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
paragraphs = extract_paragraphs_in_order(soup)
appended = append_new_paragraphs_to_file(output_file, last_saved_norm, paragraphs)
appended_total += appended
print(f"Total new paragraphs appended in this run: {appended_total}")
def main():
last_saved = find_last_saved_paragraph(output_file)
if last_saved:
print("Last saved paragraph (preview):")
print(last_saved[:200] + ("..." if len(last_saved) > 200 else ""))
else:
print("No existing output file found or file empty — starting fresh.")
print("Copying Firefox profile (to avoid lock issues)...")
copied_profile = safe_copy_profile(profile_path)
try:
driver = start_driver_with_profile(copied_profile)
except Exception as e:
raise RuntimeError("Failed to start Firefox webdriver. Check geckodriver/selenium and profile path.") from e
try:
print("Loading URL:", url)
driver.get(url)
time.sleep(WAIT_BETWEEN_SCROLLS + 1.0)
scroll_until_paragraph_found(driver, last_saved)
scroll_and_checkpoint(driver, output_file, last_saved)
finally:
try:
driver.quit()
except Exception:
pass
print("Scraping complete.")
if __name__ == "__main__":
main()

Xet Storage Details

Size:
7.89 kB
·
Xet hash:
ff32075a258a5dbac6512e03694f0d81457de6abca6249a9b29b0314a0cd7138

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.