#!/usr/bin/env python3 """ Scribd Harvester via Selenium - Bruger din rigtige Chrome session """ import json import time import hashlib import re from pathlib import Path from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from neo4j import GraphDatabase class ScribdSeleniumHarvester: NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" def __init__(self): self.output_dir = Path("data/scribd_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) # Chrome options - bruger eksisterende profil chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option("useAutomationExtension", False) # Start browser print("🌐 Starter Chrome...") self.driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=chrome_options ) self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") # Neo4j self.neo4j = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) self.documents = [] def login_manual(self): """Γ…bn Scribd og vent pΓ₯ manuel login""" print("\n" + "=" * 60) print("πŸ” MANUEL LOGIN PΓ…KRΓ†VET") print("=" * 60) self.driver.get("https://www.scribd.com/login") print(""" ╔══════════════════════════════════════════════════════════════════╗ β•‘ Log ind med din Google konto i Chrome vinduet β•‘ β•‘ Tryk ENTER her nΓ₯r du er logget ind... β•‘ β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β• """) input() # Verificer login self.driver.get("https://www.scribd.com/saved") time.sleep(2) if "login" in self.driver.current_url.lower(): print("❌ Login fejlede - prΓΈv igen") return False print("βœ… Login succesfuldt!") return True def harvest_saved(self): """Hent gemte dokumenter""" print("\nπŸ“š HARVESTING SAVED DOCUMENTS") print("-" * 40) self.driver.get("https://www.scribd.com/saved") time.sleep(3) # Scroll for at loade alle dokumenter last_height = self.driver.execute_script("return document.body.scrollHeight") scroll_count = 0 max_scrolls = 10 while scroll_count < max_scrolls: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) new_height = self.driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height scroll_count += 1 print(f" πŸ“œ Scrolled {scroll_count}x...") # Find dokumenter doc_elements = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="/document/"], a[href*="/book/"], a[href*="/audiobook/"]') seen_urls = set() for elem in doc_elements: try: url = elem.get_attribute('href') if not url or url in seen_urls: continue seen_urls.add(url) # Extract ID from URL match = re.search(r'/(document|book|audiobook)/(\d+)', url) if not match: continue doc_type = match.group(1) doc_id = match.group(2) # Get title title = elem.text.strip() or elem.get_attribute('title') or f"Document {doc_id}" doc = { "id": doc_id, "title": title[:200], "url": url, "doc_type": doc_type, "source": "saved" } self.documents.append(doc) print(f" πŸ“„ {title[:50]}...") except Exception as e: continue print(f"\n βœ… Found {len(self.documents)} documents") return self.documents def harvest_searches(self, queries: list): """SΓΈg efter dokumenter""" print("\nπŸ” SEARCHING SCRIBD") print("-" * 40) for query in queries: print(f"\n SΓΈger: {query}") search_url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}" self.driver.get(search_url) time.sleep(3) # Find results results = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="/document/"], a[href*="/book/"]') count = 0 for elem in results[:10]: # Max 10 per search try: url = elem.get_attribute('href') match = re.search(r'/(document|book)/(\d+)', url) if not match: continue # Check duplicate if any(d['url'] == url for d in self.documents): continue doc = { "id": match.group(2), "title": elem.text.strip()[:200] or f"Search result {match.group(2)}", "url": url, "doc_type": match.group(1), "source": f"search:{query}" } self.documents.append(doc) count += 1 except: continue print(f" Found {count} new documents") def save_to_neo4j(self): """Gem alle dokumenter til Neo4j""" print("\nπŸ’Ύ SAVING TO NEO4J") print("-" * 40) with self.neo4j.session() as session: for doc in self.documents: content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest() session.run(""" MERGE (d:ScribdDocument {contentHash: $hash}) ON CREATE SET d.id = $id, d.title = $title, d.url = $url, d.docType = $doc_type, d.source = $source, d.harvestedAt = datetime() ON MATCH SET d.lastSeen = datetime() MERGE (s:DataSource {name: 'Scribd'}) MERGE (d)-[:HARVESTED_FROM]->(s) """, hash=content_hash, id=doc['id'], title=doc['title'], url=doc['url'], doc_type=doc['doc_type'], source=doc['source'] ) print(f" βœ… Saved {len(self.documents)} documents to Neo4j") def save_local(self): """Gem lokal JSON""" output_file = self.output_dir / "scribd_harvest.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(self.documents, f, indent=2, ensure_ascii=False) print(f" πŸ“ Local JSON: {output_file}") def run(self): """KΓΈr fuld harvest""" print("=" * 60) print("πŸ“š SCRIBD SELENIUM HARVESTER") print("=" * 60) if not self.login_manual(): return # Harvest saved documents self.harvest_saved() # Search for relevant topics search_queries = [ "AI ethics business", "generative AI strategy", "cybersecurity threat intelligence", "digital transformation" ] self.harvest_searches(search_queries) # Save results self.save_to_neo4j() self.save_local() # Summary print("\n" + "=" * 60) print("πŸ“Š HARVEST COMPLETE") print("=" * 60) print(f" πŸ“„ Total documents: {len(self.documents)}") print("=" * 60) input("\nTryk ENTER for at lukke browseren...") self.driver.quit() self.neo4j.close() if __name__ == "__main__": harvester = ScribdSeleniumHarvester() harvester.run()