Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Scribd Harvester via Selenium - Bruger din rigtige Chrome session | |
| """ | |
| import json | |
| import time | |
| import hashlib | |
| import re | |
| from pathlib import Path | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from neo4j import GraphDatabase | |
| class ScribdSeleniumHarvester: | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| def __init__(self): | |
| self.output_dir = Path("data/scribd_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| # Chrome options - bruger eksisterende profil | |
| chrome_options = Options() | |
| chrome_options.add_argument("--start-maximized") | |
| chrome_options.add_argument("--disable-blink-features=AutomationControlled") | |
| chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
| chrome_options.add_experimental_option("useAutomationExtension", False) | |
| # Start browser | |
| print("π Starter Chrome...") | |
| self.driver = webdriver.Chrome( | |
| service=Service(ChromeDriverManager().install()), | |
| options=chrome_options | |
| ) | |
| self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") | |
| # Neo4j | |
| self.neo4j = GraphDatabase.driver( | |
| self.NEO4J_URI, | |
| auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| self.documents = [] | |
| def login_manual(self): | |
| """Γ bn Scribd og vent pΓ₯ manuel login""" | |
| print("\n" + "=" * 60) | |
| print("π MANUEL LOGIN PΓ KRΓVET") | |
| print("=" * 60) | |
| self.driver.get("https://www.scribd.com/login") | |
| print(""" | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| β Log ind med din Google konto i Chrome vinduet β | |
| β Tryk ENTER her nΓ₯r du er logget ind... β | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| """) | |
| input() | |
| # Verificer login | |
| self.driver.get("https://www.scribd.com/saved") | |
| time.sleep(2) | |
| if "login" in self.driver.current_url.lower(): | |
| print("β Login fejlede - prΓΈv igen") | |
| return False | |
| print("β Login succesfuldt!") | |
| return True | |
| def harvest_saved(self): | |
| """Hent gemte dokumenter""" | |
| print("\nπ HARVESTING SAVED DOCUMENTS") | |
| print("-" * 40) | |
| self.driver.get("https://www.scribd.com/saved") | |
| time.sleep(3) | |
| # Scroll for at loade alle dokumenter | |
| last_height = self.driver.execute_script("return document.body.scrollHeight") | |
| scroll_count = 0 | |
| max_scrolls = 10 | |
| while scroll_count < max_scrolls: | |
| self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| time.sleep(2) | |
| new_height = self.driver.execute_script("return document.body.scrollHeight") | |
| if new_height == last_height: | |
| break | |
| last_height = new_height | |
| scroll_count += 1 | |
| print(f" π Scrolled {scroll_count}x...") | |
| # Find dokumenter | |
| doc_elements = self.driver.find_elements(By.CSS_SELECTOR, | |
| 'a[href*="/document/"], a[href*="/book/"], a[href*="/audiobook/"]') | |
| seen_urls = set() | |
| for elem in doc_elements: | |
| try: | |
| url = elem.get_attribute('href') | |
| if not url or url in seen_urls: | |
| continue | |
| seen_urls.add(url) | |
| # Extract ID from URL | |
| match = re.search(r'/(document|book|audiobook)/(\d+)', url) | |
| if not match: | |
| continue | |
| doc_type = match.group(1) | |
| doc_id = match.group(2) | |
| # Get title | |
| title = elem.text.strip() or elem.get_attribute('title') or f"Document {doc_id}" | |
| doc = { | |
| "id": doc_id, | |
| "title": title[:200], | |
| "url": url, | |
| "doc_type": doc_type, | |
| "source": "saved" | |
| } | |
| self.documents.append(doc) | |
| print(f" π {title[:50]}...") | |
| except Exception as e: | |
| continue | |
| print(f"\n β Found {len(self.documents)} documents") | |
| return self.documents | |
| def harvest_searches(self, queries: list): | |
| """SΓΈg efter dokumenter""" | |
| print("\nπ SEARCHING SCRIBD") | |
| print("-" * 40) | |
| for query in queries: | |
| print(f"\n SΓΈger: {query}") | |
| search_url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}" | |
| self.driver.get(search_url) | |
| time.sleep(3) | |
| # Find results | |
| results = self.driver.find_elements(By.CSS_SELECTOR, | |
| 'a[href*="/document/"], a[href*="/book/"]') | |
| count = 0 | |
| for elem in results[:10]: # Max 10 per search | |
| try: | |
| url = elem.get_attribute('href') | |
| match = re.search(r'/(document|book)/(\d+)', url) | |
| if not match: | |
| continue | |
| # Check duplicate | |
| if any(d['url'] == url for d in self.documents): | |
| continue | |
| doc = { | |
| "id": match.group(2), | |
| "title": elem.text.strip()[:200] or f"Search result {match.group(2)}", | |
| "url": url, | |
| "doc_type": match.group(1), | |
| "source": f"search:{query}" | |
| } | |
| self.documents.append(doc) | |
| count += 1 | |
| except: | |
| continue | |
| print(f" Found {count} new documents") | |
| def save_to_neo4j(self): | |
| """Gem alle dokumenter til Neo4j""" | |
| print("\nπΎ SAVING TO NEO4J") | |
| print("-" * 40) | |
| with self.neo4j.session() as session: | |
| for doc in self.documents: | |
| content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest() | |
| session.run(""" | |
| MERGE (d:ScribdDocument {contentHash: $hash}) | |
| ON CREATE SET | |
| d.id = $id, | |
| d.title = $title, | |
| d.url = $url, | |
| d.docType = $doc_type, | |
| d.source = $source, | |
| d.harvestedAt = datetime() | |
| ON MATCH SET | |
| d.lastSeen = datetime() | |
| MERGE (s:DataSource {name: 'Scribd'}) | |
| MERGE (d)-[:HARVESTED_FROM]->(s) | |
| """, | |
| hash=content_hash, | |
| id=doc['id'], | |
| title=doc['title'], | |
| url=doc['url'], | |
| doc_type=doc['doc_type'], | |
| source=doc['source'] | |
| ) | |
| print(f" β Saved {len(self.documents)} documents to Neo4j") | |
| def save_local(self): | |
| """Gem lokal JSON""" | |
| output_file = self.output_dir / "scribd_harvest.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(self.documents, f, indent=2, ensure_ascii=False) | |
| print(f" π Local JSON: {output_file}") | |
| def run(self): | |
| """KΓΈr fuld harvest""" | |
| print("=" * 60) | |
| print("π SCRIBD SELENIUM HARVESTER") | |
| print("=" * 60) | |
| if not self.login_manual(): | |
| return | |
| # Harvest saved documents | |
| self.harvest_saved() | |
| # Search for relevant topics | |
| search_queries = [ | |
| "AI ethics business", | |
| "generative AI strategy", | |
| "cybersecurity threat intelligence", | |
| "digital transformation" | |
| ] | |
| self.harvest_searches(search_queries) | |
| # Save results | |
| self.save_to_neo4j() | |
| self.save_local() | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("π HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" π Total documents: {len(self.documents)}") | |
| print("=" * 60) | |
| input("\nTryk ENTER for at lukke browseren...") | |
| self.driver.quit() | |
| self.neo4j.close() | |
| if __name__ == "__main__": | |
| harvester = ScribdSeleniumHarvester() | |
| harvester.run() | |