Spaces:
Paused
Paused
File size: 9,507 Bytes
34367da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | #!/usr/bin/env python3
"""
Scribd Harvester via Selenium - Bruger din rigtige Chrome session
"""
import json
import time
import hashlib
import re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from neo4j import GraphDatabase
class ScribdSeleniumHarvester:
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
def __init__(self):
self.output_dir = Path("data/scribd_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
# Chrome options - bruger eksisterende profil
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
# Start browser
print("π Starter Chrome...")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# Neo4j
self.neo4j = GraphDatabase.driver(
self.NEO4J_URI,
auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
self.documents = []
def login_manual(self):
"""Γ
bn Scribd og vent pΓ₯ manuel login"""
print("\n" + "=" * 60)
print("π MANUEL LOGIN PΓ
KRΓVET")
print("=" * 60)
self.driver.get("https://www.scribd.com/login")
print("""
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β Log ind med din Google konto i Chrome vinduet β
β Tryk ENTER her nΓ₯r du er logget ind... β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
""")
input()
# Verificer login
self.driver.get("https://www.scribd.com/saved")
time.sleep(2)
if "login" in self.driver.current_url.lower():
print("β Login fejlede - prΓΈv igen")
return False
print("β
Login succesfuldt!")
return True
def harvest_saved(self):
"""Hent gemte dokumenter"""
print("\nπ HARVESTING SAVED DOCUMENTS")
print("-" * 40)
self.driver.get("https://www.scribd.com/saved")
time.sleep(3)
# Scroll for at loade alle dokumenter
last_height = self.driver.execute_script("return document.body.scrollHeight")
scroll_count = 0
max_scrolls = 10
while scroll_count < max_scrolls:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll_count += 1
print(f" π Scrolled {scroll_count}x...")
# Find dokumenter
doc_elements = self.driver.find_elements(By.CSS_SELECTOR,
'a[href*="/document/"], a[href*="/book/"], a[href*="/audiobook/"]')
seen_urls = set()
for elem in doc_elements:
try:
url = elem.get_attribute('href')
if not url or url in seen_urls:
continue
seen_urls.add(url)
# Extract ID from URL
match = re.search(r'/(document|book|audiobook)/(\d+)', url)
if not match:
continue
doc_type = match.group(1)
doc_id = match.group(2)
# Get title
title = elem.text.strip() or elem.get_attribute('title') or f"Document {doc_id}"
doc = {
"id": doc_id,
"title": title[:200],
"url": url,
"doc_type": doc_type,
"source": "saved"
}
self.documents.append(doc)
print(f" π {title[:50]}...")
except Exception as e:
continue
print(f"\n β
Found {len(self.documents)} documents")
return self.documents
def harvest_searches(self, queries: list):
"""SΓΈg efter dokumenter"""
print("\nπ SEARCHING SCRIBD")
print("-" * 40)
for query in queries:
print(f"\n SΓΈger: {query}")
search_url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}"
self.driver.get(search_url)
time.sleep(3)
# Find results
results = self.driver.find_elements(By.CSS_SELECTOR,
'a[href*="/document/"], a[href*="/book/"]')
count = 0
for elem in results[:10]: # Max 10 per search
try:
url = elem.get_attribute('href')
match = re.search(r'/(document|book)/(\d+)', url)
if not match:
continue
# Check duplicate
if any(d['url'] == url for d in self.documents):
continue
doc = {
"id": match.group(2),
"title": elem.text.strip()[:200] or f"Search result {match.group(2)}",
"url": url,
"doc_type": match.group(1),
"source": f"search:{query}"
}
self.documents.append(doc)
count += 1
except:
continue
print(f" Found {count} new documents")
def save_to_neo4j(self):
"""Gem alle dokumenter til Neo4j"""
print("\nπΎ SAVING TO NEO4J")
print("-" * 40)
with self.neo4j.session() as session:
for doc in self.documents:
content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest()
session.run("""
MERGE (d:ScribdDocument {contentHash: $hash})
ON CREATE SET
d.id = $id,
d.title = $title,
d.url = $url,
d.docType = $doc_type,
d.source = $source,
d.harvestedAt = datetime()
ON MATCH SET
d.lastSeen = datetime()
MERGE (s:DataSource {name: 'Scribd'})
MERGE (d)-[:HARVESTED_FROM]->(s)
""",
hash=content_hash,
id=doc['id'],
title=doc['title'],
url=doc['url'],
doc_type=doc['doc_type'],
source=doc['source']
)
print(f" β
Saved {len(self.documents)} documents to Neo4j")
def save_local(self):
"""Gem lokal JSON"""
output_file = self.output_dir / "scribd_harvest.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(self.documents, f, indent=2, ensure_ascii=False)
print(f" π Local JSON: {output_file}")
def run(self):
"""KΓΈr fuld harvest"""
print("=" * 60)
print("π SCRIBD SELENIUM HARVESTER")
print("=" * 60)
if not self.login_manual():
return
# Harvest saved documents
self.harvest_saved()
# Search for relevant topics
search_queries = [
"AI ethics business",
"generative AI strategy",
"cybersecurity threat intelligence",
"digital transformation"
]
self.harvest_searches(search_queries)
# Save results
self.save_to_neo4j()
self.save_local()
# Summary
print("\n" + "=" * 60)
print("π HARVEST COMPLETE")
print("=" * 60)
print(f" π Total documents: {len(self.documents)}")
print("=" * 60)
input("\nTryk ENTER for at lukke browseren...")
self.driver.quit()
self.neo4j.close()
if __name__ == "__main__":
harvester = ScribdSeleniumHarvester()
harvester.run()
|