widgettdc-api / apps /backend /python /scribd_selenium_harvest.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
Scribd Harvester via Selenium - Bruger din rigtige Chrome session
"""
import json
import time
import hashlib
import re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from neo4j import GraphDatabase
class ScribdSeleniumHarvester:
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
def __init__(self):
self.output_dir = Path("data/scribd_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
# Chrome options - bruger eksisterende profil
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
# Start browser
print("🌐 Starter Chrome...")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# Neo4j
self.neo4j = GraphDatabase.driver(
self.NEO4J_URI,
auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
self.documents = []
def login_manual(self):
"""Γ…bn Scribd og vent pΓ₯ manuel login"""
print("\n" + "=" * 60)
print("πŸ” MANUEL LOGIN PΓ…KRΓ†VET")
print("=" * 60)
self.driver.get("https://www.scribd.com/login")
print("""
╔══════════════════════════════════════════════════════════════════╗
β•‘ Log ind med din Google konto i Chrome vinduet β•‘
β•‘ Tryk ENTER her nΓ₯r du er logget ind... β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
""")
input()
# Verificer login
self.driver.get("https://www.scribd.com/saved")
time.sleep(2)
if "login" in self.driver.current_url.lower():
print("❌ Login fejlede - prøv igen")
return False
print("βœ… Login succesfuldt!")
return True
def harvest_saved(self):
"""Hent gemte dokumenter"""
print("\nπŸ“š HARVESTING SAVED DOCUMENTS")
print("-" * 40)
self.driver.get("https://www.scribd.com/saved")
time.sleep(3)
# Scroll for at loade alle dokumenter
last_height = self.driver.execute_script("return document.body.scrollHeight")
scroll_count = 0
max_scrolls = 10
while scroll_count < max_scrolls:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll_count += 1
print(f" πŸ“œ Scrolled {scroll_count}x...")
# Find dokumenter
doc_elements = self.driver.find_elements(By.CSS_SELECTOR,
'a[href*="/document/"], a[href*="/book/"], a[href*="/audiobook/"]')
seen_urls = set()
for elem in doc_elements:
try:
url = elem.get_attribute('href')
if not url or url in seen_urls:
continue
seen_urls.add(url)
# Extract ID from URL
match = re.search(r'/(document|book|audiobook)/(\d+)', url)
if not match:
continue
doc_type = match.group(1)
doc_id = match.group(2)
# Get title
title = elem.text.strip() or elem.get_attribute('title') or f"Document {doc_id}"
doc = {
"id": doc_id,
"title": title[:200],
"url": url,
"doc_type": doc_type,
"source": "saved"
}
self.documents.append(doc)
print(f" πŸ“„ {title[:50]}...")
except Exception as e:
continue
print(f"\n βœ… Found {len(self.documents)} documents")
return self.documents
def harvest_searches(self, queries: list):
"""SΓΈg efter dokumenter"""
print("\nπŸ” SEARCHING SCRIBD")
print("-" * 40)
for query in queries:
print(f"\n SΓΈger: {query}")
search_url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}"
self.driver.get(search_url)
time.sleep(3)
# Find results
results = self.driver.find_elements(By.CSS_SELECTOR,
'a[href*="/document/"], a[href*="/book/"]')
count = 0
for elem in results[:10]: # Max 10 per search
try:
url = elem.get_attribute('href')
match = re.search(r'/(document|book)/(\d+)', url)
if not match:
continue
# Check duplicate
if any(d['url'] == url for d in self.documents):
continue
doc = {
"id": match.group(2),
"title": elem.text.strip()[:200] or f"Search result {match.group(2)}",
"url": url,
"doc_type": match.group(1),
"source": f"search:{query}"
}
self.documents.append(doc)
count += 1
except:
continue
print(f" Found {count} new documents")
def save_to_neo4j(self):
"""Gem alle dokumenter til Neo4j"""
print("\nπŸ’Ύ SAVING TO NEO4J")
print("-" * 40)
with self.neo4j.session() as session:
for doc in self.documents:
content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest()
session.run("""
MERGE (d:ScribdDocument {contentHash: $hash})
ON CREATE SET
d.id = $id,
d.title = $title,
d.url = $url,
d.docType = $doc_type,
d.source = $source,
d.harvestedAt = datetime()
ON MATCH SET
d.lastSeen = datetime()
MERGE (s:DataSource {name: 'Scribd'})
MERGE (d)-[:HARVESTED_FROM]->(s)
""",
hash=content_hash,
id=doc['id'],
title=doc['title'],
url=doc['url'],
doc_type=doc['doc_type'],
source=doc['source']
)
print(f" βœ… Saved {len(self.documents)} documents to Neo4j")
def save_local(self):
"""Gem lokal JSON"""
output_file = self.output_dir / "scribd_harvest.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(self.documents, f, indent=2, ensure_ascii=False)
print(f" πŸ“ Local JSON: {output_file}")
def run(self):
"""KΓΈr fuld harvest"""
print("=" * 60)
print("πŸ“š SCRIBD SELENIUM HARVESTER")
print("=" * 60)
if not self.login_manual():
return
# Harvest saved documents
self.harvest_saved()
# Search for relevant topics
search_queries = [
"AI ethics business",
"generative AI strategy",
"cybersecurity threat intelligence",
"digital transformation"
]
self.harvest_searches(search_queries)
# Save results
self.save_to_neo4j()
self.save_local()
# Summary
print("\n" + "=" * 60)
print("πŸ“Š HARVEST COMPLETE")
print("=" * 60)
print(f" πŸ“„ Total documents: {len(self.documents)}")
print("=" * 60)
input("\nTryk ENTER for at lukke browseren...")
self.driver.quit()
self.neo4j.close()
if __name__ == "__main__":
harvester = ScribdSeleniumHarvester()
harvester.run()