Spaces:

Kraft102
/

widgettdc-api

Paused

App Files Files Community

widgettdc-api / apps /backend /python /scribd_harvester_v2.py

Kraft102

Update backend source

34367da verified 2 months ago

raw

history blame contribute delete

26.3 kB

	#!/usr/bin/env python3
	"""
	📚 ScribdHarvester v2.0 - Valideret Metode
	==========================================

	Kombinerer:
	1. Cookie extraction fra Chrome browser
	2. Officiel scribd-downloader bibliotek
	3. Web scraping for favorites/library
	4. Neo4j cloud storage

	Kør: pip install -r scribd_requirements.txt
	python scribd_harvester_v2.py

	@author WidgeTDC Neural Network
	"""

	import os
	import sys
	import json
	import hashlib
	import requests
	import re
	import subprocess
	from pathlib import Path
	from datetime import datetime
	from typing import List, Dict, Optional, Any
	from dataclasses import dataclass, asdict
	from urllib.parse import urljoin, urlparse
	import time
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Neo4j
	from neo4j import GraphDatabase

	# HTML parsing
	from bs4 import BeautifulSoup

	# Cookie extractor
	from scribd_cookie_extractor import ScribdCookieExtractor

	# Image processing
	try:
	from PIL import Image
	import io
	HAS_PIL = True
	except ImportError:
	HAS_PIL = False


	@dataclass
	class ScribdDocument:
	id: str
	title: str
	author: str
	url: str
	doc_type: str
	thumbnail: str
	description: str
	content_hash: str
	saved_at: str
	local_path: str = ""

	@dataclass
	class ExtractedImage:
	id: str
	source_doc_id: str
	url: str
	caption: str
	content_hash: str
	local_path: str
	width: int
	height: int


	class ScribdHarvesterV2:
	"""
	Valideret Scribd harvester med cookie-baseret authentication
	"""

	# Neo4j AuraDB Cloud
	NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
	NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
	NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")

	# Scribd endpoints
	SCRIBD_BASE = "https://www.scribd.com"
	SCRIBD_API = "https://www.scribd.com/api"

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
	"Accept": "application/json, text/html, /",
	"Accept-Language": "en-US,en;q=0.9,da;q=0.8",
	}

	def __init__(self, output_dir: str = None):
	self.output_dir = Path(output_dir or "data/scribd_harvest")
	self.image_dir = self.output_dir / "images"
	self.docs_dir = self.output_dir / "documents"

	for d in [self.output_dir, self.image_dir, self.docs_dir]:
	d.mkdir(parents=True, exist_ok=True)

	# Session med cookies
	self.session = requests.Session()
	self.session.headers.update(self.HEADERS)

	# Neo4j
	self.driver = GraphDatabase.driver(
	self.NEO4J_URI,
	auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
	)

	# Stats
	self.stats = {
	"documents_found": 0,
	"documents_downloaded": 0,
	"documents_skipped": 0,
	"images_extracted": 0
	}

	print("📚 ScribdHarvester v2.0 - Valideret Metode")
	print(f" 📁 Output: {self.output_dir.absolute()}")

	def authenticate(self) -> bool:
	"""Hent og anvend cookies fra browser eller fil"""
	print("\n🔐 AUTHENTICATION")
	print("-" * 40)

	cookies = None

	# FØRST: Check for manuel cookie fil
	cookie_file = self.output_dir / "scribd_cookies.json"
	if cookie_file.exists():
	print(f" 📄 Finder cookie fil: {cookie_file}")
	try:
	with open(cookie_file, 'r') as f:
	data = json.load(f)

	session_cookie = data.get('_scribd_session', '')
	expire_cookie = data.get('_scribd_expire', '')

	if session_cookie and 'INDSÆT' not in session_cookie:
	cookies = {
	'_scribd_session': session_cookie,
	'_scribd_expire': expire_cookie
	}
	print(" ✅ Cookies loaded fra fil!")
	else:
	print(" ⚠️ Cookie fil ikke udfyldt - prøver automatisk extraction...")
	except Exception as e:
	print(f" ⚠️ Fejl ved læsning af cookie fil: {e}")

	# DEREFTER: Prøv automatisk extraction
	if not cookies:
	extractor = ScribdCookieExtractor()
	cookies = extractor.get_cookies()

	if not cookies:
	return False

	# Anvend cookies til session
	for name, value in cookies.items():
	self.session.cookies.set(name, value, domain=".scribd.com")

	# Verificer
	return self._verify_session()

	def _verify_session(self) -> bool:
	"""Verificer at vi er logget ind"""
	try:
	# Prøv at hente bruger info
	response = self.session.get(
	f"{self.SCRIBD_BASE}/account",
	allow_redirects=False
	)

	if response.status_code == 200:
	if 'login' not in response.url.lower():
	print("✅ Session verificeret - logget ind!")
	return True

	# Prøv alternativ endpoint
	response = self.session.get(f"{self.SCRIBD_BASE}/saved")
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	# Check for logged-in indicators
	if soup.find('a', href=re.compile(r'/logout')):
	print("✅ Session verificeret via /saved")
	return True

	print("⚠️ Session ikke verificeret - cookies kan være udløbet")
	return False

	except Exception as e:
	print(f"❌ Verification fejl: {e}")
	return False


	def fetch_library(self) -> List[Dict]:
	"""Hent brugerens bibliotek/gemte dokumenter"""
	print("\n📖 FETCHING LIBRARY")
	print("-" * 40)

	all_items = []

	# Endpoints at prøve
	endpoints = [
	"/saved",
	"/library",
	"/your-library",
	"/account/saved",
	"/lists"
	]

	for endpoint in endpoints:
	url = f"{self.SCRIBD_BASE}{endpoint}"
	print(f" Prøver: {endpoint}")

	try:
	response = self.session.get(url)
	if response.status_code != 200:
	continue

	soup = BeautifulSoup(response.text, 'html.parser')

	# Find dokumenter med forskellige selectors
	items = self._extract_items_from_html(soup)

	for item in items:
	if not any(i['url'] == item['url'] for i in all_items):
	all_items.append(item)
	print(f" 📄 {item['title'][:50]}...")

	# Prøv også at finde JSON data
	json_items = self._extract_items_from_scripts(soup)
	for item in json_items:
	if not any(i['url'] == item['url'] for i in all_items):
	all_items.append(item)

	except Exception as e:
	print(f" ⚠️ Fejl: {e}")

	print(f"\n 📚 Fandt {len(all_items)} dokumenter total")
	self.stats["documents_found"] = len(all_items)
	return all_items

	def _extract_items_from_html(self, soup: BeautifulSoup) -> List[Dict]:
	"""Ekstraher dokumenter fra HTML"""
	items = []

	# Forskellige link patterns
	patterns = [
	('a[href*="/document/"]', 'document'),
	('a[href*="/book/"]', 'book'),
	('a[href*="/read/"]', 'book'),
	('a[href*="/audiobook/"]', 'audiobook'),
	('.doc-list-item', 'document'),
	('[data-doc-id]', 'document'),
	]

	for selector, doc_type in patterns:
	try:
	elements = soup.select(selector)
	for el in elements:
	href = el.get('href', '')
	if not href:
	# Prøv at finde link i children
	link = el.find('a')
	if link:
	href = link.get('href', '')

	if not href or '/login' in href:
	continue

	if not href.startswith('http'):
	href = urljoin(self.SCRIBD_BASE, href)

	# Ekstraher ID
	match = re.search(r'/(document\|book\|read\|audiobook)/(\d+)', href)
	doc_id = match.group(2) if match else None

	if not doc_id:
	continue

	# Find titel
	title = el.get_text(strip=True)
	if not title or len(title) < 3:
	title_el = el.find(['h1', 'h2', 'h3', 'h4', '.title', '[class*="title"]'])
	if title_el:
	title = title_el.get_text(strip=True)

	# Find thumbnail
	thumbnail = ''
	img = el.find('img')
	if img:
	thumbnail = img.get('src', '') or img.get('data-src', '')

	items.append({
	'id': doc_id,
	'url': href,
	'title': title or f"Document {doc_id}",
	'type': doc_type,
	'thumbnail': thumbnail,
	})
	except:
	pass

	return items

	def _extract_items_from_scripts(self, soup: BeautifulSoup) -> List[Dict]:
	"""Ekstraher dokumenter fra JSON scripts i HTML"""
	items = []

	scripts = soup.find_all('script')
	for script in scripts:
	text = script.string or ''

	# Prøv at finde JSON data
	patterns = [
	r'window\.__INITIAL_STATE__\s=\s({.*?});',
	r'window\.Scribd\..?=\s({.*?});',
	r'"documents"\s:\s(\[.*?\])',
	]

	for pattern in patterns:
	try:
	match = re.search(pattern, text, re.DOTALL)
	if match:
	data = json.loads(match.group(1))
	extracted = self._traverse_json_for_docs(data)
	items.extend(extracted)
	except:
	pass

	return items

	def _traverse_json_for_docs(self, obj, depth=0) -> List[Dict]:
	"""Traverser JSON for at finde dokumenter"""
	items = []

	if depth > 8:
	return items

	if isinstance(obj, dict):
	# Check om dette er et dokument
	if 'id' in obj and ('title' in obj or 'name' in obj):
	doc_id = str(obj.get('id', ''))
	if doc_id.isdigit():
	doc_type = obj.get('type', 'document').lower()
	if doc_type in ['book', 'audiobook']:
	url = f"{self.SCRIBD_BASE}/{doc_type}/{doc_id}"
	else:
	url = f"{self.SCRIBD_BASE}/document/{doc_id}"

	items.append({
	'id': doc_id,
	'url': url,
	'title': obj.get('title') or obj.get('name', f'Document {doc_id}'),
	'type': doc_type,
	'thumbnail': obj.get('thumbnail_url', obj.get('cover_url', '')),
	'author': obj.get('author', {}).get('name', '') if isinstance(obj.get('author'), dict) else obj.get('author', ''),
	})

	for v in obj.values():
	items.extend(self._traverse_json_for_docs(v, depth + 1))

	elif isinstance(obj, list):
	for item in obj:
	items.extend(self._traverse_json_for_docs(item, depth + 1))

	return items


	def download_document(self, item: Dict) -> Optional[Path]:
	"""Download dokument med scribdl eller direkte"""
	doc_id = item.get('id', '')
	url = item.get('url', '')
	title = item.get('title', f'doc_{doc_id}')

	# Sanitize filename
	safe_title = re.sub(r'[<>:"/\\\|?*]', '_', title)[:100]

	print(f" 📥 Downloader: {title[:50]}...")

	# Metode 1: Brug scribdl CLI
	output_path = self.docs_dir / f"{doc_id}_{safe_title}"

	try:
	# Prøv scribdl først
	result = subprocess.run(
	['scribdl', '-i', url],
	cwd=str(self.docs_dir),
	capture_output=True,
	text=True,
	timeout=120
	)

	if result.returncode == 0:
	# Find downloaded files
	for f in self.docs_dir.glob(f"{doc_id}"):
	print(f" ✅ Downloaded: {f.name}")
	return f

	except FileNotFoundError:
	print(" ⚠️ scribdl ikke installeret, bruger alternativ metode")
	except subprocess.TimeoutExpired:
	print(" ⚠️ Timeout på download")
	except Exception as e:
	print(f" ⚠️ scribdl fejl: {e}")

	# Metode 2: Download direkte
	return self._direct_download(item)

	def _direct_download(self, item: Dict) -> Optional[Path]:
	"""Direkte download af dokument sider"""
	doc_id = item['id']
	url = item['url']

	try:
	response = self.session.get(url)
	if response.status_code != 200:
	return None

	soup = BeautifulSoup(response.text, 'html.parser')

	# Find dokument reader
	reader = soup.find('div', class_=re.compile(r'reader\|document\|pages'))
	if not reader:
	# Gem HTML som fallback
	html_path = self.docs_dir / f"{doc_id}.html"
	with open(html_path, 'w', encoding='utf-8') as f:
	f.write(response.text)
	return html_path

	# Find og download billeder af sider
	images = reader.find_all('img', src=True)
	if images:
	doc_folder = self.docs_dir / doc_id
	doc_folder.mkdir(exist_ok=True)

	for i, img in enumerate(images):
	img_url = img['src']
	if not img_url.startswith('http'):
	img_url = urljoin(url, img_url)

	try:
	img_response = self.session.get(img_url, timeout=30)
	if img_response.status_code == 200:
	ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png'
	img_path = doc_folder / f"page_{i:03d}.{ext}"
	with open(img_path, 'wb') as f:
	f.write(img_response.content)
	except:
	pass

	return doc_folder

	return None

	except Exception as e:
	print(f" ❌ Download fejl: {e}")
	return None

	def extract_images_for_presentations(self, item: Dict) -> List[ExtractedImage]:
	"""Ekstraher billeder egnet til præsentationer"""
	images = []
	url = item['url']
	doc_id = item['id']

	try:
	response = self.session.get(url)
	if response.status_code != 200:
	return images

	soup = BeautifulSoup(response.text, 'html.parser')

	# Find alle billeder
	for idx, img in enumerate(soup.find_all('img')):
	src = img.get('src', '') or img.get('data-src', '')
	if not src:
	continue

	# Skip ikoner og små billeder
	skip_patterns = ['avatar', 'icon', 'logo', 'button', 'sprite', '1x1', 'tracking']
	if any(p in src.lower() for p in skip_patterns):
	continue

	# Check størrelse
	width = int(img.get('width', 0) or 0)
	height = int(img.get('height', 0) or 0)
	if (width > 0 and width < 150) or (height > 0 and height < 150):
	continue

	# Download billede
	if not src.startswith('http'):
	src = urljoin(url, src)

	try:
	img_response = self.session.get(src, timeout=30)
	if img_response.status_code != 200:
	continue

	# Check actual size
	if HAS_PIL:
	pil_img = Image.open(io.BytesIO(img_response.content))
	width, height = pil_img.size

	if width < 200 or height < 150:
	continue

	# Gem lokalt
	content_hash = hashlib.md5(img_response.content).hexdigest()
	ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png'
	local_path = self.image_dir / f"{doc_id}_img_{idx}.{ext}"

	with open(local_path, 'wb') as f:
	f.write(img_response.content)

	# Caption
	caption = img.get('alt', '') or img.get('title', '')
	figure = img.find_parent('figure')
	if figure:
	figcaption = figure.find('figcaption')
	if figcaption:
	caption = figcaption.get_text(strip=True)

	images.append(ExtractedImage(
	id=f"{doc_id}_img_{idx}",
	source_doc_id=doc_id,
	url=src,
	caption=caption,
	content_hash=content_hash,
	local_path=str(local_path),
	width=width,
	height=height
	))

	except Exception as e:
	pass

	if images:
	print(f" 🖼️ {len(images)} billeder ekstraheret")
	self.stats["images_extracted"] += len(images)

	except Exception as e:
	print(f" ⚠️ Image extraction fejl: {e}")

	return images


	def save_to_neo4j(self, item: Dict, local_path: Optional[Path], images: List[ExtractedImage]):
	"""Gem dokument og billeder til Neo4j"""
	doc_id = item['id']
	content_hash = hashlib.md5(f"{item['title']}-{item['url']}".encode()).hexdigest()

	with self.driver.session() as session:
	# Check for duplicate
	result = session.run(
	"MATCH (d:ScribdDocument {contentHash: $hash}) RETURN d LIMIT 1",
	hash=content_hash
	)
	if len(list(result)) > 0:
	self.stats["documents_skipped"] += 1
	return

	# Gem dokument
	session.run("""
	MERGE (d:ScribdDocument {id: $id})
	SET d.title = $title,
	d.author = $author,
	d.url = $url,
	d.type = $doc_type,
	d.thumbnail = $thumbnail,
	d.contentHash = $content_hash,
	d.localPath = $local_path,
	d.savedAt = datetime(),
	d.source = 'Scribd'

	MERGE (s:DataSource {name: 'Scribd'})
	SET s.type = 'DocumentPlatform', s.lastHarvest = datetime()
	MERGE (d)-[:HARVESTED_FROM]->(s)

	MERGE (cat:Category {name: $doc_type})
	MERGE (d)-[:BELONGS_TO]->(cat)
	""",
	id=doc_id,
	title=item.get('title', ''),
	author=item.get('author', ''),
	url=item.get('url', ''),
	doc_type=item.get('type', 'document'),
	thumbnail=item.get('thumbnail', ''),
	content_hash=content_hash,
	local_path=str(local_path) if local_path else ''
	)

	self.stats["documents_downloaded"] += 1

	# Gem billeder
	for img in images:
	session.run("""
	MERGE (i:ScribdImage {id: $id})
	SET i.url = $url,
	i.caption = $caption,
	i.contentHash = $content_hash,
	i.localPath = $local_path,
	i.width = $width,
	i.height = $height,
	i.usableForPresentations = true,
	i.savedAt = datetime()

	WITH i
	MATCH (d:ScribdDocument {id: $source_doc_id})
	MERGE (i)-[:EXTRACTED_FROM]->(d)

	MERGE (cat:AssetCategory {name: 'Presentation Images'})
	MERGE (i)-[:AVAILABLE_FOR]->(cat)
	""",
	id=img.id,
	url=img.url,
	caption=img.caption,
	content_hash=img.content_hash,
	local_path=img.local_path,
	width=img.width,
	height=img.height,
	source_doc_id=img.source_doc_id
	)

	def run(self, download_docs: bool = True, extract_images: bool = True):
	"""Hovedeksekveringsflow"""
	print("")
	print("╔══════════════════════════════════════════════════════════════╗")
	print("║ 📚 SCRIBD HARVESTER v2.0 - VALIDERET METODE ║")
	print("║ Cookie-based authentication med Neo4j Cloud storage ║")
	print("╚══════════════════════════════════════════════════════════════╝")

	# Step 1: Authentication
	if not self.authenticate():
	print("\n❌ Authentication fejlede!")
	print(" Prøv at:")
	print(" 1. Åbn Chrome og log ind på scribd.com")
	print(" 2. Luk Chrome helt")
	print(" 3. Kør scriptet igen")
	return self.stats

	# Step 2: Fetch library
	items = self.fetch_library()

	if not items:
	print("\n⚠️ Ingen dokumenter fundet i dit bibliotek")
	print(" Check at du har gemte dokumenter på scribd.com/saved")
	return self.stats

	# Step 3: Process documents
	print(f"\n⚙️ PROCESSING {len(items)} DOCUMENTS")
	print("-" * 40)

	for i, item in enumerate(items, 1):
	print(f"\n[{i}/{len(items)}] {item.get('title', 'Unknown')[:50]}...")

	local_path = None
	images = []

	# Download
	if download_docs:
	local_path = self.download_document(item)

	# Extract images
	if extract_images:
	images = self.extract_images_for_presentations(item)

	# Save to Neo4j
	self.save_to_neo4j(item, local_path, images)

	# Rate limiting
	time.sleep(2)

	# Summary
	self._print_summary()
	return self.stats

	def _print_summary(self):
	"""Print summary"""
	print("")
	print("═" * 60)
	print("📊 HARVEST COMPLETE")
	print("═" * 60)
	print(f" 📚 Documents found: {self.stats['documents_found']}")
	print(f" ✅ Documents downloaded: {self.stats['documents_downloaded']}")
	print(f" ⏭️ Documents skipped: {self.stats['documents_skipped']}")
	print(f" 🖼️ Images extracted: {self.stats['images_extracted']}")
	print(f" 📁 Output directory: {self.output_dir.absolute()}")
	print("═" * 60)

	def close(self):
	"""Cleanup"""
	self.driver.close()


	def main():
	"""Entry point"""
	import argparse

	parser = argparse.ArgumentParser(description='Scribd Harvester v2.0')
	parser.add_argument('--no-download', action='store_true', help='Skip document download')
	parser.add_argument('--no-images', action='store_true', help='Skip image extraction')
	parser.add_argument('--output', type=str, help='Output directory')
	args = parser.parse_args()

	harvester = ScribdHarvesterV2(output_dir=args.output)

	try:
	harvester.run(
	download_docs=not args.no_download,
	extract_images=not args.no_images
	)
	finally:
	harvester.close()


	if __name__ == "__main__":
	main()