#!/usr/bin/env python3 """ πŸ“š ScribdHarvester v2.0 - Valideret Metode ========================================== Kombinerer: 1. Cookie extraction fra Chrome browser 2. Officiel scribd-downloader bibliotek 3. Web scraping for favorites/library 4. Neo4j cloud storage KΓΈr: pip install -r scribd_requirements.txt python scribd_harvester_v2.py @author WidgeTDC Neural Network """ import os import sys import json import hashlib import requests import re import subprocess from pathlib import Path from datetime import datetime from typing import List, Dict, Optional, Any from dataclasses import dataclass, asdict from urllib.parse import urljoin, urlparse import time from dotenv import load_dotenv # Load environment variables load_dotenv() # Neo4j from neo4j import GraphDatabase # HTML parsing from bs4 import BeautifulSoup # Cookie extractor from scribd_cookie_extractor import ScribdCookieExtractor # Image processing try: from PIL import Image import io HAS_PIL = True except ImportError: HAS_PIL = False @dataclass class ScribdDocument: id: str title: str author: str url: str doc_type: str thumbnail: str description: str content_hash: str saved_at: str local_path: str = "" @dataclass class ExtractedImage: id: str source_doc_id: str url: str caption: str content_hash: str local_path: str width: int height: int class ScribdHarvesterV2: """ Valideret Scribd harvester med cookie-baseret authentication """ # Neo4j AuraDB Cloud NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687") NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password") # Scribd endpoints SCRIBD_BASE = "https://www.scribd.com" SCRIBD_API = "https://www.scribd.com/api" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "application/json, text/html, */*", "Accept-Language": "en-US,en;q=0.9,da;q=0.8", } def __init__(self, output_dir: str = None): self.output_dir = Path(output_dir or "data/scribd_harvest") self.image_dir = self.output_dir / "images" self.docs_dir = self.output_dir / "documents" for d in [self.output_dir, self.image_dir, self.docs_dir]: d.mkdir(parents=True, exist_ok=True) # Session med cookies self.session = requests.Session() self.session.headers.update(self.HEADERS) # Neo4j self.driver = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) # Stats self.stats = { "documents_found": 0, "documents_downloaded": 0, "documents_skipped": 0, "images_extracted": 0 } print("πŸ“š ScribdHarvester v2.0 - Valideret Metode") print(f" πŸ“ Output: {self.output_dir.absolute()}") def authenticate(self) -> bool: """Hent og anvend cookies fra browser eller fil""" print("\nπŸ” AUTHENTICATION") print("-" * 40) cookies = None # FØRST: Check for manuel cookie fil cookie_file = self.output_dir / "scribd_cookies.json" if cookie_file.exists(): print(f" πŸ“„ Finder cookie fil: {cookie_file}") try: with open(cookie_file, 'r') as f: data = json.load(f) session_cookie = data.get('_scribd_session', '') expire_cookie = data.get('_scribd_expire', '') if session_cookie and 'INDSΓ†T' not in session_cookie: cookies = { '_scribd_session': session_cookie, '_scribd_expire': expire_cookie } print(" βœ… Cookies loaded fra fil!") else: print(" ⚠️ Cookie fil ikke udfyldt - prΓΈver automatisk extraction...") except Exception as e: print(f" ⚠️ Fejl ved lΓ¦sning af cookie fil: {e}") # DEREFTER: PrΓΈv automatisk extraction if not cookies: extractor = ScribdCookieExtractor() cookies = extractor.get_cookies() if not cookies: return False # Anvend cookies til session for name, value in cookies.items(): self.session.cookies.set(name, value, domain=".scribd.com") # Verificer return self._verify_session() def _verify_session(self) -> bool: """Verificer at vi er logget ind""" try: # PrΓΈv at hente bruger info response = self.session.get( f"{self.SCRIBD_BASE}/account", allow_redirects=False ) if response.status_code == 200: if 'login' not in response.url.lower(): print("βœ… Session verificeret - logget ind!") return True # PrΓΈv alternativ endpoint response = self.session.get(f"{self.SCRIBD_BASE}/saved") if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') # Check for logged-in indicators if soup.find('a', href=re.compile(r'/logout')): print("βœ… Session verificeret via /saved") return True print("⚠️ Session ikke verificeret - cookies kan vΓ¦re udlΓΈbet") return False except Exception as e: print(f"❌ Verification fejl: {e}") return False def fetch_library(self) -> List[Dict]: """Hent brugerens bibliotek/gemte dokumenter""" print("\nπŸ“– FETCHING LIBRARY") print("-" * 40) all_items = [] # Endpoints at prΓΈve endpoints = [ "/saved", "/library", "/your-library", "/account/saved", "/lists" ] for endpoint in endpoints: url = f"{self.SCRIBD_BASE}{endpoint}" print(f" PrΓΈver: {endpoint}") try: response = self.session.get(url) if response.status_code != 200: continue soup = BeautifulSoup(response.text, 'html.parser') # Find dokumenter med forskellige selectors items = self._extract_items_from_html(soup) for item in items: if not any(i['url'] == item['url'] for i in all_items): all_items.append(item) print(f" πŸ“„ {item['title'][:50]}...") # PrΓΈv ogsΓ₯ at finde JSON data json_items = self._extract_items_from_scripts(soup) for item in json_items: if not any(i['url'] == item['url'] for i in all_items): all_items.append(item) except Exception as e: print(f" ⚠️ Fejl: {e}") print(f"\n πŸ“š Fandt {len(all_items)} dokumenter total") self.stats["documents_found"] = len(all_items) return all_items def _extract_items_from_html(self, soup: BeautifulSoup) -> List[Dict]: """Ekstraher dokumenter fra HTML""" items = [] # Forskellige link patterns patterns = [ ('a[href*="/document/"]', 'document'), ('a[href*="/book/"]', 'book'), ('a[href*="/read/"]', 'book'), ('a[href*="/audiobook/"]', 'audiobook'), ('.doc-list-item', 'document'), ('[data-doc-id]', 'document'), ] for selector, doc_type in patterns: try: elements = soup.select(selector) for el in elements: href = el.get('href', '') if not href: # PrΓΈv at finde link i children link = el.find('a') if link: href = link.get('href', '') if not href or '/login' in href: continue if not href.startswith('http'): href = urljoin(self.SCRIBD_BASE, href) # Ekstraher ID match = re.search(r'/(document|book|read|audiobook)/(\d+)', href) doc_id = match.group(2) if match else None if not doc_id: continue # Find titel title = el.get_text(strip=True) if not title or len(title) < 3: title_el = el.find(['h1', 'h2', 'h3', 'h4', '.title', '[class*="title"]']) if title_el: title = title_el.get_text(strip=True) # Find thumbnail thumbnail = '' img = el.find('img') if img: thumbnail = img.get('src', '') or img.get('data-src', '') items.append({ 'id': doc_id, 'url': href, 'title': title or f"Document {doc_id}", 'type': doc_type, 'thumbnail': thumbnail, }) except: pass return items def _extract_items_from_scripts(self, soup: BeautifulSoup) -> List[Dict]: """Ekstraher dokumenter fra JSON scripts i HTML""" items = [] scripts = soup.find_all('script') for script in scripts: text = script.string or '' # PrΓΈv at finde JSON data patterns = [ r'window\.__INITIAL_STATE__\s*=\s*({.*?});', r'window\.Scribd\..*?=\s*({.*?});', r'"documents"\s*:\s*(\[.*?\])', ] for pattern in patterns: try: match = re.search(pattern, text, re.DOTALL) if match: data = json.loads(match.group(1)) extracted = self._traverse_json_for_docs(data) items.extend(extracted) except: pass return items def _traverse_json_for_docs(self, obj, depth=0) -> List[Dict]: """Traverser JSON for at finde dokumenter""" items = [] if depth > 8: return items if isinstance(obj, dict): # Check om dette er et dokument if 'id' in obj and ('title' in obj or 'name' in obj): doc_id = str(obj.get('id', '')) if doc_id.isdigit(): doc_type = obj.get('type', 'document').lower() if doc_type in ['book', 'audiobook']: url = f"{self.SCRIBD_BASE}/{doc_type}/{doc_id}" else: url = f"{self.SCRIBD_BASE}/document/{doc_id}" items.append({ 'id': doc_id, 'url': url, 'title': obj.get('title') or obj.get('name', f'Document {doc_id}'), 'type': doc_type, 'thumbnail': obj.get('thumbnail_url', obj.get('cover_url', '')), 'author': obj.get('author', {}).get('name', '') if isinstance(obj.get('author'), dict) else obj.get('author', ''), }) for v in obj.values(): items.extend(self._traverse_json_for_docs(v, depth + 1)) elif isinstance(obj, list): for item in obj: items.extend(self._traverse_json_for_docs(item, depth + 1)) return items def download_document(self, item: Dict) -> Optional[Path]: """Download dokument med scribdl eller direkte""" doc_id = item.get('id', '') url = item.get('url', '') title = item.get('title', f'doc_{doc_id}') # Sanitize filename safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:100] print(f" πŸ“₯ Downloader: {title[:50]}...") # Metode 1: Brug scribdl CLI output_path = self.docs_dir / f"{doc_id}_{safe_title}" try: # PrΓΈv scribdl fΓΈrst result = subprocess.run( ['scribdl', '-i', url], cwd=str(self.docs_dir), capture_output=True, text=True, timeout=120 ) if result.returncode == 0: # Find downloaded files for f in self.docs_dir.glob(f"*{doc_id}*"): print(f" βœ… Downloaded: {f.name}") return f except FileNotFoundError: print(" ⚠️ scribdl ikke installeret, bruger alternativ metode") except subprocess.TimeoutExpired: print(" ⚠️ Timeout pΓ₯ download") except Exception as e: print(f" ⚠️ scribdl fejl: {e}") # Metode 2: Download direkte return self._direct_download(item) def _direct_download(self, item: Dict) -> Optional[Path]: """Direkte download af dokument sider""" doc_id = item['id'] url = item['url'] try: response = self.session.get(url) if response.status_code != 200: return None soup = BeautifulSoup(response.text, 'html.parser') # Find dokument reader reader = soup.find('div', class_=re.compile(r'reader|document|pages')) if not reader: # Gem HTML som fallback html_path = self.docs_dir / f"{doc_id}.html" with open(html_path, 'w', encoding='utf-8') as f: f.write(response.text) return html_path # Find og download billeder af sider images = reader.find_all('img', src=True) if images: doc_folder = self.docs_dir / doc_id doc_folder.mkdir(exist_ok=True) for i, img in enumerate(images): img_url = img['src'] if not img_url.startswith('http'): img_url = urljoin(url, img_url) try: img_response = self.session.get(img_url, timeout=30) if img_response.status_code == 200: ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png' img_path = doc_folder / f"page_{i:03d}.{ext}" with open(img_path, 'wb') as f: f.write(img_response.content) except: pass return doc_folder return None except Exception as e: print(f" ❌ Download fejl: {e}") return None def extract_images_for_presentations(self, item: Dict) -> List[ExtractedImage]: """Ekstraher billeder egnet til prΓ¦sentationer""" images = [] url = item['url'] doc_id = item['id'] try: response = self.session.get(url) if response.status_code != 200: return images soup = BeautifulSoup(response.text, 'html.parser') # Find alle billeder for idx, img in enumerate(soup.find_all('img')): src = img.get('src', '') or img.get('data-src', '') if not src: continue # Skip ikoner og smΓ₯ billeder skip_patterns = ['avatar', 'icon', 'logo', 'button', 'sprite', '1x1', 'tracking'] if any(p in src.lower() for p in skip_patterns): continue # Check stΓΈrrelse width = int(img.get('width', 0) or 0) height = int(img.get('height', 0) or 0) if (width > 0 and width < 150) or (height > 0 and height < 150): continue # Download billede if not src.startswith('http'): src = urljoin(url, src) try: img_response = self.session.get(src, timeout=30) if img_response.status_code != 200: continue # Check actual size if HAS_PIL: pil_img = Image.open(io.BytesIO(img_response.content)) width, height = pil_img.size if width < 200 or height < 150: continue # Gem lokalt content_hash = hashlib.md5(img_response.content).hexdigest() ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png' local_path = self.image_dir / f"{doc_id}_img_{idx}.{ext}" with open(local_path, 'wb') as f: f.write(img_response.content) # Caption caption = img.get('alt', '') or img.get('title', '') figure = img.find_parent('figure') if figure: figcaption = figure.find('figcaption') if figcaption: caption = figcaption.get_text(strip=True) images.append(ExtractedImage( id=f"{doc_id}_img_{idx}", source_doc_id=doc_id, url=src, caption=caption, content_hash=content_hash, local_path=str(local_path), width=width, height=height )) except Exception as e: pass if images: print(f" πŸ–ΌοΈ {len(images)} billeder ekstraheret") self.stats["images_extracted"] += len(images) except Exception as e: print(f" ⚠️ Image extraction fejl: {e}") return images def save_to_neo4j(self, item: Dict, local_path: Optional[Path], images: List[ExtractedImage]): """Gem dokument og billeder til Neo4j""" doc_id = item['id'] content_hash = hashlib.md5(f"{item['title']}-{item['url']}".encode()).hexdigest() with self.driver.session() as session: # Check for duplicate result = session.run( "MATCH (d:ScribdDocument {contentHash: $hash}) RETURN d LIMIT 1", hash=content_hash ) if len(list(result)) > 0: self.stats["documents_skipped"] += 1 return # Gem dokument session.run(""" MERGE (d:ScribdDocument {id: $id}) SET d.title = $title, d.author = $author, d.url = $url, d.type = $doc_type, d.thumbnail = $thumbnail, d.contentHash = $content_hash, d.localPath = $local_path, d.savedAt = datetime(), d.source = 'Scribd' MERGE (s:DataSource {name: 'Scribd'}) SET s.type = 'DocumentPlatform', s.lastHarvest = datetime() MERGE (d)-[:HARVESTED_FROM]->(s) MERGE (cat:Category {name: $doc_type}) MERGE (d)-[:BELONGS_TO]->(cat) """, id=doc_id, title=item.get('title', ''), author=item.get('author', ''), url=item.get('url', ''), doc_type=item.get('type', 'document'), thumbnail=item.get('thumbnail', ''), content_hash=content_hash, local_path=str(local_path) if local_path else '' ) self.stats["documents_downloaded"] += 1 # Gem billeder for img in images: session.run(""" MERGE (i:ScribdImage {id: $id}) SET i.url = $url, i.caption = $caption, i.contentHash = $content_hash, i.localPath = $local_path, i.width = $width, i.height = $height, i.usableForPresentations = true, i.savedAt = datetime() WITH i MATCH (d:ScribdDocument {id: $source_doc_id}) MERGE (i)-[:EXTRACTED_FROM]->(d) MERGE (cat:AssetCategory {name: 'Presentation Images'}) MERGE (i)-[:AVAILABLE_FOR]->(cat) """, id=img.id, url=img.url, caption=img.caption, content_hash=img.content_hash, local_path=img.local_path, width=img.width, height=img.height, source_doc_id=img.source_doc_id ) def run(self, download_docs: bool = True, extract_images: bool = True): """Hovedeksekveringsflow""" print("") print("╔══════════════════════════════════════════════════════════════╗") print("β•‘ πŸ“š SCRIBD HARVESTER v2.0 - VALIDERET METODE β•‘") print("β•‘ Cookie-based authentication med Neo4j Cloud storage β•‘") print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•") # Step 1: Authentication if not self.authenticate(): print("\n❌ Authentication fejlede!") print(" PrΓΈv at:") print(" 1. Γ…bn Chrome og log ind pΓ₯ scribd.com") print(" 2. Luk Chrome helt") print(" 3. KΓΈr scriptet igen") return self.stats # Step 2: Fetch library items = self.fetch_library() if not items: print("\n⚠️ Ingen dokumenter fundet i dit bibliotek") print(" Check at du har gemte dokumenter pΓ₯ scribd.com/saved") return self.stats # Step 3: Process documents print(f"\nβš™οΈ PROCESSING {len(items)} DOCUMENTS") print("-" * 40) for i, item in enumerate(items, 1): print(f"\n[{i}/{len(items)}] {item.get('title', 'Unknown')[:50]}...") local_path = None images = [] # Download if download_docs: local_path = self.download_document(item) # Extract images if extract_images: images = self.extract_images_for_presentations(item) # Save to Neo4j self.save_to_neo4j(item, local_path, images) # Rate limiting time.sleep(2) # Summary self._print_summary() return self.stats def _print_summary(self): """Print summary""" print("") print("═" * 60) print("πŸ“Š HARVEST COMPLETE") print("═" * 60) print(f" πŸ“š Documents found: {self.stats['documents_found']}") print(f" βœ… Documents downloaded: {self.stats['documents_downloaded']}") print(f" ⏭️ Documents skipped: {self.stats['documents_skipped']}") print(f" πŸ–ΌοΈ Images extracted: {self.stats['images_extracted']}") print(f" πŸ“ Output directory: {self.output_dir.absolute()}") print("═" * 60) def close(self): """Cleanup""" self.driver.close() def main(): """Entry point""" import argparse parser = argparse.ArgumentParser(description='Scribd Harvester v2.0') parser.add_argument('--no-download', action='store_true', help='Skip document download') parser.add_argument('--no-images', action='store_true', help='Skip image extraction') parser.add_argument('--output', type=str, help='Output directory') args = parser.parse_args() harvester = ScribdHarvesterV2(output_dir=args.output) try: harvester.run( download_docs=not args.no_download, extract_images=not args.no_images ) finally: harvester.close() if __name__ == "__main__": main()