Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| π ScribdHarvester v2.0 - Valideret Metode | |
| ========================================== | |
| Kombinerer: | |
| 1. Cookie extraction fra Chrome browser | |
| 2. Officiel scribd-downloader bibliotek | |
| 3. Web scraping for favorites/library | |
| 4. Neo4j cloud storage | |
| KΓΈr: pip install -r scribd_requirements.txt | |
| python scribd_harvester_v2.py | |
| @author WidgeTDC Neural Network | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import hashlib | |
| import requests | |
| import re | |
| import subprocess | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import List, Dict, Optional, Any | |
| from dataclasses import dataclass, asdict | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Neo4j | |
| from neo4j import GraphDatabase | |
| # HTML parsing | |
| from bs4 import BeautifulSoup | |
| # Cookie extractor | |
| from scribd_cookie_extractor import ScribdCookieExtractor | |
| # Image processing | |
| try: | |
| from PIL import Image | |
| import io | |
| HAS_PIL = True | |
| except ImportError: | |
| HAS_PIL = False | |
| class ScribdDocument: | |
| id: str | |
| title: str | |
| author: str | |
| url: str | |
| doc_type: str | |
| thumbnail: str | |
| description: str | |
| content_hash: str | |
| saved_at: str | |
| local_path: str = "" | |
| class ExtractedImage: | |
| id: str | |
| source_doc_id: str | |
| url: str | |
| caption: str | |
| content_hash: str | |
| local_path: str | |
| width: int | |
| height: int | |
| class ScribdHarvesterV2: | |
| """ | |
| Valideret Scribd harvester med cookie-baseret authentication | |
| """ | |
| # Neo4j AuraDB Cloud | |
| NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687") | |
| NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") | |
| NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password") | |
| # Scribd endpoints | |
| SCRIBD_BASE = "https://www.scribd.com" | |
| SCRIBD_API = "https://www.scribd.com/api" | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", | |
| "Accept": "application/json, text/html, */*", | |
| "Accept-Language": "en-US,en;q=0.9,da;q=0.8", | |
| } | |
| def __init__(self, output_dir: str = None): | |
| self.output_dir = Path(output_dir or "data/scribd_harvest") | |
| self.image_dir = self.output_dir / "images" | |
| self.docs_dir = self.output_dir / "documents" | |
| for d in [self.output_dir, self.image_dir, self.docs_dir]: | |
| d.mkdir(parents=True, exist_ok=True) | |
| # Session med cookies | |
| self.session = requests.Session() | |
| self.session.headers.update(self.HEADERS) | |
| # Neo4j | |
| self.driver = GraphDatabase.driver( | |
| self.NEO4J_URI, | |
| auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| # Stats | |
| self.stats = { | |
| "documents_found": 0, | |
| "documents_downloaded": 0, | |
| "documents_skipped": 0, | |
| "images_extracted": 0 | |
| } | |
| print("π ScribdHarvester v2.0 - Valideret Metode") | |
| print(f" π Output: {self.output_dir.absolute()}") | |
| def authenticate(self) -> bool: | |
| """Hent og anvend cookies fra browser eller fil""" | |
| print("\nπ AUTHENTICATION") | |
| print("-" * 40) | |
| cookies = None | |
| # FΓRST: Check for manuel cookie fil | |
| cookie_file = self.output_dir / "scribd_cookies.json" | |
| if cookie_file.exists(): | |
| print(f" π Finder cookie fil: {cookie_file}") | |
| try: | |
| with open(cookie_file, 'r') as f: | |
| data = json.load(f) | |
| session_cookie = data.get('_scribd_session', '') | |
| expire_cookie = data.get('_scribd_expire', '') | |
| if session_cookie and 'INDSΓT' not in session_cookie: | |
| cookies = { | |
| '_scribd_session': session_cookie, | |
| '_scribd_expire': expire_cookie | |
| } | |
| print(" β Cookies loaded fra fil!") | |
| else: | |
| print(" β οΈ Cookie fil ikke udfyldt - prΓΈver automatisk extraction...") | |
| except Exception as e: | |
| print(f" β οΈ Fejl ved lΓ¦sning af cookie fil: {e}") | |
| # DEREFTER: PrΓΈv automatisk extraction | |
| if not cookies: | |
| extractor = ScribdCookieExtractor() | |
| cookies = extractor.get_cookies() | |
| if not cookies: | |
| return False | |
| # Anvend cookies til session | |
| for name, value in cookies.items(): | |
| self.session.cookies.set(name, value, domain=".scribd.com") | |
| # Verificer | |
| return self._verify_session() | |
| def _verify_session(self) -> bool: | |
| """Verificer at vi er logget ind""" | |
| try: | |
| # PrΓΈv at hente bruger info | |
| response = self.session.get( | |
| f"{self.SCRIBD_BASE}/account", | |
| allow_redirects=False | |
| ) | |
| if response.status_code == 200: | |
| if 'login' not in response.url.lower(): | |
| print("β Session verificeret - logget ind!") | |
| return True | |
| # PrΓΈv alternativ endpoint | |
| response = self.session.get(f"{self.SCRIBD_BASE}/saved") | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Check for logged-in indicators | |
| if soup.find('a', href=re.compile(r'/logout')): | |
| print("β Session verificeret via /saved") | |
| return True | |
| print("β οΈ Session ikke verificeret - cookies kan vΓ¦re udlΓΈbet") | |
| return False | |
| except Exception as e: | |
| print(f"β Verification fejl: {e}") | |
| return False | |
| def fetch_library(self) -> List[Dict]: | |
| """Hent brugerens bibliotek/gemte dokumenter""" | |
| print("\nπ FETCHING LIBRARY") | |
| print("-" * 40) | |
| all_items = [] | |
| # Endpoints at prΓΈve | |
| endpoints = [ | |
| "/saved", | |
| "/library", | |
| "/your-library", | |
| "/account/saved", | |
| "/lists" | |
| ] | |
| for endpoint in endpoints: | |
| url = f"{self.SCRIBD_BASE}{endpoint}" | |
| print(f" PrΓΈver: {endpoint}") | |
| try: | |
| response = self.session.get(url) | |
| if response.status_code != 200: | |
| continue | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find dokumenter med forskellige selectors | |
| items = self._extract_items_from_html(soup) | |
| for item in items: | |
| if not any(i['url'] == item['url'] for i in all_items): | |
| all_items.append(item) | |
| print(f" π {item['title'][:50]}...") | |
| # PrΓΈv ogsΓ₯ at finde JSON data | |
| json_items = self._extract_items_from_scripts(soup) | |
| for item in json_items: | |
| if not any(i['url'] == item['url'] for i in all_items): | |
| all_items.append(item) | |
| except Exception as e: | |
| print(f" β οΈ Fejl: {e}") | |
| print(f"\n π Fandt {len(all_items)} dokumenter total") | |
| self.stats["documents_found"] = len(all_items) | |
| return all_items | |
| def _extract_items_from_html(self, soup: BeautifulSoup) -> List[Dict]: | |
| """Ekstraher dokumenter fra HTML""" | |
| items = [] | |
| # Forskellige link patterns | |
| patterns = [ | |
| ('a[href*="/document/"]', 'document'), | |
| ('a[href*="/book/"]', 'book'), | |
| ('a[href*="/read/"]', 'book'), | |
| ('a[href*="/audiobook/"]', 'audiobook'), | |
| ('.doc-list-item', 'document'), | |
| ('[data-doc-id]', 'document'), | |
| ] | |
| for selector, doc_type in patterns: | |
| try: | |
| elements = soup.select(selector) | |
| for el in elements: | |
| href = el.get('href', '') | |
| if not href: | |
| # PrΓΈv at finde link i children | |
| link = el.find('a') | |
| if link: | |
| href = link.get('href', '') | |
| if not href or '/login' in href: | |
| continue | |
| if not href.startswith('http'): | |
| href = urljoin(self.SCRIBD_BASE, href) | |
| # Ekstraher ID | |
| match = re.search(r'/(document|book|read|audiobook)/(\d+)', href) | |
| doc_id = match.group(2) if match else None | |
| if not doc_id: | |
| continue | |
| # Find titel | |
| title = el.get_text(strip=True) | |
| if not title or len(title) < 3: | |
| title_el = el.find(['h1', 'h2', 'h3', 'h4', '.title', '[class*="title"]']) | |
| if title_el: | |
| title = title_el.get_text(strip=True) | |
| # Find thumbnail | |
| thumbnail = '' | |
| img = el.find('img') | |
| if img: | |
| thumbnail = img.get('src', '') or img.get('data-src', '') | |
| items.append({ | |
| 'id': doc_id, | |
| 'url': href, | |
| 'title': title or f"Document {doc_id}", | |
| 'type': doc_type, | |
| 'thumbnail': thumbnail, | |
| }) | |
| except: | |
| pass | |
| return items | |
| def _extract_items_from_scripts(self, soup: BeautifulSoup) -> List[Dict]: | |
| """Ekstraher dokumenter fra JSON scripts i HTML""" | |
| items = [] | |
| scripts = soup.find_all('script') | |
| for script in scripts: | |
| text = script.string or '' | |
| # PrΓΈv at finde JSON data | |
| patterns = [ | |
| r'window\.__INITIAL_STATE__\s*=\s*({.*?});', | |
| r'window\.Scribd\..*?=\s*({.*?});', | |
| r'"documents"\s*:\s*(\[.*?\])', | |
| ] | |
| for pattern in patterns: | |
| try: | |
| match = re.search(pattern, text, re.DOTALL) | |
| if match: | |
| data = json.loads(match.group(1)) | |
| extracted = self._traverse_json_for_docs(data) | |
| items.extend(extracted) | |
| except: | |
| pass | |
| return items | |
| def _traverse_json_for_docs(self, obj, depth=0) -> List[Dict]: | |
| """Traverser JSON for at finde dokumenter""" | |
| items = [] | |
| if depth > 8: | |
| return items | |
| if isinstance(obj, dict): | |
| # Check om dette er et dokument | |
| if 'id' in obj and ('title' in obj or 'name' in obj): | |
| doc_id = str(obj.get('id', '')) | |
| if doc_id.isdigit(): | |
| doc_type = obj.get('type', 'document').lower() | |
| if doc_type in ['book', 'audiobook']: | |
| url = f"{self.SCRIBD_BASE}/{doc_type}/{doc_id}" | |
| else: | |
| url = f"{self.SCRIBD_BASE}/document/{doc_id}" | |
| items.append({ | |
| 'id': doc_id, | |
| 'url': url, | |
| 'title': obj.get('title') or obj.get('name', f'Document {doc_id}'), | |
| 'type': doc_type, | |
| 'thumbnail': obj.get('thumbnail_url', obj.get('cover_url', '')), | |
| 'author': obj.get('author', {}).get('name', '') if isinstance(obj.get('author'), dict) else obj.get('author', ''), | |
| }) | |
| for v in obj.values(): | |
| items.extend(self._traverse_json_for_docs(v, depth + 1)) | |
| elif isinstance(obj, list): | |
| for item in obj: | |
| items.extend(self._traverse_json_for_docs(item, depth + 1)) | |
| return items | |
| def download_document(self, item: Dict) -> Optional[Path]: | |
| """Download dokument med scribdl eller direkte""" | |
| doc_id = item.get('id', '') | |
| url = item.get('url', '') | |
| title = item.get('title', f'doc_{doc_id}') | |
| # Sanitize filename | |
| safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:100] | |
| print(f" π₯ Downloader: {title[:50]}...") | |
| # Metode 1: Brug scribdl CLI | |
| output_path = self.docs_dir / f"{doc_id}_{safe_title}" | |
| try: | |
| # PrΓΈv scribdl fΓΈrst | |
| result = subprocess.run( | |
| ['scribdl', '-i', url], | |
| cwd=str(self.docs_dir), | |
| capture_output=True, | |
| text=True, | |
| timeout=120 | |
| ) | |
| if result.returncode == 0: | |
| # Find downloaded files | |
| for f in self.docs_dir.glob(f"*{doc_id}*"): | |
| print(f" β Downloaded: {f.name}") | |
| return f | |
| except FileNotFoundError: | |
| print(" β οΈ scribdl ikke installeret, bruger alternativ metode") | |
| except subprocess.TimeoutExpired: | |
| print(" β οΈ Timeout pΓ₯ download") | |
| except Exception as e: | |
| print(f" β οΈ scribdl fejl: {e}") | |
| # Metode 2: Download direkte | |
| return self._direct_download(item) | |
| def _direct_download(self, item: Dict) -> Optional[Path]: | |
| """Direkte download af dokument sider""" | |
| doc_id = item['id'] | |
| url = item['url'] | |
| try: | |
| response = self.session.get(url) | |
| if response.status_code != 200: | |
| return None | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find dokument reader | |
| reader = soup.find('div', class_=re.compile(r'reader|document|pages')) | |
| if not reader: | |
| # Gem HTML som fallback | |
| html_path = self.docs_dir / f"{doc_id}.html" | |
| with open(html_path, 'w', encoding='utf-8') as f: | |
| f.write(response.text) | |
| return html_path | |
| # Find og download billeder af sider | |
| images = reader.find_all('img', src=True) | |
| if images: | |
| doc_folder = self.docs_dir / doc_id | |
| doc_folder.mkdir(exist_ok=True) | |
| for i, img in enumerate(images): | |
| img_url = img['src'] | |
| if not img_url.startswith('http'): | |
| img_url = urljoin(url, img_url) | |
| try: | |
| img_response = self.session.get(img_url, timeout=30) | |
| if img_response.status_code == 200: | |
| ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png' | |
| img_path = doc_folder / f"page_{i:03d}.{ext}" | |
| with open(img_path, 'wb') as f: | |
| f.write(img_response.content) | |
| except: | |
| pass | |
| return doc_folder | |
| return None | |
| except Exception as e: | |
| print(f" β Download fejl: {e}") | |
| return None | |
| def extract_images_for_presentations(self, item: Dict) -> List[ExtractedImage]: | |
| """Ekstraher billeder egnet til præsentationer""" | |
| images = [] | |
| url = item['url'] | |
| doc_id = item['id'] | |
| try: | |
| response = self.session.get(url) | |
| if response.status_code != 200: | |
| return images | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find alle billeder | |
| for idx, img in enumerate(soup.find_all('img')): | |
| src = img.get('src', '') or img.get('data-src', '') | |
| if not src: | |
| continue | |
| # Skip ikoner og smΓ₯ billeder | |
| skip_patterns = ['avatar', 'icon', 'logo', 'button', 'sprite', '1x1', 'tracking'] | |
| if any(p in src.lower() for p in skip_patterns): | |
| continue | |
| # Check stΓΈrrelse | |
| width = int(img.get('width', 0) or 0) | |
| height = int(img.get('height', 0) or 0) | |
| if (width > 0 and width < 150) or (height > 0 and height < 150): | |
| continue | |
| # Download billede | |
| if not src.startswith('http'): | |
| src = urljoin(url, src) | |
| try: | |
| img_response = self.session.get(src, timeout=30) | |
| if img_response.status_code != 200: | |
| continue | |
| # Check actual size | |
| if HAS_PIL: | |
| pil_img = Image.open(io.BytesIO(img_response.content)) | |
| width, height = pil_img.size | |
| if width < 200 or height < 150: | |
| continue | |
| # Gem lokalt | |
| content_hash = hashlib.md5(img_response.content).hexdigest() | |
| ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png' | |
| local_path = self.image_dir / f"{doc_id}_img_{idx}.{ext}" | |
| with open(local_path, 'wb') as f: | |
| f.write(img_response.content) | |
| # Caption | |
| caption = img.get('alt', '') or img.get('title', '') | |
| figure = img.find_parent('figure') | |
| if figure: | |
| figcaption = figure.find('figcaption') | |
| if figcaption: | |
| caption = figcaption.get_text(strip=True) | |
| images.append(ExtractedImage( | |
| id=f"{doc_id}_img_{idx}", | |
| source_doc_id=doc_id, | |
| url=src, | |
| caption=caption, | |
| content_hash=content_hash, | |
| local_path=str(local_path), | |
| width=width, | |
| height=height | |
| )) | |
| except Exception as e: | |
| pass | |
| if images: | |
| print(f" πΌοΈ {len(images)} billeder ekstraheret") | |
| self.stats["images_extracted"] += len(images) | |
| except Exception as e: | |
| print(f" β οΈ Image extraction fejl: {e}") | |
| return images | |
| def save_to_neo4j(self, item: Dict, local_path: Optional[Path], images: List[ExtractedImage]): | |
| """Gem dokument og billeder til Neo4j""" | |
| doc_id = item['id'] | |
| content_hash = hashlib.md5(f"{item['title']}-{item['url']}".encode()).hexdigest() | |
| with self.driver.session() as session: | |
| # Check for duplicate | |
| result = session.run( | |
| "MATCH (d:ScribdDocument {contentHash: $hash}) RETURN d LIMIT 1", | |
| hash=content_hash | |
| ) | |
| if len(list(result)) > 0: | |
| self.stats["documents_skipped"] += 1 | |
| return | |
| # Gem dokument | |
| session.run(""" | |
| MERGE (d:ScribdDocument {id: $id}) | |
| SET d.title = $title, | |
| d.author = $author, | |
| d.url = $url, | |
| d.type = $doc_type, | |
| d.thumbnail = $thumbnail, | |
| d.contentHash = $content_hash, | |
| d.localPath = $local_path, | |
| d.savedAt = datetime(), | |
| d.source = 'Scribd' | |
| MERGE (s:DataSource {name: 'Scribd'}) | |
| SET s.type = 'DocumentPlatform', s.lastHarvest = datetime() | |
| MERGE (d)-[:HARVESTED_FROM]->(s) | |
| MERGE (cat:Category {name: $doc_type}) | |
| MERGE (d)-[:BELONGS_TO]->(cat) | |
| """, | |
| id=doc_id, | |
| title=item.get('title', ''), | |
| author=item.get('author', ''), | |
| url=item.get('url', ''), | |
| doc_type=item.get('type', 'document'), | |
| thumbnail=item.get('thumbnail', ''), | |
| content_hash=content_hash, | |
| local_path=str(local_path) if local_path else '' | |
| ) | |
| self.stats["documents_downloaded"] += 1 | |
| # Gem billeder | |
| for img in images: | |
| session.run(""" | |
| MERGE (i:ScribdImage {id: $id}) | |
| SET i.url = $url, | |
| i.caption = $caption, | |
| i.contentHash = $content_hash, | |
| i.localPath = $local_path, | |
| i.width = $width, | |
| i.height = $height, | |
| i.usableForPresentations = true, | |
| i.savedAt = datetime() | |
| WITH i | |
| MATCH (d:ScribdDocument {id: $source_doc_id}) | |
| MERGE (i)-[:EXTRACTED_FROM]->(d) | |
| MERGE (cat:AssetCategory {name: 'Presentation Images'}) | |
| MERGE (i)-[:AVAILABLE_FOR]->(cat) | |
| """, | |
| id=img.id, | |
| url=img.url, | |
| caption=img.caption, | |
| content_hash=img.content_hash, | |
| local_path=img.local_path, | |
| width=img.width, | |
| height=img.height, | |
| source_doc_id=img.source_doc_id | |
| ) | |
| def run(self, download_docs: bool = True, extract_images: bool = True): | |
| """Hovedeksekveringsflow""" | |
| print("") | |
| print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| print("β π SCRIBD HARVESTER v2.0 - VALIDERET METODE β") | |
| print("β Cookie-based authentication med Neo4j Cloud storage β") | |
| print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| # Step 1: Authentication | |
| if not self.authenticate(): | |
| print("\nβ Authentication fejlede!") | |
| print(" PrΓΈv at:") | |
| print(" 1. Γ bn Chrome og log ind pΓ₯ scribd.com") | |
| print(" 2. Luk Chrome helt") | |
| print(" 3. KΓΈr scriptet igen") | |
| return self.stats | |
| # Step 2: Fetch library | |
| items = self.fetch_library() | |
| if not items: | |
| print("\nβ οΈ Ingen dokumenter fundet i dit bibliotek") | |
| print(" Check at du har gemte dokumenter pΓ₯ scribd.com/saved") | |
| return self.stats | |
| # Step 3: Process documents | |
| print(f"\nβοΈ PROCESSING {len(items)} DOCUMENTS") | |
| print("-" * 40) | |
| for i, item in enumerate(items, 1): | |
| print(f"\n[{i}/{len(items)}] {item.get('title', 'Unknown')[:50]}...") | |
| local_path = None | |
| images = [] | |
| # Download | |
| if download_docs: | |
| local_path = self.download_document(item) | |
| # Extract images | |
| if extract_images: | |
| images = self.extract_images_for_presentations(item) | |
| # Save to Neo4j | |
| self.save_to_neo4j(item, local_path, images) | |
| # Rate limiting | |
| time.sleep(2) | |
| # Summary | |
| self._print_summary() | |
| return self.stats | |
| def _print_summary(self): | |
| """Print summary""" | |
| print("") | |
| print("β" * 60) | |
| print("π HARVEST COMPLETE") | |
| print("β" * 60) | |
| print(f" π Documents found: {self.stats['documents_found']}") | |
| print(f" β Documents downloaded: {self.stats['documents_downloaded']}") | |
| print(f" βοΈ Documents skipped: {self.stats['documents_skipped']}") | |
| print(f" πΌοΈ Images extracted: {self.stats['images_extracted']}") | |
| print(f" π Output directory: {self.output_dir.absolute()}") | |
| print("β" * 60) | |
| def close(self): | |
| """Cleanup""" | |
| self.driver.close() | |
| def main(): | |
| """Entry point""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Scribd Harvester v2.0') | |
| parser.add_argument('--no-download', action='store_true', help='Skip document download') | |
| parser.add_argument('--no-images', action='store_true', help='Skip image extraction') | |
| parser.add_argument('--output', type=str, help='Output directory') | |
| args = parser.parse_args() | |
| harvester = ScribdHarvesterV2(output_dir=args.output) | |
| try: | |
| harvester.run( | |
| download_docs=not args.no_download, | |
| extract_images=not args.no_images | |
| ) | |
| finally: | |
| harvester.close() | |
| if __name__ == "__main__": | |
| main() | |