widgettdc-api / apps /backend /python /scribd_harvester_v2.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
πŸ“š ScribdHarvester v2.0 - Valideret Metode
==========================================
Kombinerer:
1. Cookie extraction fra Chrome browser
2. Officiel scribd-downloader bibliotek
3. Web scraping for favorites/library
4. Neo4j cloud storage
KΓΈr: pip install -r scribd_requirements.txt
python scribd_harvester_v2.py
@author WidgeTDC Neural Network
"""
import os
import sys
import json
import hashlib
import requests
import re
import subprocess
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional, Any
from dataclasses import dataclass, asdict
from urllib.parse import urljoin, urlparse
import time
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Neo4j
from neo4j import GraphDatabase
# HTML parsing
from bs4 import BeautifulSoup
# Cookie extractor
from scribd_cookie_extractor import ScribdCookieExtractor
# Image processing
try:
from PIL import Image
import io
HAS_PIL = True
except ImportError:
HAS_PIL = False
@dataclass
class ScribdDocument:
id: str
title: str
author: str
url: str
doc_type: str
thumbnail: str
description: str
content_hash: str
saved_at: str
local_path: str = ""
@dataclass
class ExtractedImage:
id: str
source_doc_id: str
url: str
caption: str
content_hash: str
local_path: str
width: int
height: int
class ScribdHarvesterV2:
"""
Valideret Scribd harvester med cookie-baseret authentication
"""
# Neo4j AuraDB Cloud
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
# Scribd endpoints
SCRIBD_BASE = "https://www.scribd.com"
SCRIBD_API = "https://www.scribd.com/api"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json, text/html, */*",
"Accept-Language": "en-US,en;q=0.9,da;q=0.8",
}
def __init__(self, output_dir: str = None):
self.output_dir = Path(output_dir or "data/scribd_harvest")
self.image_dir = self.output_dir / "images"
self.docs_dir = self.output_dir / "documents"
for d in [self.output_dir, self.image_dir, self.docs_dir]:
d.mkdir(parents=True, exist_ok=True)
# Session med cookies
self.session = requests.Session()
self.session.headers.update(self.HEADERS)
# Neo4j
self.driver = GraphDatabase.driver(
self.NEO4J_URI,
auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
# Stats
self.stats = {
"documents_found": 0,
"documents_downloaded": 0,
"documents_skipped": 0,
"images_extracted": 0
}
print("πŸ“š ScribdHarvester v2.0 - Valideret Metode")
print(f" πŸ“ Output: {self.output_dir.absolute()}")
def authenticate(self) -> bool:
"""Hent og anvend cookies fra browser eller fil"""
print("\nπŸ” AUTHENTICATION")
print("-" * 40)
cookies = None
# FØRST: Check for manuel cookie fil
cookie_file = self.output_dir / "scribd_cookies.json"
if cookie_file.exists():
print(f" πŸ“„ Finder cookie fil: {cookie_file}")
try:
with open(cookie_file, 'r') as f:
data = json.load(f)
session_cookie = data.get('_scribd_session', '')
expire_cookie = data.get('_scribd_expire', '')
if session_cookie and 'INDSÆT' not in session_cookie:
cookies = {
'_scribd_session': session_cookie,
'_scribd_expire': expire_cookie
}
print(" βœ… Cookies loaded fra fil!")
else:
print(" ⚠️ Cookie fil ikke udfyldt - prøver automatisk extraction...")
except Exception as e:
print(f" ⚠️ Fejl ved læsning af cookie fil: {e}")
# DEREFTER: PrΓΈv automatisk extraction
if not cookies:
extractor = ScribdCookieExtractor()
cookies = extractor.get_cookies()
if not cookies:
return False
# Anvend cookies til session
for name, value in cookies.items():
self.session.cookies.set(name, value, domain=".scribd.com")
# Verificer
return self._verify_session()
def _verify_session(self) -> bool:
"""Verificer at vi er logget ind"""
try:
# PrΓΈv at hente bruger info
response = self.session.get(
f"{self.SCRIBD_BASE}/account",
allow_redirects=False
)
if response.status_code == 200:
if 'login' not in response.url.lower():
print("βœ… Session verificeret - logget ind!")
return True
# PrΓΈv alternativ endpoint
response = self.session.get(f"{self.SCRIBD_BASE}/saved")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Check for logged-in indicators
if soup.find('a', href=re.compile(r'/logout')):
print("βœ… Session verificeret via /saved")
return True
print("⚠️ Session ikke verificeret - cookies kan være udløbet")
return False
except Exception as e:
print(f"❌ Verification fejl: {e}")
return False
def fetch_library(self) -> List[Dict]:
"""Hent brugerens bibliotek/gemte dokumenter"""
print("\nπŸ“– FETCHING LIBRARY")
print("-" * 40)
all_items = []
# Endpoints at prΓΈve
endpoints = [
"/saved",
"/library",
"/your-library",
"/account/saved",
"/lists"
]
for endpoint in endpoints:
url = f"{self.SCRIBD_BASE}{endpoint}"
print(f" PrΓΈver: {endpoint}")
try:
response = self.session.get(url)
if response.status_code != 200:
continue
soup = BeautifulSoup(response.text, 'html.parser')
# Find dokumenter med forskellige selectors
items = self._extract_items_from_html(soup)
for item in items:
if not any(i['url'] == item['url'] for i in all_items):
all_items.append(item)
print(f" πŸ“„ {item['title'][:50]}...")
# PrΓΈv ogsΓ₯ at finde JSON data
json_items = self._extract_items_from_scripts(soup)
for item in json_items:
if not any(i['url'] == item['url'] for i in all_items):
all_items.append(item)
except Exception as e:
print(f" ⚠️ Fejl: {e}")
print(f"\n πŸ“š Fandt {len(all_items)} dokumenter total")
self.stats["documents_found"] = len(all_items)
return all_items
def _extract_items_from_html(self, soup: BeautifulSoup) -> List[Dict]:
"""Ekstraher dokumenter fra HTML"""
items = []
# Forskellige link patterns
patterns = [
('a[href*="/document/"]', 'document'),
('a[href*="/book/"]', 'book'),
('a[href*="/read/"]', 'book'),
('a[href*="/audiobook/"]', 'audiobook'),
('.doc-list-item', 'document'),
('[data-doc-id]', 'document'),
]
for selector, doc_type in patterns:
try:
elements = soup.select(selector)
for el in elements:
href = el.get('href', '')
if not href:
# PrΓΈv at finde link i children
link = el.find('a')
if link:
href = link.get('href', '')
if not href or '/login' in href:
continue
if not href.startswith('http'):
href = urljoin(self.SCRIBD_BASE, href)
# Ekstraher ID
match = re.search(r'/(document|book|read|audiobook)/(\d+)', href)
doc_id = match.group(2) if match else None
if not doc_id:
continue
# Find titel
title = el.get_text(strip=True)
if not title or len(title) < 3:
title_el = el.find(['h1', 'h2', 'h3', 'h4', '.title', '[class*="title"]'])
if title_el:
title = title_el.get_text(strip=True)
# Find thumbnail
thumbnail = ''
img = el.find('img')
if img:
thumbnail = img.get('src', '') or img.get('data-src', '')
items.append({
'id': doc_id,
'url': href,
'title': title or f"Document {doc_id}",
'type': doc_type,
'thumbnail': thumbnail,
})
except:
pass
return items
def _extract_items_from_scripts(self, soup: BeautifulSoup) -> List[Dict]:
"""Ekstraher dokumenter fra JSON scripts i HTML"""
items = []
scripts = soup.find_all('script')
for script in scripts:
text = script.string or ''
# PrΓΈv at finde JSON data
patterns = [
r'window\.__INITIAL_STATE__\s*=\s*({.*?});',
r'window\.Scribd\..*?=\s*({.*?});',
r'"documents"\s*:\s*(\[.*?\])',
]
for pattern in patterns:
try:
match = re.search(pattern, text, re.DOTALL)
if match:
data = json.loads(match.group(1))
extracted = self._traverse_json_for_docs(data)
items.extend(extracted)
except:
pass
return items
def _traverse_json_for_docs(self, obj, depth=0) -> List[Dict]:
"""Traverser JSON for at finde dokumenter"""
items = []
if depth > 8:
return items
if isinstance(obj, dict):
# Check om dette er et dokument
if 'id' in obj and ('title' in obj or 'name' in obj):
doc_id = str(obj.get('id', ''))
if doc_id.isdigit():
doc_type = obj.get('type', 'document').lower()
if doc_type in ['book', 'audiobook']:
url = f"{self.SCRIBD_BASE}/{doc_type}/{doc_id}"
else:
url = f"{self.SCRIBD_BASE}/document/{doc_id}"
items.append({
'id': doc_id,
'url': url,
'title': obj.get('title') or obj.get('name', f'Document {doc_id}'),
'type': doc_type,
'thumbnail': obj.get('thumbnail_url', obj.get('cover_url', '')),
'author': obj.get('author', {}).get('name', '') if isinstance(obj.get('author'), dict) else obj.get('author', ''),
})
for v in obj.values():
items.extend(self._traverse_json_for_docs(v, depth + 1))
elif isinstance(obj, list):
for item in obj:
items.extend(self._traverse_json_for_docs(item, depth + 1))
return items
def download_document(self, item: Dict) -> Optional[Path]:
"""Download dokument med scribdl eller direkte"""
doc_id = item.get('id', '')
url = item.get('url', '')
title = item.get('title', f'doc_{doc_id}')
# Sanitize filename
safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:100]
print(f" πŸ“₯ Downloader: {title[:50]}...")
# Metode 1: Brug scribdl CLI
output_path = self.docs_dir / f"{doc_id}_{safe_title}"
try:
# PrΓΈv scribdl fΓΈrst
result = subprocess.run(
['scribdl', '-i', url],
cwd=str(self.docs_dir),
capture_output=True,
text=True,
timeout=120
)
if result.returncode == 0:
# Find downloaded files
for f in self.docs_dir.glob(f"*{doc_id}*"):
print(f" βœ… Downloaded: {f.name}")
return f
except FileNotFoundError:
print(" ⚠️ scribdl ikke installeret, bruger alternativ metode")
except subprocess.TimeoutExpired:
print(" ⚠️ Timeout pΓ₯ download")
except Exception as e:
print(f" ⚠️ scribdl fejl: {e}")
# Metode 2: Download direkte
return self._direct_download(item)
def _direct_download(self, item: Dict) -> Optional[Path]:
"""Direkte download af dokument sider"""
doc_id = item['id']
url = item['url']
try:
response = self.session.get(url)
if response.status_code != 200:
return None
soup = BeautifulSoup(response.text, 'html.parser')
# Find dokument reader
reader = soup.find('div', class_=re.compile(r'reader|document|pages'))
if not reader:
# Gem HTML som fallback
html_path = self.docs_dir / f"{doc_id}.html"
with open(html_path, 'w', encoding='utf-8') as f:
f.write(response.text)
return html_path
# Find og download billeder af sider
images = reader.find_all('img', src=True)
if images:
doc_folder = self.docs_dir / doc_id
doc_folder.mkdir(exist_ok=True)
for i, img in enumerate(images):
img_url = img['src']
if not img_url.startswith('http'):
img_url = urljoin(url, img_url)
try:
img_response = self.session.get(img_url, timeout=30)
if img_response.status_code == 200:
ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png'
img_path = doc_folder / f"page_{i:03d}.{ext}"
with open(img_path, 'wb') as f:
f.write(img_response.content)
except:
pass
return doc_folder
return None
except Exception as e:
print(f" ❌ Download fejl: {e}")
return None
def extract_images_for_presentations(self, item: Dict) -> List[ExtractedImage]:
"""Ekstraher billeder egnet til præsentationer"""
images = []
url = item['url']
doc_id = item['id']
try:
response = self.session.get(url)
if response.status_code != 200:
return images
soup = BeautifulSoup(response.text, 'html.parser')
# Find alle billeder
for idx, img in enumerate(soup.find_all('img')):
src = img.get('src', '') or img.get('data-src', '')
if not src:
continue
# Skip ikoner og smΓ₯ billeder
skip_patterns = ['avatar', 'icon', 'logo', 'button', 'sprite', '1x1', 'tracking']
if any(p in src.lower() for p in skip_patterns):
continue
# Check stΓΈrrelse
width = int(img.get('width', 0) or 0)
height = int(img.get('height', 0) or 0)
if (width > 0 and width < 150) or (height > 0 and height < 150):
continue
# Download billede
if not src.startswith('http'):
src = urljoin(url, src)
try:
img_response = self.session.get(src, timeout=30)
if img_response.status_code != 200:
continue
# Check actual size
if HAS_PIL:
pil_img = Image.open(io.BytesIO(img_response.content))
width, height = pil_img.size
if width < 200 or height < 150:
continue
# Gem lokalt
content_hash = hashlib.md5(img_response.content).hexdigest()
ext = 'jpg' if 'jpeg' in img_response.headers.get('content-type', '') else 'png'
local_path = self.image_dir / f"{doc_id}_img_{idx}.{ext}"
with open(local_path, 'wb') as f:
f.write(img_response.content)
# Caption
caption = img.get('alt', '') or img.get('title', '')
figure = img.find_parent('figure')
if figure:
figcaption = figure.find('figcaption')
if figcaption:
caption = figcaption.get_text(strip=True)
images.append(ExtractedImage(
id=f"{doc_id}_img_{idx}",
source_doc_id=doc_id,
url=src,
caption=caption,
content_hash=content_hash,
local_path=str(local_path),
width=width,
height=height
))
except Exception as e:
pass
if images:
print(f" πŸ–ΌοΈ {len(images)} billeder ekstraheret")
self.stats["images_extracted"] += len(images)
except Exception as e:
print(f" ⚠️ Image extraction fejl: {e}")
return images
def save_to_neo4j(self, item: Dict, local_path: Optional[Path], images: List[ExtractedImage]):
"""Gem dokument og billeder til Neo4j"""
doc_id = item['id']
content_hash = hashlib.md5(f"{item['title']}-{item['url']}".encode()).hexdigest()
with self.driver.session() as session:
# Check for duplicate
result = session.run(
"MATCH (d:ScribdDocument {contentHash: $hash}) RETURN d LIMIT 1",
hash=content_hash
)
if len(list(result)) > 0:
self.stats["documents_skipped"] += 1
return
# Gem dokument
session.run("""
MERGE (d:ScribdDocument {id: $id})
SET d.title = $title,
d.author = $author,
d.url = $url,
d.type = $doc_type,
d.thumbnail = $thumbnail,
d.contentHash = $content_hash,
d.localPath = $local_path,
d.savedAt = datetime(),
d.source = 'Scribd'
MERGE (s:DataSource {name: 'Scribd'})
SET s.type = 'DocumentPlatform', s.lastHarvest = datetime()
MERGE (d)-[:HARVESTED_FROM]->(s)
MERGE (cat:Category {name: $doc_type})
MERGE (d)-[:BELONGS_TO]->(cat)
""",
id=doc_id,
title=item.get('title', ''),
author=item.get('author', ''),
url=item.get('url', ''),
doc_type=item.get('type', 'document'),
thumbnail=item.get('thumbnail', ''),
content_hash=content_hash,
local_path=str(local_path) if local_path else ''
)
self.stats["documents_downloaded"] += 1
# Gem billeder
for img in images:
session.run("""
MERGE (i:ScribdImage {id: $id})
SET i.url = $url,
i.caption = $caption,
i.contentHash = $content_hash,
i.localPath = $local_path,
i.width = $width,
i.height = $height,
i.usableForPresentations = true,
i.savedAt = datetime()
WITH i
MATCH (d:ScribdDocument {id: $source_doc_id})
MERGE (i)-[:EXTRACTED_FROM]->(d)
MERGE (cat:AssetCategory {name: 'Presentation Images'})
MERGE (i)-[:AVAILABLE_FOR]->(cat)
""",
id=img.id,
url=img.url,
caption=img.caption,
content_hash=img.content_hash,
local_path=img.local_path,
width=img.width,
height=img.height,
source_doc_id=img.source_doc_id
)
def run(self, download_docs: bool = True, extract_images: bool = True):
"""Hovedeksekveringsflow"""
print("")
print("╔══════════════════════════════════════════════════════════════╗")
print("β•‘ πŸ“š SCRIBD HARVESTER v2.0 - VALIDERET METODE β•‘")
print("β•‘ Cookie-based authentication med Neo4j Cloud storage β•‘")
print("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•")
# Step 1: Authentication
if not self.authenticate():
print("\n❌ Authentication fejlede!")
print(" PrΓΈv at:")
print(" 1. Γ…bn Chrome og log ind pΓ₯ scribd.com")
print(" 2. Luk Chrome helt")
print(" 3. KΓΈr scriptet igen")
return self.stats
# Step 2: Fetch library
items = self.fetch_library()
if not items:
print("\n⚠️ Ingen dokumenter fundet i dit bibliotek")
print(" Check at du har gemte dokumenter pΓ₯ scribd.com/saved")
return self.stats
# Step 3: Process documents
print(f"\nβš™οΈ PROCESSING {len(items)} DOCUMENTS")
print("-" * 40)
for i, item in enumerate(items, 1):
print(f"\n[{i}/{len(items)}] {item.get('title', 'Unknown')[:50]}...")
local_path = None
images = []
# Download
if download_docs:
local_path = self.download_document(item)
# Extract images
if extract_images:
images = self.extract_images_for_presentations(item)
# Save to Neo4j
self.save_to_neo4j(item, local_path, images)
# Rate limiting
time.sleep(2)
# Summary
self._print_summary()
return self.stats
def _print_summary(self):
"""Print summary"""
print("")
print("═" * 60)
print("πŸ“Š HARVEST COMPLETE")
print("═" * 60)
print(f" πŸ“š Documents found: {self.stats['documents_found']}")
print(f" βœ… Documents downloaded: {self.stats['documents_downloaded']}")
print(f" ⏭️ Documents skipped: {self.stats['documents_skipped']}")
print(f" πŸ–ΌοΈ Images extracted: {self.stats['images_extracted']}")
print(f" πŸ“ Output directory: {self.output_dir.absolute()}")
print("═" * 60)
def close(self):
"""Cleanup"""
self.driver.close()
def main():
"""Entry point"""
import argparse
parser = argparse.ArgumentParser(description='Scribd Harvester v2.0')
parser.add_argument('--no-download', action='store_true', help='Skip document download')
parser.add_argument('--no-images', action='store_true', help='Skip image extraction')
parser.add_argument('--output', type=str, help='Output directory')
args = parser.parse_args()
harvester = ScribdHarvesterV2(output_dir=args.output)
try:
harvester.run(
download_docs=not args.no_download,
extract_images=not args.no_images
)
finally:
harvester.close()
if __name__ == "__main__":
main()