#!/usr/bin/env python3 """ 📚 Scribd Public Harvester - Henter offentligt tilgængelige dokumenter """ import os import json import hashlib import requests import re from pathlib import Path from datetime import datetime from bs4 import BeautifulSoup from neo4j import GraphDatabase class ScribdPublicHarvester: """Henter offentlige Scribd dokumenter uden login""" NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } # Søgeord til at finde relevante dokumenter SEARCH_TOPICS = [ "AI ethics", "generative AI", "machine learning business", "digital transformation", "cybersecurity threats", "OSINT techniques", "threat intelligence" ] def __init__(self): self.session = requests.Session() self.session.headers.update(self.HEADERS) self.output_dir = Path("data/scribd_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) self.driver = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) self.stats = {"found": 0, "saved": 0} def search_documents(self, query: str, max_results: int = 20): """Søg efter dokumenter""" print(f"\n🔍 Søger: {query}") url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}" try: response = self.session.get(url) if response.status_code != 200: print(f" ❌ HTTP {response.status_code}") return [] soup = BeautifulSoup(response.text, 'html.parser') documents = [] # Find document cards for card in soup.select('.SearchResults_card, .document_cell, [data-e2e="search-result"]'): try: link = card.find('a', href=re.compile(r'/document/\d+')) if not link: link = card.find('a', href=re.compile(r'/doc/\d+')) if not link: continue href = link.get('href', '') if not href.startswith('http'): href = f"https://www.scribd.com{href}" title_elem = card.find(['h2', 'h3', '.title', '[class*="title"]']) title = title_elem.get_text(strip=True) if title_elem else link.get_text(strip=True) if title and href: documents.append({ "title": title[:200], "url": href, "query": query }) except Exception: continue # Fallback: Find alle document links if not documents: for link in soup.find_all('a', href=re.compile(r'/(document|doc)/\d+')): href = link.get('href', '') if not href.startswith('http'): href = f"https://www.scribd.com{href}" title = link.get_text(strip=True) or link.get('title', '') if title and len(title) > 5: documents.append({ "title": title[:200], "url": href, "query": query }) # Deduplicate seen = set() unique = [] for doc in documents[:max_results]: if doc['url'] not in seen: seen.add(doc['url']) unique.append(doc) print(f" ✅ Fandt {len(unique)} dokumenter") return unique except Exception as e: print(f" ❌ Fejl: {e}") return [] def get_document_details(self, url: str) -> dict: """Hent metadata for et dokument""" try: response = self.session.get(url) if response.status_code != 200: return {} soup = BeautifulSoup(response.text, 'html.parser') # Extract metadata title = "" title_elem = soup.find('h1') or soup.find('title') if title_elem: title = title_elem.get_text(strip=True).replace(' | PDF', '').replace(' | Scribd', '') author = "" author_elem = soup.find('a', href=re.compile(r'/user/\d+')) if author_elem: author = author_elem.get_text(strip=True) description = "" desc_elem = soup.find('meta', {'name': 'description'}) if desc_elem: description = desc_elem.get('content', '')[:500] # Document ID from URL doc_id_match = re.search(r'/(document|doc)/(\d+)', url) doc_id = doc_id_match.group(2) if doc_id_match else hashlib.md5(url.encode()).hexdigest()[:12] # Thumbnail thumbnail = "" og_image = soup.find('meta', {'property': 'og:image'}) if og_image: thumbnail = og_image.get('content', '') return { "id": doc_id, "title": title, "author": author, "url": url, "description": description, "thumbnail": thumbnail, "doc_type": "document" } except Exception as e: print(f" ⚠️ Metadata fejl: {e}") return {} def save_to_neo4j(self, doc: dict, query: str): """Gem dokument i Neo4j""" content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest() with self.driver.session() as session: session.run(""" MERGE (d:ScribdDocument {contentHash: $hash}) ON CREATE SET d.id = $id, d.title = $title, d.author = $author, d.url = $url, d.description = $description, d.thumbnail = $thumbnail, d.docType = $doc_type, d.searchQuery = $query, d.harvestedAt = datetime(), d.source = 'scribd_public_search' ON MATCH SET d.lastSeen = datetime() MERGE (s:DataSource {name: 'Scribd'}) ON CREATE SET s.type = 'document_repository', s.url = 'https://scribd.com' MERGE (d)-[:HARVESTED_FROM]->(s) """, hash=content_hash, id=doc.get('id', ''), title=doc.get('title', ''), author=doc.get('author', ''), url=doc.get('url', ''), description=doc.get('description', ''), thumbnail=doc.get('thumbnail', ''), doc_type=doc.get('doc_type', 'document'), query=query ) self.stats['saved'] += 1 def run(self): """Kør harvest""" print("=" * 60) print("📚 SCRIBD PUBLIC HARVESTER") print("=" * 60) all_docs = [] for topic in self.SEARCH_TOPICS: docs = self.search_documents(topic) self.stats['found'] += len(docs) for doc in docs: details = self.get_document_details(doc['url']) if details: details['query'] = topic all_docs.append(details) self.save_to_neo4j(details, topic) print(f" 💾 {details['title'][:50]}...") # Summary print("\n" + "=" * 60) print("📊 HARVEST COMPLETE") print("=" * 60) print(f" 🔍 Topics searched: {len(self.SEARCH_TOPICS)}") print(f" 📄 Documents found: {self.stats['found']}") print(f" 💾 Saved to Neo4j: {self.stats['saved']}") print("=" * 60) # Save local JSON output_file = self.output_dir / "scribd_public_harvest.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_docs, f, indent=2, ensure_ascii=False) print(f"\n📁 JSON saved: {output_file}") self.driver.close() return all_docs if __name__ == "__main__": harvester = ScribdPublicHarvester() harvester.run()