Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 📚 Scribd Public Harvester - Henter offentligt tilgængelige dokumenter | |
| """ | |
| import os | |
| import json | |
| import hashlib | |
| import requests | |
| import re | |
| from pathlib import Path | |
| from datetime import datetime | |
| from bs4 import BeautifulSoup | |
| from neo4j import GraphDatabase | |
| class ScribdPublicHarvester: | |
| """Henter offentlige Scribd dokumenter uden login""" | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| } | |
| # Søgeord til at finde relevante dokumenter | |
| SEARCH_TOPICS = [ | |
| "AI ethics", | |
| "generative AI", | |
| "machine learning business", | |
| "digital transformation", | |
| "cybersecurity threats", | |
| "OSINT techniques", | |
| "threat intelligence" | |
| ] | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update(self.HEADERS) | |
| self.output_dir = Path("data/scribd_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self.driver = GraphDatabase.driver( | |
| self.NEO4J_URI, | |
| auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| self.stats = {"found": 0, "saved": 0} | |
| def search_documents(self, query: str, max_results: int = 20): | |
| """Søg efter dokumenter""" | |
| print(f"\n🔍 Søger: {query}") | |
| url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}" | |
| try: | |
| response = self.session.get(url) | |
| if response.status_code != 200: | |
| print(f" ❌ HTTP {response.status_code}") | |
| return [] | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| documents = [] | |
| # Find document cards | |
| for card in soup.select('.SearchResults_card, .document_cell, [data-e2e="search-result"]'): | |
| try: | |
| link = card.find('a', href=re.compile(r'/document/\d+')) | |
| if not link: | |
| link = card.find('a', href=re.compile(r'/doc/\d+')) | |
| if not link: | |
| continue | |
| href = link.get('href', '') | |
| if not href.startswith('http'): | |
| href = f"https://www.scribd.com{href}" | |
| title_elem = card.find(['h2', 'h3', '.title', '[class*="title"]']) | |
| title = title_elem.get_text(strip=True) if title_elem else link.get_text(strip=True) | |
| if title and href: | |
| documents.append({ | |
| "title": title[:200], | |
| "url": href, | |
| "query": query | |
| }) | |
| except Exception: | |
| continue | |
| # Fallback: Find alle document links | |
| if not documents: | |
| for link in soup.find_all('a', href=re.compile(r'/(document|doc)/\d+')): | |
| href = link.get('href', '') | |
| if not href.startswith('http'): | |
| href = f"https://www.scribd.com{href}" | |
| title = link.get_text(strip=True) or link.get('title', '') | |
| if title and len(title) > 5: | |
| documents.append({ | |
| "title": title[:200], | |
| "url": href, | |
| "query": query | |
| }) | |
| # Deduplicate | |
| seen = set() | |
| unique = [] | |
| for doc in documents[:max_results]: | |
| if doc['url'] not in seen: | |
| seen.add(doc['url']) | |
| unique.append(doc) | |
| print(f" ✅ Fandt {len(unique)} dokumenter") | |
| return unique | |
| except Exception as e: | |
| print(f" ❌ Fejl: {e}") | |
| return [] | |
| def get_document_details(self, url: str) -> dict: | |
| """Hent metadata for et dokument""" | |
| try: | |
| response = self.session.get(url) | |
| if response.status_code != 200: | |
| return {} | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extract metadata | |
| title = "" | |
| title_elem = soup.find('h1') or soup.find('title') | |
| if title_elem: | |
| title = title_elem.get_text(strip=True).replace(' | PDF', '').replace(' | Scribd', '') | |
| author = "" | |
| author_elem = soup.find('a', href=re.compile(r'/user/\d+')) | |
| if author_elem: | |
| author = author_elem.get_text(strip=True) | |
| description = "" | |
| desc_elem = soup.find('meta', {'name': 'description'}) | |
| if desc_elem: | |
| description = desc_elem.get('content', '')[:500] | |
| # Document ID from URL | |
| doc_id_match = re.search(r'/(document|doc)/(\d+)', url) | |
| doc_id = doc_id_match.group(2) if doc_id_match else hashlib.md5(url.encode()).hexdigest()[:12] | |
| # Thumbnail | |
| thumbnail = "" | |
| og_image = soup.find('meta', {'property': 'og:image'}) | |
| if og_image: | |
| thumbnail = og_image.get('content', '') | |
| return { | |
| "id": doc_id, | |
| "title": title, | |
| "author": author, | |
| "url": url, | |
| "description": description, | |
| "thumbnail": thumbnail, | |
| "doc_type": "document" | |
| } | |
| except Exception as e: | |
| print(f" ⚠️ Metadata fejl: {e}") | |
| return {} | |
| def save_to_neo4j(self, doc: dict, query: str): | |
| """Gem dokument i Neo4j""" | |
| content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest() | |
| with self.driver.session() as session: | |
| session.run(""" | |
| MERGE (d:ScribdDocument {contentHash: $hash}) | |
| ON CREATE SET | |
| d.id = $id, | |
| d.title = $title, | |
| d.author = $author, | |
| d.url = $url, | |
| d.description = $description, | |
| d.thumbnail = $thumbnail, | |
| d.docType = $doc_type, | |
| d.searchQuery = $query, | |
| d.harvestedAt = datetime(), | |
| d.source = 'scribd_public_search' | |
| ON MATCH SET | |
| d.lastSeen = datetime() | |
| MERGE (s:DataSource {name: 'Scribd'}) | |
| ON CREATE SET s.type = 'document_repository', s.url = 'https://scribd.com' | |
| MERGE (d)-[:HARVESTED_FROM]->(s) | |
| """, | |
| hash=content_hash, | |
| id=doc.get('id', ''), | |
| title=doc.get('title', ''), | |
| author=doc.get('author', ''), | |
| url=doc.get('url', ''), | |
| description=doc.get('description', ''), | |
| thumbnail=doc.get('thumbnail', ''), | |
| doc_type=doc.get('doc_type', 'document'), | |
| query=query | |
| ) | |
| self.stats['saved'] += 1 | |
| def run(self): | |
| """Kør harvest""" | |
| print("=" * 60) | |
| print("📚 SCRIBD PUBLIC HARVESTER") | |
| print("=" * 60) | |
| all_docs = [] | |
| for topic in self.SEARCH_TOPICS: | |
| docs = self.search_documents(topic) | |
| self.stats['found'] += len(docs) | |
| for doc in docs: | |
| details = self.get_document_details(doc['url']) | |
| if details: | |
| details['query'] = topic | |
| all_docs.append(details) | |
| self.save_to_neo4j(details, topic) | |
| print(f" 💾 {details['title'][:50]}...") | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 🔍 Topics searched: {len(self.SEARCH_TOPICS)}") | |
| print(f" 📄 Documents found: {self.stats['found']}") | |
| print(f" 💾 Saved to Neo4j: {self.stats['saved']}") | |
| print("=" * 60) | |
| # Save local JSON | |
| output_file = self.output_dir / "scribd_public_harvest.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(all_docs, f, indent=2, ensure_ascii=False) | |
| print(f"\n📁 JSON saved: {output_file}") | |
| self.driver.close() | |
| return all_docs | |
| if __name__ == "__main__": | |
| harvester = ScribdPublicHarvester() | |
| harvester.run() | |