widgettdc-api / apps /backend /python /scribd_public_harvest.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
📚 Scribd Public Harvester - Henter offentligt tilgængelige dokumenter
"""
import os
import json
import hashlib
import requests
import re
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
from neo4j import GraphDatabase
class ScribdPublicHarvester:
"""Henter offentlige Scribd dokumenter uden login"""
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
# Søgeord til at finde relevante dokumenter
SEARCH_TOPICS = [
"AI ethics",
"generative AI",
"machine learning business",
"digital transformation",
"cybersecurity threats",
"OSINT techniques",
"threat intelligence"
]
def __init__(self):
self.session = requests.Session()
self.session.headers.update(self.HEADERS)
self.output_dir = Path("data/scribd_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
self.driver = GraphDatabase.driver(
self.NEO4J_URI,
auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
self.stats = {"found": 0, "saved": 0}
def search_documents(self, query: str, max_results: int = 20):
"""Søg efter dokumenter"""
print(f"\n🔍 Søger: {query}")
url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}"
try:
response = self.session.get(url)
if response.status_code != 200:
print(f" ❌ HTTP {response.status_code}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
documents = []
# Find document cards
for card in soup.select('.SearchResults_card, .document_cell, [data-e2e="search-result"]'):
try:
link = card.find('a', href=re.compile(r'/document/\d+'))
if not link:
link = card.find('a', href=re.compile(r'/doc/\d+'))
if not link:
continue
href = link.get('href', '')
if not href.startswith('http'):
href = f"https://www.scribd.com{href}"
title_elem = card.find(['h2', 'h3', '.title', '[class*="title"]'])
title = title_elem.get_text(strip=True) if title_elem else link.get_text(strip=True)
if title and href:
documents.append({
"title": title[:200],
"url": href,
"query": query
})
except Exception:
continue
# Fallback: Find alle document links
if not documents:
for link in soup.find_all('a', href=re.compile(r'/(document|doc)/\d+')):
href = link.get('href', '')
if not href.startswith('http'):
href = f"https://www.scribd.com{href}"
title = link.get_text(strip=True) or link.get('title', '')
if title and len(title) > 5:
documents.append({
"title": title[:200],
"url": href,
"query": query
})
# Deduplicate
seen = set()
unique = []
for doc in documents[:max_results]:
if doc['url'] not in seen:
seen.add(doc['url'])
unique.append(doc)
print(f" ✅ Fandt {len(unique)} dokumenter")
return unique
except Exception as e:
print(f" ❌ Fejl: {e}")
return []
def get_document_details(self, url: str) -> dict:
"""Hent metadata for et dokument"""
try:
response = self.session.get(url)
if response.status_code != 200:
return {}
soup = BeautifulSoup(response.text, 'html.parser')
# Extract metadata
title = ""
title_elem = soup.find('h1') or soup.find('title')
if title_elem:
title = title_elem.get_text(strip=True).replace(' | PDF', '').replace(' | Scribd', '')
author = ""
author_elem = soup.find('a', href=re.compile(r'/user/\d+'))
if author_elem:
author = author_elem.get_text(strip=True)
description = ""
desc_elem = soup.find('meta', {'name': 'description'})
if desc_elem:
description = desc_elem.get('content', '')[:500]
# Document ID from URL
doc_id_match = re.search(r'/(document|doc)/(\d+)', url)
doc_id = doc_id_match.group(2) if doc_id_match else hashlib.md5(url.encode()).hexdigest()[:12]
# Thumbnail
thumbnail = ""
og_image = soup.find('meta', {'property': 'og:image'})
if og_image:
thumbnail = og_image.get('content', '')
return {
"id": doc_id,
"title": title,
"author": author,
"url": url,
"description": description,
"thumbnail": thumbnail,
"doc_type": "document"
}
except Exception as e:
print(f" ⚠️ Metadata fejl: {e}")
return {}
def save_to_neo4j(self, doc: dict, query: str):
"""Gem dokument i Neo4j"""
content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest()
with self.driver.session() as session:
session.run("""
MERGE (d:ScribdDocument {contentHash: $hash})
ON CREATE SET
d.id = $id,
d.title = $title,
d.author = $author,
d.url = $url,
d.description = $description,
d.thumbnail = $thumbnail,
d.docType = $doc_type,
d.searchQuery = $query,
d.harvestedAt = datetime(),
d.source = 'scribd_public_search'
ON MATCH SET
d.lastSeen = datetime()
MERGE (s:DataSource {name: 'Scribd'})
ON CREATE SET s.type = 'document_repository', s.url = 'https://scribd.com'
MERGE (d)-[:HARVESTED_FROM]->(s)
""",
hash=content_hash,
id=doc.get('id', ''),
title=doc.get('title', ''),
author=doc.get('author', ''),
url=doc.get('url', ''),
description=doc.get('description', ''),
thumbnail=doc.get('thumbnail', ''),
doc_type=doc.get('doc_type', 'document'),
query=query
)
self.stats['saved'] += 1
def run(self):
"""Kør harvest"""
print("=" * 60)
print("📚 SCRIBD PUBLIC HARVESTER")
print("=" * 60)
all_docs = []
for topic in self.SEARCH_TOPICS:
docs = self.search_documents(topic)
self.stats['found'] += len(docs)
for doc in docs:
details = self.get_document_details(doc['url'])
if details:
details['query'] = topic
all_docs.append(details)
self.save_to_neo4j(details, topic)
print(f" 💾 {details['title'][:50]}...")
# Summary
print("\n" + "=" * 60)
print("📊 HARVEST COMPLETE")
print("=" * 60)
print(f" 🔍 Topics searched: {len(self.SEARCH_TOPICS)}")
print(f" 📄 Documents found: {self.stats['found']}")
print(f" 💾 Saved to Neo4j: {self.stats['saved']}")
print("=" * 60)
# Save local JSON
output_file = self.output_dir / "scribd_public_harvest.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_docs, f, indent=2, ensure_ascii=False)
print(f"\n📁 JSON saved: {output_file}")
self.driver.close()
return all_docs
if __name__ == "__main__":
harvester = ScribdPublicHarvester()
harvester.run()