Spaces:

Kraft102
/

widgettdc-api

Paused

App Files Files Community

widgettdc-api / apps /backend /python /scribd_public_harvest.py

Kraft102

Update backend source

34367da verified 2 months ago

raw

history blame contribute delete

9.07 kB

	#!/usr/bin/env python3
	"""
	📚 Scribd Public Harvester - Henter offentligt tilgængelige dokumenter
	"""

	import os
	import json
	import hashlib
	import requests
	import re
	from pathlib import Path
	from datetime import datetime
	from bs4 import BeautifulSoup
	from neo4j import GraphDatabase

	class ScribdPublicHarvester:
	"""Henter offentlige Scribd dokumenter uden login"""

	NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
	NEO4J_USER = "neo4j"
	NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	}

	# Søgeord til at finde relevante dokumenter
	SEARCH_TOPICS = [
	"AI ethics",
	"generative AI",
	"machine learning business",
	"digital transformation",
	"cybersecurity threats",
	"OSINT techniques",
	"threat intelligence"
	]

	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update(self.HEADERS)
	self.output_dir = Path("data/scribd_harvest")
	self.output_dir.mkdir(parents=True, exist_ok=True)

	self.driver = GraphDatabase.driver(
	self.NEO4J_URI,
	auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
	)

	self.stats = {"found": 0, "saved": 0}

	def search_documents(self, query: str, max_results: int = 20):
	"""Søg efter dokumenter"""
	print(f"\n🔍 Søger: {query}")

	url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}"

	try:
	response = self.session.get(url)
	if response.status_code != 200:
	print(f" ❌ HTTP {response.status_code}")
	return []

	soup = BeautifulSoup(response.text, 'html.parser')
	documents = []

	# Find document cards
	for card in soup.select('.SearchResults_card, .document_cell, [data-e2e="search-result"]'):
	try:
	link = card.find('a', href=re.compile(r'/document/\d+'))
	if not link:
	link = card.find('a', href=re.compile(r'/doc/\d+'))
	if not link:
	continue

	href = link.get('href', '')
	if not href.startswith('http'):
	href = f"https://www.scribd.com{href}"

	title_elem = card.find(['h2', 'h3', '.title', '[class*="title"]'])
	title = title_elem.get_text(strip=True) if title_elem else link.get_text(strip=True)

	if title and href:
	documents.append({
	"title": title[:200],
	"url": href,
	"query": query
	})
	except Exception:
	continue

	# Fallback: Find alle document links
	if not documents:
	for link in soup.find_all('a', href=re.compile(r'/(document\|doc)/\d+')):
	href = link.get('href', '')
	if not href.startswith('http'):
	href = f"https://www.scribd.com{href}"

	title = link.get_text(strip=True) or link.get('title', '')
	if title and len(title) > 5:
	documents.append({
	"title": title[:200],
	"url": href,
	"query": query
	})

	# Deduplicate
	seen = set()
	unique = []
	for doc in documents[:max_results]:
	if doc['url'] not in seen:
	seen.add(doc['url'])
	unique.append(doc)

	print(f" ✅ Fandt {len(unique)} dokumenter")
	return unique

	except Exception as e:
	print(f" ❌ Fejl: {e}")
	return []

	def get_document_details(self, url: str) -> dict:
	"""Hent metadata for et dokument"""
	try:
	response = self.session.get(url)
	if response.status_code != 200:
	return {}

	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract metadata
	title = ""
	title_elem = soup.find('h1') or soup.find('title')
	if title_elem:
	title = title_elem.get_text(strip=True).replace(' \| PDF', '').replace(' \| Scribd', '')

	author = ""
	author_elem = soup.find('a', href=re.compile(r'/user/\d+'))
	if author_elem:
	author = author_elem.get_text(strip=True)

	description = ""
	desc_elem = soup.find('meta', {'name': 'description'})
	if desc_elem:
	description = desc_elem.get('content', '')[:500]

	# Document ID from URL
	doc_id_match = re.search(r'/(document\|doc)/(\d+)', url)
	doc_id = doc_id_match.group(2) if doc_id_match else hashlib.md5(url.encode()).hexdigest()[:12]

	# Thumbnail
	thumbnail = ""
	og_image = soup.find('meta', {'property': 'og:image'})
	if og_image:
	thumbnail = og_image.get('content', '')

	return {
	"id": doc_id,
	"title": title,
	"author": author,
	"url": url,
	"description": description,
	"thumbnail": thumbnail,
	"doc_type": "document"
	}

	except Exception as e:
	print(f" ⚠️ Metadata fejl: {e}")
	return {}

	def save_to_neo4j(self, doc: dict, query: str):
	"""Gem dokument i Neo4j"""
	content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest()

	with self.driver.session() as session:
	session.run("""
	MERGE (d:ScribdDocument {contentHash: $hash})
	ON CREATE SET
	d.id = $id,
	d.title = $title,
	d.author = $author,
	d.url = $url,
	d.description = $description,
	d.thumbnail = $thumbnail,
	d.docType = $doc_type,
	d.searchQuery = $query,
	d.harvestedAt = datetime(),
	d.source = 'scribd_public_search'
	ON MATCH SET
	d.lastSeen = datetime()

	MERGE (s:DataSource {name: 'Scribd'})
	ON CREATE SET s.type = 'document_repository', s.url = 'https://scribd.com'

	MERGE (d)-[:HARVESTED_FROM]->(s)
	""",
	hash=content_hash,
	id=doc.get('id', ''),
	title=doc.get('title', ''),
	author=doc.get('author', ''),
	url=doc.get('url', ''),
	description=doc.get('description', ''),
	thumbnail=doc.get('thumbnail', ''),
	doc_type=doc.get('doc_type', 'document'),
	query=query
	)

	self.stats['saved'] += 1

	def run(self):
	"""Kør harvest"""
	print("=" * 60)
	print("📚 SCRIBD PUBLIC HARVESTER")
	print("=" * 60)

	all_docs = []

	for topic in self.SEARCH_TOPICS:
	docs = self.search_documents(topic)
	self.stats['found'] += len(docs)

	for doc in docs:
	details = self.get_document_details(doc['url'])
	if details:
	details['query'] = topic
	all_docs.append(details)
	self.save_to_neo4j(details, topic)
	print(f" 💾 {details['title'][:50]}...")

	# Summary
	print("\n" + "=" * 60)
	print("📊 HARVEST COMPLETE")
	print("=" * 60)
	print(f" 🔍 Topics searched: {len(self.SEARCH_TOPICS)}")
	print(f" 📄 Documents found: {self.stats['found']}")
	print(f" 💾 Saved to Neo4j: {self.stats['saved']}")
	print("=" * 60)

	# Save local JSON
	output_file = self.output_dir / "scribd_public_harvest.json"
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(all_docs, f, indent=2, ensure_ascii=False)
	print(f"\n📁 JSON saved: {output_file}")

	self.driver.close()
	return all_docs


	if __name__ == "__main__":
	harvester = ScribdPublicHarvester()
	harvester.run()