Spaces:

Kraft102
/

widgettdc-api

Paused

App Files Files Community

widgettdc-api / apps /backend /python /slideshare_harvester.py

Kraft102

Update backend source

34367da verified 2 months ago

raw

history blame contribute delete

16.4 kB

	#!/usr/bin/env python3
	"""
	📊 SlideShare Harvester - Scraper til Neo4j Knowledge Graph
	Henter præsentationer om: Cybersecurity, AI, Cloud, Strategy, NIS2, OSINT
	"""
	import json
	import time
	import hashlib
	import re
	from pathlib import Path
	from datetime import datetime
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from webdriver_manager.chrome import ChromeDriverManager
	from neo4j import GraphDatabase

	class SlideShareHarvester:
	"""SlideShare præsentation harvester"""

	NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
	NEO4J_USER = "neo4j"
	NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"

	BASE_URL = "https://www.slideshare.net"

	# Søgetermer - fokuseret på WidgeTDC relevante emner
	SEARCH_TERMS = [
	# Cybersecurity
	"cybersecurity strategy",
	"SOC security operations",
	"threat intelligence CTI",
	"incident response",
	"NIS2 directive",
	"OSINT investigation",
	"penetration testing",
	"ransomware defense",
	"zero trust architecture",

	# AI & ML
	"AI cybersecurity",
	"machine learning security",
	"GPT enterprise",
	"generative AI business",
	"AI strategy enterprise",
	"LLM applications",

	# Cloud
	"cloud security architecture",
	"Azure security",
	"AWS security best practices",
	"multi-cloud strategy",
	"cloud migration security",

	# Business Strategy
	"digital transformation strategy",
	"IT strategy roadmap",
	"enterprise architecture",
	"technology roadmap",

	# Compliance & Governance
	"GDPR compliance",
	"ISO 27001",
	"cyber risk management",
	"security governance",

	# Specific Tech
	"SIEM implementation",
	"EDR endpoint detection",
	"MDR managed detection",
	"XDR security",
	"SOAR automation"
	]

	def __init__(self):
	self.output_dir = Path("data/slideshare_harvest")
	self.output_dir.mkdir(parents=True, exist_ok=True)

	# Chrome options
	chrome_options = Options()
	chrome_options.add_argument("--start-maximized")
	chrome_options.add_argument("--disable-blink-features=AutomationControlled")
	chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
	chrome_options.add_argument("--disable-notifications")

	print("🌐 Starter Chrome til SlideShare...")
	self.driver = webdriver.Chrome(
	service=Service(ChromeDriverManager().install()),
	options=chrome_options
	)

	# Neo4j connection
	self.neo4j = GraphDatabase.driver(
	self.NEO4J_URI,
	auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
	)

	self.presentations = []
	self.stats = {
	"searches": 0,
	"presentations_found": 0,
	"saved_to_neo4j": 0,
	"errors": 0
	}

	def search_presentations(self, query: str, max_results: int = 20) -> list:
	"""Søg efter præsentationer"""
	results = []

	try:
	# Encode search query
	search_url = f"{self.BASE_URL}/search?searchfrom=header&q={query.replace(' ', '+')}"
	self.driver.get(search_url)
	time.sleep(3)

	self.stats["searches"] += 1

	# Scroll to load more results
	for _ in range(3):
	self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(1)

	# Find presentation cards
	selectors = [
	'div[data-testid="slideshow-card"]',
	'.slideshow-card',
	'a[href*="/slideshow/"]',
	'.search-result-item',
	'article.slideshow'
	]

	cards = []
	for selector in selectors:
	cards = self.driver.find_elements(By.CSS_SELECTOR, selector)
	if cards:
	break

	# Also try finding by link pattern
	if not cards:
	cards = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="slideshare.net"]')

	print(f" Found {len(cards)} potential results")

	for card in cards[:max_results]:
	try:
	# Extract presentation info
	presentation = self._extract_presentation_info(card, query)
	if presentation and presentation.get('url'):
	# Avoid duplicates
	if not any(p['url'] == presentation['url'] for p in results):
	results.append(presentation)
	self.stats["presentations_found"] += 1
	except Exception as e:
	continue

	except Exception as e:
	print(f" ⚠️ Search error: {str(e)[:100]}")
	self.stats["errors"] += 1

	return results

	def _extract_presentation_info(self, element, search_query: str) -> dict:
	"""Udtræk info fra et præsentationselement"""
	try:
	# Find link
	link = None
	url = ""
	title = ""

	# Try to get href directly or from child
	if element.tag_name == 'a':
	url = element.get_attribute('href') or ''
	title = element.text.strip() or element.get_attribute('title') or ''
	else:
	try:
	link = element.find_element(By.CSS_SELECTOR, 'a[href="/slideshow/"], a[href="slideshare"]')
	url = link.get_attribute('href') or ''
	title = link.text.strip() or link.get_attribute('title') or ''
	except:
	pass

	if not url or '/slideshow/' not in url:
	return None

	# Clean URL
	if '?' in url:
	url = url.split('?')[0]

	# Try to get description
	description = ""
	try:
	desc_elem = element.find_element(By.CSS_SELECTOR, '.description, .summary, p')
	description = desc_elem.text.strip()[:500]
	except:
	pass

	# Try to get author
	author = ""
	try:
	author_elem = element.find_element(By.CSS_SELECTOR, '.author, [data-testid="author"], .username')
	author = author_elem.text.strip()
	except:
	pass

	# Try to get views/downloads
	views = 0
	try:
	views_elem = element.find_element(By.CSS_SELECTOR, '.views, .stats')
	views_text = views_elem.text
	views_match = re.search(r'(\d+[,.\d])\s(?:views\|downloads)', views_text, re.I)
	if views_match:
	views = int(views_match.group(1).replace(',', '').replace('.', ''))
	except:
	pass

	return {
	"title": title[:300] if title else f"Presentation from {url.split('/')[-1]}",
	"url": url,
	"description": description,
	"author": author,
	"views": views,
	"search_query": search_query,
	"source": "slideshare",
	"harvested_at": datetime.now().isoformat()
	}

	except Exception as e:
	return None

	def get_presentation_details(self, url: str) -> dict:
	"""Hent detaljer fra en præsentationsside"""
	details = {}

	try:
	self.driver.get(url)
	time.sleep(2)

	# Title
	try:
	title_elem = self.driver.find_element(By.CSS_SELECTOR, 'h1, .slideshow-title, [data-testid="title"]')
	details['title'] = title_elem.text.strip()
	except:
	pass

	# Description
	try:
	desc_elem = self.driver.find_element(By.CSS_SELECTOR, '.slideshow-description, .description, [data-testid="description"]')
	details['description'] = desc_elem.text.strip()[:1000]
	except:
	pass

	# Author info
	try:
	author_elem = self.driver.find_element(By.CSS_SELECTOR, '.author-name, [data-testid="author"], .profile-name')
	details['author'] = author_elem.text.strip()
	except:
	pass

	# Categories/Tags
	try:
	tags = self.driver.find_elements(By.CSS_SELECTOR, '.tag, .category, [data-testid="tag"]')
	details['tags'] = [t.text.strip() for t in tags[:10] if t.text.strip()]
	except:
	details['tags'] = []

	# Slides count
	try:
	slides_elem = self.driver.find_element(By.CSS_SELECTOR, '.slides-count, [data-testid="slides-count"]')
	slides_match = re.search(r'(\d+)', slides_elem.text)
	if slides_match:
	details['slide_count'] = int(slides_match.group(1))
	except:
	pass

	# Date
	try:
	date_elem = self.driver.find_element(By.CSS_SELECTOR, '.date, time, [data-testid="date"]')
	details['published_date'] = date_elem.text.strip()
	except:
	pass

	except Exception as e:
	print(f" ⚠️ Details error: {str(e)[:50]}")

	return details

	def save_to_neo4j(self, presentation: dict):
	"""Gem præsentation i Neo4j"""
	content_hash = hashlib.md5(presentation['url'].encode()).hexdigest()

	try:
	with self.neo4j.session() as session:
	session.run("""
	MERGE (p:SlideSharePresentation {contentHash: $hash})
	ON CREATE SET
	p.title = $title,
	p.url = $url,
	p.description = $description,
	p.author = $author,
	p.views = $views,
	p.searchQuery = $search_query,
	p.tags = $tags,
	p.slideCount = $slide_count,
	p.harvestedAt = datetime()
	ON MATCH SET
	p.lastSeen = datetime(),
	p.views = $views

	MERGE (ds:DataSource {name: 'SlideShare'})
	ON CREATE SET ds.type = 'presentation_platform', ds.url = 'https://slideshare.net'
	MERGE (p)-[:HARVESTED_FROM]->(ds)
	""",
	hash=content_hash,
	title=presentation.get('title', ''),
	url=presentation.get('url', ''),
	description=presentation.get('description', '')[:1000],
	author=presentation.get('author', ''),
	views=presentation.get('views', 0),
	search_query=presentation.get('search_query', ''),
	tags=presentation.get('tags', []),
	slide_count=presentation.get('slide_count', 0)
	)

	# Add category relationships based on search query
	category = self._categorize_presentation(presentation)
	if category:
	session.run("""
	MATCH (p:SlideSharePresentation {contentHash: $hash})
	MERGE (c:Category {name: $category})
	MERGE (p)-[:BELONGS_TO]->(c)
	""", hash=content_hash, category=category)

	self.stats["saved_to_neo4j"] += 1

	except Exception as e:
	print(f" ❌ Neo4j error: {str(e)[:50]}")
	self.stats["errors"] += 1

	def _categorize_presentation(self, presentation: dict) -> str:
	"""Kategoriser præsentation baseret på indhold"""
	text = f"{presentation.get('title', '')} {presentation.get('description', '')} {presentation.get('search_query', '')}".lower()

	if any(kw in text for kw in ['cyber', 'security', 'threat', 'soc', 'incident', 'malware', 'ransomware']):
	return 'CYBERSECURITY'
	elif any(kw in text for kw in ['ai', 'artificial intelligence', 'machine learning', 'gpt', 'llm']):
	return 'ARTIFICIAL_INTELLIGENCE'
	elif any(kw in text for kw in ['cloud', 'azure', 'aws', 'migration']):
	return 'CLOUD_COMPUTING'
	elif any(kw in text for kw in ['strategy', 'roadmap', 'transformation', 'architecture']):
	return 'STRATEGY'
	elif any(kw in text for kw in ['compliance', 'gdpr', 'nis2', 'iso', 'governance']):
	return 'COMPLIANCE'
	elif any(kw in text for kw in ['osint', 'intelligence', 'investigation']):
	return 'THREAT_INTELLIGENCE'
	else:
	return 'GENERAL'

	def run(self):
	"""Kør fuld harvest"""
	print("\n" + "=" * 60)
	print("📊 SLIDESHARE HARVESTER")
	print("=" * 60)
	print(f"Søgetermer: {len(self.SEARCH_TERMS)}")
	print(f"Output: {self.output_dir}")
	print("=" * 60)

	# Navigate to SlideShare
	self.driver.get(self.BASE_URL)
	time.sleep(3)

	# Handle any popups/cookies
	try:
	cookie_btn = self.driver.find_element(By.CSS_SELECTOR, '[data-testid="accept-cookies"], .accept-cookies, #onetrust-accept-btn-handler')
	cookie_btn.click()
	time.sleep(1)
	except:
	pass

	all_presentations = []

	# Search for each term
	for i, term in enumerate(self.SEARCH_TERMS, 1):
	print(f"\n🔍 [{i}/{len(self.SEARCH_TERMS)}] Søger: {term}")

	results = self.search_presentations(term, max_results=15)
	print(f" Found {len(results)} presentations")

	for pres in results:
	# Avoid duplicates
	if not any(p['url'] == pres['url'] for p in all_presentations):
	all_presentations.append(pres)
	self.save_to_neo4j(pres)
	print(f" ✓ {pres['title'][:60]}...")

	# Small delay between searches
	time.sleep(2)

	# Save local JSON
	output_file = self.output_dir / "slideshare_harvest.json"
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump({
	"timestamp": datetime.now().isoformat(),
	"stats": self.stats,
	"presentations": all_presentations
	}, f, indent=2, ensure_ascii=False)

	# Summary
	print("\n" + "=" * 60)
	print("📊 HARVEST COMPLETE")
	print("=" * 60)
	print(f" 🔍 Searches performed: {self.stats['searches']}")
	print(f" 📄 Presentations found: {self.stats['presentations_found']}")
	print(f" 💾 Saved to Neo4j: {self.stats['saved_to_neo4j']}")
	print(f" ⚠️ Errors: {self.stats['errors']}")
	print(f" 📁 Output: {output_file}")
	print("=" * 60)

	self.driver.quit()
	self.neo4j.close()

	return all_presentations


	if __name__ == "__main__":
	harvester = SlideShareHarvester()
	harvester.run()