widgettdc-api / apps /backend /python /slideshare_harvester.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
📊 SlideShare Harvester - Scraper til Neo4j Knowledge Graph
Henter præsentationer om: Cybersecurity, AI, Cloud, Strategy, NIS2, OSINT
"""
import json
import time
import hashlib
import re
from pathlib import Path
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from neo4j import GraphDatabase
class SlideShareHarvester:
"""SlideShare præsentation harvester"""
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
BASE_URL = "https://www.slideshare.net"
# Søgetermer - fokuseret på WidgeTDC relevante emner
SEARCH_TERMS = [
# Cybersecurity
"cybersecurity strategy",
"SOC security operations",
"threat intelligence CTI",
"incident response",
"NIS2 directive",
"OSINT investigation",
"penetration testing",
"ransomware defense",
"zero trust architecture",
# AI & ML
"AI cybersecurity",
"machine learning security",
"GPT enterprise",
"generative AI business",
"AI strategy enterprise",
"LLM applications",
# Cloud
"cloud security architecture",
"Azure security",
"AWS security best practices",
"multi-cloud strategy",
"cloud migration security",
# Business Strategy
"digital transformation strategy",
"IT strategy roadmap",
"enterprise architecture",
"technology roadmap",
# Compliance & Governance
"GDPR compliance",
"ISO 27001",
"cyber risk management",
"security governance",
# Specific Tech
"SIEM implementation",
"EDR endpoint detection",
"MDR managed detection",
"XDR security",
"SOAR automation"
]
def __init__(self):
self.output_dir = Path("data/slideshare_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
# Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("--disable-notifications")
print("🌐 Starter Chrome til SlideShare...")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
# Neo4j connection
self.neo4j = GraphDatabase.driver(
self.NEO4J_URI,
auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
self.presentations = []
self.stats = {
"searches": 0,
"presentations_found": 0,
"saved_to_neo4j": 0,
"errors": 0
}
def search_presentations(self, query: str, max_results: int = 20) -> list:
"""Søg efter præsentationer"""
results = []
try:
# Encode search query
search_url = f"{self.BASE_URL}/search?searchfrom=header&q={query.replace(' ', '+')}"
self.driver.get(search_url)
time.sleep(3)
self.stats["searches"] += 1
# Scroll to load more results
for _ in range(3):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# Find presentation cards
selectors = [
'div[data-testid="slideshow-card"]',
'.slideshow-card',
'a[href*="/slideshow/"]',
'.search-result-item',
'article.slideshow'
]
cards = []
for selector in selectors:
cards = self.driver.find_elements(By.CSS_SELECTOR, selector)
if cards:
break
# Also try finding by link pattern
if not cards:
cards = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="slideshare.net"]')
print(f" Found {len(cards)} potential results")
for card in cards[:max_results]:
try:
# Extract presentation info
presentation = self._extract_presentation_info(card, query)
if presentation and presentation.get('url'):
# Avoid duplicates
if not any(p['url'] == presentation['url'] for p in results):
results.append(presentation)
self.stats["presentations_found"] += 1
except Exception as e:
continue
except Exception as e:
print(f" ⚠️ Search error: {str(e)[:100]}")
self.stats["errors"] += 1
return results
def _extract_presentation_info(self, element, search_query: str) -> dict:
"""Udtræk info fra et præsentationselement"""
try:
# Find link
link = None
url = ""
title = ""
# Try to get href directly or from child
if element.tag_name == 'a':
url = element.get_attribute('href') or ''
title = element.text.strip() or element.get_attribute('title') or ''
else:
try:
link = element.find_element(By.CSS_SELECTOR, 'a[href*="/slideshow/"], a[href*="slideshare"]')
url = link.get_attribute('href') or ''
title = link.text.strip() or link.get_attribute('title') or ''
except:
pass
if not url or '/slideshow/' not in url:
return None
# Clean URL
if '?' in url:
url = url.split('?')[0]
# Try to get description
description = ""
try:
desc_elem = element.find_element(By.CSS_SELECTOR, '.description, .summary, p')
description = desc_elem.text.strip()[:500]
except:
pass
# Try to get author
author = ""
try:
author_elem = element.find_element(By.CSS_SELECTOR, '.author, [data-testid="author"], .username')
author = author_elem.text.strip()
except:
pass
# Try to get views/downloads
views = 0
try:
views_elem = element.find_element(By.CSS_SELECTOR, '.views, .stats')
views_text = views_elem.text
views_match = re.search(r'(\d+[,.\d]*)\s*(?:views|downloads)', views_text, re.I)
if views_match:
views = int(views_match.group(1).replace(',', '').replace('.', ''))
except:
pass
return {
"title": title[:300] if title else f"Presentation from {url.split('/')[-1]}",
"url": url,
"description": description,
"author": author,
"views": views,
"search_query": search_query,
"source": "slideshare",
"harvested_at": datetime.now().isoformat()
}
except Exception as e:
return None
def get_presentation_details(self, url: str) -> dict:
"""Hent detaljer fra en præsentationsside"""
details = {}
try:
self.driver.get(url)
time.sleep(2)
# Title
try:
title_elem = self.driver.find_element(By.CSS_SELECTOR, 'h1, .slideshow-title, [data-testid="title"]')
details['title'] = title_elem.text.strip()
except:
pass
# Description
try:
desc_elem = self.driver.find_element(By.CSS_SELECTOR, '.slideshow-description, .description, [data-testid="description"]')
details['description'] = desc_elem.text.strip()[:1000]
except:
pass
# Author info
try:
author_elem = self.driver.find_element(By.CSS_SELECTOR, '.author-name, [data-testid="author"], .profile-name')
details['author'] = author_elem.text.strip()
except:
pass
# Categories/Tags
try:
tags = self.driver.find_elements(By.CSS_SELECTOR, '.tag, .category, [data-testid="tag"]')
details['tags'] = [t.text.strip() for t in tags[:10] if t.text.strip()]
except:
details['tags'] = []
# Slides count
try:
slides_elem = self.driver.find_element(By.CSS_SELECTOR, '.slides-count, [data-testid="slides-count"]')
slides_match = re.search(r'(\d+)', slides_elem.text)
if slides_match:
details['slide_count'] = int(slides_match.group(1))
except:
pass
# Date
try:
date_elem = self.driver.find_element(By.CSS_SELECTOR, '.date, time, [data-testid="date"]')
details['published_date'] = date_elem.text.strip()
except:
pass
except Exception as e:
print(f" ⚠️ Details error: {str(e)[:50]}")
return details
def save_to_neo4j(self, presentation: dict):
"""Gem præsentation i Neo4j"""
content_hash = hashlib.md5(presentation['url'].encode()).hexdigest()
try:
with self.neo4j.session() as session:
session.run("""
MERGE (p:SlideSharePresentation {contentHash: $hash})
ON CREATE SET
p.title = $title,
p.url = $url,
p.description = $description,
p.author = $author,
p.views = $views,
p.searchQuery = $search_query,
p.tags = $tags,
p.slideCount = $slide_count,
p.harvestedAt = datetime()
ON MATCH SET
p.lastSeen = datetime(),
p.views = $views
MERGE (ds:DataSource {name: 'SlideShare'})
ON CREATE SET ds.type = 'presentation_platform', ds.url = 'https://slideshare.net'
MERGE (p)-[:HARVESTED_FROM]->(ds)
""",
hash=content_hash,
title=presentation.get('title', ''),
url=presentation.get('url', ''),
description=presentation.get('description', '')[:1000],
author=presentation.get('author', ''),
views=presentation.get('views', 0),
search_query=presentation.get('search_query', ''),
tags=presentation.get('tags', []),
slide_count=presentation.get('slide_count', 0)
)
# Add category relationships based on search query
category = self._categorize_presentation(presentation)
if category:
session.run("""
MATCH (p:SlideSharePresentation {contentHash: $hash})
MERGE (c:Category {name: $category})
MERGE (p)-[:BELONGS_TO]->(c)
""", hash=content_hash, category=category)
self.stats["saved_to_neo4j"] += 1
except Exception as e:
print(f" ❌ Neo4j error: {str(e)[:50]}")
self.stats["errors"] += 1
def _categorize_presentation(self, presentation: dict) -> str:
"""Kategoriser præsentation baseret på indhold"""
text = f"{presentation.get('title', '')} {presentation.get('description', '')} {presentation.get('search_query', '')}".lower()
if any(kw in text for kw in ['cyber', 'security', 'threat', 'soc', 'incident', 'malware', 'ransomware']):
return 'CYBERSECURITY'
elif any(kw in text for kw in ['ai', 'artificial intelligence', 'machine learning', 'gpt', 'llm']):
return 'ARTIFICIAL_INTELLIGENCE'
elif any(kw in text for kw in ['cloud', 'azure', 'aws', 'migration']):
return 'CLOUD_COMPUTING'
elif any(kw in text for kw in ['strategy', 'roadmap', 'transformation', 'architecture']):
return 'STRATEGY'
elif any(kw in text for kw in ['compliance', 'gdpr', 'nis2', 'iso', 'governance']):
return 'COMPLIANCE'
elif any(kw in text for kw in ['osint', 'intelligence', 'investigation']):
return 'THREAT_INTELLIGENCE'
else:
return 'GENERAL'
def run(self):
"""Kør fuld harvest"""
print("\n" + "=" * 60)
print("📊 SLIDESHARE HARVESTER")
print("=" * 60)
print(f"Søgetermer: {len(self.SEARCH_TERMS)}")
print(f"Output: {self.output_dir}")
print("=" * 60)
# Navigate to SlideShare
self.driver.get(self.BASE_URL)
time.sleep(3)
# Handle any popups/cookies
try:
cookie_btn = self.driver.find_element(By.CSS_SELECTOR, '[data-testid="accept-cookies"], .accept-cookies, #onetrust-accept-btn-handler')
cookie_btn.click()
time.sleep(1)
except:
pass
all_presentations = []
# Search for each term
for i, term in enumerate(self.SEARCH_TERMS, 1):
print(f"\n🔍 [{i}/{len(self.SEARCH_TERMS)}] Søger: {term}")
results = self.search_presentations(term, max_results=15)
print(f" Found {len(results)} presentations")
for pres in results:
# Avoid duplicates
if not any(p['url'] == pres['url'] for p in all_presentations):
all_presentations.append(pres)
self.save_to_neo4j(pres)
print(f" ✓ {pres['title'][:60]}...")
# Small delay between searches
time.sleep(2)
# Save local JSON
output_file = self.output_dir / "slideshare_harvest.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
"timestamp": datetime.now().isoformat(),
"stats": self.stats,
"presentations": all_presentations
}, f, indent=2, ensure_ascii=False)
# Summary
print("\n" + "=" * 60)
print("📊 HARVEST COMPLETE")
print("=" * 60)
print(f" 🔍 Searches performed: {self.stats['searches']}")
print(f" 📄 Presentations found: {self.stats['presentations_found']}")
print(f" 💾 Saved to Neo4j: {self.stats['saved_to_neo4j']}")
print(f" ⚠️ Errors: {self.stats['errors']}")
print(f" 📁 Output: {output_file}")
print("=" * 60)
self.driver.quit()
self.neo4j.close()
return all_presentations
if __name__ == "__main__":
harvester = SlideShareHarvester()
harvester.run()