#!/usr/bin/env python3 """ 📊 SlideShare Harvester - Scraper til Neo4j Knowledge Graph Henter præsentationer om: Cybersecurity, AI, Cloud, Strategy, NIS2, OSINT """ import json import time import hashlib import re from pathlib import Path from datetime import datetime from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from neo4j import GraphDatabase class SlideShareHarvester: """SlideShare præsentation harvester""" NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" BASE_URL = "https://www.slideshare.net" # Søgetermer - fokuseret på WidgeTDC relevante emner SEARCH_TERMS = [ # Cybersecurity "cybersecurity strategy", "SOC security operations", "threat intelligence CTI", "incident response", "NIS2 directive", "OSINT investigation", "penetration testing", "ransomware defense", "zero trust architecture", # AI & ML "AI cybersecurity", "machine learning security", "GPT enterprise", "generative AI business", "AI strategy enterprise", "LLM applications", # Cloud "cloud security architecture", "Azure security", "AWS security best practices", "multi-cloud strategy", "cloud migration security", # Business Strategy "digital transformation strategy", "IT strategy roadmap", "enterprise architecture", "technology roadmap", # Compliance & Governance "GDPR compliance", "ISO 27001", "cyber risk management", "security governance", # Specific Tech "SIEM implementation", "EDR endpoint detection", "MDR managed detection", "XDR security", "SOAR automation" ] def __init__(self): self.output_dir = Path("data/slideshare_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) # Chrome options chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_argument("--disable-notifications") print("🌐 Starter Chrome til SlideShare...") self.driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=chrome_options ) # Neo4j connection self.neo4j = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) self.presentations = [] self.stats = { "searches": 0, "presentations_found": 0, "saved_to_neo4j": 0, "errors": 0 } def search_presentations(self, query: str, max_results: int = 20) -> list: """Søg efter præsentationer""" results = [] try: # Encode search query search_url = f"{self.BASE_URL}/search?searchfrom=header&q={query.replace(' ', '+')}" self.driver.get(search_url) time.sleep(3) self.stats["searches"] += 1 # Scroll to load more results for _ in range(3): self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # Find presentation cards selectors = [ 'div[data-testid="slideshow-card"]', '.slideshow-card', 'a[href*="/slideshow/"]', '.search-result-item', 'article.slideshow' ] cards = [] for selector in selectors: cards = self.driver.find_elements(By.CSS_SELECTOR, selector) if cards: break # Also try finding by link pattern if not cards: cards = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="slideshare.net"]') print(f" Found {len(cards)} potential results") for card in cards[:max_results]: try: # Extract presentation info presentation = self._extract_presentation_info(card, query) if presentation and presentation.get('url'): # Avoid duplicates if not any(p['url'] == presentation['url'] for p in results): results.append(presentation) self.stats["presentations_found"] += 1 except Exception as e: continue except Exception as e: print(f" ⚠️ Search error: {str(e)[:100]}") self.stats["errors"] += 1 return results def _extract_presentation_info(self, element, search_query: str) -> dict: """Udtræk info fra et præsentationselement""" try: # Find link link = None url = "" title = "" # Try to get href directly or from child if element.tag_name == 'a': url = element.get_attribute('href') or '' title = element.text.strip() or element.get_attribute('title') or '' else: try: link = element.find_element(By.CSS_SELECTOR, 'a[href*="/slideshow/"], a[href*="slideshare"]') url = link.get_attribute('href') or '' title = link.text.strip() or link.get_attribute('title') or '' except: pass if not url or '/slideshow/' not in url: return None # Clean URL if '?' in url: url = url.split('?')[0] # Try to get description description = "" try: desc_elem = element.find_element(By.CSS_SELECTOR, '.description, .summary, p') description = desc_elem.text.strip()[:500] except: pass # Try to get author author = "" try: author_elem = element.find_element(By.CSS_SELECTOR, '.author, [data-testid="author"], .username') author = author_elem.text.strip() except: pass # Try to get views/downloads views = 0 try: views_elem = element.find_element(By.CSS_SELECTOR, '.views, .stats') views_text = views_elem.text views_match = re.search(r'(\d+[,.\d]*)\s*(?:views|downloads)', views_text, re.I) if views_match: views = int(views_match.group(1).replace(',', '').replace('.', '')) except: pass return { "title": title[:300] if title else f"Presentation from {url.split('/')[-1]}", "url": url, "description": description, "author": author, "views": views, "search_query": search_query, "source": "slideshare", "harvested_at": datetime.now().isoformat() } except Exception as e: return None def get_presentation_details(self, url: str) -> dict: """Hent detaljer fra en præsentationsside""" details = {} try: self.driver.get(url) time.sleep(2) # Title try: title_elem = self.driver.find_element(By.CSS_SELECTOR, 'h1, .slideshow-title, [data-testid="title"]') details['title'] = title_elem.text.strip() except: pass # Description try: desc_elem = self.driver.find_element(By.CSS_SELECTOR, '.slideshow-description, .description, [data-testid="description"]') details['description'] = desc_elem.text.strip()[:1000] except: pass # Author info try: author_elem = self.driver.find_element(By.CSS_SELECTOR, '.author-name, [data-testid="author"], .profile-name') details['author'] = author_elem.text.strip() except: pass # Categories/Tags try: tags = self.driver.find_elements(By.CSS_SELECTOR, '.tag, .category, [data-testid="tag"]') details['tags'] = [t.text.strip() for t in tags[:10] if t.text.strip()] except: details['tags'] = [] # Slides count try: slides_elem = self.driver.find_element(By.CSS_SELECTOR, '.slides-count, [data-testid="slides-count"]') slides_match = re.search(r'(\d+)', slides_elem.text) if slides_match: details['slide_count'] = int(slides_match.group(1)) except: pass # Date try: date_elem = self.driver.find_element(By.CSS_SELECTOR, '.date, time, [data-testid="date"]') details['published_date'] = date_elem.text.strip() except: pass except Exception as e: print(f" ⚠️ Details error: {str(e)[:50]}") return details def save_to_neo4j(self, presentation: dict): """Gem præsentation i Neo4j""" content_hash = hashlib.md5(presentation['url'].encode()).hexdigest() try: with self.neo4j.session() as session: session.run(""" MERGE (p:SlideSharePresentation {contentHash: $hash}) ON CREATE SET p.title = $title, p.url = $url, p.description = $description, p.author = $author, p.views = $views, p.searchQuery = $search_query, p.tags = $tags, p.slideCount = $slide_count, p.harvestedAt = datetime() ON MATCH SET p.lastSeen = datetime(), p.views = $views MERGE (ds:DataSource {name: 'SlideShare'}) ON CREATE SET ds.type = 'presentation_platform', ds.url = 'https://slideshare.net' MERGE (p)-[:HARVESTED_FROM]->(ds) """, hash=content_hash, title=presentation.get('title', ''), url=presentation.get('url', ''), description=presentation.get('description', '')[:1000], author=presentation.get('author', ''), views=presentation.get('views', 0), search_query=presentation.get('search_query', ''), tags=presentation.get('tags', []), slide_count=presentation.get('slide_count', 0) ) # Add category relationships based on search query category = self._categorize_presentation(presentation) if category: session.run(""" MATCH (p:SlideSharePresentation {contentHash: $hash}) MERGE (c:Category {name: $category}) MERGE (p)-[:BELONGS_TO]->(c) """, hash=content_hash, category=category) self.stats["saved_to_neo4j"] += 1 except Exception as e: print(f" ❌ Neo4j error: {str(e)[:50]}") self.stats["errors"] += 1 def _categorize_presentation(self, presentation: dict) -> str: """Kategoriser præsentation baseret på indhold""" text = f"{presentation.get('title', '')} {presentation.get('description', '')} {presentation.get('search_query', '')}".lower() if any(kw in text for kw in ['cyber', 'security', 'threat', 'soc', 'incident', 'malware', 'ransomware']): return 'CYBERSECURITY' elif any(kw in text for kw in ['ai', 'artificial intelligence', 'machine learning', 'gpt', 'llm']): return 'ARTIFICIAL_INTELLIGENCE' elif any(kw in text for kw in ['cloud', 'azure', 'aws', 'migration']): return 'CLOUD_COMPUTING' elif any(kw in text for kw in ['strategy', 'roadmap', 'transformation', 'architecture']): return 'STRATEGY' elif any(kw in text for kw in ['compliance', 'gdpr', 'nis2', 'iso', 'governance']): return 'COMPLIANCE' elif any(kw in text for kw in ['osint', 'intelligence', 'investigation']): return 'THREAT_INTELLIGENCE' else: return 'GENERAL' def run(self): """Kør fuld harvest""" print("\n" + "=" * 60) print("📊 SLIDESHARE HARVESTER") print("=" * 60) print(f"Søgetermer: {len(self.SEARCH_TERMS)}") print(f"Output: {self.output_dir}") print("=" * 60) # Navigate to SlideShare self.driver.get(self.BASE_URL) time.sleep(3) # Handle any popups/cookies try: cookie_btn = self.driver.find_element(By.CSS_SELECTOR, '[data-testid="accept-cookies"], .accept-cookies, #onetrust-accept-btn-handler') cookie_btn.click() time.sleep(1) except: pass all_presentations = [] # Search for each term for i, term in enumerate(self.SEARCH_TERMS, 1): print(f"\n🔍 [{i}/{len(self.SEARCH_TERMS)}] Søger: {term}") results = self.search_presentations(term, max_results=15) print(f" Found {len(results)} presentations") for pres in results: # Avoid duplicates if not any(p['url'] == pres['url'] for p in all_presentations): all_presentations.append(pres) self.save_to_neo4j(pres) print(f" ✓ {pres['title'][:60]}...") # Small delay between searches time.sleep(2) # Save local JSON output_file = self.output_dir / "slideshare_harvest.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump({ "timestamp": datetime.now().isoformat(), "stats": self.stats, "presentations": all_presentations }, f, indent=2, ensure_ascii=False) # Summary print("\n" + "=" * 60) print("📊 HARVEST COMPLETE") print("=" * 60) print(f" 🔍 Searches performed: {self.stats['searches']}") print(f" 📄 Presentations found: {self.stats['presentations_found']}") print(f" 💾 Saved to Neo4j: {self.stats['saved_to_neo4j']}") print(f" ⚠️ Errors: {self.stats['errors']}") print(f" 📁 Output: {output_file}") print("=" * 60) self.driver.quit() self.neo4j.close() return all_presentations if __name__ == "__main__": harvester = SlideShareHarvester() harvester.run()