Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 📊 SlideShare Harvester - Scraper til Neo4j Knowledge Graph | |
| Henter præsentationer om: Cybersecurity, AI, Cloud, Strategy, NIS2, OSINT | |
| """ | |
| import json | |
| import time | |
| import hashlib | |
| import re | |
| from pathlib import Path | |
| from datetime import datetime | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.common.keys import Keys | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from neo4j import GraphDatabase | |
| class SlideShareHarvester: | |
| """SlideShare præsentation harvester""" | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| BASE_URL = "https://www.slideshare.net" | |
| # Søgetermer - fokuseret på WidgeTDC relevante emner | |
| SEARCH_TERMS = [ | |
| # Cybersecurity | |
| "cybersecurity strategy", | |
| "SOC security operations", | |
| "threat intelligence CTI", | |
| "incident response", | |
| "NIS2 directive", | |
| "OSINT investigation", | |
| "penetration testing", | |
| "ransomware defense", | |
| "zero trust architecture", | |
| # AI & ML | |
| "AI cybersecurity", | |
| "machine learning security", | |
| "GPT enterprise", | |
| "generative AI business", | |
| "AI strategy enterprise", | |
| "LLM applications", | |
| # Cloud | |
| "cloud security architecture", | |
| "Azure security", | |
| "AWS security best practices", | |
| "multi-cloud strategy", | |
| "cloud migration security", | |
| # Business Strategy | |
| "digital transformation strategy", | |
| "IT strategy roadmap", | |
| "enterprise architecture", | |
| "technology roadmap", | |
| # Compliance & Governance | |
| "GDPR compliance", | |
| "ISO 27001", | |
| "cyber risk management", | |
| "security governance", | |
| # Specific Tech | |
| "SIEM implementation", | |
| "EDR endpoint detection", | |
| "MDR managed detection", | |
| "XDR security", | |
| "SOAR automation" | |
| ] | |
| def __init__(self): | |
| self.output_dir = Path("data/slideshare_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| # Chrome options | |
| chrome_options = Options() | |
| chrome_options.add_argument("--start-maximized") | |
| chrome_options.add_argument("--disable-blink-features=AutomationControlled") | |
| chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
| chrome_options.add_argument("--disable-notifications") | |
| print("🌐 Starter Chrome til SlideShare...") | |
| self.driver = webdriver.Chrome( | |
| service=Service(ChromeDriverManager().install()), | |
| options=chrome_options | |
| ) | |
| # Neo4j connection | |
| self.neo4j = GraphDatabase.driver( | |
| self.NEO4J_URI, | |
| auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| self.presentations = [] | |
| self.stats = { | |
| "searches": 0, | |
| "presentations_found": 0, | |
| "saved_to_neo4j": 0, | |
| "errors": 0 | |
| } | |
| def search_presentations(self, query: str, max_results: int = 20) -> list: | |
| """Søg efter præsentationer""" | |
| results = [] | |
| try: | |
| # Encode search query | |
| search_url = f"{self.BASE_URL}/search?searchfrom=header&q={query.replace(' ', '+')}" | |
| self.driver.get(search_url) | |
| time.sleep(3) | |
| self.stats["searches"] += 1 | |
| # Scroll to load more results | |
| for _ in range(3): | |
| self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| time.sleep(1) | |
| # Find presentation cards | |
| selectors = [ | |
| 'div[data-testid="slideshow-card"]', | |
| '.slideshow-card', | |
| 'a[href*="/slideshow/"]', | |
| '.search-result-item', | |
| 'article.slideshow' | |
| ] | |
| cards = [] | |
| for selector in selectors: | |
| cards = self.driver.find_elements(By.CSS_SELECTOR, selector) | |
| if cards: | |
| break | |
| # Also try finding by link pattern | |
| if not cards: | |
| cards = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="slideshare.net"]') | |
| print(f" Found {len(cards)} potential results") | |
| for card in cards[:max_results]: | |
| try: | |
| # Extract presentation info | |
| presentation = self._extract_presentation_info(card, query) | |
| if presentation and presentation.get('url'): | |
| # Avoid duplicates | |
| if not any(p['url'] == presentation['url'] for p in results): | |
| results.append(presentation) | |
| self.stats["presentations_found"] += 1 | |
| except Exception as e: | |
| continue | |
| except Exception as e: | |
| print(f" ⚠️ Search error: {str(e)[:100]}") | |
| self.stats["errors"] += 1 | |
| return results | |
| def _extract_presentation_info(self, element, search_query: str) -> dict: | |
| """Udtræk info fra et præsentationselement""" | |
| try: | |
| # Find link | |
| link = None | |
| url = "" | |
| title = "" | |
| # Try to get href directly or from child | |
| if element.tag_name == 'a': | |
| url = element.get_attribute('href') or '' | |
| title = element.text.strip() or element.get_attribute('title') or '' | |
| else: | |
| try: | |
| link = element.find_element(By.CSS_SELECTOR, 'a[href*="/slideshow/"], a[href*="slideshare"]') | |
| url = link.get_attribute('href') or '' | |
| title = link.text.strip() or link.get_attribute('title') or '' | |
| except: | |
| pass | |
| if not url or '/slideshow/' not in url: | |
| return None | |
| # Clean URL | |
| if '?' in url: | |
| url = url.split('?')[0] | |
| # Try to get description | |
| description = "" | |
| try: | |
| desc_elem = element.find_element(By.CSS_SELECTOR, '.description, .summary, p') | |
| description = desc_elem.text.strip()[:500] | |
| except: | |
| pass | |
| # Try to get author | |
| author = "" | |
| try: | |
| author_elem = element.find_element(By.CSS_SELECTOR, '.author, [data-testid="author"], .username') | |
| author = author_elem.text.strip() | |
| except: | |
| pass | |
| # Try to get views/downloads | |
| views = 0 | |
| try: | |
| views_elem = element.find_element(By.CSS_SELECTOR, '.views, .stats') | |
| views_text = views_elem.text | |
| views_match = re.search(r'(\d+[,.\d]*)\s*(?:views|downloads)', views_text, re.I) | |
| if views_match: | |
| views = int(views_match.group(1).replace(',', '').replace('.', '')) | |
| except: | |
| pass | |
| return { | |
| "title": title[:300] if title else f"Presentation from {url.split('/')[-1]}", | |
| "url": url, | |
| "description": description, | |
| "author": author, | |
| "views": views, | |
| "search_query": search_query, | |
| "source": "slideshare", | |
| "harvested_at": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| return None | |
| def get_presentation_details(self, url: str) -> dict: | |
| """Hent detaljer fra en præsentationsside""" | |
| details = {} | |
| try: | |
| self.driver.get(url) | |
| time.sleep(2) | |
| # Title | |
| try: | |
| title_elem = self.driver.find_element(By.CSS_SELECTOR, 'h1, .slideshow-title, [data-testid="title"]') | |
| details['title'] = title_elem.text.strip() | |
| except: | |
| pass | |
| # Description | |
| try: | |
| desc_elem = self.driver.find_element(By.CSS_SELECTOR, '.slideshow-description, .description, [data-testid="description"]') | |
| details['description'] = desc_elem.text.strip()[:1000] | |
| except: | |
| pass | |
| # Author info | |
| try: | |
| author_elem = self.driver.find_element(By.CSS_SELECTOR, '.author-name, [data-testid="author"], .profile-name') | |
| details['author'] = author_elem.text.strip() | |
| except: | |
| pass | |
| # Categories/Tags | |
| try: | |
| tags = self.driver.find_elements(By.CSS_SELECTOR, '.tag, .category, [data-testid="tag"]') | |
| details['tags'] = [t.text.strip() for t in tags[:10] if t.text.strip()] | |
| except: | |
| details['tags'] = [] | |
| # Slides count | |
| try: | |
| slides_elem = self.driver.find_element(By.CSS_SELECTOR, '.slides-count, [data-testid="slides-count"]') | |
| slides_match = re.search(r'(\d+)', slides_elem.text) | |
| if slides_match: | |
| details['slide_count'] = int(slides_match.group(1)) | |
| except: | |
| pass | |
| # Date | |
| try: | |
| date_elem = self.driver.find_element(By.CSS_SELECTOR, '.date, time, [data-testid="date"]') | |
| details['published_date'] = date_elem.text.strip() | |
| except: | |
| pass | |
| except Exception as e: | |
| print(f" ⚠️ Details error: {str(e)[:50]}") | |
| return details | |
| def save_to_neo4j(self, presentation: dict): | |
| """Gem præsentation i Neo4j""" | |
| content_hash = hashlib.md5(presentation['url'].encode()).hexdigest() | |
| try: | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (p:SlideSharePresentation {contentHash: $hash}) | |
| ON CREATE SET | |
| p.title = $title, | |
| p.url = $url, | |
| p.description = $description, | |
| p.author = $author, | |
| p.views = $views, | |
| p.searchQuery = $search_query, | |
| p.tags = $tags, | |
| p.slideCount = $slide_count, | |
| p.harvestedAt = datetime() | |
| ON MATCH SET | |
| p.lastSeen = datetime(), | |
| p.views = $views | |
| MERGE (ds:DataSource {name: 'SlideShare'}) | |
| ON CREATE SET ds.type = 'presentation_platform', ds.url = 'https://slideshare.net' | |
| MERGE (p)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| title=presentation.get('title', ''), | |
| url=presentation.get('url', ''), | |
| description=presentation.get('description', '')[:1000], | |
| author=presentation.get('author', ''), | |
| views=presentation.get('views', 0), | |
| search_query=presentation.get('search_query', ''), | |
| tags=presentation.get('tags', []), | |
| slide_count=presentation.get('slide_count', 0) | |
| ) | |
| # Add category relationships based on search query | |
| category = self._categorize_presentation(presentation) | |
| if category: | |
| session.run(""" | |
| MATCH (p:SlideSharePresentation {contentHash: $hash}) | |
| MERGE (c:Category {name: $category}) | |
| MERGE (p)-[:BELONGS_TO]->(c) | |
| """, hash=content_hash, category=category) | |
| self.stats["saved_to_neo4j"] += 1 | |
| except Exception as e: | |
| print(f" ❌ Neo4j error: {str(e)[:50]}") | |
| self.stats["errors"] += 1 | |
| def _categorize_presentation(self, presentation: dict) -> str: | |
| """Kategoriser præsentation baseret på indhold""" | |
| text = f"{presentation.get('title', '')} {presentation.get('description', '')} {presentation.get('search_query', '')}".lower() | |
| if any(kw in text for kw in ['cyber', 'security', 'threat', 'soc', 'incident', 'malware', 'ransomware']): | |
| return 'CYBERSECURITY' | |
| elif any(kw in text for kw in ['ai', 'artificial intelligence', 'machine learning', 'gpt', 'llm']): | |
| return 'ARTIFICIAL_INTELLIGENCE' | |
| elif any(kw in text for kw in ['cloud', 'azure', 'aws', 'migration']): | |
| return 'CLOUD_COMPUTING' | |
| elif any(kw in text for kw in ['strategy', 'roadmap', 'transformation', 'architecture']): | |
| return 'STRATEGY' | |
| elif any(kw in text for kw in ['compliance', 'gdpr', 'nis2', 'iso', 'governance']): | |
| return 'COMPLIANCE' | |
| elif any(kw in text for kw in ['osint', 'intelligence', 'investigation']): | |
| return 'THREAT_INTELLIGENCE' | |
| else: | |
| return 'GENERAL' | |
| def run(self): | |
| """Kør fuld harvest""" | |
| print("\n" + "=" * 60) | |
| print("📊 SLIDESHARE HARVESTER") | |
| print("=" * 60) | |
| print(f"Søgetermer: {len(self.SEARCH_TERMS)}") | |
| print(f"Output: {self.output_dir}") | |
| print("=" * 60) | |
| # Navigate to SlideShare | |
| self.driver.get(self.BASE_URL) | |
| time.sleep(3) | |
| # Handle any popups/cookies | |
| try: | |
| cookie_btn = self.driver.find_element(By.CSS_SELECTOR, '[data-testid="accept-cookies"], .accept-cookies, #onetrust-accept-btn-handler') | |
| cookie_btn.click() | |
| time.sleep(1) | |
| except: | |
| pass | |
| all_presentations = [] | |
| # Search for each term | |
| for i, term in enumerate(self.SEARCH_TERMS, 1): | |
| print(f"\n🔍 [{i}/{len(self.SEARCH_TERMS)}] Søger: {term}") | |
| results = self.search_presentations(term, max_results=15) | |
| print(f" Found {len(results)} presentations") | |
| for pres in results: | |
| # Avoid duplicates | |
| if not any(p['url'] == pres['url'] for p in all_presentations): | |
| all_presentations.append(pres) | |
| self.save_to_neo4j(pres) | |
| print(f" ✓ {pres['title'][:60]}...") | |
| # Small delay between searches | |
| time.sleep(2) | |
| # Save local JSON | |
| output_file = self.output_dir / "slideshare_harvest.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.stats, | |
| "presentations": all_presentations | |
| }, f, indent=2, ensure_ascii=False) | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 🔍 Searches performed: {self.stats['searches']}") | |
| print(f" 📄 Presentations found: {self.stats['presentations_found']}") | |
| print(f" 💾 Saved to Neo4j: {self.stats['saved_to_neo4j']}") | |
| print(f" ⚠️ Errors: {self.stats['errors']}") | |
| print(f" 📁 Output: {output_file}") | |
| print("=" * 60) | |
| self.driver.quit() | |
| self.neo4j.close() | |
| return all_presentations | |
| if __name__ == "__main__": | |
| harvester = SlideShareHarvester() | |
| harvester.run() | |