Spaces:

Kraft102
/

widgettdc-api

Paused

File size: 16,353 Bytes

34367da

#!/usr/bin/env python3
"""
📊 SlideShare Harvester - Scraper til Neo4j Knowledge Graph
Henter præsentationer om: Cybersecurity, AI, Cloud, Strategy, NIS2, OSINT
"""
import json
import time
import hashlib
import re
from pathlib import Path
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from neo4j import GraphDatabase

class SlideShareHarvester:
    """SlideShare præsentation harvester"""
    
    NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
    NEO4J_USER = "neo4j"
    NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
    
    BASE_URL = "https://www.slideshare.net"
    
    # Søgetermer - fokuseret på WidgeTDC relevante emner
    SEARCH_TERMS = [
        # Cybersecurity
        "cybersecurity strategy",
        "SOC security operations",
        "threat intelligence CTI",
        "incident response",
        "NIS2 directive",
        "OSINT investigation",
        "penetration testing",
        "ransomware defense",
        "zero trust architecture",
        
        # AI & ML
        "AI cybersecurity",
        "machine learning security",
        "GPT enterprise",
        "generative AI business",
        "AI strategy enterprise",
        "LLM applications",
        
        # Cloud
        "cloud security architecture",
        "Azure security",
        "AWS security best practices",
        "multi-cloud strategy",
        "cloud migration security",
        
        # Business Strategy
        "digital transformation strategy",
        "IT strategy roadmap",
        "enterprise architecture",
        "technology roadmap",
        
        # Compliance & Governance
        "GDPR compliance",
        "ISO 27001",
        "cyber risk management",
        "security governance",
        
        # Specific Tech
        "SIEM implementation",
        "EDR endpoint detection",
        "MDR managed detection",
        "XDR security",
        "SOAR automation"
    ]
    
    def __init__(self):
        self.output_dir = Path("data/slideshare_harvest")
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_argument("--disable-notifications")
        
        print("🌐 Starter Chrome til SlideShare...")
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )
        
        # Neo4j connection
        self.neo4j = GraphDatabase.driver(
            self.NEO4J_URI,
            auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
        )
        
        self.presentations = []
        self.stats = {
            "searches": 0,
            "presentations_found": 0,
            "saved_to_neo4j": 0,
            "errors": 0
        }
    
    def search_presentations(self, query: str, max_results: int = 20) -> list:
        """Søg efter præsentationer"""
        results = []
        
        try:
            # Encode search query
            search_url = f"{self.BASE_URL}/search?searchfrom=header&q={query.replace(' ', '+')}"
            self.driver.get(search_url)
            time.sleep(3)
            
            self.stats["searches"] += 1
            
            # Scroll to load more results
            for _ in range(3):
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1)
            
            # Find presentation cards
            selectors = [
                'div[data-testid="slideshow-card"]',
                '.slideshow-card',
                'a[href*="/slideshow/"]',
                '.search-result-item',
                'article.slideshow'
            ]
            
            cards = []
            for selector in selectors:
                cards = self.driver.find_elements(By.CSS_SELECTOR, selector)
                if cards:
                    break
            
            # Also try finding by link pattern
            if not cards:
                cards = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="slideshare.net"]')
            
            print(f"      Found {len(cards)} potential results")
            
            for card in cards[:max_results]:
                try:
                    # Extract presentation info
                    presentation = self._extract_presentation_info(card, query)
                    if presentation and presentation.get('url'):
                        # Avoid duplicates
                        if not any(p['url'] == presentation['url'] for p in results):
                            results.append(presentation)
                            self.stats["presentations_found"] += 1
                except Exception as e:
                    continue
            
        except Exception as e:
            print(f"      ⚠️ Search error: {str(e)[:100]}")
            self.stats["errors"] += 1
        
        return results
    
    def _extract_presentation_info(self, element, search_query: str) -> dict:
        """Udtræk info fra et præsentationselement"""
        try:
            # Find link
            link = None
            url = ""
            title = ""
            
            # Try to get href directly or from child
            if element.tag_name == 'a':
                url = element.get_attribute('href') or ''
                title = element.text.strip() or element.get_attribute('title') or ''
            else:
                try:
                    link = element.find_element(By.CSS_SELECTOR, 'a[href*="/slideshow/"], a[href*="slideshare"]')
                    url = link.get_attribute('href') or ''
                    title = link.text.strip() or link.get_attribute('title') or ''
                except:
                    pass
            
            if not url or '/slideshow/' not in url:
                return None
            
            # Clean URL
            if '?' in url:
                url = url.split('?')[0]
            
            # Try to get description
            description = ""
            try:
                desc_elem = element.find_element(By.CSS_SELECTOR, '.description, .summary, p')
                description = desc_elem.text.strip()[:500]
            except:
                pass
            
            # Try to get author
            author = ""
            try:
                author_elem = element.find_element(By.CSS_SELECTOR, '.author, [data-testid="author"], .username')
                author = author_elem.text.strip()
            except:
                pass
            
            # Try to get views/downloads
            views = 0
            try:
                views_elem = element.find_element(By.CSS_SELECTOR, '.views, .stats')
                views_text = views_elem.text
                views_match = re.search(r'(\d+[,.\d]*)\s*(?:views|downloads)', views_text, re.I)
                if views_match:
                    views = int(views_match.group(1).replace(',', '').replace('.', ''))
            except:
                pass
            
            return {
                "title": title[:300] if title else f"Presentation from {url.split('/')[-1]}",
                "url": url,
                "description": description,
                "author": author,
                "views": views,
                "search_query": search_query,
                "source": "slideshare",
                "harvested_at": datetime.now().isoformat()
            }
            
        except Exception as e:
            return None
    
    def get_presentation_details(self, url: str) -> dict:
        """Hent detaljer fra en præsentationsside"""
        details = {}
        
        try:
            self.driver.get(url)
            time.sleep(2)
            
            # Title
            try:
                title_elem = self.driver.find_element(By.CSS_SELECTOR, 'h1, .slideshow-title, [data-testid="title"]')
                details['title'] = title_elem.text.strip()
            except:
                pass
            
            # Description
            try:
                desc_elem = self.driver.find_element(By.CSS_SELECTOR, '.slideshow-description, .description, [data-testid="description"]')
                details['description'] = desc_elem.text.strip()[:1000]
            except:
                pass
            
            # Author info
            try:
                author_elem = self.driver.find_element(By.CSS_SELECTOR, '.author-name, [data-testid="author"], .profile-name')
                details['author'] = author_elem.text.strip()
            except:
                pass
            
            # Categories/Tags
            try:
                tags = self.driver.find_elements(By.CSS_SELECTOR, '.tag, .category, [data-testid="tag"]')
                details['tags'] = [t.text.strip() for t in tags[:10] if t.text.strip()]
            except:
                details['tags'] = []
            
            # Slides count
            try:
                slides_elem = self.driver.find_element(By.CSS_SELECTOR, '.slides-count, [data-testid="slides-count"]')
                slides_match = re.search(r'(\d+)', slides_elem.text)
                if slides_match:
                    details['slide_count'] = int(slides_match.group(1))
            except:
                pass
            
            # Date
            try:
                date_elem = self.driver.find_element(By.CSS_SELECTOR, '.date, time, [data-testid="date"]')
                details['published_date'] = date_elem.text.strip()
            except:
                pass
            
        except Exception as e:
            print(f"      ⚠️ Details error: {str(e)[:50]}")
        
        return details
    
    def save_to_neo4j(self, presentation: dict):
        """Gem præsentation i Neo4j"""
        content_hash = hashlib.md5(presentation['url'].encode()).hexdigest()
        
        try:
            with self.neo4j.session() as session:
                session.run("""
                    MERGE (p:SlideSharePresentation {contentHash: $hash})
                    ON CREATE SET
                        p.title = $title,
                        p.url = $url,
                        p.description = $description,
                        p.author = $author,
                        p.views = $views,
                        p.searchQuery = $search_query,
                        p.tags = $tags,
                        p.slideCount = $slide_count,
                        p.harvestedAt = datetime()
                    ON MATCH SET
                        p.lastSeen = datetime(),
                        p.views = $views
                    
                    MERGE (ds:DataSource {name: 'SlideShare'})
                    ON CREATE SET ds.type = 'presentation_platform', ds.url = 'https://slideshare.net'
                    MERGE (p)-[:HARVESTED_FROM]->(ds)
                """,
                hash=content_hash,
                title=presentation.get('title', ''),
                url=presentation.get('url', ''),
                description=presentation.get('description', '')[:1000],
                author=presentation.get('author', ''),
                views=presentation.get('views', 0),
                search_query=presentation.get('search_query', ''),
                tags=presentation.get('tags', []),
                slide_count=presentation.get('slide_count', 0)
                )
                
                # Add category relationships based on search query
                category = self._categorize_presentation(presentation)
                if category:
                    session.run("""
                        MATCH (p:SlideSharePresentation {contentHash: $hash})
                        MERGE (c:Category {name: $category})
                        MERGE (p)-[:BELONGS_TO]->(c)
                    """, hash=content_hash, category=category)
                
                self.stats["saved_to_neo4j"] += 1
                
        except Exception as e:
            print(f"      ❌ Neo4j error: {str(e)[:50]}")
            self.stats["errors"] += 1
    
    def _categorize_presentation(self, presentation: dict) -> str:
        """Kategoriser præsentation baseret på indhold"""
        text = f"{presentation.get('title', '')} {presentation.get('description', '')} {presentation.get('search_query', '')}".lower()
        
        if any(kw in text for kw in ['cyber', 'security', 'threat', 'soc', 'incident', 'malware', 'ransomware']):
            return 'CYBERSECURITY'
        elif any(kw in text for kw in ['ai', 'artificial intelligence', 'machine learning', 'gpt', 'llm']):
            return 'ARTIFICIAL_INTELLIGENCE'
        elif any(kw in text for kw in ['cloud', 'azure', 'aws', 'migration']):
            return 'CLOUD_COMPUTING'
        elif any(kw in text for kw in ['strategy', 'roadmap', 'transformation', 'architecture']):
            return 'STRATEGY'
        elif any(kw in text for kw in ['compliance', 'gdpr', 'nis2', 'iso', 'governance']):
            return 'COMPLIANCE'
        elif any(kw in text for kw in ['osint', 'intelligence', 'investigation']):
            return 'THREAT_INTELLIGENCE'
        else:
            return 'GENERAL'
    
    def run(self):
        """Kør fuld harvest"""
        print("\n" + "=" * 60)
        print("📊 SLIDESHARE HARVESTER")
        print("=" * 60)
        print(f"Søgetermer: {len(self.SEARCH_TERMS)}")
        print(f"Output: {self.output_dir}")
        print("=" * 60)
        
        # Navigate to SlideShare
        self.driver.get(self.BASE_URL)
        time.sleep(3)
        
        # Handle any popups/cookies
        try:
            cookie_btn = self.driver.find_element(By.CSS_SELECTOR, '[data-testid="accept-cookies"], .accept-cookies, #onetrust-accept-btn-handler')
            cookie_btn.click()
            time.sleep(1)
        except:
            pass
        
        all_presentations = []
        
        # Search for each term
        for i, term in enumerate(self.SEARCH_TERMS, 1):
            print(f"\n🔍 [{i}/{len(self.SEARCH_TERMS)}] Søger: {term}")
            
            results = self.search_presentations(term, max_results=15)
            print(f"   Found {len(results)} presentations")
            
            for pres in results:
                # Avoid duplicates
                if not any(p['url'] == pres['url'] for p in all_presentations):
                    all_presentations.append(pres)
                    self.save_to_neo4j(pres)
                    print(f"   ✓ {pres['title'][:60]}...")
            
            # Small delay between searches
            time.sleep(2)
        
        # Save local JSON
        output_file = self.output_dir / "slideshare_harvest.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({
                "timestamp": datetime.now().isoformat(),
                "stats": self.stats,
                "presentations": all_presentations
            }, f, indent=2, ensure_ascii=False)
        
        # Summary
        print("\n" + "=" * 60)
        print("📊 HARVEST COMPLETE")
        print("=" * 60)
        print(f"   🔍 Searches performed:      {self.stats['searches']}")
        print(f"   📄 Presentations found:     {self.stats['presentations_found']}")
        print(f"   💾 Saved to Neo4j:          {self.stats['saved_to_neo4j']}")
        print(f"   ⚠️  Errors:                  {self.stats['errors']}")
        print(f"   📁 Output: {output_file}")
        print("=" * 60)
        
        self.driver.quit()
        self.neo4j.close()
        
        return all_presentations


if __name__ == "__main__":
    harvester = SlideShareHarvester()
    harvester.run()