Spaces:

Kraft102
/

widgettdc-api

Paused

File size: 9,073 Bytes

34367da

#!/usr/bin/env python3
"""
📚 Scribd Public Harvester - Henter offentligt tilgængelige dokumenter
"""

import os
import json
import hashlib
import requests
import re
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
from neo4j import GraphDatabase

class ScribdPublicHarvester:
    """Henter offentlige Scribd dokumenter uden login"""
    
    NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
    NEO4J_USER = "neo4j"
    NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
    
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    
    # Søgeord til at finde relevante dokumenter
    SEARCH_TOPICS = [
        "AI ethics",
        "generative AI",
        "machine learning business",
        "digital transformation",
        "cybersecurity threats",
        "OSINT techniques",
        "threat intelligence"
    ]
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(self.HEADERS)
        self.output_dir = Path("data/scribd_harvest")
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        self.driver = GraphDatabase.driver(
            self.NEO4J_URI,
            auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
        )
        
        self.stats = {"found": 0, "saved": 0}
        
    def search_documents(self, query: str, max_results: int = 20):
        """Søg efter dokumenter"""
        print(f"\n🔍 Søger: {query}")
        
        url = f"https://www.scribd.com/search?query={query.replace(' ', '+')}"
        
        try:
            response = self.session.get(url)
            if response.status_code != 200:
                print(f"   ❌ HTTP {response.status_code}")
                return []
            
            soup = BeautifulSoup(response.text, 'html.parser')
            documents = []
            
            # Find document cards
            for card in soup.select('.SearchResults_card, .document_cell, [data-e2e="search-result"]'):
                try:
                    link = card.find('a', href=re.compile(r'/document/\d+'))
                    if not link:
                        link = card.find('a', href=re.compile(r'/doc/\d+'))
                    if not link:
                        continue
                    
                    href = link.get('href', '')
                    if not href.startswith('http'):
                        href = f"https://www.scribd.com{href}"
                    
                    title_elem = card.find(['h2', 'h3', '.title', '[class*="title"]'])
                    title = title_elem.get_text(strip=True) if title_elem else link.get_text(strip=True)
                    
                    if title and href:
                        documents.append({
                            "title": title[:200],
                            "url": href,
                            "query": query
                        })
                except Exception:
                    continue
            
            # Fallback: Find alle document links
            if not documents:
                for link in soup.find_all('a', href=re.compile(r'/(document|doc)/\d+')):
                    href = link.get('href', '')
                    if not href.startswith('http'):
                        href = f"https://www.scribd.com{href}"
                    
                    title = link.get_text(strip=True) or link.get('title', '')
                    if title and len(title) > 5:
                        documents.append({
                            "title": title[:200],
                            "url": href,
                            "query": query
                        })
            
            # Deduplicate
            seen = set()
            unique = []
            for doc in documents[:max_results]:
                if doc['url'] not in seen:
                    seen.add(doc['url'])
                    unique.append(doc)
            
            print(f"   ✅ Fandt {len(unique)} dokumenter")
            return unique
            
        except Exception as e:
            print(f"   ❌ Fejl: {e}")
            return []
    
    def get_document_details(self, url: str) -> dict:
        """Hent metadata for et dokument"""
        try:
            response = self.session.get(url)
            if response.status_code != 200:
                return {}
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract metadata
            title = ""
            title_elem = soup.find('h1') or soup.find('title')
            if title_elem:
                title = title_elem.get_text(strip=True).replace(' | PDF', '').replace(' | Scribd', '')
            
            author = ""
            author_elem = soup.find('a', href=re.compile(r'/user/\d+'))
            if author_elem:
                author = author_elem.get_text(strip=True)
            
            description = ""
            desc_elem = soup.find('meta', {'name': 'description'})
            if desc_elem:
                description = desc_elem.get('content', '')[:500]
            
            # Document ID from URL
            doc_id_match = re.search(r'/(document|doc)/(\d+)', url)
            doc_id = doc_id_match.group(2) if doc_id_match else hashlib.md5(url.encode()).hexdigest()[:12]
            
            # Thumbnail
            thumbnail = ""
            og_image = soup.find('meta', {'property': 'og:image'})
            if og_image:
                thumbnail = og_image.get('content', '')
            
            return {
                "id": doc_id,
                "title": title,
                "author": author,
                "url": url,
                "description": description,
                "thumbnail": thumbnail,
                "doc_type": "document"
            }
            
        except Exception as e:
            print(f"      ⚠️ Metadata fejl: {e}")
            return {}
    
    def save_to_neo4j(self, doc: dict, query: str):
        """Gem dokument i Neo4j"""
        content_hash = hashlib.md5(f"{doc['title']}:{doc['url']}".encode()).hexdigest()
        
        with self.driver.session() as session:
            session.run("""
                MERGE (d:ScribdDocument {contentHash: $hash})
                ON CREATE SET
                    d.id = $id,
                    d.title = $title,
                    d.author = $author,
                    d.url = $url,
                    d.description = $description,
                    d.thumbnail = $thumbnail,
                    d.docType = $doc_type,
                    d.searchQuery = $query,
                    d.harvestedAt = datetime(),
                    d.source = 'scribd_public_search'
                ON MATCH SET
                    d.lastSeen = datetime()
                
                MERGE (s:DataSource {name: 'Scribd'})
                ON CREATE SET s.type = 'document_repository', s.url = 'https://scribd.com'
                
                MERGE (d)-[:HARVESTED_FROM]->(s)
            """, 
            hash=content_hash,
            id=doc.get('id', ''),
            title=doc.get('title', ''),
            author=doc.get('author', ''),
            url=doc.get('url', ''),
            description=doc.get('description', ''),
            thumbnail=doc.get('thumbnail', ''),
            doc_type=doc.get('doc_type', 'document'),
            query=query
            )
        
        self.stats['saved'] += 1
    
    def run(self):
        """Kør harvest"""
        print("=" * 60)
        print("📚 SCRIBD PUBLIC HARVESTER")
        print("=" * 60)
        
        all_docs = []
        
        for topic in self.SEARCH_TOPICS:
            docs = self.search_documents(topic)
            self.stats['found'] += len(docs)
            
            for doc in docs:
                details = self.get_document_details(doc['url'])
                if details:
                    details['query'] = topic
                    all_docs.append(details)
                    self.save_to_neo4j(details, topic)
                    print(f"      💾 {details['title'][:50]}...")
        
        # Summary
        print("\n" + "=" * 60)
        print("📊 HARVEST COMPLETE")
        print("=" * 60)
        print(f"   🔍 Topics searched: {len(self.SEARCH_TOPICS)}")
        print(f"   📄 Documents found: {self.stats['found']}")
        print(f"   💾 Saved to Neo4j:  {self.stats['saved']}")
        print("=" * 60)
        
        # Save local JSON
        output_file = self.output_dir / "scribd_public_harvest.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_docs, f, indent=2, ensure_ascii=False)
        print(f"\n📁 JSON saved: {output_file}")
        
        self.driver.close()
        return all_docs


if __name__ == "__main__":
    harvester = ScribdPublicHarvester()
    harvester.run()