Spaces:

Kraft102
/

widgettdc-api

Paused

File size: 10,998 Bytes

34367da

#!/usr/bin/env python3
"""
🏢 TDC SharePoint Harvester - MSAL Device Code Flow
Simpel, holdbar løsning der virker med enhver Microsoft konto
"""
import json
import hashlib
import webbrowser
from pathlib import Path
from datetime import datetime
from neo4j import GraphDatabase

try:
    import msal
    import requests
except ImportError:
    print("Installing required packages...")
    import subprocess
    subprocess.run(["pip", "install", "msal", "requests", "--quiet"])
    import msal
    import requests

class TDCSharePointHarvester:
    """SharePoint harvester med MSAL device code authentication"""
    
    # Microsoft Graph public client (no app registration needed)
    CLIENT_ID = "14d82eec-204b-4c2f-b7e8-296a70dab67e"  # Microsoft Graph Explorer
    AUTHORITY = "https://login.microsoftonline.com/common"
    SCOPES = ["https://graph.microsoft.com/.default"]
    GRAPH_URL = "https://graph.microsoft.com/v1.0"
    
    # Neo4j
    NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
    NEO4J_USER = "neo4j"
    NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
    
    # Søgetermer
    SEARCH_TERMS = [
        "strategi TDC",
        "cybersikkerhed",
        "cloud strategi",
        "AI strategi",
        "Columbus ERP",
        "budget 2025",
        "kundeliste",
        "rammeaftale",
        "produktkatalog",
        "SOC MDR",
        "NIS2",
        "IT arkitektur"
    ]
    
    def __init__(self):
        self.output_dir = Path("data/sharepoint_harvest")
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        self.access_token = None
        self.documents = []
        self.stats = {"searches": 0, "documents": 0, "sites": 0}
        
        # Token cache
        self.token_cache_file = self.output_dir / "token_cache.json"
        
        # Neo4j
        self.neo4j = GraphDatabase.driver(
            self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
        )
        
        print("🏢 TDC SharePoint Harvester")
        print("=" * 50)
    
    def authenticate(self):
        """Authenticate via device code flow"""
        print("\n🔐 AUTHENTICATION")
        print("-" * 40)
        
        # Load cached token if exists
        cache = msal.SerializableTokenCache()
        if self.token_cache_file.exists():
            cache.deserialize(self.token_cache_file.read_text())
        
        app = msal.PublicClientApplication(
            self.CLIENT_ID,
            authority=self.AUTHORITY,
            token_cache=cache
        )
        
        # Try silent auth first
        accounts = app.get_accounts()
        if accounts:
            print(f"   Found cached account: {accounts[0].get('username', 'Unknown')}")
            result = app.acquire_token_silent(
                ["Sites.Read.All", "Files.Read.All", "User.Read"],
                account=accounts[0]
            )
            if result and "access_token" in result:
                self.access_token = result["access_token"]
                print("   ✅ Using cached token")
                return True
        
        # Device code flow
        print("\n   📱 Device Code Authentication:")
        flow = app.initiate_device_flow(
            scopes=["Sites.Read.All", "Files.Read.All", "User.Read"]
        )
        
        if "user_code" not in flow:
            print(f"   ❌ Error: {flow.get('error_description', 'Unknown')}")
            return False
        
        print(f"\n   🔗 Gå til: {flow['verification_uri']}")
        print(f"   📝 Indtast kode: {flow['user_code']}")
        print("\n   Venter på login...")
        
        # Open browser
        webbrowser.open(flow['verification_uri'])
        
        # Wait for auth
        result = app.acquire_token_by_device_flow(flow)
        
        if "access_token" in result:
            self.access_token = result["access_token"]
            
            # Save cache
            if cache.has_state_changed:
                self.token_cache_file.write_text(cache.serialize())
            
            # Get user info
            headers = {"Authorization": f"Bearer {self.access_token}"}
            user = requests.get(f"{self.GRAPH_URL}/me", headers=headers).json()
            print(f"\n   ✅ Logget ind som: {user.get('displayName', 'Unknown')}")
            print(f"      Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}")
            
            return True
        else:
            print(f"   ❌ Auth failed: {result.get('error_description', 'Unknown')}")
            return False
    
    def api_get(self, endpoint: str) -> dict:
        """Make authenticated API call"""
        headers = {"Authorization": f"Bearer {self.access_token}"}
        response = requests.get(f"{self.GRAPH_URL}{endpoint}", headers=headers)
        if response.status_code == 200:
            return response.json()
        return {}
    
    def api_post(self, endpoint: str, data: dict) -> dict:
        """Make authenticated POST call"""
        headers = {
            "Authorization": f"Bearer {self.access_token}",
            "Content-Type": "application/json"
        }
        response = requests.post(f"{self.GRAPH_URL}{endpoint}", headers=headers, json=data)
        if response.status_code == 200:
            return response.json()
        return {}
    
    def search(self, query: str) -> list:
        """Search SharePoint via Graph API"""
        results = []
        
        search_body = {
            "requests": [{
                "entityTypes": ["driveItem", "listItem", "site"],
                "query": {"queryString": query},
                "from": 0,
                "size": 25
            }]
        }
        
        data = self.api_post("/search/query", search_body)
        
        for result_set in data.get("value", []):
            for container in result_set.get("hitsContainers", []):
                for hit in container.get("hits", []):
                    resource = hit.get("resource", {})
                    
                    doc = {
                        "id": resource.get("id", ""),
                        "title": resource.get("name", "") or resource.get("displayName", ""),
                        "url": resource.get("webUrl", ""),
                        "summary": hit.get("summary", "")[:500],
                        "type": resource.get("@odata.type", "").split(".")[-1],
                        "modified": resource.get("lastModifiedDateTime", ""),
                        "query": query
                    }
                    
                    if doc["title"] and doc["url"]:
                        results.append(doc)
        
        return results
    
    def get_sites(self) -> list:
        """Get accessible SharePoint sites"""
        sites = []
        data = self.api_get("/sites?search=*")
        
        for site in data.get("value", []):
            sites.append({
                "id": site.get("id"),
                "name": site.get("displayName"),
                "url": site.get("webUrl"),
                "description": site.get("description", "")
            })
            self.stats["sites"] += 1
        
        return sites
    
    def get_my_drive(self) -> list:
        """Get OneDrive files"""
        files = []
        data = self.api_get("/me/drive/root/children")
        
        for item in data.get("value", []):
            files.append({
                "name": item.get("name"),
                "url": item.get("webUrl"),
                "type": "folder" if item.get("folder") else "file",
                "size": item.get("size", 0),
                "modified": item.get("lastModifiedDateTime", "")
            })
        
        return files
    
    def save_to_neo4j(self, doc: dict):
        """Save document to Neo4j"""
        content_hash = hashlib.md5(
            f"{doc.get('title','')}:{doc.get('url','')}".encode()
        ).hexdigest()
        
        with self.neo4j.session() as session:
            session.run("""
                MERGE (d:SharePointDocument {contentHash: $hash})
                SET d.title = $title,
                    d.url = $url,
                    d.summary = $summary,
                    d.docType = $type,
                    d.searchQuery = $query,
                    d.modified = $modified,
                    d.harvestedAt = datetime()
                
                MERGE (ds:DataSource {name: 'TDC_SharePoint'})
                MERGE (d)-[:HARVESTED_FROM]->(ds)
            """,
            hash=content_hash,
            title=doc.get('title', '')[:200],
            url=doc.get('url', ''),
            summary=doc.get('summary', '')[:1000],
            type=doc.get('type', ''),
            query=doc.get('query', ''),
            modified=doc.get('modified', '')
            )
        
        self.stats["documents"] += 1
    
    def run(self):
        """Run full harvest"""
        if not self.authenticate():
            return
        
        # 1. Get sites
        print("\n📍 SHAREPOINT SITES")
        print("-" * 40)
        sites = self.get_sites()
        print(f"   Found {len(sites)} sites")
        for site in sites[:10]:
            print(f"   • {site['name']}: {site['url']}")
        
        # 2. Search documents
        print("\n🔍 SEARCHING DOCUMENTS")
        print("-" * 40)
        
        all_docs = []
        seen_urls = set()
        
        for query in self.SEARCH_TERMS:
            print(f"   Søger: {query}", end="")
            results = self.search(query)
            self.stats["searches"] += 1
            
            new_count = 0
            for doc in results:
                if doc["url"] not in seen_urls:
                    seen_urls.add(doc["url"])
                    all_docs.append(doc)
                    self.save_to_neo4j(doc)
                    new_count += 1
            
            print(f" → {len(results)} results ({new_count} new)")
        
        # 3. OneDrive
        print("\n📁 ONEDRIVE FILES")
        print("-" * 40)
        my_files = self.get_my_drive()
        print(f"   Found {len(my_files)} items")
        
        # 4. Summary
        print("\n" + "=" * 50)
        print("📊 HARVEST COMPLETE")
        print("=" * 50)
        print(f"   🔍 Searches: {self.stats['searches']}")
        print(f"   📍 Sites: {self.stats['sites']}")
        print(f"   📄 Documents: {self.stats['documents']}")
        
        # Save JSON
        output = {
            "timestamp": datetime.now().isoformat(),
            "stats": self.stats,
            "sites": sites,
            "documents": all_docs,
            "onedrive": my_files
        }
        
        output_file = self.output_dir / "sharepoint_harvest.json"
        output_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
        print(f"\n📁 Saved: {output_file}")
        
        self.neo4j.close()


if __name__ == "__main__":
    harvester = TDCSharePointHarvester()
    harvester.run()