#!/usr/bin/env python3 """ ๐Ÿข TDC SharePoint Harvester - MSAL Device Code Flow Simpel, holdbar lรธsning der virker med enhver Microsoft konto """ import json import hashlib import webbrowser from pathlib import Path from datetime import datetime from neo4j import GraphDatabase try: import msal import requests except ImportError: print("Installing required packages...") import subprocess subprocess.run(["pip", "install", "msal", "requests", "--quiet"]) import msal import requests class TDCSharePointHarvester: """SharePoint harvester med MSAL device code authentication""" # Microsoft Graph public client (no app registration needed) CLIENT_ID = "14d82eec-204b-4c2f-b7e8-296a70dab67e" # Microsoft Graph Explorer AUTHORITY = "https://login.microsoftonline.com/common" SCOPES = ["https://graph.microsoft.com/.default"] GRAPH_URL = "https://graph.microsoft.com/v1.0" # Neo4j NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" # Sรธgetermer SEARCH_TERMS = [ "strategi TDC", "cybersikkerhed", "cloud strategi", "AI strategi", "Columbus ERP", "budget 2025", "kundeliste", "rammeaftale", "produktkatalog", "SOC MDR", "NIS2", "IT arkitektur" ] def __init__(self): self.output_dir = Path("data/sharepoint_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) self.access_token = None self.documents = [] self.stats = {"searches": 0, "documents": 0, "sites": 0} # Token cache self.token_cache_file = self.output_dir / "token_cache.json" # Neo4j self.neo4j = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) print("๐Ÿข TDC SharePoint Harvester") print("=" * 50) def authenticate(self): """Authenticate via device code flow""" print("\n๐Ÿ” AUTHENTICATION") print("-" * 40) # Load cached token if exists cache = msal.SerializableTokenCache() if self.token_cache_file.exists(): cache.deserialize(self.token_cache_file.read_text()) app = msal.PublicClientApplication( self.CLIENT_ID, authority=self.AUTHORITY, token_cache=cache ) # Try silent auth first accounts = app.get_accounts() if accounts: print(f" Found cached account: {accounts[0].get('username', 'Unknown')}") result = app.acquire_token_silent( ["Sites.Read.All", "Files.Read.All", "User.Read"], account=accounts[0] ) if result and "access_token" in result: self.access_token = result["access_token"] print(" โœ… Using cached token") return True # Device code flow print("\n ๐Ÿ“ฑ Device Code Authentication:") flow = app.initiate_device_flow( scopes=["Sites.Read.All", "Files.Read.All", "User.Read"] ) if "user_code" not in flow: print(f" โŒ Error: {flow.get('error_description', 'Unknown')}") return False print(f"\n ๐Ÿ”— Gรฅ til: {flow['verification_uri']}") print(f" ๐Ÿ“ Indtast kode: {flow['user_code']}") print("\n Venter pรฅ login...") # Open browser webbrowser.open(flow['verification_uri']) # Wait for auth result = app.acquire_token_by_device_flow(flow) if "access_token" in result: self.access_token = result["access_token"] # Save cache if cache.has_state_changed: self.token_cache_file.write_text(cache.serialize()) # Get user info headers = {"Authorization": f"Bearer {self.access_token}"} user = requests.get(f"{self.GRAPH_URL}/me", headers=headers).json() print(f"\n โœ… Logget ind som: {user.get('displayName', 'Unknown')}") print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}") return True else: print(f" โŒ Auth failed: {result.get('error_description', 'Unknown')}") return False def api_get(self, endpoint: str) -> dict: """Make authenticated API call""" headers = {"Authorization": f"Bearer {self.access_token}"} response = requests.get(f"{self.GRAPH_URL}{endpoint}", headers=headers) if response.status_code == 200: return response.json() return {} def api_post(self, endpoint: str, data: dict) -> dict: """Make authenticated POST call""" headers = { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json" } response = requests.post(f"{self.GRAPH_URL}{endpoint}", headers=headers, json=data) if response.status_code == 200: return response.json() return {} def search(self, query: str) -> list: """Search SharePoint via Graph API""" results = [] search_body = { "requests": [{ "entityTypes": ["driveItem", "listItem", "site"], "query": {"queryString": query}, "from": 0, "size": 25 }] } data = self.api_post("/search/query", search_body) for result_set in data.get("value", []): for container in result_set.get("hitsContainers", []): for hit in container.get("hits", []): resource = hit.get("resource", {}) doc = { "id": resource.get("id", ""), "title": resource.get("name", "") or resource.get("displayName", ""), "url": resource.get("webUrl", ""), "summary": hit.get("summary", "")[:500], "type": resource.get("@odata.type", "").split(".")[-1], "modified": resource.get("lastModifiedDateTime", ""), "query": query } if doc["title"] and doc["url"]: results.append(doc) return results def get_sites(self) -> list: """Get accessible SharePoint sites""" sites = [] data = self.api_get("/sites?search=*") for site in data.get("value", []): sites.append({ "id": site.get("id"), "name": site.get("displayName"), "url": site.get("webUrl"), "description": site.get("description", "") }) self.stats["sites"] += 1 return sites def get_my_drive(self) -> list: """Get OneDrive files""" files = [] data = self.api_get("/me/drive/root/children") for item in data.get("value", []): files.append({ "name": item.get("name"), "url": item.get("webUrl"), "type": "folder" if item.get("folder") else "file", "size": item.get("size", 0), "modified": item.get("lastModifiedDateTime", "") }) return files def save_to_neo4j(self, doc: dict): """Save document to Neo4j""" content_hash = hashlib.md5( f"{doc.get('title','')}:{doc.get('url','')}".encode() ).hexdigest() with self.neo4j.session() as session: session.run(""" MERGE (d:SharePointDocument {contentHash: $hash}) SET d.title = $title, d.url = $url, d.summary = $summary, d.docType = $type, d.searchQuery = $query, d.modified = $modified, d.harvestedAt = datetime() MERGE (ds:DataSource {name: 'TDC_SharePoint'}) MERGE (d)-[:HARVESTED_FROM]->(ds) """, hash=content_hash, title=doc.get('title', '')[:200], url=doc.get('url', ''), summary=doc.get('summary', '')[:1000], type=doc.get('type', ''), query=doc.get('query', ''), modified=doc.get('modified', '') ) self.stats["documents"] += 1 def run(self): """Run full harvest""" if not self.authenticate(): return # 1. Get sites print("\n๐Ÿ“ SHAREPOINT SITES") print("-" * 40) sites = self.get_sites() print(f" Found {len(sites)} sites") for site in sites[:10]: print(f" โ€ข {site['name']}: {site['url']}") # 2. Search documents print("\n๐Ÿ” SEARCHING DOCUMENTS") print("-" * 40) all_docs = [] seen_urls = set() for query in self.SEARCH_TERMS: print(f" Sรธger: {query}", end="") results = self.search(query) self.stats["searches"] += 1 new_count = 0 for doc in results: if doc["url"] not in seen_urls: seen_urls.add(doc["url"]) all_docs.append(doc) self.save_to_neo4j(doc) new_count += 1 print(f" โ†’ {len(results)} results ({new_count} new)") # 3. OneDrive print("\n๐Ÿ“ ONEDRIVE FILES") print("-" * 40) my_files = self.get_my_drive() print(f" Found {len(my_files)} items") # 4. Summary print("\n" + "=" * 50) print("๐Ÿ“Š HARVEST COMPLETE") print("=" * 50) print(f" ๐Ÿ” Searches: {self.stats['searches']}") print(f" ๐Ÿ“ Sites: {self.stats['sites']}") print(f" ๐Ÿ“„ Documents: {self.stats['documents']}") # Save JSON output = { "timestamp": datetime.now().isoformat(), "stats": self.stats, "sites": sites, "documents": all_docs, "onedrive": my_files } output_file = self.output_dir / "sharepoint_harvest.json" output_file.write_text(json.dumps(output, indent=2, ensure_ascii=False)) print(f"\n๐Ÿ“ Saved: {output_file}") self.neo4j.close() if __name__ == "__main__": harvester = TDCSharePointHarvester() harvester.run()