#!/usr/bin/env python3 """ 🏢 TDC Microsoft 365 Harvester via Graph API Henter: SharePoint, OneDrive, Teams, Outlook data """ import os import json import hashlib import webbrowser from pathlib import Path from datetime import datetime from neo4j import GraphDatabase from dotenv import load_dotenv # Load environment variables load_dotenv() # Microsoft Graph import msal import requests class TDCMicrosoft365Harvester: """Microsoft 365 data harvester via Graph API""" NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687") NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password") # Microsoft Graph endpoints GRAPH_BASE = "https://graph.microsoft.com/v1.0" # Søgetermer for TDC intern data SEARCH_QUERIES = [ # Strategi "strategi TDC Erhverv", "roadmap 2025", "forretningsplan", # Cyber "cybersikkerhed", "SOC", "NIS2", "incident response", # Cloud "cloud strategi", "Azure migration", # AI "AI strategi", "Copilot", "GPT", # IT systemer "IT arkitektur", "Columbus ERP", "systemlandskab", # Finans "budget 2025", "finanstal", "forecast", # Kunder/Kontrakter "kundeliste", "rammeaftale", "SKI aftale", # Produkter "produktkatalog", "CloudKey", "prisliste" ] def __init__(self): self.output_dir = Path("data/m365_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) self.access_token = None self.documents = [] self.stats = {"searches": 0, "documents": 0, "sites": 0} # Neo4j self.neo4j = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) print("🏢 TDC Microsoft 365 Harvester") print("=" * 50) def authenticate_interactive(self): """Interaktiv authentication via browser eller environment""" print("\n🔐 MICROSOFT 365 AUTHENTICATION") print("-" * 40) # Check for environment token first env_token = os.getenv("MS_GRAPH_ACCESS_TOKEN") if env_token: print(" ✅ Fandt MS_GRAPH_ACCESS_TOKEN i miljøet") self.access_token = env_token # Verificer token if self._verify_token(): return True else: print(" ⚠️ Environment token er ugyldigt/udløbet. Falder tilbage til interaktiv.") print(""" For at tilgå TDC SharePoint, skal du: 1. Gå til: https://developer.microsoft.com/en-us/graph/graph-explorer 2. Log ind med din TDC konto (clauskraf@tdc.dk) 3. Klik på "Access token" tab 4. Kopier hele access token Alternativt brug Azure AD app registration. """) token = input("\nPaste access token her (eller 'skip' for at bruge cached): ").strip() if token.lower() == 'skip': # Prøv at læse cached token token_file = self.output_dir / "access_token.txt" if token_file.exists(): token = token_file.read_text().strip() print(" ✅ Bruger cached token") else: print(" ❌ Ingen cached token fundet") return False else: # Gem token token_file = self.output_dir / "access_token.txt" token_file.write_text(token) self.access_token = token return self._verify_token() def _verify_token(self): """Helper to verify current token""" try: headers = {"Authorization": f"Bearer {self.access_token}"} response = requests.get(f"{self.GRAPH_BASE}/me", headers=headers) if response.status_code == 200: user = response.json() print(f" ✅ Logget ind som: {user.get('displayName', 'Unknown')}") print(f" Email: {user.get('mail', 'Unknown')}") return True else: print(f" ❌ Token fejl: {response.status_code}") # print(f" {response.text[:200]}") # Støjsvagt return False except Exception as e: print(f" ❌ Connection error under verify: {e}") return False def search_sharepoint(self, query: str) -> list: """Søg i SharePoint via Graph API""" results = [] headers = { "Authorization": f"Bearer {self.access_token}", "Content-Type": "application/json" } # Graph Search API search_body = { "requests": [{ "entityTypes": ["driveItem", "listItem", "site"], "query": { "queryString": query }, "from": 0, "size": 25 }] } try: response = requests.post( f"{self.GRAPH_BASE}/search/query", headers=headers, json=search_body ) if response.status_code == 200: data = response.json() for result_set in data.get("value", []): for hit in result_set.get("hitsContainers", []): for item in hit.get("hits", []): resource = item.get("resource", {}) doc = { "id": resource.get("id", ""), "title": resource.get("name", "") or resource.get("displayName", ""), "url": resource.get("webUrl", ""), "summary": item.get("summary", "")[:500], "doc_type": resource.get("@odata.type", "").replace("#microsoft.graph.", ""), "created": resource.get("createdDateTime", ""), "modified": resource.get("lastModifiedDateTime", ""), "search_query": query, "source": "graph_search" } if doc["title"] and doc["url"]: results.append(doc) else: print(f" ⚠️ Search fejl: {response.status_code}") except Exception as e: print(f" ❌ Error: {e}") return results def get_sharepoint_sites(self) -> list: """Hent alle tilgængelige SharePoint sites""" sites = [] headers = {"Authorization": f"Bearer {self.access_token}"} try: # Søg efter sites response = requests.get( f"{self.GRAPH_BASE}/sites?search=*", headers=headers ) if response.status_code == 200: for site in response.json().get("value", []): sites.append({ "id": site.get("id", ""), "name": site.get("displayName", ""), "url": site.get("webUrl", ""), "description": site.get("description", "") }) self.stats["sites"] += 1 except Exception as e: print(f" ❌ Sites fejl: {e}") return sites def get_site_documents(self, site_id: str) -> list: """Hent dokumenter fra en specifik site""" documents = [] headers = {"Authorization": f"Bearer {self.access_token}"} try: # Hent document libraries response = requests.get( f"{self.GRAPH_BASE}/sites/{site_id}/drives", headers=headers ) if response.status_code == 200: for drive in response.json().get("value", []): drive_id = drive.get("id") # Hent filer fra drive files_response = requests.get( f"{self.GRAPH_BASE}/drives/{drive_id}/root/children", headers=headers ) if files_response.status_code == 200: for item in files_response.json().get("value", []): if item.get("file"): # Det er en fil documents.append({ "id": item.get("id", ""), "title": item.get("name", ""), "url": item.get("webUrl", ""), "size": item.get("size", 0), "doc_type": item.get("file", {}).get("mimeType", ""), "created": item.get("createdDateTime", ""), "modified": item.get("lastModifiedDateTime", ""), "source": f"site:{site_id}" }) except Exception as e: print(f" ⚠️ Docs fejl: {e}") return documents def get_my_files(self) -> list: """Hent brugerens OneDrive filer""" files = [] headers = {"Authorization": f"Bearer {self.access_token}"} try: response = requests.get( f"{self.GRAPH_BASE}/me/drive/root/children", headers=headers ) if response.status_code == 200: for item in response.json().get("value", []): files.append({ "id": item.get("id", ""), "title": item.get("name", ""), "url": item.get("webUrl", ""), "type": "folder" if item.get("folder") else "file", "size": item.get("size", 0), "source": "onedrive" }) except Exception as e: print(f" ❌ OneDrive fejl: {e}") return files def save_to_neo4j(self, doc: dict): """Gem dokument i Neo4j""" content_hash = hashlib.md5(f"{doc.get('title','')}:{doc.get('url','')}".encode()).hexdigest() with self.neo4j.session() as session: session.run(""" MERGE (d:M365Document {contentHash: $hash}) ON CREATE SET d.id = $id, d.title = $title, d.url = $url, d.summary = $summary, d.docType = $doc_type, d.searchQuery = $search_query, d.source = $source, d.harvestedAt = datetime() ON MATCH SET d.lastSeen = datetime() MERGE (ds:DataSource {name: 'TDC_Microsoft365'}) ON CREATE SET ds.type = 'enterprise_cloud' MERGE (d)-[:HARVESTED_FROM]->(ds) """, hash=content_hash, id=doc.get('id', ''), title=doc.get('title', ''), url=doc.get('url', ''), summary=doc.get('summary', '')[:1000], doc_type=doc.get('doc_type', ''), search_query=doc.get('search_query', ''), source=doc.get('source', '') ) self.stats["documents"] += 1 def run(self): """Kør fuld harvest""" print("\n" + "=" * 60) print("🏢 TDC MICROSOFT 365 HARVESTER") print("=" * 60) # Authenticate if not self.authenticate_interactive(): print("\n❌ Authentication fejlede - afslutter") return # 1. Hent SharePoint sites print("\n📍 SHAREPOINT SITES") print("-" * 40) sites = self.get_sharepoint_sites() print(f" Found {len(sites)} sites") for site in sites[:10]: print(f" • {site['name']}") # 2. Søg efter relevante dokumenter print("\n🔍 SEARCHING DOCUMENTS") print("-" * 40) all_results = [] for query in self.SEARCH_QUERIES: print(f" Søger: {query}") results = self.search_sharepoint(query) self.stats["searches"] += 1 for doc in results: # Undgå duplikater if not any(d['url'] == doc['url'] for d in all_results): all_results.append(doc) self.save_to_neo4j(doc) print(f" Found {len(results)} results") # 3. Hent OneDrive filer print("\n📁 ONEDRIVE FILES") print("-" * 40) my_files = self.get_my_files() print(f" Found {len(my_files)} items") # 4. Summary print("\n" + "=" * 60) print("📊 HARVEST COMPLETE") print("=" * 60) print(f" 🔍 Searches performed: {self.stats['searches']}") print(f" 📍 Sites found: {self.stats['sites']}") print(f" 📄 Documents imported: {self.stats['documents']}") print("=" * 60) # Save local JSON output_file = self.output_dir / "m365_harvest.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump({ "timestamp": datetime.now().isoformat(), "stats": self.stats, "sites": sites, "documents": all_results[:100], "onedrive": my_files }, f, indent=2, ensure_ascii=False) print(f"\n📁 Results saved: {output_file}") self.neo4j.close() if __name__ == "__main__": harvester = TDCMicrosoft365Harvester() harvester.run()