Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🏢 TDC Microsoft 365 Harvester via Graph API | |
| Henter: SharePoint, OneDrive, Teams, Outlook data | |
| """ | |
| import os | |
| import json | |
| import hashlib | |
| import webbrowser | |
| from pathlib import Path | |
| from datetime import datetime | |
| from neo4j import GraphDatabase | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Microsoft Graph | |
| import msal | |
| import requests | |
| class TDCMicrosoft365Harvester: | |
| """Microsoft 365 data harvester via Graph API""" | |
| NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687") | |
| NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") | |
| NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password") | |
| # Microsoft Graph endpoints | |
| GRAPH_BASE = "https://graph.microsoft.com/v1.0" | |
| # Søgetermer for TDC intern data | |
| SEARCH_QUERIES = [ | |
| # Strategi | |
| "strategi TDC Erhverv", | |
| "roadmap 2025", | |
| "forretningsplan", | |
| # Cyber | |
| "cybersikkerhed", | |
| "SOC", | |
| "NIS2", | |
| "incident response", | |
| # Cloud | |
| "cloud strategi", | |
| "Azure migration", | |
| # AI | |
| "AI strategi", | |
| "Copilot", | |
| "GPT", | |
| # IT systemer | |
| "IT arkitektur", | |
| "Columbus ERP", | |
| "systemlandskab", | |
| # Finans | |
| "budget 2025", | |
| "finanstal", | |
| "forecast", | |
| # Kunder/Kontrakter | |
| "kundeliste", | |
| "rammeaftale", | |
| "SKI aftale", | |
| # Produkter | |
| "produktkatalog", | |
| "CloudKey", | |
| "prisliste" | |
| ] | |
| def __init__(self): | |
| self.output_dir = Path("data/m365_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self.access_token = None | |
| self.documents = [] | |
| self.stats = {"searches": 0, "documents": 0, "sites": 0} | |
| # Neo4j | |
| self.neo4j = GraphDatabase.driver( | |
| self.NEO4J_URI, | |
| auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| print("🏢 TDC Microsoft 365 Harvester") | |
| print("=" * 50) | |
| def authenticate_interactive(self): | |
| """Interaktiv authentication via browser eller environment""" | |
| print("\n🔐 MICROSOFT 365 AUTHENTICATION") | |
| print("-" * 40) | |
| # Check for environment token first | |
| env_token = os.getenv("MS_GRAPH_ACCESS_TOKEN") | |
| if env_token: | |
| print(" ✅ Fandt MS_GRAPH_ACCESS_TOKEN i miljøet") | |
| self.access_token = env_token | |
| # Verificer token | |
| if self._verify_token(): | |
| return True | |
| else: | |
| print(" ⚠️ Environment token er ugyldigt/udløbet. Falder tilbage til interaktiv.") | |
| print(""" | |
| For at tilgå TDC SharePoint, skal du: | |
| 1. Gå til: https://developer.microsoft.com/en-us/graph/graph-explorer | |
| 2. Log ind med din TDC konto (clauskraf@tdc.dk) | |
| 3. Klik på "Access token" tab | |
| 4. Kopier hele access token | |
| Alternativt brug Azure AD app registration. | |
| """) | |
| token = input("\nPaste access token her (eller 'skip' for at bruge cached): ").strip() | |
| if token.lower() == 'skip': | |
| # Prøv at læse cached token | |
| token_file = self.output_dir / "access_token.txt" | |
| if token_file.exists(): | |
| token = token_file.read_text().strip() | |
| print(" ✅ Bruger cached token") | |
| else: | |
| print(" ❌ Ingen cached token fundet") | |
| return False | |
| else: | |
| # Gem token | |
| token_file = self.output_dir / "access_token.txt" | |
| token_file.write_text(token) | |
| self.access_token = token | |
| return self._verify_token() | |
| def _verify_token(self): | |
| """Helper to verify current token""" | |
| try: | |
| headers = {"Authorization": f"Bearer {self.access_token}"} | |
| response = requests.get(f"{self.GRAPH_BASE}/me", headers=headers) | |
| if response.status_code == 200: | |
| user = response.json() | |
| print(f" ✅ Logget ind som: {user.get('displayName', 'Unknown')}") | |
| print(f" Email: {user.get('mail', 'Unknown')}") | |
| return True | |
| else: | |
| print(f" ❌ Token fejl: {response.status_code}") | |
| # print(f" {response.text[:200]}") # Støjsvagt | |
| return False | |
| except Exception as e: | |
| print(f" ❌ Connection error under verify: {e}") | |
| return False | |
| def search_sharepoint(self, query: str) -> list: | |
| """Søg i SharePoint via Graph API""" | |
| results = [] | |
| headers = { | |
| "Authorization": f"Bearer {self.access_token}", | |
| "Content-Type": "application/json" | |
| } | |
| # Graph Search API | |
| search_body = { | |
| "requests": [{ | |
| "entityTypes": ["driveItem", "listItem", "site"], | |
| "query": { | |
| "queryString": query | |
| }, | |
| "from": 0, | |
| "size": 25 | |
| }] | |
| } | |
| try: | |
| response = requests.post( | |
| f"{self.GRAPH_BASE}/search/query", | |
| headers=headers, | |
| json=search_body | |
| ) | |
| if response.status_code == 200: | |
| data = response.json() | |
| for result_set in data.get("value", []): | |
| for hit in result_set.get("hitsContainers", []): | |
| for item in hit.get("hits", []): | |
| resource = item.get("resource", {}) | |
| doc = { | |
| "id": resource.get("id", ""), | |
| "title": resource.get("name", "") or resource.get("displayName", ""), | |
| "url": resource.get("webUrl", ""), | |
| "summary": item.get("summary", "")[:500], | |
| "doc_type": resource.get("@odata.type", "").replace("#microsoft.graph.", ""), | |
| "created": resource.get("createdDateTime", ""), | |
| "modified": resource.get("lastModifiedDateTime", ""), | |
| "search_query": query, | |
| "source": "graph_search" | |
| } | |
| if doc["title"] and doc["url"]: | |
| results.append(doc) | |
| else: | |
| print(f" ⚠️ Search fejl: {response.status_code}") | |
| except Exception as e: | |
| print(f" ❌ Error: {e}") | |
| return results | |
| def get_sharepoint_sites(self) -> list: | |
| """Hent alle tilgængelige SharePoint sites""" | |
| sites = [] | |
| headers = {"Authorization": f"Bearer {self.access_token}"} | |
| try: | |
| # Søg efter sites | |
| response = requests.get( | |
| f"{self.GRAPH_BASE}/sites?search=*", | |
| headers=headers | |
| ) | |
| if response.status_code == 200: | |
| for site in response.json().get("value", []): | |
| sites.append({ | |
| "id": site.get("id", ""), | |
| "name": site.get("displayName", ""), | |
| "url": site.get("webUrl", ""), | |
| "description": site.get("description", "") | |
| }) | |
| self.stats["sites"] += 1 | |
| except Exception as e: | |
| print(f" ❌ Sites fejl: {e}") | |
| return sites | |
| def get_site_documents(self, site_id: str) -> list: | |
| """Hent dokumenter fra en specifik site""" | |
| documents = [] | |
| headers = {"Authorization": f"Bearer {self.access_token}"} | |
| try: | |
| # Hent document libraries | |
| response = requests.get( | |
| f"{self.GRAPH_BASE}/sites/{site_id}/drives", | |
| headers=headers | |
| ) | |
| if response.status_code == 200: | |
| for drive in response.json().get("value", []): | |
| drive_id = drive.get("id") | |
| # Hent filer fra drive | |
| files_response = requests.get( | |
| f"{self.GRAPH_BASE}/drives/{drive_id}/root/children", | |
| headers=headers | |
| ) | |
| if files_response.status_code == 200: | |
| for item in files_response.json().get("value", []): | |
| if item.get("file"): # Det er en fil | |
| documents.append({ | |
| "id": item.get("id", ""), | |
| "title": item.get("name", ""), | |
| "url": item.get("webUrl", ""), | |
| "size": item.get("size", 0), | |
| "doc_type": item.get("file", {}).get("mimeType", ""), | |
| "created": item.get("createdDateTime", ""), | |
| "modified": item.get("lastModifiedDateTime", ""), | |
| "source": f"site:{site_id}" | |
| }) | |
| except Exception as e: | |
| print(f" ⚠️ Docs fejl: {e}") | |
| return documents | |
| def get_my_files(self) -> list: | |
| """Hent brugerens OneDrive filer""" | |
| files = [] | |
| headers = {"Authorization": f"Bearer {self.access_token}"} | |
| try: | |
| response = requests.get( | |
| f"{self.GRAPH_BASE}/me/drive/root/children", | |
| headers=headers | |
| ) | |
| if response.status_code == 200: | |
| for item in response.json().get("value", []): | |
| files.append({ | |
| "id": item.get("id", ""), | |
| "title": item.get("name", ""), | |
| "url": item.get("webUrl", ""), | |
| "type": "folder" if item.get("folder") else "file", | |
| "size": item.get("size", 0), | |
| "source": "onedrive" | |
| }) | |
| except Exception as e: | |
| print(f" ❌ OneDrive fejl: {e}") | |
| return files | |
| def save_to_neo4j(self, doc: dict): | |
| """Gem dokument i Neo4j""" | |
| content_hash = hashlib.md5(f"{doc.get('title','')}:{doc.get('url','')}".encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (d:M365Document {contentHash: $hash}) | |
| ON CREATE SET | |
| d.id = $id, | |
| d.title = $title, | |
| d.url = $url, | |
| d.summary = $summary, | |
| d.docType = $doc_type, | |
| d.searchQuery = $search_query, | |
| d.source = $source, | |
| d.harvestedAt = datetime() | |
| ON MATCH SET | |
| d.lastSeen = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_Microsoft365'}) | |
| ON CREATE SET ds.type = 'enterprise_cloud' | |
| MERGE (d)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=doc.get('id', ''), | |
| title=doc.get('title', ''), | |
| url=doc.get('url', ''), | |
| summary=doc.get('summary', '')[:1000], | |
| doc_type=doc.get('doc_type', ''), | |
| search_query=doc.get('search_query', ''), | |
| source=doc.get('source', '') | |
| ) | |
| self.stats["documents"] += 1 | |
| def run(self): | |
| """Kør fuld harvest""" | |
| print("\n" + "=" * 60) | |
| print("🏢 TDC MICROSOFT 365 HARVESTER") | |
| print("=" * 60) | |
| # Authenticate | |
| if not self.authenticate_interactive(): | |
| print("\n❌ Authentication fejlede - afslutter") | |
| return | |
| # 1. Hent SharePoint sites | |
| print("\n📍 SHAREPOINT SITES") | |
| print("-" * 40) | |
| sites = self.get_sharepoint_sites() | |
| print(f" Found {len(sites)} sites") | |
| for site in sites[:10]: | |
| print(f" • {site['name']}") | |
| # 2. Søg efter relevante dokumenter | |
| print("\n🔍 SEARCHING DOCUMENTS") | |
| print("-" * 40) | |
| all_results = [] | |
| for query in self.SEARCH_QUERIES: | |
| print(f" Søger: {query}") | |
| results = self.search_sharepoint(query) | |
| self.stats["searches"] += 1 | |
| for doc in results: | |
| # Undgå duplikater | |
| if not any(d['url'] == doc['url'] for d in all_results): | |
| all_results.append(doc) | |
| self.save_to_neo4j(doc) | |
| print(f" Found {len(results)} results") | |
| # 3. Hent OneDrive filer | |
| print("\n📁 ONEDRIVE FILES") | |
| print("-" * 40) | |
| my_files = self.get_my_files() | |
| print(f" Found {len(my_files)} items") | |
| # 4. Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 🔍 Searches performed: {self.stats['searches']}") | |
| print(f" 📍 Sites found: {self.stats['sites']}") | |
| print(f" 📄 Documents imported: {self.stats['documents']}") | |
| print("=" * 60) | |
| # Save local JSON | |
| output_file = self.output_dir / "m365_harvest.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.stats, | |
| "sites": sites, | |
| "documents": all_results[:100], | |
| "onedrive": my_files | |
| }, f, indent=2, ensure_ascii=False) | |
| print(f"\n📁 Results saved: {output_file}") | |
| self.neo4j.close() | |
| if __name__ == "__main__": | |
| harvester = TDCMicrosoft365Harvester() | |
| harvester.run() | |