#!/usr/bin/env python3 """ 🏢 TDC Microsoft 365 Quick Harvester Brug: python m365_quick_harvest.py [access_token] Henter emails og SharePoint docs via Microsoft Graph API og gemmer dem i Neo4j knowledge graph. """ import sys import json import hashlib import requests from pathlib import Path from datetime import datetime from neo4j import GraphDatabase NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" GRAPH_BASE = "https://graph.microsoft.com/v1.0" # TDC-relevante søgetermer SEARCH_TERMS = [ "strategi TDC Erhverv", "cybersikkerhed", "NIS2", "cloud Azure", "AI Copilot", "Columbus ERP", "budget forecast", "kundeliste", "rammeaftale SKI", "produktkatalog", "SOC MDR", "IT arkitektur" ] class M365QuickHarvester: def __init__(self, access_token: str): self.token = access_token self.headers = { "Authorization": f"Bearer {access_token}", "Content-Type": "application/json", "ConsistencyLevel": "eventual" } self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) self.results = [] self.stats = {"emails": 0, "docs": 0, "errors": 0} def verify_token(self) -> bool: """Verificer token og vis brugerinfo""" try: resp = requests.get(f"{GRAPH_BASE}/me", headers=self.headers) if resp.ok: user = resp.json() print(f"✅ Logget ind som: {user.get('displayName', 'Unknown')}") print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}") return True else: print(f"❌ Token fejl: {resp.status_code}") return False except Exception as e: print(f"❌ Fejl: {e}") return False def search_emails(self, query: str, limit: int = 20) -> list: """Søg i emails""" try: url = f"{GRAPH_BASE}/me/messages?$search=\"{query}\"&$top={limit}&$select=id,subject,from,receivedDateTime,bodyPreview,webLink" resp = requests.get(url, headers=self.headers) if not resp.ok: return [] emails = [] for msg in resp.json().get("value", []): emails.append({ "id": msg.get("id"), "title": msg.get("subject", "(No subject)"), "url": msg.get("webLink", ""), "summary": msg.get("bodyPreview", "")[:500], "type": "email", "from": msg.get("from", {}).get("emailAddress", {}).get("address", ""), "date": msg.get("receivedDateTime"), "query": query }) return emails except Exception as e: print(f" ⚠️ Email søgefejl: {e}") return [] def search_sharepoint(self, query: str, limit: int = 20) -> list: """Søg i SharePoint/OneDrive""" try: body = { "requests": [{ "entityTypes": ["driveItem", "listItem", "site"], "query": {"queryString": query}, "from": 0, "size": limit }] } resp = requests.post(f"{GRAPH_BASE}/search/query", headers=self.headers, json=body) if not resp.ok: return [] docs = [] for result_set in resp.json().get("value", []): for container in result_set.get("hitsContainers", []): for hit in container.get("hits", []): resource = hit.get("resource", {}) docs.append({ "id": resource.get("id", hit.get("hitId")), "title": resource.get("name") or resource.get("displayName") or query, "url": resource.get("webUrl", ""), "summary": hit.get("summary", "")[:500], "type": "document", "date": resource.get("lastModifiedDateTime"), "query": query }) return docs except Exception as e: print(f" ⚠️ SharePoint søgefejl: {e}") return [] def save_to_neo4j(self, item: dict): """Gem i Neo4j""" content_hash = hashlib.md5(f"{item['title']}:{item['url']}".encode()).hexdigest() with self.neo4j.session() as session: session.run(""" MERGE (d:M365Document {contentHash: $hash}) ON CREATE SET d.id = $id, d.title = $title, d.url = $url, d.summary = $summary, d.docType = $type, d.searchQuery = $query, d.date = $date, d.harvestedAt = datetime() ON MATCH SET d.lastSeen = datetime() MERGE (ds:DataSource {name: 'TDC_M365'}) ON CREATE SET ds.type = 'enterprise_m365' MERGE (d)-[:HARVESTED_FROM]->(ds) """, hash=content_hash, id=item.get("id", ""), title=item.get("title", ""), url=item.get("url", ""), summary=item.get("summary", ""), type=item.get("type", ""), query=item.get("query", ""), date=item.get("date", "") ) def run(self): """Kør harvest""" print("\n" + "=" * 60) print("🏢 TDC MICROSOFT 365 QUICK HARVESTER") print("=" * 60) if not self.verify_token(): print("\n❌ Ugyldig token - hent ny fra Graph Explorer") return print(f"\n🔍 Søger med {len(SEARCH_TERMS)} termer...") for query in SEARCH_TERMS: print(f"\n 📧 {query}") # Emails emails = self.search_emails(query, 10) for email in emails: if not any(r['url'] == email['url'] for r in self.results): self.results.append(email) self.save_to_neo4j(email) self.stats["emails"] += 1 # SharePoint docs = self.search_sharepoint(query, 10) for doc in docs: if not any(r['url'] == doc['url'] for r in self.results): self.results.append(doc) self.save_to_neo4j(doc) self.stats["docs"] += 1 print(f" {len(emails)} emails, {len(docs)} docs") # Gem lokal backup output_file = Path("data/m365_harvest/m365_results.json") output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump({ "timestamp": datetime.now().isoformat(), "stats": self.stats, "results": self.results[:200] }, f, indent=2, ensure_ascii=False) print("\n" + "=" * 60) print("📊 HARVEST COMPLETE") print("=" * 60) print(f" 📧 Emails: {self.stats['emails']}") print(f" 📄 Documents: {self.stats['docs']}") print(f" 📁 Saved: {output_file}") print("=" * 60) self.neo4j.close() if __name__ == "__main__": if len(sys.argv) < 2: print(""" 🏢 TDC M365 Quick Harvester ═══════════════════════════ Brug: python m365_quick_harvest.py [ACCESS_TOKEN] Sådan får du token: 1. Gå til: https://developer.microsoft.com/en-us/graph/graph-explorer 2. Log ind med TDC konto (clauskraf@tdc.dk) 3. Klik "Access token" tab 4. Kopier hele token Eller paste token nu: """) token = input("Access token: ").strip() if not token: sys.exit(1) else: token = sys.argv[1] harvester = M365QuickHarvester(token) harvester.run()