widgettdc-api / apps /backend /python /m365_harvester.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
🏢 TDC Microsoft 365 Harvester via Graph API
Henter: SharePoint, OneDrive, Teams, Outlook data
"""
import os
import json
import hashlib
import webbrowser
from pathlib import Path
from datetime import datetime
from neo4j import GraphDatabase
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Microsoft Graph
import msal
import requests
class TDCMicrosoft365Harvester:
"""Microsoft 365 data harvester via Graph API"""
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
# Microsoft Graph endpoints
GRAPH_BASE = "https://graph.microsoft.com/v1.0"
# Søgetermer for TDC intern data
SEARCH_QUERIES = [
# Strategi
"strategi TDC Erhverv",
"roadmap 2025",
"forretningsplan",
# Cyber
"cybersikkerhed",
"SOC",
"NIS2",
"incident response",
# Cloud
"cloud strategi",
"Azure migration",
# AI
"AI strategi",
"Copilot",
"GPT",
# IT systemer
"IT arkitektur",
"Columbus ERP",
"systemlandskab",
# Finans
"budget 2025",
"finanstal",
"forecast",
# Kunder/Kontrakter
"kundeliste",
"rammeaftale",
"SKI aftale",
# Produkter
"produktkatalog",
"CloudKey",
"prisliste"
]
def __init__(self):
self.output_dir = Path("data/m365_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
self.access_token = None
self.documents = []
self.stats = {"searches": 0, "documents": 0, "sites": 0}
# Neo4j
self.neo4j = GraphDatabase.driver(
self.NEO4J_URI,
auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
print("🏢 TDC Microsoft 365 Harvester")
print("=" * 50)
def authenticate_interactive(self):
"""Interaktiv authentication via browser eller environment"""
print("\n🔐 MICROSOFT 365 AUTHENTICATION")
print("-" * 40)
# Check for environment token first
env_token = os.getenv("MS_GRAPH_ACCESS_TOKEN")
if env_token:
print(" ✅ Fandt MS_GRAPH_ACCESS_TOKEN i miljøet")
self.access_token = env_token
# Verificer token
if self._verify_token():
return True
else:
print(" ⚠️ Environment token er ugyldigt/udløbet. Falder tilbage til interaktiv.")
print("""
For at tilgå TDC SharePoint, skal du:
1. Gå til: https://developer.microsoft.com/en-us/graph/graph-explorer
2. Log ind med din TDC konto (clauskraf@tdc.dk)
3. Klik på "Access token" tab
4. Kopier hele access token
Alternativt brug Azure AD app registration.
""")
token = input("\nPaste access token her (eller 'skip' for at bruge cached): ").strip()
if token.lower() == 'skip':
# Prøv at læse cached token
token_file = self.output_dir / "access_token.txt"
if token_file.exists():
token = token_file.read_text().strip()
print(" ✅ Bruger cached token")
else:
print(" ❌ Ingen cached token fundet")
return False
else:
# Gem token
token_file = self.output_dir / "access_token.txt"
token_file.write_text(token)
self.access_token = token
return self._verify_token()
def _verify_token(self):
"""Helper to verify current token"""
try:
headers = {"Authorization": f"Bearer {self.access_token}"}
response = requests.get(f"{self.GRAPH_BASE}/me", headers=headers)
if response.status_code == 200:
user = response.json()
print(f" ✅ Logget ind som: {user.get('displayName', 'Unknown')}")
print(f" Email: {user.get('mail', 'Unknown')}")
return True
else:
print(f" ❌ Token fejl: {response.status_code}")
# print(f" {response.text[:200]}") # Støjsvagt
return False
except Exception as e:
print(f" ❌ Connection error under verify: {e}")
return False
def search_sharepoint(self, query: str) -> list:
"""Søg i SharePoint via Graph API"""
results = []
headers = {
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json"
}
# Graph Search API
search_body = {
"requests": [{
"entityTypes": ["driveItem", "listItem", "site"],
"query": {
"queryString": query
},
"from": 0,
"size": 25
}]
}
try:
response = requests.post(
f"{self.GRAPH_BASE}/search/query",
headers=headers,
json=search_body
)
if response.status_code == 200:
data = response.json()
for result_set in data.get("value", []):
for hit in result_set.get("hitsContainers", []):
for item in hit.get("hits", []):
resource = item.get("resource", {})
doc = {
"id": resource.get("id", ""),
"title": resource.get("name", "") or resource.get("displayName", ""),
"url": resource.get("webUrl", ""),
"summary": item.get("summary", "")[:500],
"doc_type": resource.get("@odata.type", "").replace("#microsoft.graph.", ""),
"created": resource.get("createdDateTime", ""),
"modified": resource.get("lastModifiedDateTime", ""),
"search_query": query,
"source": "graph_search"
}
if doc["title"] and doc["url"]:
results.append(doc)
else:
print(f" ⚠️ Search fejl: {response.status_code}")
except Exception as e:
print(f" ❌ Error: {e}")
return results
def get_sharepoint_sites(self) -> list:
"""Hent alle tilgængelige SharePoint sites"""
sites = []
headers = {"Authorization": f"Bearer {self.access_token}"}
try:
# Søg efter sites
response = requests.get(
f"{self.GRAPH_BASE}/sites?search=*",
headers=headers
)
if response.status_code == 200:
for site in response.json().get("value", []):
sites.append({
"id": site.get("id", ""),
"name": site.get("displayName", ""),
"url": site.get("webUrl", ""),
"description": site.get("description", "")
})
self.stats["sites"] += 1
except Exception as e:
print(f" ❌ Sites fejl: {e}")
return sites
def get_site_documents(self, site_id: str) -> list:
"""Hent dokumenter fra en specifik site"""
documents = []
headers = {"Authorization": f"Bearer {self.access_token}"}
try:
# Hent document libraries
response = requests.get(
f"{self.GRAPH_BASE}/sites/{site_id}/drives",
headers=headers
)
if response.status_code == 200:
for drive in response.json().get("value", []):
drive_id = drive.get("id")
# Hent filer fra drive
files_response = requests.get(
f"{self.GRAPH_BASE}/drives/{drive_id}/root/children",
headers=headers
)
if files_response.status_code == 200:
for item in files_response.json().get("value", []):
if item.get("file"): # Det er en fil
documents.append({
"id": item.get("id", ""),
"title": item.get("name", ""),
"url": item.get("webUrl", ""),
"size": item.get("size", 0),
"doc_type": item.get("file", {}).get("mimeType", ""),
"created": item.get("createdDateTime", ""),
"modified": item.get("lastModifiedDateTime", ""),
"source": f"site:{site_id}"
})
except Exception as e:
print(f" ⚠️ Docs fejl: {e}")
return documents
def get_my_files(self) -> list:
"""Hent brugerens OneDrive filer"""
files = []
headers = {"Authorization": f"Bearer {self.access_token}"}
try:
response = requests.get(
f"{self.GRAPH_BASE}/me/drive/root/children",
headers=headers
)
if response.status_code == 200:
for item in response.json().get("value", []):
files.append({
"id": item.get("id", ""),
"title": item.get("name", ""),
"url": item.get("webUrl", ""),
"type": "folder" if item.get("folder") else "file",
"size": item.get("size", 0),
"source": "onedrive"
})
except Exception as e:
print(f" ❌ OneDrive fejl: {e}")
return files
def save_to_neo4j(self, doc: dict):
"""Gem dokument i Neo4j"""
content_hash = hashlib.md5(f"{doc.get('title','')}:{doc.get('url','')}".encode()).hexdigest()
with self.neo4j.session() as session:
session.run("""
MERGE (d:M365Document {contentHash: $hash})
ON CREATE SET
d.id = $id,
d.title = $title,
d.url = $url,
d.summary = $summary,
d.docType = $doc_type,
d.searchQuery = $search_query,
d.source = $source,
d.harvestedAt = datetime()
ON MATCH SET
d.lastSeen = datetime()
MERGE (ds:DataSource {name: 'TDC_Microsoft365'})
ON CREATE SET ds.type = 'enterprise_cloud'
MERGE (d)-[:HARVESTED_FROM]->(ds)
""",
hash=content_hash,
id=doc.get('id', ''),
title=doc.get('title', ''),
url=doc.get('url', ''),
summary=doc.get('summary', '')[:1000],
doc_type=doc.get('doc_type', ''),
search_query=doc.get('search_query', ''),
source=doc.get('source', '')
)
self.stats["documents"] += 1
def run(self):
"""Kør fuld harvest"""
print("\n" + "=" * 60)
print("🏢 TDC MICROSOFT 365 HARVESTER")
print("=" * 60)
# Authenticate
if not self.authenticate_interactive():
print("\n❌ Authentication fejlede - afslutter")
return
# 1. Hent SharePoint sites
print("\n📍 SHAREPOINT SITES")
print("-" * 40)
sites = self.get_sharepoint_sites()
print(f" Found {len(sites)} sites")
for site in sites[:10]:
print(f" • {site['name']}")
# 2. Søg efter relevante dokumenter
print("\n🔍 SEARCHING DOCUMENTS")
print("-" * 40)
all_results = []
for query in self.SEARCH_QUERIES:
print(f" Søger: {query}")
results = self.search_sharepoint(query)
self.stats["searches"] += 1
for doc in results:
# Undgå duplikater
if not any(d['url'] == doc['url'] for d in all_results):
all_results.append(doc)
self.save_to_neo4j(doc)
print(f" Found {len(results)} results")
# 3. Hent OneDrive filer
print("\n📁 ONEDRIVE FILES")
print("-" * 40)
my_files = self.get_my_files()
print(f" Found {len(my_files)} items")
# 4. Summary
print("\n" + "=" * 60)
print("📊 HARVEST COMPLETE")
print("=" * 60)
print(f" 🔍 Searches performed: {self.stats['searches']}")
print(f" 📍 Sites found: {self.stats['sites']}")
print(f" 📄 Documents imported: {self.stats['documents']}")
print("=" * 60)
# Save local JSON
output_file = self.output_dir / "m365_harvest.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
"timestamp": datetime.now().isoformat(),
"stats": self.stats,
"sites": sites,
"documents": all_results[:100],
"onedrive": my_files
}, f, indent=2, ensure_ascii=False)
print(f"\n📁 Results saved: {output_file}")
self.neo4j.close()
if __name__ == "__main__":
harvester = TDCMicrosoft365Harvester()
harvester.run()