widgettdc-api / apps /backend /python /m365_quick_harvest.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
🏢 TDC Microsoft 365 Quick Harvester
Brug: python m365_quick_harvest.py [access_token]
Henter emails og SharePoint docs via Microsoft Graph API
og gemmer dem i Neo4j knowledge graph.
"""
import sys
import json
import hashlib
import requests
from pathlib import Path
from datetime import datetime
from neo4j import GraphDatabase
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
GRAPH_BASE = "https://graph.microsoft.com/v1.0"
# TDC-relevante søgetermer
SEARCH_TERMS = [
"strategi TDC Erhverv",
"cybersikkerhed",
"NIS2",
"cloud Azure",
"AI Copilot",
"Columbus ERP",
"budget forecast",
"kundeliste",
"rammeaftale SKI",
"produktkatalog",
"SOC MDR",
"IT arkitektur"
]
class M365QuickHarvester:
def __init__(self, access_token: str):
self.token = access_token
self.headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/json",
"ConsistencyLevel": "eventual"
}
self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
self.results = []
self.stats = {"emails": 0, "docs": 0, "errors": 0}
def verify_token(self) -> bool:
"""Verificer token og vis brugerinfo"""
try:
resp = requests.get(f"{GRAPH_BASE}/me", headers=self.headers)
if resp.ok:
user = resp.json()
print(f"✅ Logget ind som: {user.get('displayName', 'Unknown')}")
print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}")
return True
else:
print(f"❌ Token fejl: {resp.status_code}")
return False
except Exception as e:
print(f"❌ Fejl: {e}")
return False
def search_emails(self, query: str, limit: int = 20) -> list:
"""Søg i emails"""
try:
url = f"{GRAPH_BASE}/me/messages?$search=\"{query}\"&$top={limit}&$select=id,subject,from,receivedDateTime,bodyPreview,webLink"
resp = requests.get(url, headers=self.headers)
if not resp.ok:
return []
emails = []
for msg in resp.json().get("value", []):
emails.append({
"id": msg.get("id"),
"title": msg.get("subject", "(No subject)"),
"url": msg.get("webLink", ""),
"summary": msg.get("bodyPreview", "")[:500],
"type": "email",
"from": msg.get("from", {}).get("emailAddress", {}).get("address", ""),
"date": msg.get("receivedDateTime"),
"query": query
})
return emails
except Exception as e:
print(f" ⚠️ Email søgefejl: {e}")
return []
def search_sharepoint(self, query: str, limit: int = 20) -> list:
"""Søg i SharePoint/OneDrive"""
try:
body = {
"requests": [{
"entityTypes": ["driveItem", "listItem", "site"],
"query": {"queryString": query},
"from": 0,
"size": limit
}]
}
resp = requests.post(f"{GRAPH_BASE}/search/query", headers=self.headers, json=body)
if not resp.ok:
return []
docs = []
for result_set in resp.json().get("value", []):
for container in result_set.get("hitsContainers", []):
for hit in container.get("hits", []):
resource = hit.get("resource", {})
docs.append({
"id": resource.get("id", hit.get("hitId")),
"title": resource.get("name") or resource.get("displayName") or query,
"url": resource.get("webUrl", ""),
"summary": hit.get("summary", "")[:500],
"type": "document",
"date": resource.get("lastModifiedDateTime"),
"query": query
})
return docs
except Exception as e:
print(f" ⚠️ SharePoint søgefejl: {e}")
return []
def save_to_neo4j(self, item: dict):
"""Gem i Neo4j"""
content_hash = hashlib.md5(f"{item['title']}:{item['url']}".encode()).hexdigest()
with self.neo4j.session() as session:
session.run("""
MERGE (d:M365Document {contentHash: $hash})
ON CREATE SET
d.id = $id,
d.title = $title,
d.url = $url,
d.summary = $summary,
d.docType = $type,
d.searchQuery = $query,
d.date = $date,
d.harvestedAt = datetime()
ON MATCH SET
d.lastSeen = datetime()
MERGE (ds:DataSource {name: 'TDC_M365'})
ON CREATE SET ds.type = 'enterprise_m365'
MERGE (d)-[:HARVESTED_FROM]->(ds)
""",
hash=content_hash,
id=item.get("id", ""),
title=item.get("title", ""),
url=item.get("url", ""),
summary=item.get("summary", ""),
type=item.get("type", ""),
query=item.get("query", ""),
date=item.get("date", "")
)
def run(self):
"""Kør harvest"""
print("\n" + "=" * 60)
print("🏢 TDC MICROSOFT 365 QUICK HARVESTER")
print("=" * 60)
if not self.verify_token():
print("\n❌ Ugyldig token - hent ny fra Graph Explorer")
return
print(f"\n🔍 Søger med {len(SEARCH_TERMS)} termer...")
for query in SEARCH_TERMS:
print(f"\n 📧 {query}")
# Emails
emails = self.search_emails(query, 10)
for email in emails:
if not any(r['url'] == email['url'] for r in self.results):
self.results.append(email)
self.save_to_neo4j(email)
self.stats["emails"] += 1
# SharePoint
docs = self.search_sharepoint(query, 10)
for doc in docs:
if not any(r['url'] == doc['url'] for r in self.results):
self.results.append(doc)
self.save_to_neo4j(doc)
self.stats["docs"] += 1
print(f" {len(emails)} emails, {len(docs)} docs")
# Gem lokal backup
output_file = Path("data/m365_harvest/m365_results.json")
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
"timestamp": datetime.now().isoformat(),
"stats": self.stats,
"results": self.results[:200]
}, f, indent=2, ensure_ascii=False)
print("\n" + "=" * 60)
print("📊 HARVEST COMPLETE")
print("=" * 60)
print(f" 📧 Emails: {self.stats['emails']}")
print(f" 📄 Documents: {self.stats['docs']}")
print(f" 📁 Saved: {output_file}")
print("=" * 60)
self.neo4j.close()
if __name__ == "__main__":
if len(sys.argv) < 2:
print("""
🏢 TDC M365 Quick Harvester
═══════════════════════════
Brug: python m365_quick_harvest.py [ACCESS_TOKEN]
Sådan får du token:
1. Gå til: https://developer.microsoft.com/en-us/graph/graph-explorer
2. Log ind med TDC konto (clauskraf@tdc.dk)
3. Klik "Access token" tab
4. Kopier hele token
Eller paste token nu:
""")
token = input("Access token: ").strip()
if not token:
sys.exit(1)
else:
token = sys.argv[1]
harvester = M365QuickHarvester(token)
harvester.run()