Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🏢 TDC Microsoft 365 Quick Harvester | |
| Brug: python m365_quick_harvest.py [access_token] | |
| Henter emails og SharePoint docs via Microsoft Graph API | |
| og gemmer dem i Neo4j knowledge graph. | |
| """ | |
| import sys | |
| import json | |
| import hashlib | |
| import requests | |
| from pathlib import Path | |
| from datetime import datetime | |
| from neo4j import GraphDatabase | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| GRAPH_BASE = "https://graph.microsoft.com/v1.0" | |
| # TDC-relevante søgetermer | |
| SEARCH_TERMS = [ | |
| "strategi TDC Erhverv", | |
| "cybersikkerhed", | |
| "NIS2", | |
| "cloud Azure", | |
| "AI Copilot", | |
| "Columbus ERP", | |
| "budget forecast", | |
| "kundeliste", | |
| "rammeaftale SKI", | |
| "produktkatalog", | |
| "SOC MDR", | |
| "IT arkitektur" | |
| ] | |
| class M365QuickHarvester: | |
| def __init__(self, access_token: str): | |
| self.token = access_token | |
| self.headers = { | |
| "Authorization": f"Bearer {access_token}", | |
| "Content-Type": "application/json", | |
| "ConsistencyLevel": "eventual" | |
| } | |
| self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) | |
| self.results = [] | |
| self.stats = {"emails": 0, "docs": 0, "errors": 0} | |
| def verify_token(self) -> bool: | |
| """Verificer token og vis brugerinfo""" | |
| try: | |
| resp = requests.get(f"{GRAPH_BASE}/me", headers=self.headers) | |
| if resp.ok: | |
| user = resp.json() | |
| print(f"✅ Logget ind som: {user.get('displayName', 'Unknown')}") | |
| print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}") | |
| return True | |
| else: | |
| print(f"❌ Token fejl: {resp.status_code}") | |
| return False | |
| except Exception as e: | |
| print(f"❌ Fejl: {e}") | |
| return False | |
| def search_emails(self, query: str, limit: int = 20) -> list: | |
| """Søg i emails""" | |
| try: | |
| url = f"{GRAPH_BASE}/me/messages?$search=\"{query}\"&$top={limit}&$select=id,subject,from,receivedDateTime,bodyPreview,webLink" | |
| resp = requests.get(url, headers=self.headers) | |
| if not resp.ok: | |
| return [] | |
| emails = [] | |
| for msg in resp.json().get("value", []): | |
| emails.append({ | |
| "id": msg.get("id"), | |
| "title": msg.get("subject", "(No subject)"), | |
| "url": msg.get("webLink", ""), | |
| "summary": msg.get("bodyPreview", "")[:500], | |
| "type": "email", | |
| "from": msg.get("from", {}).get("emailAddress", {}).get("address", ""), | |
| "date": msg.get("receivedDateTime"), | |
| "query": query | |
| }) | |
| return emails | |
| except Exception as e: | |
| print(f" ⚠️ Email søgefejl: {e}") | |
| return [] | |
| def search_sharepoint(self, query: str, limit: int = 20) -> list: | |
| """Søg i SharePoint/OneDrive""" | |
| try: | |
| body = { | |
| "requests": [{ | |
| "entityTypes": ["driveItem", "listItem", "site"], | |
| "query": {"queryString": query}, | |
| "from": 0, | |
| "size": limit | |
| }] | |
| } | |
| resp = requests.post(f"{GRAPH_BASE}/search/query", headers=self.headers, json=body) | |
| if not resp.ok: | |
| return [] | |
| docs = [] | |
| for result_set in resp.json().get("value", []): | |
| for container in result_set.get("hitsContainers", []): | |
| for hit in container.get("hits", []): | |
| resource = hit.get("resource", {}) | |
| docs.append({ | |
| "id": resource.get("id", hit.get("hitId")), | |
| "title": resource.get("name") or resource.get("displayName") or query, | |
| "url": resource.get("webUrl", ""), | |
| "summary": hit.get("summary", "")[:500], | |
| "type": "document", | |
| "date": resource.get("lastModifiedDateTime"), | |
| "query": query | |
| }) | |
| return docs | |
| except Exception as e: | |
| print(f" ⚠️ SharePoint søgefejl: {e}") | |
| return [] | |
| def save_to_neo4j(self, item: dict): | |
| """Gem i Neo4j""" | |
| content_hash = hashlib.md5(f"{item['title']}:{item['url']}".encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (d:M365Document {contentHash: $hash}) | |
| ON CREATE SET | |
| d.id = $id, | |
| d.title = $title, | |
| d.url = $url, | |
| d.summary = $summary, | |
| d.docType = $type, | |
| d.searchQuery = $query, | |
| d.date = $date, | |
| d.harvestedAt = datetime() | |
| ON MATCH SET | |
| d.lastSeen = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_M365'}) | |
| ON CREATE SET ds.type = 'enterprise_m365' | |
| MERGE (d)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=item.get("id", ""), | |
| title=item.get("title", ""), | |
| url=item.get("url", ""), | |
| summary=item.get("summary", ""), | |
| type=item.get("type", ""), | |
| query=item.get("query", ""), | |
| date=item.get("date", "") | |
| ) | |
| def run(self): | |
| """Kør harvest""" | |
| print("\n" + "=" * 60) | |
| print("🏢 TDC MICROSOFT 365 QUICK HARVESTER") | |
| print("=" * 60) | |
| if not self.verify_token(): | |
| print("\n❌ Ugyldig token - hent ny fra Graph Explorer") | |
| return | |
| print(f"\n🔍 Søger med {len(SEARCH_TERMS)} termer...") | |
| for query in SEARCH_TERMS: | |
| print(f"\n 📧 {query}") | |
| # Emails | |
| emails = self.search_emails(query, 10) | |
| for email in emails: | |
| if not any(r['url'] == email['url'] for r in self.results): | |
| self.results.append(email) | |
| self.save_to_neo4j(email) | |
| self.stats["emails"] += 1 | |
| # SharePoint | |
| docs = self.search_sharepoint(query, 10) | |
| for doc in docs: | |
| if not any(r['url'] == doc['url'] for r in self.results): | |
| self.results.append(doc) | |
| self.save_to_neo4j(doc) | |
| self.stats["docs"] += 1 | |
| print(f" {len(emails)} emails, {len(docs)} docs") | |
| # Gem lokal backup | |
| output_file = Path("data/m365_harvest/m365_results.json") | |
| output_file.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.stats, | |
| "results": self.results[:200] | |
| }, f, indent=2, ensure_ascii=False) | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 📧 Emails: {self.stats['emails']}") | |
| print(f" 📄 Documents: {self.stats['docs']}") | |
| print(f" 📁 Saved: {output_file}") | |
| print("=" * 60) | |
| self.neo4j.close() | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 2: | |
| print(""" | |
| 🏢 TDC M365 Quick Harvester | |
| ═══════════════════════════ | |
| Brug: python m365_quick_harvest.py [ACCESS_TOKEN] | |
| Sådan får du token: | |
| 1. Gå til: https://developer.microsoft.com/en-us/graph/graph-explorer | |
| 2. Log ind med TDC konto (clauskraf@tdc.dk) | |
| 3. Klik "Access token" tab | |
| 4. Kopier hele token | |
| Eller paste token nu: | |
| """) | |
| token = input("Access token: ").strip() | |
| if not token: | |
| sys.exit(1) | |
| else: | |
| token = sys.argv[1] | |
| harvester = M365QuickHarvester(token) | |
| harvester.run() | |