Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🚀 EWS AUTODISCOVER HARVESTER - Step 2 | |
| Automatisk endpoint discovery + OAuth2 Modern Auth | |
| Features: | |
| - Autodiscover Exchange endpoint fra email domain | |
| - OAuth2 med MSAL (Modern Authentication) | |
| - Full mailbox access: emails, calendar, contacts | |
| - Batch operations for effektiv harvest | |
| - Neo4j Knowledge Graph integration | |
| """ | |
| import asyncio | |
| import hashlib | |
| import json | |
| import re | |
| from datetime import datetime, timedelta | |
| from pathlib import Path | |
| from typing import Optional, Dict, List, Any | |
| from dataclasses import dataclass, asdict | |
| import xml.etree.ElementTree as ET | |
| import httpx | |
| from neo4j import GraphDatabase | |
| # ============================================================ | |
| # CONFIGURATION | |
| # ============================================================ | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| # Azure AD App Registration (oprettes i Azure Portal) | |
| # For TDC: Brug eksisterende app eller opret ny | |
| AZURE_CLIENT_ID = "" # Udfyldes med din Azure App ID | |
| AZURE_TENANT_ID = "" # Udfyldes med TDC tenant ID | |
| AZURE_CLIENT_SECRET = "" # Optional: Kun for app-only auth | |
| # EWS Endpoints (autodiscover finder den rigtige) | |
| EWS_ENDPOINTS = [ | |
| "https://outlook.office365.com/EWS/Exchange.asmx", | |
| "https://outlook.office.com/EWS/Exchange.asmx", | |
| ] | |
| # Søgetermer | |
| SEARCH_KEYWORDS = [ | |
| "strategi", "cyber", "NIS2", "SOC", "MDR", "cloud", "Azure", "AI", | |
| "Copilot", "Columbus", "ERP", "budget", "forecast", "kunde", "kontrakt", | |
| "rammeaftale", "SKI", "produkt", "CloudKey", "arkitektur", "roadmap" | |
| ] | |
| # ============================================================ | |
| # DATA CLASSES | |
| # ============================================================ | |
| class EmailItem: | |
| """Repræsenterer en email fra Exchange""" | |
| id: str | |
| subject: str | |
| sender: str | |
| sender_name: str | |
| received: str | |
| body_preview: str | |
| folder: str | |
| keywords: List[str] | |
| has_attachments: bool | |
| attachments: List[Dict] | |
| importance: str | |
| categories: str | |
| conversation_id: str = "" | |
| internet_message_id: str = "" | |
| class CalendarItem: | |
| """Repræsenterer en kalenderbegivenhed""" | |
| id: str | |
| subject: str | |
| organizer: str | |
| start: str | |
| end: str | |
| location: str | |
| attendees: List[str] | |
| body_preview: str | |
| is_recurring: bool | |
| class ContactItem: | |
| """Repræsenterer en kontakt""" | |
| id: str | |
| display_name: str | |
| email: str | |
| company: str | |
| job_title: str | |
| department: str | |
| phone: str | |
| # ============================================================ | |
| # AUTODISCOVER | |
| # ============================================================ | |
| class EWSAutodiscover: | |
| """Automatisk Exchange endpoint discovery""" | |
| def __init__(self, email: str): | |
| self.email = email | |
| self.domain = email.split("@")[1] | |
| self.ews_url: Optional[str] = None | |
| async def discover(self) -> Optional[str]: | |
| """Find EWS endpoint via autodiscover""" | |
| print(f"🔍 Autodiscover for {self.domain}...") | |
| # Autodiscover URLs at prøve | |
| autodiscover_urls = [ | |
| f"https://autodiscover.{self.domain}/autodiscover/autodiscover.xml", | |
| f"https://{self.domain}/autodiscover/autodiscover.xml", | |
| f"https://autodiscover.{self.domain}/Autodiscover/Autodiscover.xml", | |
| "https://autodiscover-s.outlook.com/autodiscover/autodiscover.xml", | |
| ] | |
| autodiscover_payload = f"""<?xml version="1.0" encoding="utf-8"?> | |
| <Autodiscover xmlns="http://schemas.microsoft.com/exchange/autodiscover/outlook/requestschema/2006"> | |
| <Request> | |
| <EMailAddress>{self.email}</EMailAddress> | |
| <AcceptableResponseSchema>http://schemas.microsoft.com/exchange/autodiscover/outlook/responseschema/2006a</AcceptableResponseSchema> | |
| </Request> | |
| </Autodiscover>""" | |
| async with httpx.AsyncClient(timeout=30) as client: | |
| for url in autodiscover_urls: | |
| try: | |
| print(f" Prøver: {url}") | |
| response = await client.post( | |
| url, | |
| content=autodiscover_payload, | |
| headers={"Content-Type": "text/xml"} | |
| ) | |
| if response.status_code == 200: | |
| # Parse XML response | |
| ews_url = self._parse_autodiscover_response(response.text) | |
| if ews_url: | |
| print(f" ✅ Fundet: {ews_url}") | |
| self.ews_url = ews_url | |
| return ews_url | |
| except Exception as e: | |
| continue | |
| # Fallback til Office 365 | |
| print(" ⚠️ Autodiscover failed, bruger Office 365 default") | |
| self.ews_url = EWS_ENDPOINTS[0] | |
| return self.ews_url | |
| def _parse_autodiscover_response(self, xml_text: str) -> Optional[str]: | |
| """Parse autodiscover XML response""" | |
| try: | |
| root = ET.fromstring(xml_text) | |
| # Find EWS URL i response | |
| for elem in root.iter(): | |
| if "EwsUrl" in elem.tag or "ASUrl" in elem.tag: | |
| return elem.text | |
| except: | |
| pass | |
| return None | |
| # ============================================================ | |
| # OAUTH2 AUTHENTICATION | |
| # ============================================================ | |
| class OAuth2Auth: | |
| """OAuth2 authentication for EWS""" | |
| def __init__(self, client_id: str, tenant_id: str, client_secret: str = ""): | |
| self.client_id = client_id | |
| self.tenant_id = tenant_id | |
| self.client_secret = client_secret | |
| self.access_token: Optional[str] = None | |
| self.token_expires: Optional[datetime] = None | |
| async def get_token_interactive(self) -> str: | |
| """Get token via interactive browser login""" | |
| try: | |
| import msal | |
| app = msal.PublicClientApplication( | |
| self.client_id, | |
| authority=f"https://login.microsoftonline.com/{self.tenant_id}" | |
| ) | |
| # Scopes for EWS | |
| scopes = ["https://outlook.office365.com/EWS.AccessAsUser.All"] | |
| # Prøv silent først (cached token) | |
| accounts = app.get_accounts() | |
| if accounts: | |
| result = app.acquire_token_silent(scopes, account=accounts[0]) | |
| if result and "access_token" in result: | |
| self.access_token = result["access_token"] | |
| return self.access_token | |
| # Interactive login | |
| print("🔐 Åbner browser for login...") | |
| result = app.acquire_token_interactive(scopes=scopes) | |
| if "access_token" in result: | |
| self.access_token = result["access_token"] | |
| print(" ✅ Token hentet!") | |
| return self.access_token | |
| else: | |
| raise Exception(f"Token error: {result.get('error_description', 'Unknown')}") | |
| except ImportError: | |
| print("⚠️ MSAL ikke installeret. Kør: pip install msal") | |
| raise | |
| async def get_token_device_flow(self) -> str: | |
| """Get token via device code flow (headless)""" | |
| try: | |
| import msal | |
| app = msal.PublicClientApplication( | |
| self.client_id, | |
| authority=f"https://login.microsoftonline.com/{self.tenant_id}" | |
| ) | |
| scopes = ["https://outlook.office365.com/EWS.AccessAsUser.All"] | |
| # Start device flow | |
| flow = app.initiate_device_flow(scopes=scopes) | |
| if "user_code" not in flow: | |
| raise Exception("Device flow failed") | |
| print("\n" + "=" * 60) | |
| print("🔐 DEVICE CODE LOGIN") | |
| print("=" * 60) | |
| print(f" Gå til: {flow['verification_uri']}") | |
| print(f" Indtast kode: {flow['user_code']}") | |
| print("=" * 60 + "\n") | |
| result = app.acquire_token_by_device_flow(flow) | |
| if "access_token" in result: | |
| self.access_token = result["access_token"] | |
| print(" ✅ Token hentet!") | |
| return self.access_token | |
| else: | |
| raise Exception(f"Token error: {result.get('error_description')}") | |
| except ImportError: | |
| print("⚠️ MSAL ikke installeret. Kør: pip install msal") | |
| raise | |
| # ============================================================ | |
| # EWS CLIENT | |
| # ============================================================ | |
| class EWSClient: | |
| """Exchange Web Services client""" | |
| SOAP_NS = { | |
| "soap": "http://schemas.xmlsoap.org/soap/envelope/", | |
| "t": "http://schemas.microsoft.com/exchange/services/2006/types", | |
| "m": "http://schemas.microsoft.com/exchange/services/2006/messages", | |
| } | |
| def __init__(self, ews_url: str, access_token: str): | |
| self.ews_url = ews_url | |
| self.access_token = access_token | |
| self.client = httpx.AsyncClient(timeout=60) | |
| def _headers(self) -> Dict: | |
| return { | |
| "Authorization": f"Bearer {self.access_token}", | |
| "Content-Type": "text/xml; charset=utf-8", | |
| } | |
| async def _soap_request(self, body: str) -> str: | |
| """Send SOAP request til EWS""" | |
| envelope = f"""<?xml version="1.0" encoding="utf-8"?> | |
| <soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/" | |
| xmlns:t="http://schemas.microsoft.com/exchange/services/2006/types" | |
| xmlns:m="http://schemas.microsoft.com/exchange/services/2006/messages"> | |
| <soap:Header> | |
| <t:RequestServerVersion Version="Exchange2016"/> | |
| </soap:Header> | |
| <soap:Body> | |
| {body} | |
| </soap:Body> | |
| </soap:Envelope>""" | |
| response = await self.client.post( | |
| self.ews_url, | |
| content=envelope, | |
| headers=self._headers() | |
| ) | |
| if response.status_code != 200: | |
| raise Exception(f"EWS Error {response.status_code}: {response.text[:500]}") | |
| return response.text | |
| async def get_folder_id(self, folder_name: str = "inbox") -> Optional[str]: | |
| """Hent folder ID""" | |
| folder_map = { | |
| "inbox": "inbox", | |
| "sent": "sentitems", | |
| "drafts": "drafts", | |
| "deleted": "deleteditems", | |
| "calendar": "calendar", | |
| "contacts": "contacts", | |
| } | |
| distinguished_folder = folder_map.get(folder_name.lower(), folder_name) | |
| body = f"""<m:GetFolder> | |
| <m:FolderShape> | |
| <t:BaseShape>IdOnly</t:BaseShape> | |
| </m:FolderShape> | |
| <m:FolderIds> | |
| <t:DistinguishedFolderId Id="{distinguished_folder}"/> | |
| </m:FolderIds> | |
| </m:GetFolder>""" | |
| response = await self._soap_request(body) | |
| # Parse folder ID | |
| try: | |
| root = ET.fromstring(response) | |
| for elem in root.iter(): | |
| if "FolderId" in elem.tag: | |
| return elem.get("Id") | |
| except: | |
| pass | |
| return None | |
| async def find_items( | |
| self, | |
| folder: str = "inbox", | |
| max_items: int = 100, | |
| days_back: int = 365, | |
| keywords: List[str] = None | |
| ) -> List[EmailItem]: | |
| """Find emails i en folder""" | |
| # Dato filter | |
| since_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%dT00:00:00Z") | |
| # Byg restriction (søgefilter) | |
| restriction = f"""<m:Restriction> | |
| <t:IsGreaterThanOrEqualTo> | |
| <t:FieldURI FieldURI="item:DateTimeReceived"/> | |
| <t:FieldURIOrConstant> | |
| <t:Constant Value="{since_date}"/> | |
| </t:FieldURIOrConstant> | |
| </t:IsGreaterThanOrEqualTo> | |
| </m:Restriction>""" | |
| body = f"""<m:FindItem Traversal="Shallow"> | |
| <m:ItemShape> | |
| <t:BaseShape>Default</t:BaseShape> | |
| <t:AdditionalProperties> | |
| <t:FieldURI FieldURI="item:Subject"/> | |
| <t:FieldURI FieldURI="item:DateTimeReceived"/> | |
| <t:FieldURI FieldURI="message:From"/> | |
| <t:FieldURI FieldURI="item:HasAttachments"/> | |
| <t:FieldURI FieldURI="item:Importance"/> | |
| <t:FieldURI FieldURI="item:Categories"/> | |
| <t:FieldURI FieldURI="message:ConversationId"/> | |
| <t:FieldURI FieldURI="message:InternetMessageId"/> | |
| </t:AdditionalProperties> | |
| </m:ItemShape> | |
| {restriction} | |
| <m:ParentFolderIds> | |
| <t:DistinguishedFolderId Id="{folder}"/> | |
| </m:ParentFolderIds> | |
| <m:IndexedPageItemView MaxEntriesReturned="{max_items}" Offset="0" BasePoint="Beginning"/> | |
| </m:FindItem>""" | |
| response = await self._soap_request(body) | |
| return self._parse_email_items(response, folder, keywords or SEARCH_KEYWORDS) | |
| def _parse_email_items(self, xml_text: str, folder: str, keywords: List[str]) -> List[EmailItem]: | |
| """Parse email items fra XML""" | |
| items = [] | |
| try: | |
| root = ET.fromstring(xml_text) | |
| for message in root.iter(): | |
| if not message.tag.endswith("Message"): | |
| continue | |
| # Extract fields | |
| subject = "" | |
| sender = "" | |
| sender_name = "" | |
| received = "" | |
| has_attachments = False | |
| importance = "Normal" | |
| categories = "" | |
| item_id = "" | |
| conversation_id = "" | |
| internet_message_id = "" | |
| for child in message: | |
| tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag | |
| if tag == "ItemId": | |
| item_id = child.get("Id", "") | |
| elif tag == "Subject": | |
| subject = child.text or "" | |
| elif tag == "DateTimeReceived": | |
| received = child.text or "" | |
| elif tag == "From": | |
| for mailbox in child.iter(): | |
| mtag = mailbox.tag.split("}")[-1] if "}" in mailbox.tag else mailbox.tag | |
| if mtag == "EmailAddress": | |
| sender = mailbox.text or "" | |
| elif mtag == "Name": | |
| sender_name = mailbox.text or "" | |
| elif tag == "HasAttachments": | |
| has_attachments = child.text == "true" | |
| elif tag == "Importance": | |
| importance = child.text or "Normal" | |
| elif tag == "Categories": | |
| categories = ",".join([c.text for c in child if c.text]) | |
| elif tag == "ConversationId": | |
| conversation_id = child.get("Id", "") | |
| elif tag == "InternetMessageId": | |
| internet_message_id = child.text or "" | |
| # Match keywords | |
| subject_lower = subject.lower() | |
| matched_keywords = [kw for kw in keywords if kw.lower() in subject_lower] | |
| if matched_keywords or not keywords: | |
| items.append(EmailItem( | |
| id=item_id, | |
| subject=subject, | |
| sender=sender, | |
| sender_name=sender_name, | |
| received=received, | |
| body_preview="", # Hentes separat hvis nødvendigt | |
| folder=folder, | |
| keywords=matched_keywords, | |
| has_attachments=has_attachments, | |
| attachments=[], | |
| importance=importance, | |
| categories=categories, | |
| conversation_id=conversation_id, | |
| internet_message_id=internet_message_id | |
| )) | |
| except Exception as e: | |
| print(f" ⚠️ Parse error: {e}") | |
| return items | |
| async def get_item_body(self, item_id: str) -> str: | |
| """Hent email body""" | |
| body = f"""<m:GetItem> | |
| <m:ItemShape> | |
| <t:BaseShape>Default</t:BaseShape> | |
| <t:AdditionalProperties> | |
| <t:FieldURI FieldURI="item:Body"/> | |
| </t:AdditionalProperties> | |
| </m:ItemShape> | |
| <m:ItemIds> | |
| <t:ItemId Id="{item_id}"/> | |
| </m:ItemIds> | |
| </m:GetItem>""" | |
| response = await self._soap_request(body) | |
| try: | |
| root = ET.fromstring(response) | |
| for elem in root.iter(): | |
| if elem.tag.endswith("Body"): | |
| text = elem.text or "" | |
| # Strip HTML | |
| text = re.sub(r'<[^>]+>', '', text) | |
| return text[:2000] | |
| except: | |
| pass | |
| return "" | |
| async def get_calendar_items(self, days_forward: int = 30, days_back: int = 7) -> List[CalendarItem]: | |
| """Hent kalender events""" | |
| start = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%dT00:00:00Z") | |
| end = (datetime.now() + timedelta(days=days_forward)).strftime("%Y-%m-%dT23:59:59Z") | |
| body = f"""<m:FindItem Traversal="Shallow"> | |
| <m:ItemShape> | |
| <t:BaseShape>Default</t:BaseShape> | |
| </m:ItemShape> | |
| <m:CalendarView MaxEntriesReturned="100" StartDate="{start}" EndDate="{end}"/> | |
| <m:ParentFolderIds> | |
| <t:DistinguishedFolderId Id="calendar"/> | |
| </m:ParentFolderIds> | |
| </m:FindItem>""" | |
| response = await self._soap_request(body) | |
| return self._parse_calendar_items(response) | |
| def _parse_calendar_items(self, xml_text: str) -> List[CalendarItem]: | |
| """Parse calendar items""" | |
| items = [] | |
| try: | |
| root = ET.fromstring(xml_text) | |
| for cal_item in root.iter(): | |
| if not cal_item.tag.endswith("CalendarItem"): | |
| continue | |
| item_id = "" | |
| subject = "" | |
| organizer = "" | |
| start = "" | |
| end = "" | |
| location = "" | |
| attendees = [] | |
| is_recurring = False | |
| for child in cal_item: | |
| tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag | |
| if tag == "ItemId": | |
| item_id = child.get("Id", "") | |
| elif tag == "Subject": | |
| subject = child.text or "" | |
| elif tag == "Start": | |
| start = child.text or "" | |
| elif tag == "End": | |
| end = child.text or "" | |
| elif tag == "Location": | |
| location = child.text or "" | |
| elif tag == "IsRecurring": | |
| is_recurring = child.text == "true" | |
| elif tag == "Organizer": | |
| for mailbox in child.iter(): | |
| if mailbox.tag.endswith("EmailAddress"): | |
| organizer = mailbox.text or "" | |
| items.append(CalendarItem( | |
| id=item_id, | |
| subject=subject, | |
| organizer=organizer, | |
| start=start, | |
| end=end, | |
| location=location, | |
| attendees=attendees, | |
| body_preview="", | |
| is_recurring=is_recurring | |
| )) | |
| except Exception as e: | |
| print(f" ⚠️ Calendar parse error: {e}") | |
| return items | |
| async def close(self): | |
| await self.client.aclose() | |
| # ============================================================ | |
| # MAIN HARVESTER | |
| # ============================================================ | |
| class EWSAutodiscoverHarvester: | |
| """Main harvester med autodiscover og EWS""" | |
| def __init__(self, email: str, client_id: str, tenant_id: str): | |
| self.email = email | |
| self.client_id = client_id | |
| self.tenant_id = tenant_id | |
| self.autodiscover = EWSAutodiscover(email) | |
| self.auth = OAuth2Auth(client_id, tenant_id) | |
| self.ews_client: Optional[EWSClient] = None | |
| self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) | |
| self.stats = { | |
| "emails_found": 0, | |
| "emails_matched": 0, | |
| "calendar_items": 0, | |
| "contacts": 0, | |
| "saved_to_neo4j": 0 | |
| } | |
| self.output_dir = Path("data/ews_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| async def connect(self) -> bool: | |
| """Forbind til Exchange""" | |
| print("\n" + "=" * 60) | |
| print("🚀 EWS AUTODISCOVER HARVESTER") | |
| print("=" * 60) | |
| # Step 1: Autodiscover | |
| ews_url = await self.autodiscover.discover() | |
| if not ews_url: | |
| print("❌ Kunne ikke finde EWS endpoint") | |
| return False | |
| # Step 2: OAuth2 token | |
| print("\n🔐 Henter OAuth2 token...") | |
| try: | |
| # Prøv device flow først (virker headless) | |
| token = await self.auth.get_token_device_flow() | |
| except Exception as e: | |
| print(f" Device flow failed: {e}") | |
| try: | |
| # Fallback til interactive | |
| token = await self.auth.get_token_interactive() | |
| except Exception as e2: | |
| print(f"❌ Authentication failed: {e2}") | |
| return False | |
| # Step 3: Opret EWS client | |
| self.ews_client = EWSClient(ews_url, token) | |
| print("✅ Forbundet til Exchange!") | |
| return True | |
| def save_email_to_neo4j(self, email: EmailItem): | |
| """Gem email i Neo4j""" | |
| content_hash = hashlib.md5(f"{email.subject}:{email.id}".encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (e:ExchangeEmail {contentHash: $hash}) | |
| ON CREATE SET | |
| e.itemId = $id, | |
| e.subject = $subject, | |
| e.sender = $sender, | |
| e.senderName = $senderName, | |
| e.received = $received, | |
| e.bodyPreview = $body, | |
| e.folder = $folder, | |
| e.keywords = $keywords, | |
| e.hasAttachments = $hasAtt, | |
| e.importance = $importance, | |
| e.conversationId = $convId, | |
| e.harvestedAt = datetime(), | |
| e.harvestMethod = 'EWS_Autodiscover' | |
| ON MATCH SET | |
| e.lastSeen = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_Exchange_EWS'}) | |
| ON CREATE SET ds.type = 'exchange_ews' | |
| MERGE (e)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=email.id, | |
| subject=email.subject[:500], | |
| sender=email.sender, | |
| senderName=email.sender_name, | |
| received=email.received, | |
| body=email.body_preview, | |
| folder=email.folder, | |
| keywords=email.keywords, | |
| hasAtt=email.has_attachments, | |
| importance=email.importance, | |
| convId=email.conversation_id | |
| ) | |
| # Keyword relationships | |
| for kw in email.keywords: | |
| session.run(""" | |
| MERGE (k:SearchKeyword {name: $kw}) | |
| WITH k | |
| MATCH (e:ExchangeEmail {contentHash: $hash}) | |
| MERGE (e)-[:MATCHES_KEYWORD]->(k) | |
| """, kw=kw, hash=content_hash) | |
| self.stats["saved_to_neo4j"] += 1 | |
| def save_calendar_to_neo4j(self, item: CalendarItem): | |
| """Gem kalender event i Neo4j""" | |
| content_hash = hashlib.md5(f"{item.subject}:{item.start}".encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (c:CalendarEvent {contentHash: $hash}) | |
| ON CREATE SET | |
| c.itemId = $id, | |
| c.subject = $subject, | |
| c.organizer = $organizer, | |
| c.start = $start, | |
| c.end = $end, | |
| c.location = $location, | |
| c.isRecurring = $recurring, | |
| c.harvestedAt = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_Exchange_Calendar'}) | |
| MERGE (c)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=item.id, | |
| subject=item.subject[:500], | |
| organizer=item.organizer, | |
| start=item.start, | |
| end=item.end, | |
| location=item.location, | |
| recurring=item.is_recurring | |
| ) | |
| async def harvest_emails(self, folders: List[str] = None, days_back: int = 365): | |
| """Harvest emails fra folders""" | |
| if not folders: | |
| folders = ["inbox", "sentitems"] | |
| all_emails = [] | |
| for folder in folders: | |
| print(f"\n📂 Scanner {folder}...") | |
| try: | |
| items = await self.ews_client.find_items( | |
| folder=folder, | |
| max_items=500, | |
| days_back=days_back, | |
| keywords=SEARCH_KEYWORDS | |
| ) | |
| self.stats["emails_found"] += len(items) | |
| # Filtrer på keywords | |
| matched = [item for item in items if item.keywords] | |
| self.stats["emails_matched"] += len(matched) | |
| print(f" Fundet: {len(items)}, Matched: {len(matched)}") | |
| # Hent body for matched items (begrænset) | |
| for i, item in enumerate(matched[:50]): | |
| if i % 10 == 0: | |
| print(f" Henter body {i+1}/{min(len(matched), 50)}...", end="\r") | |
| body = await self.ews_client.get_item_body(item.id) | |
| item.body_preview = body | |
| # Gem i Neo4j | |
| self.save_email_to_neo4j(item) | |
| all_emails.append(item) | |
| except Exception as e: | |
| print(f" ⚠️ Error: {e}") | |
| return all_emails | |
| async def harvest_calendar(self, days_forward: int = 30, days_back: int = 7): | |
| """Harvest kalender events""" | |
| print(f"\n📅 Scanner kalender ({days_back}d tilbage, {days_forward}d frem)...") | |
| try: | |
| items = await self.ews_client.get_calendar_items(days_forward, days_back) | |
| self.stats["calendar_items"] = len(items) | |
| print(f" Fundet: {len(items)} events") | |
| for item in items: | |
| self.save_calendar_to_neo4j(item) | |
| return items | |
| except Exception as e: | |
| print(f" ⚠️ Error: {e}") | |
| return [] | |
| async def run(self, days_back: int = 365): | |
| """Kør fuld harvest""" | |
| if not await self.connect(): | |
| return | |
| # Harvest emails | |
| emails = await self.harvest_emails( | |
| folders=["inbox", "sentitems"], | |
| days_back=days_back | |
| ) | |
| # Harvest calendar | |
| calendar = await self.harvest_calendar() | |
| # Gem JSON output | |
| output_file = self.output_dir / f"ews_harvest_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "email": self.email, | |
| "stats": self.stats, | |
| "emails": [asdict(e) for e in emails], | |
| "calendar": [asdict(c) for c in calendar] | |
| }, f, indent=2, ensure_ascii=False) | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 📧 Emails fundet: {self.stats['emails_found']}") | |
| print(f" ✅ Emails matched: {self.stats['emails_matched']}") | |
| print(f" 📅 Calendar events: {self.stats['calendar_items']}") | |
| print(f" 💾 Saved to Neo4j: {self.stats['saved_to_neo4j']}") | |
| print(f"\n 📁 Output: {output_file}") | |
| print("=" * 60) | |
| # Cleanup | |
| if self.ews_client: | |
| await self.ews_client.close() | |
| self.neo4j.close() | |
| # ============================================================ | |
| # MAIN | |
| # ============================================================ | |
| async def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description="EWS Autodiscover Harvester") | |
| parser.add_argument("--email", required=True, help="Din TDC email adresse") | |
| parser.add_argument("--client-id", required=True, help="Azure AD App Client ID") | |
| parser.add_argument("--tenant-id", required=True, help="Azure AD Tenant ID") | |
| parser.add_argument("--days", type=int, default=365, help="Dage tilbage (default: 365)") | |
| args = parser.parse_args() | |
| harvester = EWSAutodiscoverHarvester( | |
| email=args.email, | |
| client_id=args.client_id, | |
| tenant_id=args.tenant_id | |
| ) | |
| await harvester.run(args.days) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |