#!/usr/bin/env python3 """ 📧 TDC Exchange/Outlook Harvester Søger i emails efter SharePoint links, vedhæftninger og intern data """ import win32com.client import pythoncom import hashlib import json import re from pathlib import Path from datetime import datetime, timedelta from neo4j import GraphDatabase class TDCOutlookHarvester: """Harvester for TDC Outlook/Exchange data""" NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" # Søgetermer for relevante emails SEARCH_TERMS = [ "strategi", "roadmap", "cybersikkerhed", "cyber", "SOC", "NIS2", "cloud", "Azure", "AI", "kunstig intelligens", "GPT", "Copilot", "Columbus", "ERP", "budget", "forecast", "finanstal", "kunde", "kontrakt", "rammeaftale", "SKI", "produkt", "CloudKey", "prisliste" ] def __init__(self): self.output_dir = Path("data/outlook_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) # Initialize COM for Outlook pythoncom.CoInitialize() print("📧 Connecting to Outlook...") self.outlook = win32com.client.Dispatch("Outlook.Application") self.namespace = self.outlook.GetNamespace("MAPI") # Neo4j self.neo4j = GraphDatabase.driver( self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) ) self.emails = [] self.sharepoint_links = [] self.attachments = [] self.stats = { "emails_scanned": 0, "relevant_emails": 0, "sharepoint_links": 0, "attachments": 0 } def get_folders(self): """List alle Outlook folders""" folders = [] for account in self.namespace.Folders: # Skip offentlige mapper if "offentlig" in account.Name.lower() or "public" in account.Name.lower(): print(f"\n📁 Account: {account.Name} (skipped)") continue print(f"\n📁 Account: {account.Name}") try: for folder in account.Folders: try: folders.append({ "account": account.Name, "folder": folder.Name, "count": folder.Items.Count if hasattr(folder.Items, 'Count') else 0 }) print(f" └─ {folder.Name}: {folder.Items.Count if hasattr(folder.Items, 'Count') else '?'} items") except: continue except Exception as e: print(f" ⚠️ Could not access folders: {e}") return folders def search_folder(self, folder, search_term: str, max_items: int = 100) -> list: """Søg i en specifik folder""" results = [] try: # Outlook filter filter_str = f"@SQL=\"urn:schemas:httpmail:subject\" LIKE '%{search_term}%' OR \"urn:schemas:httpmail:textdescription\" LIKE '%{search_term}%'" items = folder.Items items.Sort("[ReceivedTime]", True) # Nyeste først count = 0 for item in items: if count >= max_items: break try: subject = getattr(item, 'Subject', '') or '' body = getattr(item, 'Body', '') or '' # Check if search term matches if search_term.lower() in subject.lower() or search_term.lower() in body.lower(): # Extract SharePoint links sp_links = re.findall(r'https://[a-zA-Z0-9.-]*sharepoint\.com[^\s<>"]*', body) email_data = { "subject": subject[:200], "sender": str(getattr(item, 'SenderEmailAddress', '')), "sender_name": str(getattr(item, 'SenderName', '')), "received": str(getattr(item, 'ReceivedTime', '')), "search_term": search_term, "has_attachments": getattr(item, 'Attachments', None) and item.Attachments.Count > 0, "attachment_count": item.Attachments.Count if hasattr(item, 'Attachments') else 0, "sharepoint_links": sp_links[:10], "body_preview": body[:500].replace('\r\n', ' ').replace('\n', ' ') } # Get attachment names if email_data["has_attachments"]: email_data["attachment_names"] = [ att.FileName for att in item.Attachments ][:10] results.append(email_data) self.sharepoint_links.extend(sp_links) count += 1 except Exception as e: continue except Exception as e: print(f" ⚠️ Search error: {e}") return results def harvest_inbox(self, days_back: int = 90, max_per_term: int = 50): """Harvest emails from inbox""" print(f"\n📥 HARVESTING INBOX (last {days_back} days)") print("-" * 50) # Find TDC inbox inbox = None for account in self.namespace.Folders: if "tdc" in account.Name.lower(): try: inbox = account.Folders["Inbox"] print(f" Found: {account.Name}/Inbox") break except: # Try Indbakke (Danish) try: inbox = account.Folders["Indbakke"] print(f" Found: {account.Name}/Indbakke") break except: continue if not inbox: # Fallback to default inbox inbox = self.namespace.GetDefaultFolder(6) # 6 = Inbox print(f" Using default inbox") print(f" Items in inbox: {inbox.Items.Count}") # Search for each term all_results = [] for term in self.SEARCH_TERMS: print(f"\n 🔍 Searching: '{term}'") results = self.search_folder(inbox, term, max_per_term) for email in results: # Avoid duplicates if not any(e['subject'] == email['subject'] and e['received'] == email['received'] for e in all_results): all_results.append(email) self.save_to_neo4j(email) self.stats["emails_scanned"] += max_per_term self.stats["relevant_emails"] += len(results) print(f" Found: {len(results)} relevant emails") self.emails = all_results return all_results def harvest_sent_items(self, max_per_term: int = 30): """Harvest sent emails""" print(f"\n📤 HARVESTING SENT ITEMS") print("-" * 50) sent = None for account in self.namespace.Folders: if "tdc" in account.Name.lower(): try: sent = account.Folders["Sent Items"] break except: try: sent = account.Folders["Sendt post"] break except: continue if not sent: sent = self.namespace.GetDefaultFolder(5) # 5 = Sent results = [] for term in self.SEARCH_TERMS[:10]: # Færre terms for sent found = self.search_folder(sent, term, max_per_term) results.extend(found) print(f" Found: {len(results)} relevant sent emails") return results def extract_sharepoint_links(self): """Udtræk alle unikke SharePoint links""" unique_links = list(set(self.sharepoint_links)) self.stats["sharepoint_links"] = len(unique_links) print(f"\n🔗 SHAREPOINT LINKS FOUND: {len(unique_links)}") print("-" * 50) for link in unique_links[:20]: print(f" {link[:80]}...") # Save to Neo4j self.save_sharepoint_link(link) return unique_links def save_to_neo4j(self, email: dict): """Gem email i Neo4j""" content_hash = hashlib.md5( f"{email['subject']}:{email['received']}".encode() ).hexdigest() with self.neo4j.session() as session: session.run(""" MERGE (e:TDCEmail {contentHash: $hash}) ON CREATE SET e.subject = $subject, e.sender = $sender, e.senderName = $sender_name, e.received = $received, e.searchTerm = $search_term, e.hasAttachments = $has_attachments, e.attachmentCount = $attachment_count, e.bodyPreview = $body_preview, e.harvestedAt = datetime() MERGE (ds:DataSource {name: 'TDC_Exchange'}) ON CREATE SET ds.type = 'email' MERGE (e)-[:HARVESTED_FROM]->(ds) """, hash=content_hash, subject=email.get('subject', ''), sender=email.get('sender', ''), sender_name=email.get('sender_name', ''), received=email.get('received', ''), search_term=email.get('search_term', ''), has_attachments=email.get('has_attachments', False), attachment_count=email.get('attachment_count', 0), body_preview=email.get('body_preview', '')[:1000] ) # Link SharePoint URLs for sp_link in email.get('sharepoint_links', []): link_hash = hashlib.md5(sp_link.encode()).hexdigest() session.run(""" MERGE (sp:SharePointLink {contentHash: $hash}) ON CREATE SET sp.url = $url, sp.discoveredAt = datetime() WITH sp MATCH (e:TDCEmail {contentHash: $email_hash}) MERGE (e)-[:CONTAINS_LINK]->(sp) """, hash=link_hash, url=sp_link, email_hash=content_hash ) def save_sharepoint_link(self, url: str): """Gem SharePoint link separat""" link_hash = hashlib.md5(url.encode()).hexdigest() with self.neo4j.session() as session: session.run(""" MERGE (sp:SharePointLink {contentHash: $hash}) ON CREATE SET sp.url = $url, sp.discoveredAt = datetime(), sp.source = 'email_extraction' """, hash=link_hash, url=url ) def run(self): """Kør fuld harvest""" print("\n" + "=" * 60) print("📧 TDC OUTLOOK/EXCHANGE HARVESTER") print("=" * 60) # 1. List folders print("\n📁 AVAILABLE FOLDERS") folders = self.get_folders() # 2. Harvest inbox inbox_results = self.harvest_inbox(days_back=180, max_per_term=50) # 3. Harvest sent items sent_results = self.harvest_sent_items(max_per_term=30) # 4. Extract SharePoint links sp_links = self.extract_sharepoint_links() # 5. Summary print("\n" + "=" * 60) print("📊 HARVEST COMPLETE") print("=" * 60) print(f" 📧 Emails scanned: ~{self.stats['emails_scanned']}") print(f" ✅ Relevant emails: {self.stats['relevant_emails']}") print(f" 🔗 SharePoint links: {self.stats['sharepoint_links']}") print(f" 📎 With attachments: {sum(1 for e in self.emails if e.get('has_attachments'))}") print("=" * 60) # Save local JSON output_file = self.output_dir / "outlook_harvest.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump({ "timestamp": datetime.now().isoformat(), "stats": self.stats, "emails": self.emails[:200], "sharepoint_links": list(set(self.sharepoint_links))[:100] }, f, indent=2, ensure_ascii=False, default=str) print(f"\n📁 Results saved: {output_file}") # Cleanup pythoncom.CoUninitialize() self.neo4j.close() return self.emails, sp_links if __name__ == "__main__": harvester = TDCOutlookHarvester() harvester.run()