Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 📧 TDC Exchange/Outlook Harvester | |
| Søger i emails efter SharePoint links, vedhæftninger og intern data | |
| """ | |
| import win32com.client | |
| import pythoncom | |
| import hashlib | |
| import json | |
| import re | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| from neo4j import GraphDatabase | |
| class TDCOutlookHarvester: | |
| """Harvester for TDC Outlook/Exchange data""" | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| # Søgetermer for relevante emails | |
| SEARCH_TERMS = [ | |
| "strategi", | |
| "roadmap", | |
| "cybersikkerhed", | |
| "cyber", | |
| "SOC", | |
| "NIS2", | |
| "cloud", | |
| "Azure", | |
| "AI", | |
| "kunstig intelligens", | |
| "GPT", | |
| "Copilot", | |
| "Columbus", | |
| "ERP", | |
| "budget", | |
| "forecast", | |
| "finanstal", | |
| "kunde", | |
| "kontrakt", | |
| "rammeaftale", | |
| "SKI", | |
| "produkt", | |
| "CloudKey", | |
| "prisliste" | |
| ] | |
| def __init__(self): | |
| self.output_dir = Path("data/outlook_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| # Initialize COM for Outlook | |
| pythoncom.CoInitialize() | |
| print("📧 Connecting to Outlook...") | |
| self.outlook = win32com.client.Dispatch("Outlook.Application") | |
| self.namespace = self.outlook.GetNamespace("MAPI") | |
| # Neo4j | |
| self.neo4j = GraphDatabase.driver( | |
| self.NEO4J_URI, | |
| auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| self.emails = [] | |
| self.sharepoint_links = [] | |
| self.attachments = [] | |
| self.stats = { | |
| "emails_scanned": 0, | |
| "relevant_emails": 0, | |
| "sharepoint_links": 0, | |
| "attachments": 0 | |
| } | |
| def get_folders(self): | |
| """List alle Outlook folders""" | |
| folders = [] | |
| for account in self.namespace.Folders: | |
| # Skip offentlige mapper | |
| if "offentlig" in account.Name.lower() or "public" in account.Name.lower(): | |
| print(f"\n📁 Account: {account.Name} (skipped)") | |
| continue | |
| print(f"\n📁 Account: {account.Name}") | |
| try: | |
| for folder in account.Folders: | |
| try: | |
| folders.append({ | |
| "account": account.Name, | |
| "folder": folder.Name, | |
| "count": folder.Items.Count if hasattr(folder.Items, 'Count') else 0 | |
| }) | |
| print(f" └─ {folder.Name}: {folder.Items.Count if hasattr(folder.Items, 'Count') else '?'} items") | |
| except: | |
| continue | |
| except Exception as e: | |
| print(f" ⚠️ Could not access folders: {e}") | |
| return folders | |
| def search_folder(self, folder, search_term: str, max_items: int = 100) -> list: | |
| """Søg i en specifik folder""" | |
| results = [] | |
| try: | |
| # Outlook filter | |
| filter_str = f"@SQL=\"urn:schemas:httpmail:subject\" LIKE '%{search_term}%' OR \"urn:schemas:httpmail:textdescription\" LIKE '%{search_term}%'" | |
| items = folder.Items | |
| items.Sort("[ReceivedTime]", True) # Nyeste først | |
| count = 0 | |
| for item in items: | |
| if count >= max_items: | |
| break | |
| try: | |
| subject = getattr(item, 'Subject', '') or '' | |
| body = getattr(item, 'Body', '') or '' | |
| # Check if search term matches | |
| if search_term.lower() in subject.lower() or search_term.lower() in body.lower(): | |
| # Extract SharePoint links | |
| sp_links = re.findall(r'https://[a-zA-Z0-9.-]*sharepoint\.com[^\s<>"]*', body) | |
| email_data = { | |
| "subject": subject[:200], | |
| "sender": str(getattr(item, 'SenderEmailAddress', '')), | |
| "sender_name": str(getattr(item, 'SenderName', '')), | |
| "received": str(getattr(item, 'ReceivedTime', '')), | |
| "search_term": search_term, | |
| "has_attachments": getattr(item, 'Attachments', None) and item.Attachments.Count > 0, | |
| "attachment_count": item.Attachments.Count if hasattr(item, 'Attachments') else 0, | |
| "sharepoint_links": sp_links[:10], | |
| "body_preview": body[:500].replace('\r\n', ' ').replace('\n', ' ') | |
| } | |
| # Get attachment names | |
| if email_data["has_attachments"]: | |
| email_data["attachment_names"] = [ | |
| att.FileName for att in item.Attachments | |
| ][:10] | |
| results.append(email_data) | |
| self.sharepoint_links.extend(sp_links) | |
| count += 1 | |
| except Exception as e: | |
| continue | |
| except Exception as e: | |
| print(f" ⚠️ Search error: {e}") | |
| return results | |
| def harvest_inbox(self, days_back: int = 90, max_per_term: int = 50): | |
| """Harvest emails from inbox""" | |
| print(f"\n📥 HARVESTING INBOX (last {days_back} days)") | |
| print("-" * 50) | |
| # Find TDC inbox | |
| inbox = None | |
| for account in self.namespace.Folders: | |
| if "tdc" in account.Name.lower(): | |
| try: | |
| inbox = account.Folders["Inbox"] | |
| print(f" Found: {account.Name}/Inbox") | |
| break | |
| except: | |
| # Try Indbakke (Danish) | |
| try: | |
| inbox = account.Folders["Indbakke"] | |
| print(f" Found: {account.Name}/Indbakke") | |
| break | |
| except: | |
| continue | |
| if not inbox: | |
| # Fallback to default inbox | |
| inbox = self.namespace.GetDefaultFolder(6) # 6 = Inbox | |
| print(f" Using default inbox") | |
| print(f" Items in inbox: {inbox.Items.Count}") | |
| # Search for each term | |
| all_results = [] | |
| for term in self.SEARCH_TERMS: | |
| print(f"\n 🔍 Searching: '{term}'") | |
| results = self.search_folder(inbox, term, max_per_term) | |
| for email in results: | |
| # Avoid duplicates | |
| if not any(e['subject'] == email['subject'] and e['received'] == email['received'] | |
| for e in all_results): | |
| all_results.append(email) | |
| self.save_to_neo4j(email) | |
| self.stats["emails_scanned"] += max_per_term | |
| self.stats["relevant_emails"] += len(results) | |
| print(f" Found: {len(results)} relevant emails") | |
| self.emails = all_results | |
| return all_results | |
| def harvest_sent_items(self, max_per_term: int = 30): | |
| """Harvest sent emails""" | |
| print(f"\n📤 HARVESTING SENT ITEMS") | |
| print("-" * 50) | |
| sent = None | |
| for account in self.namespace.Folders: | |
| if "tdc" in account.Name.lower(): | |
| try: | |
| sent = account.Folders["Sent Items"] | |
| break | |
| except: | |
| try: | |
| sent = account.Folders["Sendt post"] | |
| break | |
| except: | |
| continue | |
| if not sent: | |
| sent = self.namespace.GetDefaultFolder(5) # 5 = Sent | |
| results = [] | |
| for term in self.SEARCH_TERMS[:10]: # Færre terms for sent | |
| found = self.search_folder(sent, term, max_per_term) | |
| results.extend(found) | |
| print(f" Found: {len(results)} relevant sent emails") | |
| return results | |
| def extract_sharepoint_links(self): | |
| """Udtræk alle unikke SharePoint links""" | |
| unique_links = list(set(self.sharepoint_links)) | |
| self.stats["sharepoint_links"] = len(unique_links) | |
| print(f"\n🔗 SHAREPOINT LINKS FOUND: {len(unique_links)}") | |
| print("-" * 50) | |
| for link in unique_links[:20]: | |
| print(f" {link[:80]}...") | |
| # Save to Neo4j | |
| self.save_sharepoint_link(link) | |
| return unique_links | |
| def save_to_neo4j(self, email: dict): | |
| """Gem email i Neo4j""" | |
| content_hash = hashlib.md5( | |
| f"{email['subject']}:{email['received']}".encode() | |
| ).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (e:TDCEmail {contentHash: $hash}) | |
| ON CREATE SET | |
| e.subject = $subject, | |
| e.sender = $sender, | |
| e.senderName = $sender_name, | |
| e.received = $received, | |
| e.searchTerm = $search_term, | |
| e.hasAttachments = $has_attachments, | |
| e.attachmentCount = $attachment_count, | |
| e.bodyPreview = $body_preview, | |
| e.harvestedAt = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_Exchange'}) | |
| ON CREATE SET ds.type = 'email' | |
| MERGE (e)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| subject=email.get('subject', ''), | |
| sender=email.get('sender', ''), | |
| sender_name=email.get('sender_name', ''), | |
| received=email.get('received', ''), | |
| search_term=email.get('search_term', ''), | |
| has_attachments=email.get('has_attachments', False), | |
| attachment_count=email.get('attachment_count', 0), | |
| body_preview=email.get('body_preview', '')[:1000] | |
| ) | |
| # Link SharePoint URLs | |
| for sp_link in email.get('sharepoint_links', []): | |
| link_hash = hashlib.md5(sp_link.encode()).hexdigest() | |
| session.run(""" | |
| MERGE (sp:SharePointLink {contentHash: $hash}) | |
| ON CREATE SET sp.url = $url, sp.discoveredAt = datetime() | |
| WITH sp | |
| MATCH (e:TDCEmail {contentHash: $email_hash}) | |
| MERGE (e)-[:CONTAINS_LINK]->(sp) | |
| """, | |
| hash=link_hash, | |
| url=sp_link, | |
| email_hash=content_hash | |
| ) | |
| def save_sharepoint_link(self, url: str): | |
| """Gem SharePoint link separat""" | |
| link_hash = hashlib.md5(url.encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (sp:SharePointLink {contentHash: $hash}) | |
| ON CREATE SET | |
| sp.url = $url, | |
| sp.discoveredAt = datetime(), | |
| sp.source = 'email_extraction' | |
| """, | |
| hash=link_hash, | |
| url=url | |
| ) | |
| def run(self): | |
| """Kør fuld harvest""" | |
| print("\n" + "=" * 60) | |
| print("📧 TDC OUTLOOK/EXCHANGE HARVESTER") | |
| print("=" * 60) | |
| # 1. List folders | |
| print("\n📁 AVAILABLE FOLDERS") | |
| folders = self.get_folders() | |
| # 2. Harvest inbox | |
| inbox_results = self.harvest_inbox(days_back=180, max_per_term=50) | |
| # 3. Harvest sent items | |
| sent_results = self.harvest_sent_items(max_per_term=30) | |
| # 4. Extract SharePoint links | |
| sp_links = self.extract_sharepoint_links() | |
| # 5. Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 📧 Emails scanned: ~{self.stats['emails_scanned']}") | |
| print(f" ✅ Relevant emails: {self.stats['relevant_emails']}") | |
| print(f" 🔗 SharePoint links: {self.stats['sharepoint_links']}") | |
| print(f" 📎 With attachments: {sum(1 for e in self.emails if e.get('has_attachments'))}") | |
| print("=" * 60) | |
| # Save local JSON | |
| output_file = self.output_dir / "outlook_harvest.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.stats, | |
| "emails": self.emails[:200], | |
| "sharepoint_links": list(set(self.sharepoint_links))[:100] | |
| }, f, indent=2, ensure_ascii=False, default=str) | |
| print(f"\n📁 Results saved: {output_file}") | |
| # Cleanup | |
| pythoncom.CoUninitialize() | |
| self.neo4j.close() | |
| return self.emails, sp_links | |
| if __name__ == "__main__": | |
| harvester = TDCOutlookHarvester() | |
| harvester.run() | |