#!/usr/bin/env python3 """ 🌐 EWS AUTODISCOVER HARVESTER Forbinder til Exchange via EWS med autodiscover Krav: Din TDC email + password (evt. app password) """ import hashlib import json from pathlib import Path from datetime import datetime, timedelta from getpass import getpass # EWS bibliotek try: from exchangelib import ( Credentials, Account, Configuration, DELEGATE, EWSDateTime, EWSTimeZone, Q ) from exchangelib.autodiscover import Autodiscover except ImportError: print("❌ exchangelib ikke installeret!") print(" Kør: pip install exchangelib") exit(1) try: from neo4j import GraphDatabase except ImportError: print("⚠️ neo4j ikke installeret - gemmer kun lokalt") GraphDatabase = None NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" NEO4J_USER = "neo4j" NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" # Søgetermer for TDC intern data SEARCH_KEYWORDS = [ "strategi", "cyber", "NIS2", "SOC", "MDR", "cloud", "Azure", "AI", "Copilot", "Columbus", "ERP", "budget", "forecast", "kunde", "kontrakt", "rammeaftale", "SKI", "produkt", "CloudKey", "arkitektur", "roadmap", "security", "incident", "TDC NET", "Nuuday", "partner", "projekt" ] class EWSHarvester: """Harvester der bruger Exchange Web Services med Autodiscover""" def __init__(self, email: str, password: str): self.email = email self.password = password self.account = None self.emails = [] self.stats = { "folders_scanned": 0, "emails_found": 0, "emails_matched": 0, "attachments": 0 } # Neo4j self.neo4j = None if GraphDatabase: try: self.neo4j = GraphDatabase.driver( NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD) ) except Exception as e: print(f"⚠️ Neo4j connection failed: {e}") # Output self.output_dir = Path("data/outlook_ews_harvest") self.output_dir.mkdir(parents=True, exist_ok=True) def connect(self) -> bool: """Forbind via EWS Autodiscover""" print("\n🔌 Forbinder via EWS Autodiscover...") print(f" Email: {self.email}") try: # Opret credentials credentials = Credentials( username=self.email, password=self.password ) # Autodiscover finder automatisk Exchange server print(" 🔍 Kører autodiscover...") self.account = Account( primary_smtp_address=self.email, credentials=credentials, autodiscover=True, access_type=DELEGATE ) print(f"✅ Forbundet til Exchange!") print(f" Server: {self.account.protocol.server}") print(f" EWS URL: {self.account.protocol.service_endpoint}") return True except Exception as e: print(f"❌ Forbindelse fejlede: {e}") print("\n💡 Tips:") print(" • Tjek at email/password er korrekt") print(" • Prøv med app password hvis MFA er aktivt") print(" • TDC kan kræve OAuth2 - se outlook_graph_harvest.py") return False def get_folders(self): """Hent alle mailmapper""" folders = [] try: # Hent alle mapper rekursivt def scan_folder(folder, level=0): try: folders.append({ "name": folder.name, "path": folder.absolute or folder.name, "folder": folder, "level": level, "total_count": folder.total_count or 0 }) if level < 3: for child in folder.children: scan_folder(child, level + 1) except: pass # Start fra root scan_folder(self.account.root) except Exception as e: print(f"⚠️ Fejl ved folder scan: {e}") return folders def search_folder(self, folder_info: dict, keywords: list, days_back: int = 365): """Søg i en mappe efter keywords""" results = [] folder = folder_info["folder"] try: # Sæt timezone tz = EWSTimeZone.localzone() cutoff = EWSDateTime.from_datetime( datetime.now() - timedelta(days=days_back) ).astimezone(tz) # Byg søgequery # EWS understøtter komplekse queries keyword_filter = None for kw in keywords: q = Q(subject__icontains=kw) | Q(body__icontains=kw) if keyword_filter is None: keyword_filter = q else: keyword_filter = keyword_filter | q # Kombinér med dato filter full_filter = Q(datetime_received__gte=cutoff) & keyword_filter # Søg items = folder.filter(full_filter).order_by('-datetime_received')[:500] count = 0 for item in items: try: # Find matchede keywords subject = str(item.subject or "").lower() body = str(item.text_body or "")[:2000].lower() matched_keywords = [] for kw in keywords: if kw.lower() in subject or kw.lower() in body: matched_keywords.append(kw) if not matched_keywords: continue # Hent attachments attachments = [] try: for att in item.attachments: attachments.append({ "name": att.name, "size": att.size or 0, "content_type": getattr(att, 'content_type', 'unknown') }) self.stats["attachments"] += 1 except: pass # Hent sender info sender_email = "" sender_name = "" if item.sender: sender_email = item.sender.email_address or "" sender_name = item.sender.name or "" results.append({ "id": item.id, "subject": item.subject, "sender": sender_email, "sender_name": sender_name, "received": item.datetime_received.strftime("%Y-%m-%d %H:%M") if item.datetime_received else "", "body_preview": body[:500], "folder": folder_info["path"], "keywords": matched_keywords, "has_attachments": len(attachments) > 0, "attachments": attachments[:5], "importance": str(item.importance) if item.importance else "normal", "categories": list(item.categories) if item.categories else [] }) self.stats["emails_matched"] += 1 count += 1 if count % 50 == 0: print(f" Processeret {count} matches...", end="\r") except Exception as e: continue self.stats["emails_found"] += count except Exception as e: # Kan være "folder does not support searching" pass return results def save_to_neo4j(self, email: dict): """Gem email i Neo4j""" if not self.neo4j: return content_hash = hashlib.md5( f"{email['subject']}:{email['id']}".encode() ).hexdigest() try: with self.neo4j.session() as session: session.run(""" MERGE (e:OutlookEmail {contentHash: $hash}) ON CREATE SET e.entryId = $id, e.subject = $subject, e.sender = $sender, e.senderName = $senderName, e.received = $received, e.bodyPreview = $body, e.folder = $folder, e.keywords = $keywords, e.hasAttachments = $hasAtt, e.importance = $importance, e.source = 'EWS', e.harvestedAt = datetime() ON MATCH SET e.lastSeen = datetime() MERGE (ds:DataSource {name: 'TDC_Outlook_EWS'}) ON CREATE SET ds.type = 'exchange_ews' MERGE (e)-[:HARVESTED_FROM]->(ds) """, hash=content_hash, id=str(email["id"])[:100], subject=email["subject"][:500] if email["subject"] else "", sender=email["sender"], senderName=email["sender_name"], received=email["received"], body=email["body_preview"], folder=email["folder"], keywords=email["keywords"], hasAtt=email["has_attachments"], importance=email["importance"] ) # Keyword relationships for kw in email["keywords"]: session.run(""" MERGE (k:SearchKeyword {name: $kw}) WITH k MATCH (e:OutlookEmail {contentHash: $hash}) MERGE (e)-[:MATCHES_KEYWORD]->(k) """, kw=kw, hash=content_hash) except Exception as e: print(f"⚠️ Neo4j save error: {e}") def run(self, days_back: int = 365): """Kør fuld harvest""" print("\n" + "=" * 60) print("🌐 EWS AUTODISCOVER HARVESTER") print(" Exchange Web Services med automatisk server detection") print("=" * 60) if not self.connect(): return None # Hent mapper print("\n📁 Scanner mapper...") folders = self.get_folders() print(f" Fundet {len(folders)} mapper") # Filtrer til relevante mapper skip_names = ["junk", "spam", "deleted", "drafts", "outbox", "slettet", "kladder", "udbakke", "sync issues", "conflicts", "local failures", "server failures"] relevant_folders = [ f for f in folders if not any(skip.lower() in f["name"].lower() for skip in skip_names) and f["total_count"] > 0 ] print(f" Relevante mapper med emails: {len(relevant_folders)}") # Søg print(f"\n🔍 Søger efter {len(SEARCH_KEYWORDS)} keywords...") print(f" Periode: Sidste {days_back} dage") all_results = [] for folder_info in relevant_folders: indent = " " * folder_info["level"] folder_name = folder_info["name"] total = folder_info["total_count"] print(f"\n{indent}📂 {folder_name} ({total} items)") results = self.search_folder(folder_info, SEARCH_KEYWORDS, days_back) self.stats["folders_scanned"] += 1 if results: print(f"{indent} ✅ {len(results)} matches!") for email in results: self.save_to_neo4j(email) all_results.append(email) # Gem lokal JSON timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') output_file = self.output_dir / f"ews_harvest_{timestamp}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump({ "timestamp": datetime.now().isoformat(), "email": self.email, "stats": self.stats, "keywords": SEARCH_KEYWORDS, "emails": all_results }, f, indent=2, ensure_ascii=False) # Summary print("\n" + "=" * 60) print("📊 HARVEST COMPLETE") print("=" * 60) print(f" 📁 Mapper scannet: {self.stats['folders_scanned']}") print(f" 📧 Emails matched: {self.stats['emails_matched']}") print(f" 📎 Attachments: {self.stats['attachments']}") print(f"\n 💾 Saved: {output_file}") if self.neo4j: print(f" 🗄️ Neo4j: Data synced") print("=" * 60) # Top keywords if all_results: print("\n🏷️ TOP KEYWORDS:") keyword_counts = {} for email in all_results: for kw in email["keywords"]: keyword_counts[kw] = keyword_counts.get(kw, 0) + 1 for kw, count in sorted(keyword_counts.items(), key=lambda x: -x[1])[:10]: print(f" • {kw}: {count} emails") # Cleanup if self.neo4j: self.neo4j.close() return all_results def main(): import argparse parser = argparse.ArgumentParser(description="EWS Outlook Harvester med Autodiscover") parser.add_argument("--email", "-e", help="Din TDC email adresse") parser.add_argument("--password", "-p", help="Password (eller app password)") parser.add_argument("--days", "-d", type=int, default=365, help="Dage tilbage (default: 365)") args = parser.parse_args() print("\n" + "=" * 60) print("🌐 EWS AUTODISCOVER HARVESTER") print("=" * 60) # Få credentials email = args.email if not email: email = input("\n📧 Din TDC email: ") password = args.password if not password: password = getpass("🔑 Password (eller app password): ") if not email or not password: print("❌ Email og password kræves!") return # Kør harvest harvester = EWSHarvester(email, password) harvester.run(args.days) if __name__ == "__main__": main()