Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🌐 EWS AUTODISCOVER HARVESTER | |
| Forbinder til Exchange via EWS med autodiscover | |
| Krav: Din TDC email + password (evt. app password) | |
| """ | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| from getpass import getpass | |
| # EWS bibliotek | |
| try: | |
| from exchangelib import ( | |
| Credentials, Account, Configuration, DELEGATE, | |
| EWSDateTime, EWSTimeZone, Q | |
| ) | |
| from exchangelib.autodiscover import Autodiscover | |
| except ImportError: | |
| print("❌ exchangelib ikke installeret!") | |
| print(" Kør: pip install exchangelib") | |
| exit(1) | |
| try: | |
| from neo4j import GraphDatabase | |
| except ImportError: | |
| print("⚠️ neo4j ikke installeret - gemmer kun lokalt") | |
| GraphDatabase = None | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| # Søgetermer for TDC intern data | |
| SEARCH_KEYWORDS = [ | |
| "strategi", "cyber", "NIS2", "SOC", "MDR", | |
| "cloud", "Azure", "AI", "Copilot", "Columbus", | |
| "ERP", "budget", "forecast", "kunde", "kontrakt", | |
| "rammeaftale", "SKI", "produkt", "CloudKey", | |
| "arkitektur", "roadmap", "security", "incident", | |
| "TDC NET", "Nuuday", "partner", "projekt" | |
| ] | |
| class EWSHarvester: | |
| """Harvester der bruger Exchange Web Services med Autodiscover""" | |
| def __init__(self, email: str, password: str): | |
| self.email = email | |
| self.password = password | |
| self.account = None | |
| self.emails = [] | |
| self.stats = { | |
| "folders_scanned": 0, | |
| "emails_found": 0, | |
| "emails_matched": 0, | |
| "attachments": 0 | |
| } | |
| # Neo4j | |
| self.neo4j = None | |
| if GraphDatabase: | |
| try: | |
| self.neo4j = GraphDatabase.driver( | |
| NEO4J_URI, | |
| auth=(NEO4J_USER, NEO4J_PASSWORD) | |
| ) | |
| except Exception as e: | |
| print(f"⚠️ Neo4j connection failed: {e}") | |
| # Output | |
| self.output_dir = Path("data/outlook_ews_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def connect(self) -> bool: | |
| """Forbind via EWS Autodiscover""" | |
| print("\n🔌 Forbinder via EWS Autodiscover...") | |
| print(f" Email: {self.email}") | |
| try: | |
| # Opret credentials | |
| credentials = Credentials( | |
| username=self.email, | |
| password=self.password | |
| ) | |
| # Autodiscover finder automatisk Exchange server | |
| print(" 🔍 Kører autodiscover...") | |
| self.account = Account( | |
| primary_smtp_address=self.email, | |
| credentials=credentials, | |
| autodiscover=True, | |
| access_type=DELEGATE | |
| ) | |
| print(f"✅ Forbundet til Exchange!") | |
| print(f" Server: {self.account.protocol.server}") | |
| print(f" EWS URL: {self.account.protocol.service_endpoint}") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Forbindelse fejlede: {e}") | |
| print("\n💡 Tips:") | |
| print(" • Tjek at email/password er korrekt") | |
| print(" • Prøv med app password hvis MFA er aktivt") | |
| print(" • TDC kan kræve OAuth2 - se outlook_graph_harvest.py") | |
| return False | |
| def get_folders(self): | |
| """Hent alle mailmapper""" | |
| folders = [] | |
| try: | |
| # Hent alle mapper rekursivt | |
| def scan_folder(folder, level=0): | |
| try: | |
| folders.append({ | |
| "name": folder.name, | |
| "path": folder.absolute or folder.name, | |
| "folder": folder, | |
| "level": level, | |
| "total_count": folder.total_count or 0 | |
| }) | |
| if level < 3: | |
| for child in folder.children: | |
| scan_folder(child, level + 1) | |
| except: | |
| pass | |
| # Start fra root | |
| scan_folder(self.account.root) | |
| except Exception as e: | |
| print(f"⚠️ Fejl ved folder scan: {e}") | |
| return folders | |
| def search_folder(self, folder_info: dict, keywords: list, days_back: int = 365): | |
| """Søg i en mappe efter keywords""" | |
| results = [] | |
| folder = folder_info["folder"] | |
| try: | |
| # Sæt timezone | |
| tz = EWSTimeZone.localzone() | |
| cutoff = EWSDateTime.from_datetime( | |
| datetime.now() - timedelta(days=days_back) | |
| ).astimezone(tz) | |
| # Byg søgequery | |
| # EWS understøtter komplekse queries | |
| keyword_filter = None | |
| for kw in keywords: | |
| q = Q(subject__icontains=kw) | Q(body__icontains=kw) | |
| if keyword_filter is None: | |
| keyword_filter = q | |
| else: | |
| keyword_filter = keyword_filter | q | |
| # Kombinér med dato filter | |
| full_filter = Q(datetime_received__gte=cutoff) & keyword_filter | |
| # Søg | |
| items = folder.filter(full_filter).order_by('-datetime_received')[:500] | |
| count = 0 | |
| for item in items: | |
| try: | |
| # Find matchede keywords | |
| subject = str(item.subject or "").lower() | |
| body = str(item.text_body or "")[:2000].lower() | |
| matched_keywords = [] | |
| for kw in keywords: | |
| if kw.lower() in subject or kw.lower() in body: | |
| matched_keywords.append(kw) | |
| if not matched_keywords: | |
| continue | |
| # Hent attachments | |
| attachments = [] | |
| try: | |
| for att in item.attachments: | |
| attachments.append({ | |
| "name": att.name, | |
| "size": att.size or 0, | |
| "content_type": getattr(att, 'content_type', 'unknown') | |
| }) | |
| self.stats["attachments"] += 1 | |
| except: | |
| pass | |
| # Hent sender info | |
| sender_email = "" | |
| sender_name = "" | |
| if item.sender: | |
| sender_email = item.sender.email_address or "" | |
| sender_name = item.sender.name or "" | |
| results.append({ | |
| "id": item.id, | |
| "subject": item.subject, | |
| "sender": sender_email, | |
| "sender_name": sender_name, | |
| "received": item.datetime_received.strftime("%Y-%m-%d %H:%M") if item.datetime_received else "", | |
| "body_preview": body[:500], | |
| "folder": folder_info["path"], | |
| "keywords": matched_keywords, | |
| "has_attachments": len(attachments) > 0, | |
| "attachments": attachments[:5], | |
| "importance": str(item.importance) if item.importance else "normal", | |
| "categories": list(item.categories) if item.categories else [] | |
| }) | |
| self.stats["emails_matched"] += 1 | |
| count += 1 | |
| if count % 50 == 0: | |
| print(f" Processeret {count} matches...", end="\r") | |
| except Exception as e: | |
| continue | |
| self.stats["emails_found"] += count | |
| except Exception as e: | |
| # Kan være "folder does not support searching" | |
| pass | |
| return results | |
| def save_to_neo4j(self, email: dict): | |
| """Gem email i Neo4j""" | |
| if not self.neo4j: | |
| return | |
| content_hash = hashlib.md5( | |
| f"{email['subject']}:{email['id']}".encode() | |
| ).hexdigest() | |
| try: | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (e:OutlookEmail {contentHash: $hash}) | |
| ON CREATE SET | |
| e.entryId = $id, | |
| e.subject = $subject, | |
| e.sender = $sender, | |
| e.senderName = $senderName, | |
| e.received = $received, | |
| e.bodyPreview = $body, | |
| e.folder = $folder, | |
| e.keywords = $keywords, | |
| e.hasAttachments = $hasAtt, | |
| e.importance = $importance, | |
| e.source = 'EWS', | |
| e.harvestedAt = datetime() | |
| ON MATCH SET | |
| e.lastSeen = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_Outlook_EWS'}) | |
| ON CREATE SET ds.type = 'exchange_ews' | |
| MERGE (e)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=str(email["id"])[:100], | |
| subject=email["subject"][:500] if email["subject"] else "", | |
| sender=email["sender"], | |
| senderName=email["sender_name"], | |
| received=email["received"], | |
| body=email["body_preview"], | |
| folder=email["folder"], | |
| keywords=email["keywords"], | |
| hasAtt=email["has_attachments"], | |
| importance=email["importance"] | |
| ) | |
| # Keyword relationships | |
| for kw in email["keywords"]: | |
| session.run(""" | |
| MERGE (k:SearchKeyword {name: $kw}) | |
| WITH k | |
| MATCH (e:OutlookEmail {contentHash: $hash}) | |
| MERGE (e)-[:MATCHES_KEYWORD]->(k) | |
| """, kw=kw, hash=content_hash) | |
| except Exception as e: | |
| print(f"⚠️ Neo4j save error: {e}") | |
| def run(self, days_back: int = 365): | |
| """Kør fuld harvest""" | |
| print("\n" + "=" * 60) | |
| print("🌐 EWS AUTODISCOVER HARVESTER") | |
| print(" Exchange Web Services med automatisk server detection") | |
| print("=" * 60) | |
| if not self.connect(): | |
| return None | |
| # Hent mapper | |
| print("\n📁 Scanner mapper...") | |
| folders = self.get_folders() | |
| print(f" Fundet {len(folders)} mapper") | |
| # Filtrer til relevante mapper | |
| skip_names = ["junk", "spam", "deleted", "drafts", "outbox", | |
| "slettet", "kladder", "udbakke", "sync issues", | |
| "conflicts", "local failures", "server failures"] | |
| relevant_folders = [ | |
| f for f in folders | |
| if not any(skip.lower() in f["name"].lower() for skip in skip_names) | |
| and f["total_count"] > 0 | |
| ] | |
| print(f" Relevante mapper med emails: {len(relevant_folders)}") | |
| # Søg | |
| print(f"\n🔍 Søger efter {len(SEARCH_KEYWORDS)} keywords...") | |
| print(f" Periode: Sidste {days_back} dage") | |
| all_results = [] | |
| for folder_info in relevant_folders: | |
| indent = " " * folder_info["level"] | |
| folder_name = folder_info["name"] | |
| total = folder_info["total_count"] | |
| print(f"\n{indent}📂 {folder_name} ({total} items)") | |
| results = self.search_folder(folder_info, SEARCH_KEYWORDS, days_back) | |
| self.stats["folders_scanned"] += 1 | |
| if results: | |
| print(f"{indent} ✅ {len(results)} matches!") | |
| for email in results: | |
| self.save_to_neo4j(email) | |
| all_results.append(email) | |
| # Gem lokal JSON | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| output_file = self.output_dir / f"ews_harvest_{timestamp}.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "email": self.email, | |
| "stats": self.stats, | |
| "keywords": SEARCH_KEYWORDS, | |
| "emails": all_results | |
| }, f, indent=2, ensure_ascii=False) | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 📁 Mapper scannet: {self.stats['folders_scanned']}") | |
| print(f" 📧 Emails matched: {self.stats['emails_matched']}") | |
| print(f" 📎 Attachments: {self.stats['attachments']}") | |
| print(f"\n 💾 Saved: {output_file}") | |
| if self.neo4j: | |
| print(f" 🗄️ Neo4j: Data synced") | |
| print("=" * 60) | |
| # Top keywords | |
| if all_results: | |
| print("\n🏷️ TOP KEYWORDS:") | |
| keyword_counts = {} | |
| for email in all_results: | |
| for kw in email["keywords"]: | |
| keyword_counts[kw] = keyword_counts.get(kw, 0) + 1 | |
| for kw, count in sorted(keyword_counts.items(), key=lambda x: -x[1])[:10]: | |
| print(f" • {kw}: {count} emails") | |
| # Cleanup | |
| if self.neo4j: | |
| self.neo4j.close() | |
| return all_results | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description="EWS Outlook Harvester med Autodiscover") | |
| parser.add_argument("--email", "-e", help="Din TDC email adresse") | |
| parser.add_argument("--password", "-p", help="Password (eller app password)") | |
| parser.add_argument("--days", "-d", type=int, default=365, help="Dage tilbage (default: 365)") | |
| args = parser.parse_args() | |
| print("\n" + "=" * 60) | |
| print("🌐 EWS AUTODISCOVER HARVESTER") | |
| print("=" * 60) | |
| # Få credentials | |
| email = args.email | |
| if not email: | |
| email = input("\n📧 Din TDC email: ") | |
| password = args.password | |
| if not password: | |
| password = getpass("🔑 Password (eller app password): ") | |
| if not email or not password: | |
| print("❌ Email og password kræves!") | |
| return | |
| # Kør harvest | |
| harvester = EWSHarvester(email, password) | |
| harvester.run(args.days) | |
| if __name__ == "__main__": | |
| main() | |