Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🔓 LOCAL OUTLOOK HARVESTER - Ingen admin nødvendig! | |
| Læser direkte fra din lokale Outlook installation via COM | |
| Krav: Outlook installeret og logget ind med din TDC konto | |
| """ | |
| import win32com.client | |
| import pythoncom | |
| import json | |
| import hashlib | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| from neo4j import GraphDatabase | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| # Søgetermer for TDC intern data | |
| SEARCH_KEYWORDS = [ | |
| "strategi", | |
| "cyber", | |
| "NIS2", | |
| "SOC", | |
| "MDR", | |
| "cloud", | |
| "Azure", | |
| "AI", | |
| "Copilot", | |
| "Columbus", | |
| "ERP", | |
| "budget", | |
| "forecast", | |
| "kunde", | |
| "kontrakt", | |
| "rammeaftale", | |
| "SKI", | |
| "produkt", | |
| "CloudKey", | |
| "arkitektur", | |
| "roadmap" | |
| ] | |
| class LocalOutlookHarvester: | |
| """Harvester der læser direkte fra lokal Outlook via COM""" | |
| def __init__(self): | |
| pythoncom.CoInitialize() | |
| self.outlook = None | |
| self.namespace = None | |
| self.emails = [] | |
| self.stats = { | |
| "folders_scanned": 0, | |
| "emails_found": 0, | |
| "emails_matched": 0, | |
| "attachments": 0 | |
| } | |
| # Neo4j | |
| self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) | |
| # Output | |
| self.output_dir = Path("data/outlook_local_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def connect(self) -> bool: | |
| """Forbind til lokal Outlook""" | |
| try: | |
| print("🔌 Forbinder til Outlook...") | |
| self.outlook = win32com.client.Dispatch("Outlook.Application") | |
| self.namespace = self.outlook.GetNamespace("MAPI") | |
| # Vis konti | |
| accounts = self.namespace.Accounts | |
| print(f"✅ Outlook forbundet!") | |
| print(f" Konti fundet: {accounts.Count}") | |
| for i in range(1, accounts.Count + 1): | |
| acc = accounts.Item(i) | |
| print(f" • {acc.DisplayName} ({acc.SmtpAddress})") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Kunne ikke forbinde til Outlook: {e}") | |
| print(" Sørg for at Outlook er installeret og kører") | |
| return False | |
| def get_folders(self, parent=None, level=0): | |
| """Rekursivt hent alle mapper""" | |
| folders = [] | |
| try: | |
| if parent is None: | |
| # Start fra root | |
| for store in self.namespace.Stores: | |
| try: | |
| root = store.GetRootFolder() | |
| folders.append({ | |
| "name": store.DisplayName, | |
| "path": store.DisplayName, | |
| "folder": root, | |
| "level": 0 | |
| }) | |
| folders.extend(self.get_folders(root, 1)) | |
| except: | |
| continue | |
| else: | |
| for folder in parent.Folders: | |
| try: | |
| path = f"{parent.FolderPath}\\{folder.Name}" | |
| folders.append({ | |
| "name": folder.Name, | |
| "path": path, | |
| "folder": folder, | |
| "level": level | |
| }) | |
| if level < 3: # Max 3 niveauer dybt | |
| folders.extend(self.get_folders(folder, level + 1)) | |
| except: | |
| continue | |
| except: | |
| pass | |
| return folders | |
| def search_folder(self, folder_info: dict, keywords: list, days_back: int = 365) -> list: | |
| """Søg i en mappe efter keywords""" | |
| results = [] | |
| folder = folder_info["folder"] | |
| try: | |
| items = folder.Items | |
| items.Sort("[ReceivedTime]", True) # Nyeste først | |
| # Filtrer på dato | |
| cutoff = datetime.now() - timedelta(days=days_back) | |
| count = 0 | |
| for item in items: | |
| try: | |
| # Check om det er en mail | |
| if item.Class != 43: # 43 = MailItem | |
| continue | |
| # Check dato | |
| received = item.ReceivedTime | |
| if hasattr(received, 'year') and datetime(received.year, received.month, received.day) < cutoff: | |
| continue | |
| # Søg i subject og body | |
| subject = str(item.Subject or "").lower() | |
| body = str(item.Body or "")[:2000].lower() | |
| sender = str(item.SenderEmailAddress or "") | |
| # Match keywords | |
| matched_keywords = [] | |
| for kw in keywords: | |
| if kw.lower() in subject or kw.lower() in body: | |
| matched_keywords.append(kw) | |
| if matched_keywords: | |
| # Hent attachments info | |
| attachments = [] | |
| try: | |
| for att in item.Attachments: | |
| attachments.append({ | |
| "name": att.FileName, | |
| "size": att.Size if hasattr(att, 'Size') else 0 | |
| }) | |
| self.stats["attachments"] += 1 | |
| except: | |
| pass | |
| results.append({ | |
| "id": item.EntryID, | |
| "subject": item.Subject, | |
| "sender": sender, | |
| "sender_name": str(item.SenderName or ""), | |
| "received": received.strftime("%Y-%m-%d %H:%M") if hasattr(received, 'strftime') else str(received), | |
| "body_preview": body[:500], | |
| "folder": folder_info["path"], | |
| "keywords": matched_keywords, | |
| "has_attachments": len(attachments) > 0, | |
| "attachments": attachments[:5], # Max 5 | |
| "importance": item.Importance, | |
| "categories": str(item.Categories or "") | |
| }) | |
| self.stats["emails_matched"] += 1 | |
| count += 1 | |
| self.stats["emails_found"] += 1 | |
| # Progress | |
| if count % 100 == 0: | |
| print(f" Scannet {count} emails...", end="\r") | |
| # Limit per mappe | |
| if count >= 1000: | |
| break | |
| except Exception as e: | |
| continue | |
| except Exception as e: | |
| pass | |
| return results | |
| def save_to_neo4j(self, email: dict): | |
| """Gem email i Neo4j""" | |
| content_hash = hashlib.md5(f"{email['subject']}:{email['id']}".encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| # Opret email node | |
| session.run(""" | |
| MERGE (e:OutlookEmail {contentHash: $hash}) | |
| ON CREATE SET | |
| e.entryId = $id, | |
| e.subject = $subject, | |
| e.sender = $sender, | |
| e.senderName = $senderName, | |
| e.received = $received, | |
| e.bodyPreview = $body, | |
| e.folder = $folder, | |
| e.keywords = $keywords, | |
| e.hasAttachments = $hasAtt, | |
| e.importance = $importance, | |
| e.harvestedAt = datetime() | |
| ON MATCH SET | |
| e.lastSeen = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_Outlook_Local'}) | |
| ON CREATE SET ds.type = 'local_exchange' | |
| MERGE (e)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=email["id"], | |
| subject=email["subject"][:500], | |
| sender=email["sender"], | |
| senderName=email["sender_name"], | |
| received=email["received"], | |
| body=email["body_preview"], | |
| folder=email["folder"], | |
| keywords=email["keywords"], | |
| hasAtt=email["has_attachments"], | |
| importance=email["importance"] | |
| ) | |
| # Opret keyword relationships | |
| for kw in email["keywords"]: | |
| session.run(""" | |
| MERGE (k:SearchKeyword {name: $kw}) | |
| WITH k | |
| MATCH (e:OutlookEmail {contentHash: $hash}) | |
| MERGE (e)-[:MATCHES_KEYWORD]->(k) | |
| """, kw=kw, hash=content_hash) | |
| def run(self, days_back: int = 365): | |
| """Kør fuld harvest""" | |
| print("\n" + "=" * 60) | |
| print("🔓 LOCAL OUTLOOK HARVESTER") | |
| print(" Ingen admin nødvendig!") | |
| print("=" * 60) | |
| if not self.connect(): | |
| return | |
| # Hent alle mapper | |
| print("\n📁 Scanner mapper...") | |
| folders = self.get_folders() | |
| print(f" Fundet {len(folders)} mapper") | |
| # Filtrer til relevante mapper | |
| relevant_folders = [] | |
| skip_names = ["Junk", "Spam", "Deleted", "Drafts", "Outbox", "Slettet", "Kladder", "Udbakke"] | |
| for f in folders: | |
| if not any(skip in f["name"] for skip in skip_names): | |
| relevant_folders.append(f) | |
| print(f" Relevante mapper: {len(relevant_folders)}") | |
| # Søg i hver mappe | |
| print(f"\n🔍 Søger efter {len(SEARCH_KEYWORDS)} keywords...") | |
| print(f" Periode: Sidste {days_back} dage") | |
| all_results = [] | |
| for i, folder_info in enumerate(relevant_folders): | |
| indent = " " * folder_info["level"] | |
| print(f"\n{indent}📂 {folder_info['name']}") | |
| results = self.search_folder(folder_info, SEARCH_KEYWORDS, days_back) | |
| self.stats["folders_scanned"] += 1 | |
| if results: | |
| print(f"{indent} ✅ {len(results)} matches!") | |
| for email in results: | |
| # Gem i Neo4j | |
| self.save_to_neo4j(email) | |
| all_results.append(email) | |
| # Gem lokal JSON | |
| output_file = self.output_dir / f"outlook_harvest_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.stats, | |
| "keywords": SEARCH_KEYWORDS, | |
| "emails": all_results | |
| }, f, indent=2, ensure_ascii=False) | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 60) | |
| print(f" 📁 Mapper scannet: {self.stats['folders_scanned']}") | |
| print(f" 📧 Emails gennemgået: {self.stats['emails_found']}") | |
| print(f" ✅ Emails matched: {self.stats['emails_matched']}") | |
| print(f" 📎 Attachments: {self.stats['attachments']}") | |
| print(f"\n 💾 Saved: {output_file}") | |
| print("=" * 60) | |
| # Top keywords | |
| if all_results: | |
| print("\n🏷️ TOP KEYWORDS:") | |
| keyword_counts = {} | |
| for email in all_results: | |
| for kw in email["keywords"]: | |
| keyword_counts[kw] = keyword_counts.get(kw, 0) + 1 | |
| for kw, count in sorted(keyword_counts.items(), key=lambda x: -x[1])[:10]: | |
| print(f" • {kw}: {count} emails") | |
| self.neo4j.close() | |
| pythoncom.CoUninitialize() | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Local Outlook Harvester") | |
| parser.add_argument("--days", type=int, default=365, help="Dage tilbage at søge (default: 365)") | |
| args = parser.parse_args() | |
| harvester = LocalOutlookHarvester() | |
| harvester.run(args.days) | |