File size: 12,727 Bytes
34367da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/usr/bin/env python3
"""
🔓 LOCAL OUTLOOK HARVESTER - Ingen admin nødvendig!
Læser direkte fra din lokale Outlook installation via COM

Krav: Outlook installeret og logget ind med din TDC konto
"""
import win32com.client
import pythoncom
import json
import hashlib
from pathlib import Path
from datetime import datetime, timedelta
from neo4j import GraphDatabase

NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"

# Søgetermer for TDC intern data
SEARCH_KEYWORDS = [
    "strategi",
    "cyber",
    "NIS2",
    "SOC",
    "MDR",
    "cloud",
    "Azure",
    "AI",
    "Copilot",
    "Columbus",
    "ERP",
    "budget",
    "forecast",
    "kunde",
    "kontrakt",
    "rammeaftale",
    "SKI",
    "produkt",
    "CloudKey",
    "arkitektur",
    "roadmap"
]

class LocalOutlookHarvester:
    """Harvester der læser direkte fra lokal Outlook via COM"""
    
    def __init__(self):
        pythoncom.CoInitialize()
        self.outlook = None
        self.namespace = None
        self.emails = []
        self.stats = {
            "folders_scanned": 0,
            "emails_found": 0,
            "emails_matched": 0,
            "attachments": 0
        }
        
        # Neo4j
        self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
        
        # Output
        self.output_dir = Path("data/outlook_local_harvest")
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def connect(self) -> bool:
        """Forbind til lokal Outlook"""
        try:
            print("🔌 Forbinder til Outlook...")
            self.outlook = win32com.client.Dispatch("Outlook.Application")
            self.namespace = self.outlook.GetNamespace("MAPI")
            
            # Vis konti
            accounts = self.namespace.Accounts
            print(f"✅ Outlook forbundet!")
            print(f"   Konti fundet: {accounts.Count}")
            for i in range(1, accounts.Count + 1):
                acc = accounts.Item(i)
                print(f"   • {acc.DisplayName} ({acc.SmtpAddress})")
            
            return True
        except Exception as e:
            print(f"❌ Kunne ikke forbinde til Outlook: {e}")
            print("   Sørg for at Outlook er installeret og kører")
            return False
    
    def get_folders(self, parent=None, level=0):
        """Rekursivt hent alle mapper"""
        folders = []
        
        try:
            if parent is None:
                # Start fra root
                for store in self.namespace.Stores:
                    try:
                        root = store.GetRootFolder()
                        folders.append({
                            "name": store.DisplayName,
                            "path": store.DisplayName,
                            "folder": root,
                            "level": 0
                        })
                        folders.extend(self.get_folders(root, 1))
                    except:
                        continue
            else:
                for folder in parent.Folders:
                    try:
                        path = f"{parent.FolderPath}\\{folder.Name}"
                        folders.append({
                            "name": folder.Name,
                            "path": path,
                            "folder": folder,
                            "level": level
                        })
                        if level < 3:  # Max 3 niveauer dybt
                            folders.extend(self.get_folders(folder, level + 1))
                    except:
                        continue
        except:
            pass
        
        return folders
    
    def search_folder(self, folder_info: dict, keywords: list, days_back: int = 365) -> list:
        """Søg i en mappe efter keywords"""
        results = []
        folder = folder_info["folder"]
        
        try:
            items = folder.Items
            items.Sort("[ReceivedTime]", True)  # Nyeste først
            
            # Filtrer på dato
            cutoff = datetime.now() - timedelta(days=days_back)
            
            count = 0
            for item in items:
                try:
                    # Check om det er en mail
                    if item.Class != 43:  # 43 = MailItem
                        continue
                    
                    # Check dato
                    received = item.ReceivedTime
                    if hasattr(received, 'year') and datetime(received.year, received.month, received.day) < cutoff:
                        continue
                    
                    # Søg i subject og body
                    subject = str(item.Subject or "").lower()
                    body = str(item.Body or "")[:2000].lower()
                    sender = str(item.SenderEmailAddress or "")
                    
                    # Match keywords
                    matched_keywords = []
                    for kw in keywords:
                        if kw.lower() in subject or kw.lower() in body:
                            matched_keywords.append(kw)
                    
                    if matched_keywords:
                        # Hent attachments info
                        attachments = []
                        try:
                            for att in item.Attachments:
                                attachments.append({
                                    "name": att.FileName,
                                    "size": att.Size if hasattr(att, 'Size') else 0
                                })
                                self.stats["attachments"] += 1
                        except:
                            pass
                        
                        results.append({
                            "id": item.EntryID,
                            "subject": item.Subject,
                            "sender": sender,
                            "sender_name": str(item.SenderName or ""),
                            "received": received.strftime("%Y-%m-%d %H:%M") if hasattr(received, 'strftime') else str(received),
                            "body_preview": body[:500],
                            "folder": folder_info["path"],
                            "keywords": matched_keywords,
                            "has_attachments": len(attachments) > 0,
                            "attachments": attachments[:5],  # Max 5
                            "importance": item.Importance,
                            "categories": str(item.Categories or "")
                        })
                        self.stats["emails_matched"] += 1
                    
                    count += 1
                    self.stats["emails_found"] += 1
                    
                    # Progress
                    if count % 100 == 0:
                        print(f"      Scannet {count} emails...", end="\r")
                    
                    # Limit per mappe
                    if count >= 1000:
                        break
                        
                except Exception as e:
                    continue
                    
        except Exception as e:
            pass
        
        return results
    
    def save_to_neo4j(self, email: dict):
        """Gem email i Neo4j"""
        content_hash = hashlib.md5(f"{email['subject']}:{email['id']}".encode()).hexdigest()
        
        with self.neo4j.session() as session:
            # Opret email node
            session.run("""
                MERGE (e:OutlookEmail {contentHash: $hash})
                ON CREATE SET
                    e.entryId = $id,
                    e.subject = $subject,
                    e.sender = $sender,
                    e.senderName = $senderName,
                    e.received = $received,
                    e.bodyPreview = $body,
                    e.folder = $folder,
                    e.keywords = $keywords,
                    e.hasAttachments = $hasAtt,
                    e.importance = $importance,
                    e.harvestedAt = datetime()
                ON MATCH SET
                    e.lastSeen = datetime()
                
                MERGE (ds:DataSource {name: 'TDC_Outlook_Local'})
                ON CREATE SET ds.type = 'local_exchange'
                MERGE (e)-[:HARVESTED_FROM]->(ds)
            """,
                hash=content_hash,
                id=email["id"],
                subject=email["subject"][:500],
                sender=email["sender"],
                senderName=email["sender_name"],
                received=email["received"],
                body=email["body_preview"],
                folder=email["folder"],
                keywords=email["keywords"],
                hasAtt=email["has_attachments"],
                importance=email["importance"]
            )
            
            # Opret keyword relationships
            for kw in email["keywords"]:
                session.run("""
                    MERGE (k:SearchKeyword {name: $kw})
                    WITH k
                    MATCH (e:OutlookEmail {contentHash: $hash})
                    MERGE (e)-[:MATCHES_KEYWORD]->(k)
                """, kw=kw, hash=content_hash)
    
    def run(self, days_back: int = 365):
        """Kør fuld harvest"""
        print("\n" + "=" * 60)
        print("🔓 LOCAL OUTLOOK HARVESTER")
        print("   Ingen admin nødvendig!")
        print("=" * 60)
        
        if not self.connect():
            return
        
        # Hent alle mapper
        print("\n📁 Scanner mapper...")
        folders = self.get_folders()
        print(f"   Fundet {len(folders)} mapper")
        
        # Filtrer til relevante mapper
        relevant_folders = []
        skip_names = ["Junk", "Spam", "Deleted", "Drafts", "Outbox", "Slettet", "Kladder", "Udbakke"]
        
        for f in folders:
            if not any(skip in f["name"] for skip in skip_names):
                relevant_folders.append(f)
        
        print(f"   Relevante mapper: {len(relevant_folders)}")
        
        # Søg i hver mappe
        print(f"\n🔍 Søger efter {len(SEARCH_KEYWORDS)} keywords...")
        print(f"   Periode: Sidste {days_back} dage")
        
        all_results = []
        
        for i, folder_info in enumerate(relevant_folders):
            indent = "  " * folder_info["level"]
            print(f"\n{indent}📂 {folder_info['name']}")
            
            results = self.search_folder(folder_info, SEARCH_KEYWORDS, days_back)
            self.stats["folders_scanned"] += 1
            
            if results:
                print(f"{indent}{len(results)} matches!")
                for email in results:
                    # Gem i Neo4j
                    self.save_to_neo4j(email)
                    all_results.append(email)
        
        # Gem lokal JSON
        output_file = self.output_dir / f"outlook_harvest_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({
                "timestamp": datetime.now().isoformat(),
                "stats": self.stats,
                "keywords": SEARCH_KEYWORDS,
                "emails": all_results
            }, f, indent=2, ensure_ascii=False)
        
        # Summary
        print("\n" + "=" * 60)
        print("📊 HARVEST COMPLETE")
        print("=" * 60)
        print(f"   📁 Mapper scannet:    {self.stats['folders_scanned']}")
        print(f"   📧 Emails gennemgået: {self.stats['emails_found']}")
        print(f"   ✅ Emails matched:    {self.stats['emails_matched']}")
        print(f"   📎 Attachments:       {self.stats['attachments']}")
        print(f"\n   💾 Saved: {output_file}")
        print("=" * 60)
        
        # Top keywords
        if all_results:
            print("\n🏷️ TOP KEYWORDS:")
            keyword_counts = {}
            for email in all_results:
                for kw in email["keywords"]:
                    keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
            
            for kw, count in sorted(keyword_counts.items(), key=lambda x: -x[1])[:10]:
                print(f"   • {kw}: {count} emails")
        
        self.neo4j.close()
        pythoncom.CoUninitialize()


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Local Outlook Harvester")
    parser.add_argument("--days", type=int, default=365, help="Dage tilbage at søge (default: 365)")
    args = parser.parse_args()
    
    harvester = LocalOutlookHarvester()
    harvester.run(args.days)