Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🚀 M365 COMPLETE LOCAL HARVESTER | |
| ================================ | |
| Harvester for ALLE Microsoft 365 tjenester via lokale metoder: | |
| - Outlook (COM) | |
| - Teams (Local Cache + LevelDB) | |
| - SharePoint (OneDrive Sync) | |
| - OneDrive (Local Sync Folder) | |
| - Loop (Local Cache) | |
| Ingen Azure AD App nødvendig - bruger lokale filer og COM! | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import sqlite3 | |
| import hashlib | |
| import shutil | |
| import struct | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| from dataclasses import dataclass, asdict | |
| from typing import List, Dict, Optional, Any | |
| from concurrent.futures import ThreadPoolExecutor | |
| import re | |
| # Neo4j | |
| from neo4j import GraphDatabase | |
| # ============================================================ | |
| # CONFIGURATION | |
| # ============================================================ | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| # Lokale paths (Windows) | |
| USER_HOME = Path(os.environ.get("USERPROFILE", os.path.expanduser("~"))) | |
| APPDATA_LOCAL = Path(os.environ.get("LOCALAPPDATA", USER_HOME / "AppData" / "Local")) | |
| APPDATA_ROAMING = Path(os.environ.get("APPDATA", USER_HOME / "AppData" / "Roaming")) | |
| # M365 Local Paths | |
| PATHS = { | |
| "teams_cache": APPDATA_ROAMING / "Microsoft" / "Teams", | |
| "teams_new": APPDATA_LOCAL / "Packages" / "MSTeams_8wekyb3d8bbwe" / "LocalCache" / "Microsoft" / "MSTeams", | |
| "onedrive": USER_HOME / "OneDrive - TDC", | |
| "onedrive_business": USER_HOME / "OneDrive - TDC Holding A_S", | |
| "sharepoint_sync": USER_HOME / "TDC Holding A_S", | |
| "loop_cache": APPDATA_LOCAL / "Microsoft" / "Loop", | |
| "outlook_cache": APPDATA_LOCAL / "Microsoft" / "Outlook", | |
| "edge_profile": APPDATA_LOCAL / "Microsoft" / "Edge" / "User Data" / "Default", | |
| } | |
| # Søgetermer | |
| SEARCH_KEYWORDS = [ | |
| "strategi", "cyber", "NIS2", "SOC", "MDR", "cloud", "Azure", "AI", | |
| "Copilot", "Columbus", "ERP", "budget", "forecast", "kunde", "kontrakt", | |
| "rammeaftale", "SKI", "produkt", "CloudKey", "arkitektur", "roadmap", | |
| "projekt", "meeting", "beslutning", "action", "deadline" | |
| ] | |
| # ============================================================ | |
| # DATA CLASSES | |
| # ============================================================ | |
| class HarvestedItem: | |
| """Generisk harvested item""" | |
| id: str | |
| source: str # outlook, teams, sharepoint, onedrive, loop | |
| item_type: str # email, message, file, document, note | |
| title: str | |
| content_preview: str | |
| author: str | |
| timestamp: str | |
| path: str | |
| keywords: List[str] | |
| metadata: Dict[str, Any] | |
| # ============================================================ | |
| # BASE HARVESTER | |
| # ============================================================ | |
| class BaseHarvester: | |
| """Base class for alle harvesters""" | |
| def __init__(self, neo4j_driver): | |
| self.neo4j = neo4j_driver | |
| self.items: List[HarvestedItem] = [] | |
| self.stats = {"found": 0, "matched": 0, "errors": 0} | |
| def match_keywords(self, text: str) -> List[str]: | |
| """Match keywords i tekst""" | |
| if not text: | |
| return [] | |
| text_lower = text.lower() | |
| return [kw for kw in SEARCH_KEYWORDS if kw.lower() in text_lower] | |
| def save_to_neo4j(self, item: HarvestedItem): | |
| """Gem item i Neo4j""" | |
| content_hash = hashlib.md5(f"{item.source}:{item.id}".encode()).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (i:M365Item {contentHash: $hash}) | |
| ON CREATE SET | |
| i.itemId = $id, | |
| i.source = $source, | |
| i.itemType = $itemType, | |
| i.title = $title, | |
| i.contentPreview = $preview, | |
| i.author = $author, | |
| i.timestamp = $timestamp, | |
| i.path = $path, | |
| i.keywords = $keywords, | |
| i.harvestedAt = datetime() | |
| ON MATCH SET | |
| i.lastSeen = datetime() | |
| MERGE (ds:DataSource {name: $dsName}) | |
| ON CREATE SET ds.type = 'local_m365' | |
| MERGE (i)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| id=item.id, | |
| source=item.source, | |
| itemType=item.item_type, | |
| title=item.title[:500] if item.title else "", | |
| preview=item.content_preview[:2000] if item.content_preview else "", | |
| author=item.author, | |
| timestamp=item.timestamp, | |
| path=item.path, | |
| keywords=item.keywords, | |
| dsName=f"TDC_M365_{item.source.capitalize()}" | |
| ) | |
| # Keyword relationships | |
| for kw in item.keywords: | |
| session.run(""" | |
| MERGE (k:SearchKeyword {name: $kw}) | |
| WITH k | |
| MATCH (i:M365Item {contentHash: $hash}) | |
| MERGE (i)-[:MATCHES_KEYWORD]->(k) | |
| """, kw=kw, hash=content_hash) | |
| # ============================================================ | |
| # OUTLOOK HARVESTER (COM) | |
| # ============================================================ | |
| class OutlookHarvester(BaseHarvester): | |
| """Harvest Outlook via COM""" | |
| def __init__(self, neo4j_driver): | |
| super().__init__(neo4j_driver) | |
| self.outlook = None | |
| self.namespace = None | |
| def connect(self) -> bool: | |
| """Forbind til Outlook via COM""" | |
| try: | |
| import win32com.client | |
| import pythoncom | |
| pythoncom.CoInitialize() | |
| print(" 🔌 Forbinder til Outlook...") | |
| self.outlook = win32com.client.Dispatch("Outlook.Application") | |
| self.namespace = self.outlook.GetNamespace("MAPI") | |
| accounts = self.namespace.Accounts | |
| print(f" ✅ Outlook forbundet! ({accounts.Count} konti)") | |
| return True | |
| except Exception as e: | |
| print(f" ❌ Outlook fejl: {e}") | |
| return False | |
| def harvest(self, days_back: int = 90) -> List[HarvestedItem]: | |
| """Harvest emails""" | |
| if not self.connect(): | |
| return [] | |
| print(" 📧 Scanner emails...") | |
| cutoff = datetime.now() - timedelta(days=days_back) | |
| # Scan Inbox og Sent | |
| folders_to_scan = ["Inbox", "Sent Items", "Sendt post"] | |
| for folder_name in folders_to_scan: | |
| try: | |
| folder = self.namespace.GetDefaultFolder(6 if "Inbox" in folder_name else 5) | |
| items = folder.Items | |
| items.Sort("[ReceivedTime]", True) | |
| count = 0 | |
| for item in items: | |
| try: | |
| if item.Class != 43: # MailItem | |
| continue | |
| received = item.ReceivedTime | |
| if hasattr(received, 'year'): | |
| item_date = datetime(received.year, received.month, received.day) | |
| if item_date < cutoff: | |
| break | |
| subject = str(item.Subject or "") | |
| body = str(item.Body or "")[:2000] | |
| sender = str(item.SenderEmailAddress or "") | |
| keywords = self.match_keywords(f"{subject} {body}") | |
| if keywords: | |
| harvested = HarvestedItem( | |
| id=item.EntryID, | |
| source="outlook", | |
| item_type="email", | |
| title=subject, | |
| content_preview=body[:500], | |
| author=sender, | |
| timestamp=received.strftime("%Y-%m-%d %H:%M") if hasattr(received, 'strftime') else str(received), | |
| path=folder_name, | |
| keywords=keywords, | |
| metadata={"has_attachments": item.Attachments.Count > 0} | |
| ) | |
| self.items.append(harvested) | |
| self.save_to_neo4j(harvested) | |
| self.stats["matched"] += 1 | |
| self.stats["found"] += 1 | |
| count += 1 | |
| if count >= 500: | |
| break | |
| except Exception as e: | |
| self.stats["errors"] += 1 | |
| continue | |
| except Exception as e: | |
| print(f" ⚠️ Folder fejl: {e}") | |
| print(f" 📧 Outlook: {self.stats['matched']}/{self.stats['found']} matched") | |
| return self.items | |
| # ============================================================ | |
| # TEAMS HARVESTER (Local Cache) | |
| # ============================================================ | |
| class TeamsHarvester(BaseHarvester): | |
| """Harvest Teams via local cache files""" | |
| def __init__(self, neo4j_driver): | |
| super().__init__(neo4j_driver) | |
| self.cache_path = None | |
| def find_cache(self) -> Optional[Path]: | |
| """Find Teams cache location""" | |
| # Prøv ny Teams først (Store version) | |
| if PATHS["teams_new"].exists(): | |
| self.cache_path = PATHS["teams_new"] | |
| return self.cache_path | |
| # Fallback til gammel Teams | |
| if PATHS["teams_cache"].exists(): | |
| self.cache_path = PATHS["teams_cache"] | |
| return self.cache_path | |
| return None | |
| def harvest(self, days_back: int = 90) -> List[HarvestedItem]: | |
| """Harvest Teams messages fra local cache""" | |
| print(" 💬 Scanner Teams cache...") | |
| if not self.find_cache(): | |
| print(" ⚠️ Teams cache ikke fundet") | |
| return [] | |
| print(f" 📁 Cache: {self.cache_path}") | |
| # Find IndexedDB / LevelDB files | |
| leveldb_paths = list(self.cache_path.rglob("*.ldb")) + list(self.cache_path.rglob("*.log")) | |
| for db_file in leveldb_paths[:20]: # Begrænset antal | |
| try: | |
| self._parse_leveldb_file(db_file) | |
| except Exception as e: | |
| self.stats["errors"] += 1 | |
| # Find JSON cache files | |
| json_files = list(self.cache_path.rglob("*.json")) | |
| for json_file in json_files[:50]: | |
| try: | |
| self._parse_json_cache(json_file) | |
| except Exception as e: | |
| self.stats["errors"] += 1 | |
| print(f" 💬 Teams: {self.stats['matched']}/{self.stats['found']} matched") | |
| return self.items | |
| def _parse_leveldb_file(self, filepath: Path): | |
| """Parse LevelDB fil for Teams data""" | |
| try: | |
| with open(filepath, 'rb') as f: | |
| content = f.read() | |
| # Søg efter JSON-lignende strukturer | |
| json_pattern = rb'\{[^{}]{50,5000}\}' | |
| matches = re.findall(json_pattern, content) | |
| for match in matches: | |
| try: | |
| text = match.decode('utf-8', errors='ignore') | |
| data = json.loads(text) | |
| # Check for message-lignende struktur | |
| if any(key in data for key in ['content', 'message', 'body', 'text']): | |
| content_text = data.get('content') or data.get('message') or data.get('body') or data.get('text', '') | |
| if isinstance(content_text, str) and len(content_text) > 20: | |
| keywords = self.match_keywords(content_text) | |
| if keywords: | |
| item = HarvestedItem( | |
| id=hashlib.md5(content_text[:100].encode()).hexdigest(), | |
| source="teams", | |
| item_type="message", | |
| title=content_text[:100], | |
| content_preview=content_text[:500], | |
| author=data.get('from', {}).get('user', {}).get('displayName', 'Unknown') if isinstance(data.get('from'), dict) else str(data.get('from', 'Unknown')), | |
| timestamp=data.get('createdDateTime', data.get('timestamp', '')), | |
| path=str(filepath), | |
| keywords=keywords, | |
| metadata={"channel": data.get('channelId', '')} | |
| ) | |
| self.items.append(item) | |
| self.save_to_neo4j(item) | |
| self.stats["matched"] += 1 | |
| self.stats["found"] += 1 | |
| except (json.JSONDecodeError, UnicodeDecodeError): | |
| continue | |
| except Exception as e: | |
| pass | |
| def _parse_json_cache(self, filepath: Path): | |
| """Parse JSON cache fil""" | |
| try: | |
| with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: | |
| data = json.load(f) | |
| # Rekursivt søg efter messages | |
| self._extract_messages(data, str(filepath)) | |
| except Exception as e: | |
| pass | |
| def _extract_messages(self, data: Any, filepath: str, depth: int = 0): | |
| """Rekursivt udtræk messages fra nested data""" | |
| if depth > 5: | |
| return | |
| if isinstance(data, dict): | |
| # Check for message content | |
| content = data.get('content') or data.get('body') or data.get('message') | |
| if content and isinstance(content, str) and len(content) > 30: | |
| keywords = self.match_keywords(content) | |
| if keywords: | |
| item = HarvestedItem( | |
| id=hashlib.md5(content[:100].encode()).hexdigest(), | |
| source="teams", | |
| item_type="message", | |
| title=content[:100], | |
| content_preview=content[:500], | |
| author=str(data.get('from', 'Unknown')), | |
| timestamp=str(data.get('createdDateTime', '')), | |
| path=filepath, | |
| keywords=keywords, | |
| metadata={} | |
| ) | |
| self.items.append(item) | |
| self.save_to_neo4j(item) | |
| self.stats["matched"] += 1 | |
| self.stats["found"] += 1 | |
| # Recurse into dict values | |
| for value in data.values(): | |
| self._extract_messages(value, filepath, depth + 1) | |
| elif isinstance(data, list): | |
| for item in data[:100]: # Limit | |
| self._extract_messages(item, filepath, depth + 1) | |
| # ============================================================ | |
| # ONEDRIVE / SHAREPOINT HARVESTER (Local Sync) | |
| # ============================================================ | |
| class OneDriveSharePointHarvester(BaseHarvester): | |
| """Harvest OneDrive og SharePoint via local sync folders""" | |
| def __init__(self, neo4j_driver): | |
| super().__init__(neo4j_driver) | |
| self.sync_paths: List[Path] = [] | |
| def find_sync_folders(self) -> List[Path]: | |
| """Find alle OneDrive/SharePoint sync folders""" | |
| self.sync_paths = [] | |
| # Check standard paths | |
| for key in ["onedrive", "onedrive_business", "sharepoint_sync"]: | |
| path = PATHS.get(key) | |
| if path and path.exists(): | |
| self.sync_paths.append(path) | |
| print(f" 📁 Fundet: {path}") | |
| # Søg efter andre OneDrive folders | |
| for item in USER_HOME.iterdir(): | |
| if item.is_dir() and "OneDrive" in item.name and item not in self.sync_paths: | |
| self.sync_paths.append(item) | |
| print(f" 📁 Fundet: {item}") | |
| return self.sync_paths | |
| def harvest(self, days_back: int = 90) -> List[HarvestedItem]: | |
| """Harvest filer fra sync folders""" | |
| print(" 📂 Scanner OneDrive/SharePoint...") | |
| self.find_sync_folders() | |
| if not self.sync_paths: | |
| print(" ⚠️ Ingen sync folders fundet") | |
| return [] | |
| cutoff = datetime.now() - timedelta(days=days_back) | |
| # Filtyper at scanne | |
| extensions = {'.docx', '.xlsx', '.pptx', '.pdf', '.txt', '.md', '.doc', '.xls', '.ppt'} | |
| for sync_path in self.sync_paths: | |
| source = "sharepoint" if "SharePoint" in str(sync_path) or "TDC Holding" in str(sync_path) else "onedrive" | |
| for filepath in sync_path.rglob("*"): | |
| try: | |
| if not filepath.is_file(): | |
| continue | |
| if filepath.suffix.lower() not in extensions: | |
| continue | |
| # Check modificeret tid | |
| mtime = datetime.fromtimestamp(filepath.stat().st_mtime) | |
| if mtime < cutoff: | |
| continue | |
| # Extract content preview | |
| content_preview = self._extract_content(filepath) | |
| keywords = self.match_keywords(f"{filepath.name} {content_preview}") | |
| if keywords: | |
| item = HarvestedItem( | |
| id=hashlib.md5(str(filepath).encode()).hexdigest(), | |
| source=source, | |
| item_type="document", | |
| title=filepath.name, | |
| content_preview=content_preview[:500], | |
| author="", # Kan udtrækkes fra metadata | |
| timestamp=mtime.strftime("%Y-%m-%d %H:%M"), | |
| path=str(filepath.relative_to(sync_path)), | |
| keywords=keywords, | |
| metadata={ | |
| "size": filepath.stat().st_size, | |
| "extension": filepath.suffix, | |
| "full_path": str(filepath) | |
| } | |
| ) | |
| self.items.append(item) | |
| self.save_to_neo4j(item) | |
| self.stats["matched"] += 1 | |
| self.stats["found"] += 1 | |
| except Exception as e: | |
| self.stats["errors"] += 1 | |
| print(f" 📂 OneDrive/SharePoint: {self.stats['matched']}/{self.stats['found']} matched") | |
| return self.items | |
| def _extract_content(self, filepath: Path) -> str: | |
| """Udtræk tekst content fra fil""" | |
| try: | |
| suffix = filepath.suffix.lower() | |
| if suffix in ['.txt', '.md']: | |
| with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: | |
| return f.read()[:2000] | |
| elif suffix == '.docx': | |
| return self._extract_docx(filepath) | |
| elif suffix == '.xlsx': | |
| return self._extract_xlsx(filepath) | |
| elif suffix == '.pptx': | |
| return self._extract_pptx(filepath) | |
| elif suffix == '.pdf': | |
| return self._extract_pdf(filepath) | |
| except Exception as e: | |
| pass | |
| return "" | |
| def _extract_docx(self, filepath: Path) -> str: | |
| """Udtræk tekst fra DOCX""" | |
| try: | |
| import zipfile | |
| with zipfile.ZipFile(filepath, 'r') as z: | |
| xml_content = z.read('word/document.xml').decode('utf-8', errors='ignore') | |
| # Strip XML tags | |
| text = re.sub(r'<[^>]+>', ' ', xml_content) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text[:2000] | |
| except: | |
| return "" | |
| def _extract_xlsx(self, filepath: Path) -> str: | |
| """Udtræk tekst fra XLSX""" | |
| try: | |
| import zipfile | |
| with zipfile.ZipFile(filepath, 'r') as z: | |
| if 'xl/sharedStrings.xml' in z.namelist(): | |
| xml_content = z.read('xl/sharedStrings.xml').decode('utf-8', errors='ignore') | |
| text = re.sub(r'<[^>]+>', ' ', xml_content) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text[:2000] | |
| except: | |
| pass | |
| return "" | |
| def _extract_pptx(self, filepath: Path) -> str: | |
| """Udtræk tekst fra PPTX""" | |
| try: | |
| import zipfile | |
| texts = [] | |
| with zipfile.ZipFile(filepath, 'r') as z: | |
| for name in z.namelist(): | |
| if name.startswith('ppt/slides/slide') and name.endswith('.xml'): | |
| xml_content = z.read(name).decode('utf-8', errors='ignore') | |
| text = re.sub(r'<[^>]+>', ' ', xml_content) | |
| texts.append(text) | |
| return ' '.join(texts)[:2000] | |
| except: | |
| return "" | |
| def _extract_pdf(self, filepath: Path) -> str: | |
| """Udtræk tekst fra PDF (basic)""" | |
| try: | |
| with open(filepath, 'rb') as f: | |
| content = f.read() | |
| # Simple PDF text extraction | |
| text_matches = re.findall(rb'\(([^)]+)\)', content) | |
| texts = [m.decode('utf-8', errors='ignore') for m in text_matches[:100]] | |
| return ' '.join(texts)[:2000] | |
| except: | |
| return "" | |
| # ============================================================ | |
| # LOOP HARVESTER (Local Cache) | |
| # ============================================================ | |
| class LoopHarvester(BaseHarvester): | |
| """Harvest Microsoft Loop via local cache""" | |
| def __init__(self, neo4j_driver): | |
| super().__init__(neo4j_driver) | |
| def harvest(self, days_back: int = 90) -> List[HarvestedItem]: | |
| """Harvest Loop notes fra local cache""" | |
| print(" 🔄 Scanner Loop cache...") | |
| loop_path = PATHS.get("loop_cache") | |
| if not loop_path or not loop_path.exists(): | |
| # Søg alternative paths | |
| alt_paths = [ | |
| APPDATA_LOCAL / "Microsoft" / "Loop", | |
| APPDATA_LOCAL / "Packages" / "Microsoft.MicrosoftLoop_8wekyb3d8bbwe", | |
| USER_HOME / ".loop", | |
| ] | |
| for alt in alt_paths: | |
| if alt.exists(): | |
| loop_path = alt | |
| break | |
| if not loop_path or not loop_path.exists(): | |
| print(" ⚠️ Loop cache ikke fundet") | |
| return [] | |
| print(f" 📁 Loop path: {loop_path}") | |
| # Scan for Loop files | |
| for filepath in loop_path.rglob("*"): | |
| try: | |
| if not filepath.is_file(): | |
| continue | |
| # Loop gemmer som JSON eller SQLite | |
| if filepath.suffix in ['.json', '.db', '.sqlite']: | |
| self._parse_loop_file(filepath) | |
| except Exception as e: | |
| self.stats["errors"] += 1 | |
| print(f" 🔄 Loop: {self.stats['matched']}/{self.stats['found']} matched") | |
| return self.items | |
| def _parse_loop_file(self, filepath: Path): | |
| """Parse Loop fil""" | |
| try: | |
| if filepath.suffix == '.json': | |
| with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: | |
| data = json.load(f) | |
| self._extract_loop_content(data, str(filepath)) | |
| elif filepath.suffix in ['.db', '.sqlite']: | |
| self._parse_sqlite(filepath) | |
| except Exception as e: | |
| pass | |
| def _extract_loop_content(self, data: Any, filepath: str, depth: int = 0): | |
| """Rekursivt udtræk Loop content""" | |
| if depth > 5: | |
| return | |
| if isinstance(data, dict): | |
| # Loop component content | |
| content = data.get('content') or data.get('text') or data.get('title') | |
| if content and isinstance(content, str) and len(content) > 20: | |
| keywords = self.match_keywords(content) | |
| if keywords: | |
| item = HarvestedItem( | |
| id=hashlib.md5(content[:100].encode()).hexdigest(), | |
| source="loop", | |
| item_type="note", | |
| title=content[:100], | |
| content_preview=content[:500], | |
| author=str(data.get('author', 'Unknown')), | |
| timestamp=str(data.get('modifiedTime', '')), | |
| path=filepath, | |
| keywords=keywords, | |
| metadata={"type": data.get('type', 'unknown')} | |
| ) | |
| self.items.append(item) | |
| self.save_to_neo4j(item) | |
| self.stats["matched"] += 1 | |
| self.stats["found"] += 1 | |
| for value in data.values(): | |
| self._extract_loop_content(value, filepath, depth + 1) | |
| elif isinstance(data, list): | |
| for item in data[:100]: | |
| self._extract_loop_content(item, filepath, depth + 1) | |
| def _parse_sqlite(self, filepath: Path): | |
| """Parse SQLite database""" | |
| try: | |
| # Kopier til temp for at undgå lock | |
| temp_path = filepath.parent / f"{filepath.name}.tmp" | |
| shutil.copy2(filepath, temp_path) | |
| conn = sqlite3.connect(temp_path) | |
| cursor = conn.cursor() | |
| # Find tabeller med content | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") | |
| tables = cursor.fetchall() | |
| for (table_name,) in tables: | |
| try: | |
| cursor.execute(f"SELECT * FROM {table_name} LIMIT 100") | |
| rows = cursor.fetchall() | |
| for row in rows: | |
| row_text = ' '.join(str(cell) for cell in row if cell) | |
| keywords = self.match_keywords(row_text) | |
| if keywords and len(row_text) > 50: | |
| item = HarvestedItem( | |
| id=hashlib.md5(row_text[:100].encode()).hexdigest(), | |
| source="loop", | |
| item_type="data", | |
| title=row_text[:100], | |
| content_preview=row_text[:500], | |
| author="", | |
| timestamp="", | |
| path=f"{filepath}:{table_name}", | |
| keywords=keywords, | |
| metadata={"table": table_name} | |
| ) | |
| self.items.append(item) | |
| self.save_to_neo4j(item) | |
| self.stats["matched"] += 1 | |
| self.stats["found"] += 1 | |
| except Exception as e: | |
| continue | |
| conn.close() | |
| temp_path.unlink(missing_ok=True) | |
| except Exception as e: | |
| pass | |
| # ============================================================ | |
| # MAIN M365 HARVESTER | |
| # ============================================================ | |
| class M365CompleteHarvester: | |
| """Komplet M365 harvester der kører alle sub-harvesters""" | |
| def __init__(self): | |
| self.neo4j = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) | |
| self.harvesters = {} | |
| self.total_stats = { | |
| "outlook": {"found": 0, "matched": 0}, | |
| "teams": {"found": 0, "matched": 0}, | |
| "onedrive": {"found": 0, "matched": 0}, | |
| "sharepoint": {"found": 0, "matched": 0}, | |
| "loop": {"found": 0, "matched": 0}, | |
| } | |
| self.all_items: List[HarvestedItem] = [] | |
| # Output | |
| self.output_dir = Path("data/m365_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def run(self, days_back: int = 90, sources: List[str] = None): | |
| """Kør komplet harvest""" | |
| if sources is None: | |
| sources = ["outlook", "teams", "onedrive", "loop"] | |
| print("\n" + "=" * 60) | |
| print("🚀 M365 COMPLETE LOCAL HARVESTER") | |
| print("=" * 60) | |
| print(f" 📅 Periode: Sidste {days_back} dage") | |
| print(f" 🔍 Keywords: {len(SEARCH_KEYWORDS)}") | |
| print(f" 📦 Sources: {', '.join(sources)}") | |
| print("=" * 60) | |
| # Outlook | |
| if "outlook" in sources: | |
| print("\n📧 OUTLOOK") | |
| harvester = OutlookHarvester(self.neo4j) | |
| items = harvester.harvest(days_back) | |
| self.all_items.extend(items) | |
| self.total_stats["outlook"] = harvester.stats | |
| # Teams | |
| if "teams" in sources: | |
| print("\n💬 TEAMS") | |
| harvester = TeamsHarvester(self.neo4j) | |
| items = harvester.harvest(days_back) | |
| self.all_items.extend(items) | |
| self.total_stats["teams"] = harvester.stats | |
| # OneDrive/SharePoint | |
| if "onedrive" in sources or "sharepoint" in sources: | |
| print("\n📂 ONEDRIVE / SHAREPOINT") | |
| harvester = OneDriveSharePointHarvester(self.neo4j) | |
| items = harvester.harvest(days_back) | |
| self.all_items.extend(items) | |
| # Split stats | |
| for item in items: | |
| self.total_stats[item.source]["matched"] += 1 | |
| # Loop | |
| if "loop" in sources: | |
| print("\n🔄 LOOP") | |
| harvester = LoopHarvester(self.neo4j) | |
| items = harvester.harvest(days_back) | |
| self.all_items.extend(items) | |
| self.total_stats["loop"] = harvester.stats | |
| # Gem output | |
| self._save_results() | |
| # Print summary | |
| self._print_summary() | |
| # Cleanup | |
| self.neo4j.close() | |
| def _save_results(self): | |
| """Gem resultater til JSON""" | |
| output_file = self.output_dir / f"m365_harvest_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.total_stats, | |
| "keywords": SEARCH_KEYWORDS, | |
| "items": [asdict(item) for item in self.all_items] | |
| }, f, indent=2, ensure_ascii=False) | |
| print(f"\n💾 Saved: {output_file}") | |
| def _print_summary(self): | |
| """Print harvest summary""" | |
| print("\n" + "=" * 60) | |
| print("📊 HARVEST SUMMARY") | |
| print("=" * 60) | |
| total_matched = 0 | |
| total_found = 0 | |
| for source, stats in self.total_stats.items(): | |
| matched = stats.get("matched", 0) | |
| found = stats.get("found", 0) | |
| total_matched += matched | |
| total_found += found | |
| icon = {"outlook": "📧", "teams": "💬", "onedrive": "☁️", "sharepoint": "📁", "loop": "🔄"}.get(source, "📦") | |
| print(f" {icon} {source.capitalize():12} {matched:5} / {found:5} matched") | |
| print(" " + "-" * 40) | |
| print(f" {'TOTAL':15} {total_matched:5} / {total_found:5} matched") | |
| print("=" * 60) | |
| # Keyword stats | |
| if self.all_items: | |
| print("\n🏷️ TOP KEYWORDS:") | |
| keyword_counts = {} | |
| for item in self.all_items: | |
| for kw in item.keywords: | |
| keyword_counts[kw] = keyword_counts.get(kw, 0) + 1 | |
| for kw, count in sorted(keyword_counts.items(), key=lambda x: -x[1])[:10]: | |
| print(f" • {kw}: {count}") | |
| print("=" * 60) | |
| # ============================================================ | |
| # CLI | |
| # ============================================================ | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description="M365 Complete Local Harvester") | |
| parser.add_argument("--days", type=int, default=90, help="Dage tilbage (default: 90)") | |
| parser.add_argument("--sources", nargs="+", default=["outlook", "teams", "onedrive", "loop"], | |
| choices=["outlook", "teams", "onedrive", "sharepoint", "loop"], | |
| help="Sources at harveste") | |
| args = parser.parse_args() | |
| harvester = M365CompleteHarvester() | |
| harvester.run(days_back=args.days, sources=args.sources) | |
| if __name__ == "__main__": | |
| main() | |