widgettdc-api / apps /backend /python /outlook_ews_harvest.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
🌐 EWS AUTODISCOVER HARVESTER
Forbinder til Exchange via EWS med autodiscover
Krav: Din TDC email + password (evt. app password)
"""
import hashlib
import json
from pathlib import Path
from datetime import datetime, timedelta
from getpass import getpass
# EWS bibliotek
try:
from exchangelib import (
Credentials, Account, Configuration, DELEGATE,
EWSDateTime, EWSTimeZone, Q
)
from exchangelib.autodiscover import Autodiscover
except ImportError:
print("❌ exchangelib ikke installeret!")
print(" Kør: pip install exchangelib")
exit(1)
try:
from neo4j import GraphDatabase
except ImportError:
print("⚠️ neo4j ikke installeret - gemmer kun lokalt")
GraphDatabase = None
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
# Søgetermer for TDC intern data
SEARCH_KEYWORDS = [
"strategi", "cyber", "NIS2", "SOC", "MDR",
"cloud", "Azure", "AI", "Copilot", "Columbus",
"ERP", "budget", "forecast", "kunde", "kontrakt",
"rammeaftale", "SKI", "produkt", "CloudKey",
"arkitektur", "roadmap", "security", "incident",
"TDC NET", "Nuuday", "partner", "projekt"
]
class EWSHarvester:
"""Harvester der bruger Exchange Web Services med Autodiscover"""
def __init__(self, email: str, password: str):
self.email = email
self.password = password
self.account = None
self.emails = []
self.stats = {
"folders_scanned": 0,
"emails_found": 0,
"emails_matched": 0,
"attachments": 0
}
# Neo4j
self.neo4j = None
if GraphDatabase:
try:
self.neo4j = GraphDatabase.driver(
NEO4J_URI,
auth=(NEO4J_USER, NEO4J_PASSWORD)
)
except Exception as e:
print(f"⚠️ Neo4j connection failed: {e}")
# Output
self.output_dir = Path("data/outlook_ews_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
def connect(self) -> bool:
"""Forbind via EWS Autodiscover"""
print("\n🔌 Forbinder via EWS Autodiscover...")
print(f" Email: {self.email}")
try:
# Opret credentials
credentials = Credentials(
username=self.email,
password=self.password
)
# Autodiscover finder automatisk Exchange server
print(" 🔍 Kører autodiscover...")
self.account = Account(
primary_smtp_address=self.email,
credentials=credentials,
autodiscover=True,
access_type=DELEGATE
)
print(f"✅ Forbundet til Exchange!")
print(f" Server: {self.account.protocol.server}")
print(f" EWS URL: {self.account.protocol.service_endpoint}")
return True
except Exception as e:
print(f"❌ Forbindelse fejlede: {e}")
print("\n💡 Tips:")
print(" • Tjek at email/password er korrekt")
print(" • Prøv med app password hvis MFA er aktivt")
print(" • TDC kan kræve OAuth2 - se outlook_graph_harvest.py")
return False
def get_folders(self):
"""Hent alle mailmapper"""
folders = []
try:
# Hent alle mapper rekursivt
def scan_folder(folder, level=0):
try:
folders.append({
"name": folder.name,
"path": folder.absolute or folder.name,
"folder": folder,
"level": level,
"total_count": folder.total_count or 0
})
if level < 3:
for child in folder.children:
scan_folder(child, level + 1)
except:
pass
# Start fra root
scan_folder(self.account.root)
except Exception as e:
print(f"⚠️ Fejl ved folder scan: {e}")
return folders
def search_folder(self, folder_info: dict, keywords: list, days_back: int = 365):
"""Søg i en mappe efter keywords"""
results = []
folder = folder_info["folder"]
try:
# Sæt timezone
tz = EWSTimeZone.localzone()
cutoff = EWSDateTime.from_datetime(
datetime.now() - timedelta(days=days_back)
).astimezone(tz)
# Byg søgequery
# EWS understøtter komplekse queries
keyword_filter = None
for kw in keywords:
q = Q(subject__icontains=kw) | Q(body__icontains=kw)
if keyword_filter is None:
keyword_filter = q
else:
keyword_filter = keyword_filter | q
# Kombinér med dato filter
full_filter = Q(datetime_received__gte=cutoff) & keyword_filter
# Søg
items = folder.filter(full_filter).order_by('-datetime_received')[:500]
count = 0
for item in items:
try:
# Find matchede keywords
subject = str(item.subject or "").lower()
body = str(item.text_body or "")[:2000].lower()
matched_keywords = []
for kw in keywords:
if kw.lower() in subject or kw.lower() in body:
matched_keywords.append(kw)
if not matched_keywords:
continue
# Hent attachments
attachments = []
try:
for att in item.attachments:
attachments.append({
"name": att.name,
"size": att.size or 0,
"content_type": getattr(att, 'content_type', 'unknown')
})
self.stats["attachments"] += 1
except:
pass
# Hent sender info
sender_email = ""
sender_name = ""
if item.sender:
sender_email = item.sender.email_address or ""
sender_name = item.sender.name or ""
results.append({
"id": item.id,
"subject": item.subject,
"sender": sender_email,
"sender_name": sender_name,
"received": item.datetime_received.strftime("%Y-%m-%d %H:%M") if item.datetime_received else "",
"body_preview": body[:500],
"folder": folder_info["path"],
"keywords": matched_keywords,
"has_attachments": len(attachments) > 0,
"attachments": attachments[:5],
"importance": str(item.importance) if item.importance else "normal",
"categories": list(item.categories) if item.categories else []
})
self.stats["emails_matched"] += 1
count += 1
if count % 50 == 0:
print(f" Processeret {count} matches...", end="\r")
except Exception as e:
continue
self.stats["emails_found"] += count
except Exception as e:
# Kan være "folder does not support searching"
pass
return results
def save_to_neo4j(self, email: dict):
"""Gem email i Neo4j"""
if not self.neo4j:
return
content_hash = hashlib.md5(
f"{email['subject']}:{email['id']}".encode()
).hexdigest()
try:
with self.neo4j.session() as session:
session.run("""
MERGE (e:OutlookEmail {contentHash: $hash})
ON CREATE SET
e.entryId = $id,
e.subject = $subject,
e.sender = $sender,
e.senderName = $senderName,
e.received = $received,
e.bodyPreview = $body,
e.folder = $folder,
e.keywords = $keywords,
e.hasAttachments = $hasAtt,
e.importance = $importance,
e.source = 'EWS',
e.harvestedAt = datetime()
ON MATCH SET
e.lastSeen = datetime()
MERGE (ds:DataSource {name: 'TDC_Outlook_EWS'})
ON CREATE SET ds.type = 'exchange_ews'
MERGE (e)-[:HARVESTED_FROM]->(ds)
""",
hash=content_hash,
id=str(email["id"])[:100],
subject=email["subject"][:500] if email["subject"] else "",
sender=email["sender"],
senderName=email["sender_name"],
received=email["received"],
body=email["body_preview"],
folder=email["folder"],
keywords=email["keywords"],
hasAtt=email["has_attachments"],
importance=email["importance"]
)
# Keyword relationships
for kw in email["keywords"]:
session.run("""
MERGE (k:SearchKeyword {name: $kw})
WITH k
MATCH (e:OutlookEmail {contentHash: $hash})
MERGE (e)-[:MATCHES_KEYWORD]->(k)
""", kw=kw, hash=content_hash)
except Exception as e:
print(f"⚠️ Neo4j save error: {e}")
def run(self, days_back: int = 365):
"""Kør fuld harvest"""
print("\n" + "=" * 60)
print("🌐 EWS AUTODISCOVER HARVESTER")
print(" Exchange Web Services med automatisk server detection")
print("=" * 60)
if not self.connect():
return None
# Hent mapper
print("\n📁 Scanner mapper...")
folders = self.get_folders()
print(f" Fundet {len(folders)} mapper")
# Filtrer til relevante mapper
skip_names = ["junk", "spam", "deleted", "drafts", "outbox",
"slettet", "kladder", "udbakke", "sync issues",
"conflicts", "local failures", "server failures"]
relevant_folders = [
f for f in folders
if not any(skip.lower() in f["name"].lower() for skip in skip_names)
and f["total_count"] > 0
]
print(f" Relevante mapper med emails: {len(relevant_folders)}")
# Søg
print(f"\n🔍 Søger efter {len(SEARCH_KEYWORDS)} keywords...")
print(f" Periode: Sidste {days_back} dage")
all_results = []
for folder_info in relevant_folders:
indent = " " * folder_info["level"]
folder_name = folder_info["name"]
total = folder_info["total_count"]
print(f"\n{indent}📂 {folder_name} ({total} items)")
results = self.search_folder(folder_info, SEARCH_KEYWORDS, days_back)
self.stats["folders_scanned"] += 1
if results:
print(f"{indent}{len(results)} matches!")
for email in results:
self.save_to_neo4j(email)
all_results.append(email)
# Gem lokal JSON
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = self.output_dir / f"ews_harvest_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
"timestamp": datetime.now().isoformat(),
"email": self.email,
"stats": self.stats,
"keywords": SEARCH_KEYWORDS,
"emails": all_results
}, f, indent=2, ensure_ascii=False)
# Summary
print("\n" + "=" * 60)
print("📊 HARVEST COMPLETE")
print("=" * 60)
print(f" 📁 Mapper scannet: {self.stats['folders_scanned']}")
print(f" 📧 Emails matched: {self.stats['emails_matched']}")
print(f" 📎 Attachments: {self.stats['attachments']}")
print(f"\n 💾 Saved: {output_file}")
if self.neo4j:
print(f" 🗄️ Neo4j: Data synced")
print("=" * 60)
# Top keywords
if all_results:
print("\n🏷️ TOP KEYWORDS:")
keyword_counts = {}
for email in all_results:
for kw in email["keywords"]:
keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
for kw, count in sorted(keyword_counts.items(), key=lambda x: -x[1])[:10]:
print(f" • {kw}: {count} emails")
# Cleanup
if self.neo4j:
self.neo4j.close()
return all_results
def main():
import argparse
parser = argparse.ArgumentParser(description="EWS Outlook Harvester med Autodiscover")
parser.add_argument("--email", "-e", help="Din TDC email adresse")
parser.add_argument("--password", "-p", help="Password (eller app password)")
parser.add_argument("--days", "-d", type=int, default=365, help="Dage tilbage (default: 365)")
args = parser.parse_args()
print("\n" + "=" * 60)
print("🌐 EWS AUTODISCOVER HARVESTER")
print("=" * 60)
# Få credentials
email = args.email
if not email:
email = input("\n📧 Din TDC email: ")
password = args.password
if not password:
password = getpass("🔑 Password (eller app password): ")
if not email or not password:
print("❌ Email og password kræves!")
return
# Kør harvest
harvester = EWSHarvester(email, password)
harvester.run(args.days)
if __name__ == "__main__":
main()