widgettdc-api / apps /backend /python /sharepoint_harvester.py
Kraft102's picture
Update backend source
34367da verified
#!/usr/bin/env python3
"""
🏢 TDC SharePoint Harvester - MSAL Device Code Flow
Simpel, holdbar løsning der virker med enhver Microsoft konto
"""
import json
import hashlib
import webbrowser
from pathlib import Path
from datetime import datetime
from neo4j import GraphDatabase
try:
import msal
import requests
except ImportError:
print("Installing required packages...")
import subprocess
subprocess.run(["pip", "install", "msal", "requests", "--quiet"])
import msal
import requests
class TDCSharePointHarvester:
"""SharePoint harvester med MSAL device code authentication"""
# Microsoft Graph public client (no app registration needed)
CLIENT_ID = "14d82eec-204b-4c2f-b7e8-296a70dab67e" # Microsoft Graph Explorer
AUTHORITY = "https://login.microsoftonline.com/common"
SCOPES = ["https://graph.microsoft.com/.default"]
GRAPH_URL = "https://graph.microsoft.com/v1.0"
# Neo4j
NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"
# Søgetermer
SEARCH_TERMS = [
"strategi TDC",
"cybersikkerhed",
"cloud strategi",
"AI strategi",
"Columbus ERP",
"budget 2025",
"kundeliste",
"rammeaftale",
"produktkatalog",
"SOC MDR",
"NIS2",
"IT arkitektur"
]
def __init__(self):
self.output_dir = Path("data/sharepoint_harvest")
self.output_dir.mkdir(parents=True, exist_ok=True)
self.access_token = None
self.documents = []
self.stats = {"searches": 0, "documents": 0, "sites": 0}
# Token cache
self.token_cache_file = self.output_dir / "token_cache.json"
# Neo4j
self.neo4j = GraphDatabase.driver(
self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
)
print("🏢 TDC SharePoint Harvester")
print("=" * 50)
def authenticate(self):
"""Authenticate via device code flow"""
print("\n🔐 AUTHENTICATION")
print("-" * 40)
# Load cached token if exists
cache = msal.SerializableTokenCache()
if self.token_cache_file.exists():
cache.deserialize(self.token_cache_file.read_text())
app = msal.PublicClientApplication(
self.CLIENT_ID,
authority=self.AUTHORITY,
token_cache=cache
)
# Try silent auth first
accounts = app.get_accounts()
if accounts:
print(f" Found cached account: {accounts[0].get('username', 'Unknown')}")
result = app.acquire_token_silent(
["Sites.Read.All", "Files.Read.All", "User.Read"],
account=accounts[0]
)
if result and "access_token" in result:
self.access_token = result["access_token"]
print(" ✅ Using cached token")
return True
# Device code flow
print("\n 📱 Device Code Authentication:")
flow = app.initiate_device_flow(
scopes=["Sites.Read.All", "Files.Read.All", "User.Read"]
)
if "user_code" not in flow:
print(f" ❌ Error: {flow.get('error_description', 'Unknown')}")
return False
print(f"\n 🔗 Gå til: {flow['verification_uri']}")
print(f" 📝 Indtast kode: {flow['user_code']}")
print("\n Venter på login...")
# Open browser
webbrowser.open(flow['verification_uri'])
# Wait for auth
result = app.acquire_token_by_device_flow(flow)
if "access_token" in result:
self.access_token = result["access_token"]
# Save cache
if cache.has_state_changed:
self.token_cache_file.write_text(cache.serialize())
# Get user info
headers = {"Authorization": f"Bearer {self.access_token}"}
user = requests.get(f"{self.GRAPH_URL}/me", headers=headers).json()
print(f"\n ✅ Logget ind som: {user.get('displayName', 'Unknown')}")
print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}")
return True
else:
print(f" ❌ Auth failed: {result.get('error_description', 'Unknown')}")
return False
def api_get(self, endpoint: str) -> dict:
"""Make authenticated API call"""
headers = {"Authorization": f"Bearer {self.access_token}"}
response = requests.get(f"{self.GRAPH_URL}{endpoint}", headers=headers)
if response.status_code == 200:
return response.json()
return {}
def api_post(self, endpoint: str, data: dict) -> dict:
"""Make authenticated POST call"""
headers = {
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json"
}
response = requests.post(f"{self.GRAPH_URL}{endpoint}", headers=headers, json=data)
if response.status_code == 200:
return response.json()
return {}
def search(self, query: str) -> list:
"""Search SharePoint via Graph API"""
results = []
search_body = {
"requests": [{
"entityTypes": ["driveItem", "listItem", "site"],
"query": {"queryString": query},
"from": 0,
"size": 25
}]
}
data = self.api_post("/search/query", search_body)
for result_set in data.get("value", []):
for container in result_set.get("hitsContainers", []):
for hit in container.get("hits", []):
resource = hit.get("resource", {})
doc = {
"id": resource.get("id", ""),
"title": resource.get("name", "") or resource.get("displayName", ""),
"url": resource.get("webUrl", ""),
"summary": hit.get("summary", "")[:500],
"type": resource.get("@odata.type", "").split(".")[-1],
"modified": resource.get("lastModifiedDateTime", ""),
"query": query
}
if doc["title"] and doc["url"]:
results.append(doc)
return results
def get_sites(self) -> list:
"""Get accessible SharePoint sites"""
sites = []
data = self.api_get("/sites?search=*")
for site in data.get("value", []):
sites.append({
"id": site.get("id"),
"name": site.get("displayName"),
"url": site.get("webUrl"),
"description": site.get("description", "")
})
self.stats["sites"] += 1
return sites
def get_my_drive(self) -> list:
"""Get OneDrive files"""
files = []
data = self.api_get("/me/drive/root/children")
for item in data.get("value", []):
files.append({
"name": item.get("name"),
"url": item.get("webUrl"),
"type": "folder" if item.get("folder") else "file",
"size": item.get("size", 0),
"modified": item.get("lastModifiedDateTime", "")
})
return files
def save_to_neo4j(self, doc: dict):
"""Save document to Neo4j"""
content_hash = hashlib.md5(
f"{doc.get('title','')}:{doc.get('url','')}".encode()
).hexdigest()
with self.neo4j.session() as session:
session.run("""
MERGE (d:SharePointDocument {contentHash: $hash})
SET d.title = $title,
d.url = $url,
d.summary = $summary,
d.docType = $type,
d.searchQuery = $query,
d.modified = $modified,
d.harvestedAt = datetime()
MERGE (ds:DataSource {name: 'TDC_SharePoint'})
MERGE (d)-[:HARVESTED_FROM]->(ds)
""",
hash=content_hash,
title=doc.get('title', '')[:200],
url=doc.get('url', ''),
summary=doc.get('summary', '')[:1000],
type=doc.get('type', ''),
query=doc.get('query', ''),
modified=doc.get('modified', '')
)
self.stats["documents"] += 1
def run(self):
"""Run full harvest"""
if not self.authenticate():
return
# 1. Get sites
print("\n📍 SHAREPOINT SITES")
print("-" * 40)
sites = self.get_sites()
print(f" Found {len(sites)} sites")
for site in sites[:10]:
print(f" • {site['name']}: {site['url']}")
# 2. Search documents
print("\n🔍 SEARCHING DOCUMENTS")
print("-" * 40)
all_docs = []
seen_urls = set()
for query in self.SEARCH_TERMS:
print(f" Søger: {query}", end="")
results = self.search(query)
self.stats["searches"] += 1
new_count = 0
for doc in results:
if doc["url"] not in seen_urls:
seen_urls.add(doc["url"])
all_docs.append(doc)
self.save_to_neo4j(doc)
new_count += 1
print(f" → {len(results)} results ({new_count} new)")
# 3. OneDrive
print("\n📁 ONEDRIVE FILES")
print("-" * 40)
my_files = self.get_my_drive()
print(f" Found {len(my_files)} items")
# 4. Summary
print("\n" + "=" * 50)
print("📊 HARVEST COMPLETE")
print("=" * 50)
print(f" 🔍 Searches: {self.stats['searches']}")
print(f" 📍 Sites: {self.stats['sites']}")
print(f" 📄 Documents: {self.stats['documents']}")
# Save JSON
output = {
"timestamp": datetime.now().isoformat(),
"stats": self.stats,
"sites": sites,
"documents": all_docs,
"onedrive": my_files
}
output_file = self.output_dir / "sharepoint_harvest.json"
output_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
print(f"\n📁 Saved: {output_file}")
self.neo4j.close()
if __name__ == "__main__":
harvester = TDCSharePointHarvester()
harvester.run()