Spaces:

Kraft102
/

widgettdc-api

Paused

App Files Files Community

widgettdc-api / apps /backend /python /sharepoint_harvester.py

Kraft102

Update backend source

34367da verified 2 months ago

raw

history blame contribute delete

11 kB

	#!/usr/bin/env python3
	"""
	🏢 TDC SharePoint Harvester - MSAL Device Code Flow
	Simpel, holdbar løsning der virker med enhver Microsoft konto
	"""
	import json
	import hashlib
	import webbrowser
	from pathlib import Path
	from datetime import datetime
	from neo4j import GraphDatabase

	try:
	import msal
	import requests
	except ImportError:
	print("Installing required packages...")
	import subprocess
	subprocess.run(["pip", "install", "msal", "requests", "--quiet"])
	import msal
	import requests

	class TDCSharePointHarvester:
	"""SharePoint harvester med MSAL device code authentication"""

	# Microsoft Graph public client (no app registration needed)
	CLIENT_ID = "14d82eec-204b-4c2f-b7e8-296a70dab67e" # Microsoft Graph Explorer
	AUTHORITY = "https://login.microsoftonline.com/common"
	SCOPES = ["https://graph.microsoft.com/.default"]
	GRAPH_URL = "https://graph.microsoft.com/v1.0"

	# Neo4j
	NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io"
	NEO4J_USER = "neo4j"
	NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8"

	# Søgetermer
	SEARCH_TERMS = [
	"strategi TDC",
	"cybersikkerhed",
	"cloud strategi",
	"AI strategi",
	"Columbus ERP",
	"budget 2025",
	"kundeliste",
	"rammeaftale",
	"produktkatalog",
	"SOC MDR",
	"NIS2",
	"IT arkitektur"
	]

	def __init__(self):
	self.output_dir = Path("data/sharepoint_harvest")
	self.output_dir.mkdir(parents=True, exist_ok=True)

	self.access_token = None
	self.documents = []
	self.stats = {"searches": 0, "documents": 0, "sites": 0}

	# Token cache
	self.token_cache_file = self.output_dir / "token_cache.json"

	# Neo4j
	self.neo4j = GraphDatabase.driver(
	self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD)
	)

	print("🏢 TDC SharePoint Harvester")
	print("=" * 50)

	def authenticate(self):
	"""Authenticate via device code flow"""
	print("\n🔐 AUTHENTICATION")
	print("-" * 40)

	# Load cached token if exists
	cache = msal.SerializableTokenCache()
	if self.token_cache_file.exists():
	cache.deserialize(self.token_cache_file.read_text())

	app = msal.PublicClientApplication(
	self.CLIENT_ID,
	authority=self.AUTHORITY,
	token_cache=cache
	)

	# Try silent auth first
	accounts = app.get_accounts()
	if accounts:
	print(f" Found cached account: {accounts[0].get('username', 'Unknown')}")
	result = app.acquire_token_silent(
	["Sites.Read.All", "Files.Read.All", "User.Read"],
	account=accounts[0]
	)
	if result and "access_token" in result:
	self.access_token = result["access_token"]
	print(" ✅ Using cached token")
	return True

	# Device code flow
	print("\n 📱 Device Code Authentication:")
	flow = app.initiate_device_flow(
	scopes=["Sites.Read.All", "Files.Read.All", "User.Read"]
	)

	if "user_code" not in flow:
	print(f" ❌ Error: {flow.get('error_description', 'Unknown')}")
	return False

	print(f"\n 🔗 Gå til: {flow['verification_uri']}")
	print(f" 📝 Indtast kode: {flow['user_code']}")
	print("\n Venter på login...")

	# Open browser
	webbrowser.open(flow['verification_uri'])

	# Wait for auth
	result = app.acquire_token_by_device_flow(flow)

	if "access_token" in result:
	self.access_token = result["access_token"]

	# Save cache
	if cache.has_state_changed:
	self.token_cache_file.write_text(cache.serialize())

	# Get user info
	headers = {"Authorization": f"Bearer {self.access_token}"}
	user = requests.get(f"{self.GRAPH_URL}/me", headers=headers).json()
	print(f"\n ✅ Logget ind som: {user.get('displayName', 'Unknown')}")
	print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}")

	return True
	else:
	print(f" ❌ Auth failed: {result.get('error_description', 'Unknown')}")
	return False

	def api_get(self, endpoint: str) -> dict:
	"""Make authenticated API call"""
	headers = {"Authorization": f"Bearer {self.access_token}"}
	response = requests.get(f"{self.GRAPH_URL}{endpoint}", headers=headers)
	if response.status_code == 200:
	return response.json()
	return {}

	def api_post(self, endpoint: str, data: dict) -> dict:
	"""Make authenticated POST call"""
	headers = {
	"Authorization": f"Bearer {self.access_token}",
	"Content-Type": "application/json"
	}
	response = requests.post(f"{self.GRAPH_URL}{endpoint}", headers=headers, json=data)
	if response.status_code == 200:
	return response.json()
	return {}

	def search(self, query: str) -> list:
	"""Search SharePoint via Graph API"""
	results = []

	search_body = {
	"requests": [{
	"entityTypes": ["driveItem", "listItem", "site"],
	"query": {"queryString": query},
	"from": 0,
	"size": 25
	}]
	}

	data = self.api_post("/search/query", search_body)

	for result_set in data.get("value", []):
	for container in result_set.get("hitsContainers", []):
	for hit in container.get("hits", []):
	resource = hit.get("resource", {})

	doc = {
	"id": resource.get("id", ""),
	"title": resource.get("name", "") or resource.get("displayName", ""),
	"url": resource.get("webUrl", ""),
	"summary": hit.get("summary", "")[:500],
	"type": resource.get("@odata.type", "").split(".")[-1],
	"modified": resource.get("lastModifiedDateTime", ""),
	"query": query
	}

	if doc["title"] and doc["url"]:
	results.append(doc)

	return results

	def get_sites(self) -> list:
	"""Get accessible SharePoint sites"""
	sites = []
	data = self.api_get("/sites?search=*")

	for site in data.get("value", []):
	sites.append({
	"id": site.get("id"),
	"name": site.get("displayName"),
	"url": site.get("webUrl"),
	"description": site.get("description", "")
	})
	self.stats["sites"] += 1

	return sites

	def get_my_drive(self) -> list:
	"""Get OneDrive files"""
	files = []
	data = self.api_get("/me/drive/root/children")

	for item in data.get("value", []):
	files.append({
	"name": item.get("name"),
	"url": item.get("webUrl"),
	"type": "folder" if item.get("folder") else "file",
	"size": item.get("size", 0),
	"modified": item.get("lastModifiedDateTime", "")
	})

	return files

	def save_to_neo4j(self, doc: dict):
	"""Save document to Neo4j"""
	content_hash = hashlib.md5(
	f"{doc.get('title','')}:{doc.get('url','')}".encode()
	).hexdigest()

	with self.neo4j.session() as session:
	session.run("""
	MERGE (d:SharePointDocument {contentHash: $hash})
	SET d.title = $title,
	d.url = $url,
	d.summary = $summary,
	d.docType = $type,
	d.searchQuery = $query,
	d.modified = $modified,
	d.harvestedAt = datetime()

	MERGE (ds:DataSource {name: 'TDC_SharePoint'})
	MERGE (d)-[:HARVESTED_FROM]->(ds)
	""",
	hash=content_hash,
	title=doc.get('title', '')[:200],
	url=doc.get('url', ''),
	summary=doc.get('summary', '')[:1000],
	type=doc.get('type', ''),
	query=doc.get('query', ''),
	modified=doc.get('modified', '')
	)

	self.stats["documents"] += 1

	def run(self):
	"""Run full harvest"""
	if not self.authenticate():
	return

	# 1. Get sites
	print("\n📍 SHAREPOINT SITES")
	print("-" * 40)
	sites = self.get_sites()
	print(f" Found {len(sites)} sites")
	for site in sites[:10]:
	print(f" • {site['name']}: {site['url']}")

	# 2. Search documents
	print("\n🔍 SEARCHING DOCUMENTS")
	print("-" * 40)

	all_docs = []
	seen_urls = set()

	for query in self.SEARCH_TERMS:
	print(f" Søger: {query}", end="")
	results = self.search(query)
	self.stats["searches"] += 1

	new_count = 0
	for doc in results:
	if doc["url"] not in seen_urls:
	seen_urls.add(doc["url"])
	all_docs.append(doc)
	self.save_to_neo4j(doc)
	new_count += 1

	print(f" → {len(results)} results ({new_count} new)")

	# 3. OneDrive
	print("\n📁 ONEDRIVE FILES")
	print("-" * 40)
	my_files = self.get_my_drive()
	print(f" Found {len(my_files)} items")

	# 4. Summary
	print("\n" + "=" * 50)
	print("📊 HARVEST COMPLETE")
	print("=" * 50)
	print(f" 🔍 Searches: {self.stats['searches']}")
	print(f" 📍 Sites: {self.stats['sites']}")
	print(f" 📄 Documents: {self.stats['documents']}")

	# Save JSON
	output = {
	"timestamp": datetime.now().isoformat(),
	"stats": self.stats,
	"sites": sites,
	"documents": all_docs,
	"onedrive": my_files
	}

	output_file = self.output_dir / "sharepoint_harvest.json"
	output_file.write_text(json.dumps(output, indent=2, ensure_ascii=False))
	print(f"\n📁 Saved: {output_file}")

	self.neo4j.close()


	if __name__ == "__main__":
	harvester = TDCSharePointHarvester()
	harvester.run()