Spaces:
Running
Running
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| import tempfile | |
| from urllib.parse import urlparse | |
| from dotenv import load_dotenv | |
| import base64 | |
| import json | |
| import os | |
| import re | |
| import shutil | |
| import subprocess | |
| import requests | |
| from requests.auth import HTTPBasicAuth | |
| load_dotenv() | |
| # examples: https://scilifelab.atlassian.net/wiki/spaces/demo/pages/20381827/Template+pages | |
| # and for gitlab: https://gitlab.com/gitlab-examples/cpp-example | |
| class DownloadResult: | |
| local_path: Path | |
| repository_name: str | |
| base_url: str | |
| _CONFLUENCE_URL_MAP_FILENAME = ".confluence_url_map.json" | |
| def _sanitize_filename(value: str) -> str: | |
| cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("._") | |
| return cleaned or "page" | |
| def _confluence_headers() -> dict[str, str]: | |
| headers = {"Accept": "application/json"} | |
| return headers | |
| def _confluence_request(url: str, params: dict | None = None) -> dict: | |
| response = requests.get( | |
| url, | |
| params=params, | |
| headers=_confluence_headers(), | |
| auth=HTTPBasicAuth( | |
| os.getenv("CONFLUENCE_USER_EMAIL", "").strip(), | |
| os.getenv("ATLASSIAN_API_KEY", "").strip(), | |
| ), | |
| timeout=30, | |
| ) | |
| if response.status_code in {401, 403} and os.getenv("CONFLUENCE_READ_ONLY_KEY"): | |
| print(response.text) | |
| raise RuntimeError( | |
| "Confluence auth failed (401/403). If using an Atlassian API token, set " | |
| "CONFLUENCE_USER_EMAIL and CONFLUENCE_READ_ONLY_KEY." | |
| ) | |
| if response.status_code != 200: | |
| raise RuntimeError( | |
| f"Confluence request failed ({response.status_code}): {response.text}" | |
| ) | |
| return response.json() | |
| def _resolve_confluence_page_url(base_url: str, page: dict) -> str: | |
| links = page.get("_links", {}) if isinstance(page, dict) else {} | |
| webui = str(links.get("webui") or "").strip() | |
| base = str(links.get("base") or base_url).strip() | |
| if webui: | |
| if webui.startswith("http://") or webui.startswith("https://"): | |
| return webui | |
| return f"{base.rstrip('/')}/{webui.lstrip('/')}" | |
| page_id = str(page.get("id") or "").strip() | |
| if page_id: | |
| return f"{base_url.rstrip('/')}/pages/{page_id}" | |
| return base_url | |
| def _fetch_confluence_homepage_id(base_url: str, space_key: str) -> str: | |
| payload = _confluence_request( | |
| f"{base_url}/rest/api/space/{space_key}", | |
| params={"expand": "homepage"}, | |
| ) | |
| homepage = payload.get("homepage", {}) | |
| return str(homepage.get("id") or "").strip() | |
| def _write_confluence_url_map(dest_path: Path, url_map: dict[str, str]) -> None: | |
| if not url_map: | |
| return | |
| map_path = dest_path / _CONFLUENCE_URL_MAP_FILENAME | |
| map_path.write_text( | |
| json.dumps(url_map, ensure_ascii=True, indent=2), | |
| encoding="utf-8", | |
| ) | |
| def _build_base_url(url: str, branch: str = "") -> str: | |
| if "gitlab.com" in url: | |
| return f"{url}/-/blob/{branch}" if branch else url | |
| elif "github.com" in url: | |
| return f"{url}/blob/{branch}" if branch else url | |
| else: | |
| return url # Fallback to original URL if we don't recognize the host | |
| def _get_default_branch(url: str) -> str: | |
| result = subprocess.run( | |
| ["git", "ls-remote", "--symref", url, "HEAD"], | |
| capture_output=True, | |
| text=True, | |
| ) | |
| if result.returncode != 0: | |
| raise RuntimeError(result.stderr.strip()) | |
| for line in result.stdout.splitlines(): | |
| if line.startswith("ref:"): | |
| # ref: refs/heads/main HEAD | |
| return line.split("refs/heads/")[1].split()[0] | |
| raise RuntimeError("Could not determine default branch.") | |
| def _infer_git_repo_name(url: str) -> str: | |
| if "://" in url: | |
| path = urlparse(url).path | |
| else: | |
| # Support scp-style URLs: git@host:org/repo.git | |
| if ":" in url: | |
| path = url.split(":", 1)[1] | |
| else: | |
| path = url | |
| path = path.rstrip("/") | |
| name = Path(path).name | |
| if name.endswith(".git"): | |
| name = name[:-4] | |
| if not name: | |
| raise ValueError("Could not determine repository name from URL.") | |
| return name | |
| def download_git_repo(url: str, dest_root: Path) -> DownloadResult: | |
| if shutil.which("git") is None: | |
| raise RuntimeError("git is not available on this system.") | |
| repo_name = _infer_git_repo_name(url) | |
| dest_path = dest_root / repo_name | |
| result = subprocess.run( | |
| ["git", "clone", "--depth", "1", url, str(dest_path)], | |
| capture_output=True, | |
| text=True, | |
| ) | |
| if result.returncode != 0: | |
| stderr = (result.stderr or result.stdout or "").strip() | |
| raise RuntimeError(f"git clone failed: {stderr}") | |
| branch = _get_default_branch(url) | |
| print("Baseurl: " + _build_base_url(url, branch)) | |
| return DownloadResult( | |
| local_path=dest_path, | |
| repository_name=repo_name, | |
| base_url=_build_base_url(url, branch), | |
| ) | |
| def _parse_confluence_base_and_page(url: str) -> tuple[str, str]: | |
| parsed = urlparse(url) | |
| if not parsed.scheme or not parsed.netloc: | |
| raise ValueError("Invalid Confluence URL.") | |
| page_id = "" | |
| query_params = parsed.query.split("&") if parsed.query else [] | |
| for entry in query_params: | |
| if entry.startswith("pageId="): | |
| page_id = entry.split("=", 1)[1] | |
| break | |
| path = parsed.path or "" | |
| if not page_id: | |
| match = re.search(r"/pages/(\d+)", path) | |
| if match: | |
| page_id = match.group(1) | |
| base_path = "/wiki" if "/wiki/" in path or path.startswith("/wiki") else "" | |
| base_url = f"{parsed.scheme}://{parsed.netloc}{base_path}" | |
| if page_id: | |
| return base_url, page_id | |
| space_key = "" | |
| match = re.search(r"/spaces/([^/]+)", path) | |
| if not match: | |
| match = re.search(r"/display/([^/]+)", path) | |
| if match: | |
| space_key = match.group(1) | |
| if space_key: | |
| homepage_id = _fetch_confluence_homepage_id(base_url, space_key) | |
| if homepage_id: | |
| return base_url, homepage_id | |
| raise ValueError( | |
| "Could not find a Confluence page ID in the URL. " | |
| "Use a link with '/pages/<PAGEID>' or '?pageId=<PAGEID>', " | |
| "or a space URL like '/spaces/<SPACEKEY>' or '/display/<SPACEKEY>'." | |
| ) | |
| def _fetch_confluence_page( | |
| base_url: str, | |
| page_id: str, | |
| ) -> dict: | |
| return _confluence_request( | |
| f"{base_url}/rest/api/content/{page_id}", | |
| params={"expand": "body.storage,title"}, | |
| ) | |
| def _fetch_confluence_child_pages( | |
| base_url: str, | |
| page_id: str, | |
| limit: int, | |
| ) -> list[dict]: | |
| children: list[dict] = [] | |
| start = 0 | |
| while True: | |
| payload = _confluence_request( | |
| f"{base_url}/rest/api/content/{page_id}/child/page", | |
| params={"limit": limit, "start": start, "expand": "body.storage,title"}, | |
| ) | |
| results = payload.get("results", []) | |
| children.extend(results) | |
| if not results: | |
| break | |
| next_link = payload.get("_links", {}).get("next") | |
| if next_link: | |
| start += len(results) | |
| continue | |
| if len(results) < limit: | |
| break | |
| start += len(results) | |
| return children | |
| def download_confluence_space( | |
| url: str, | |
| dest_root: Path, | |
| limit: int = 50, | |
| ) -> DownloadResult: | |
| base_url, root_page_id = _parse_confluence_base_and_page(url) | |
| folder_name = _sanitize_filename(f"confluence_{root_page_id}") | |
| dest_path = dest_root / folder_name | |
| dest_path.mkdir(parents=True, exist_ok=True) | |
| total_written = 0 | |
| url_map: dict[str, str] = {} | |
| root_page_url = "" | |
| queue = [root_page_id] | |
| seen: set[str] = set() | |
| while queue: | |
| page_id = queue.pop(0) | |
| if page_id in seen: | |
| continue | |
| seen.add(page_id) | |
| page = _fetch_confluence_page(base_url, page_id) | |
| title = str(page.get("title") or "untitled") | |
| body = page.get("body", {}).get("storage", {}).get("value", "") | |
| html = f"<h1>{title}</h1>\n{body}" | |
| filename = f"{_sanitize_filename(title)}_{page_id}.html" | |
| (dest_path / filename).write_text(html, encoding="utf-8") | |
| total_written += 1 | |
| page_url = _resolve_confluence_page_url(base_url, page) | |
| url_map[Path(filename).as_posix()] = page_url | |
| if page_id == root_page_id: | |
| root_page_url = page_url | |
| children = _fetch_confluence_child_pages(base_url, page_id, limit) | |
| for child in children: | |
| child_id = str(child.get("id") or "") | |
| if child_id: | |
| queue.append(child_id) | |
| if total_written == 0: | |
| raise RuntimeError("No pages were downloaded.") | |
| _write_confluence_url_map(dest_path, url_map) | |
| return DownloadResult( | |
| local_path=dest_path, | |
| repository_name=folder_name, | |
| base_url=root_page_url or f"{base_url.rstrip('/')}/pages/{root_page_id}", | |
| ) | |
| def download_source(url: str, source_type: str, dest_root: Path) -> DownloadResult: | |
| normalized = source_type.strip().lower() | |
| if normalized in {"git", "git repository", "git repo"}: | |
| return download_git_repo(url, dest_root) | |
| if normalized in {"confluence", "confluence space"}: | |
| return download_confluence_space(url, dest_root) | |
| raise ValueError(f"Unsupported source type: {source_type}") | |
| if __name__ == "__main__": | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| temp_dir_path = Path(temp_dir) | |
| download_result = download_source( | |
| "https://fortissgmbh.atlassian.net/wiki/spaces/MA/overview?homepageId=486113542", | |
| "confluence", | |
| temp_dir_path, | |
| ) | |
| # for debugging print filetree | |
| print(download_result) | |
| for path in Path(temp_dir).rglob("*"): | |
| print(path) | |
| # url = "https://fortissgmbh.atlassian.net/wiki/api/v2/pages/620199938" | |
| # auth = HTTPBasicAuth("amougou@fortiss.org", os.getenv("CONFLUENCE_READ_2", "").strip()) | |
| # headers = { | |
| # "Accept": "application/json" | |
| # } | |
| # response = requests.request( | |
| # "GET", | |
| # url, | |
| # headers=headers, | |
| # auth=auth | |
| # ) | |
| # print(response.text) | |