Spaces:
Sleeping
Sleeping
File size: 4,770 Bytes
9d8a0cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """
Confluence agent runthrough.
With credentials (CONFLUENCE_BASE_URL + CONFLUENCE_TOKEN + CONFLUENCE_EMAIL set):
Fetches the first page from CONFLUENCE_SPACES, runs the full pipeline,
and verifies the vectors in Qdrant.
Without credentials (mock mode):
Builds a synthetic RawDocument with realistic Confluence Storage HTML,
runs chunker β PII mask β embed β Qdrant, and verifies via Qdrant scroll.
Run:
python -m src.confluence_agent.test_run
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger("confluence_test_run")
_SAMPLE_HTML = """
<h1>Authentication Overview</h1>
<p>Our SSO system uses SAML 2.0. Contact the security team at security@example.com for access.</p>
<h2>Setup Instructions</h2>
<p>
1. Generate a keypair using <code>openssl</code>.<br/>
2. Submit the public certificate to the IdP admin (Alice Smith, phone: 555-987-6543).<br/>
3. Configure the SP metadata URL.
</p>
<h2>Troubleshooting</h2>
<p>If login fails, check the SAML assertion expiry. Default TTL is 5 minutes.</p>
<table>
<tr><th>Error Code</th><th>Cause</th><th>Fix</th></tr>
<tr><td>401</td><td>Expired assertion</td><td>Sync clocks with NTP</td></tr>
<tr><td>403</td><td>Missing attribute</td><td>Add email to IdP release policy</td></tr>
</table>
"""
def _mock_raw_document():
from ingestion.models import RawDocument
return RawDocument(
doc_id=hashlib.sha256(b"confluence:123456").hexdigest(),
title="Authentication Overview",
content=_SAMPLE_HTML,
source_url="http://mock-confluence/wiki/spaces/ENG/pages/123456",
source_type="confluence",
team_id="test_team",
metadata={
"page_id": "123456",
"space_key": "ENG",
"ancestors": ["Engineering", "Security"],
"version": 3,
},
)
async def _run_mock() -> None:
from ingestion.pipeline.embedder import embed_chunks
from ingestion.pipeline.pii_masker import mask_chunks
from ingestion.storage.qdrant_store import (
delete_chunks_for_doc,
ensure_collection_exists,
upsert_chunks,
)
from qdrant_client import QdrantClient
from src.confluence_agent.chunker import chunk_confluence_page
from ingestion.config import settings
logger.info("=== Confluence agent test β MOCK MODE ===")
ensure_collection_exists()
raw_doc = _mock_raw_document()
chunks = chunk_confluence_page(raw_doc)
logger.info("Produced %d chunks", len(chunks))
for c in chunks:
heading = c.metadata.get("section_heading", "")
logger.info(" chunk[%d] heading=%r len=%d", c.chunk_index, heading, len(c.text))
texts = [c.text for c in chunks]
masked = mask_chunks(texts)
for chunk, m in zip(chunks, masked):
chunk.text = m
embedded = embed_chunks(chunks)
delete_chunks_for_doc(raw_doc.doc_id)
upsert_chunks(embedded)
logger.info("Upserted %d embedded chunks", len(embedded))
client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
from qdrant_client.http import models as qmodels
results, _ = client.scroll(
collection_name=settings.qdrant_collection,
scroll_filter=qmodels.Filter(
must=[qmodels.FieldCondition(key="doc_id", match=qmodels.MatchValue(value=raw_doc.doc_id))]
),
limit=20,
with_payload=True,
with_vectors=False,
)
logger.info(
"Verification: found %d points in Qdrant for doc_id=%s",
len(results),
raw_doc.doc_id[:12],
)
for pt in results:
logger.info(" point %s | heading=%s", str(pt.id)[:8], pt.payload.get("section_heading", ""))
logger.info("=== Confluence mock test PASSED ===")
async def _run_real() -> None:
from src.confluence_agent.config import confluence_config
from src.confluence_agent.pipeline import ingest_space
spaces = confluence_config.space_list
if not spaces:
logger.warning("No CONFLUENCE_SPACES configured β cannot run real test")
return
space_key = spaces[0]
logger.info("=== Confluence agent test β REAL MODE (space: %s) ===", space_key)
count = await ingest_space(space_key)
logger.info("Stored %d chunks for space %s", count, space_key)
logger.info("=== Confluence real test DONE ===")
async def main() -> None:
from src.confluence_agent.config import confluence_config
has_creds = bool(confluence_config.confluence_base_url and confluence_config.confluence_token)
if has_creds:
await _run_real()
else:
await _run_mock()
if __name__ == "__main__":
asyncio.run(main())
|