File size: 4,770 Bytes
9d8a0cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Confluence agent runthrough.

With credentials (CONFLUENCE_BASE_URL + CONFLUENCE_TOKEN + CONFLUENCE_EMAIL set):
  Fetches the first page from CONFLUENCE_SPACES, runs the full pipeline,
  and verifies the vectors in Qdrant.

Without credentials (mock mode):
  Builds a synthetic RawDocument with realistic Confluence Storage HTML,
  runs chunker β†’ PII mask β†’ embed β†’ Qdrant, and verifies via Qdrant scroll.

Run:
  python -m src.confluence_agent.test_run
"""
from __future__ import annotations

import asyncio
import hashlib
import logging
import sys

logging.basicConfig(level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger("confluence_test_run")

_SAMPLE_HTML = """
<h1>Authentication Overview</h1>
<p>Our SSO system uses SAML 2.0. Contact the security team at security@example.com for access.</p>
<h2>Setup Instructions</h2>
<p>
  1. Generate a keypair using <code>openssl</code>.<br/>
  2. Submit the public certificate to the IdP admin (Alice Smith, phone: 555-987-6543).<br/>
  3. Configure the SP metadata URL.
</p>
<h2>Troubleshooting</h2>
<p>If login fails, check the SAML assertion expiry. Default TTL is 5 minutes.</p>
<table>
  <tr><th>Error Code</th><th>Cause</th><th>Fix</th></tr>
  <tr><td>401</td><td>Expired assertion</td><td>Sync clocks with NTP</td></tr>
  <tr><td>403</td><td>Missing attribute</td><td>Add email to IdP release policy</td></tr>
</table>
"""


def _mock_raw_document():
    from ingestion.models import RawDocument

    return RawDocument(
        doc_id=hashlib.sha256(b"confluence:123456").hexdigest(),
        title="Authentication Overview",
        content=_SAMPLE_HTML,
        source_url="http://mock-confluence/wiki/spaces/ENG/pages/123456",
        source_type="confluence",
        team_id="test_team",
        metadata={
            "page_id": "123456",
            "space_key": "ENG",
            "ancestors": ["Engineering", "Security"],
            "version": 3,
        },
    )


async def _run_mock() -> None:
    from ingestion.pipeline.embedder import embed_chunks
    from ingestion.pipeline.pii_masker import mask_chunks
    from ingestion.storage.qdrant_store import (
        delete_chunks_for_doc,
        ensure_collection_exists,
        upsert_chunks,
    )
    from qdrant_client import QdrantClient
    from src.confluence_agent.chunker import chunk_confluence_page
    from ingestion.config import settings

    logger.info("=== Confluence agent test β€” MOCK MODE ===")
    ensure_collection_exists()

    raw_doc = _mock_raw_document()
    chunks = chunk_confluence_page(raw_doc)
    logger.info("Produced %d chunks", len(chunks))
    for c in chunks:
        heading = c.metadata.get("section_heading", "")
        logger.info("  chunk[%d] heading=%r len=%d", c.chunk_index, heading, len(c.text))

    texts = [c.text for c in chunks]
    masked = mask_chunks(texts)
    for chunk, m in zip(chunks, masked):
        chunk.text = m

    embedded = embed_chunks(chunks)
    delete_chunks_for_doc(raw_doc.doc_id)
    upsert_chunks(embedded)
    logger.info("Upserted %d embedded chunks", len(embedded))

    client = QdrantClient(host=settings.qdrant_host, port=settings.qdrant_port)
    from qdrant_client.http import models as qmodels
    results, _ = client.scroll(
        collection_name=settings.qdrant_collection,
        scroll_filter=qmodels.Filter(
            must=[qmodels.FieldCondition(key="doc_id", match=qmodels.MatchValue(value=raw_doc.doc_id))]
        ),
        limit=20,
        with_payload=True,
        with_vectors=False,
    )
    logger.info(
        "Verification: found %d points in Qdrant for doc_id=%s",
        len(results),
        raw_doc.doc_id[:12],
    )
    for pt in results:
        logger.info("  point %s | heading=%s", str(pt.id)[:8], pt.payload.get("section_heading", ""))
    logger.info("=== Confluence mock test PASSED ===")


async def _run_real() -> None:
    from src.confluence_agent.config import confluence_config
    from src.confluence_agent.pipeline import ingest_space

    spaces = confluence_config.space_list
    if not spaces:
        logger.warning("No CONFLUENCE_SPACES configured β€” cannot run real test")
        return
    space_key = spaces[0]
    logger.info("=== Confluence agent test β€” REAL MODE (space: %s) ===", space_key)
    count = await ingest_space(space_key)
    logger.info("Stored %d chunks for space %s", count, space_key)
    logger.info("=== Confluence real test DONE ===")


async def main() -> None:
    from src.confluence_agent.config import confluence_config

    has_creds = bool(confluence_config.confluence_base_url and confluence_config.confluence_token)
    if has_creds:
        await _run_real()
    else:
        await _run_mock()


if __name__ == "__main__":
    asyncio.run(main())