File size: 6,854 Bytes
3ebaeb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e159ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ebaeb6
 
 
 
 
5e159ec
 
 
 
3ebaeb6
5e159ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ebaeb6
 
 
 
 
 
5e159ec
 
3ebaeb6
 
5e159ec
3ebaeb6
 
 
5e159ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ebaeb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""Wiki compiler — uses Claude to integrate raw sources into structured wiki articles."""

import json
import datetime
import anthropic

COMPILE_SYSTEM_PROMPT = """You are a clinical knowledge wiki curator for the Nursing Citizen Development Organisation.
Your job is to integrate new source material into an existing nursing knowledge base (wiki).

The wiki is a collection of markdown articles organised by category. Each article has:
- A title, category, tags, and backlinks to other articles
- Substantive clinical content aligned with NMC Standards of Proficiency (2018), UK law, and NHS frameworks

When given new source material, you must:
1. Identify key nursing concepts, frameworks, guidelines, or clinical information in the source
2. Decide which existing articles should be UPDATED with new information
3. Identify any new articles that should be CREATED for concepts not yet covered
4. Integrate the information accurately and clinically appropriately
5. Add/update backlinks between related articles
6. Always cite the source in any updated/created articles

Return a JSON object with this structure:
{
  "summary": "Brief summary of what was integrated and why",
  "articles_updated": [
    {
      "slug": "article_slug",
      "title": "Article Title",
      "category": "category_name",
      "tags": ["tag1", "tag2"],
      "content": "Full markdown content of the updated article"
    }
  ],
  "articles_created": [
    {
      "slug": "new_slug",
      "title": "New Article Title",
      "category": "category_name",
      "tags": ["tag1", "tag2"],
      "content": "Full markdown content of the new article"
    }
  ],
  "index_updates": "Updated one-line entries for the index (markdown format)",
  "log_entry": "Log entry text for this compilation"
}

Categories to use: standards, clinical, pharmacology, evidence, frameworks, safety, law, mental_health, research, ethics

Clinical content must:
- Be accurate and evidence-based
- Include NMC proficiency mappings where relevant
- Include UK-specific references (NICE, NMC, NHS, BNF)
- Include the disclaimer: "This tool supports but does not replace clinical judgment."
- Use UK spellings (organisation, anaesthesia, etc.)
"""


CHUNK_SIZE = 7000  # chars per chunk for large documents


def _chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> list[str]:
    """Split text into chunks at paragraph boundaries."""
    if len(text) <= chunk_size:
        return [text]
    chunks = []
    paragraphs = text.split("\n\n")
    current = []
    current_len = 0
    for para in paragraphs:
        if current_len + len(para) > chunk_size and current:
            chunks.append("\n\n".join(current))
            current = [para]
            current_len = len(para)
        else:
            current.append(para)
            current_len += len(para)
    if current:
        chunks.append("\n\n".join(current))
    return chunks


def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
                   existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
    """
    Integrate a new source into the wiki.

    Large documents are automatically split into chunks and compiled sequentially,
    with the wiki state updated between chunks so each pass builds on the last.

    Returns a merged dict with all updated/created articles and metadata.
    """
    chunks = _chunk_text(source_content)
    total_chunks = len(chunks)

    merged: dict = {"articles_updated": [], "articles_created": [], "summary": "", "index_updates": "", "log_entry": ""}

    for chunk_num, chunk in enumerate(chunks, 1):
        chunk_label = f"{source_title} (part {chunk_num}/{total_chunks})" if total_chunks > 1 else source_title

        # Build context from current article state (updates between chunks)
        articles_context = ""
        if existing_articles:
            for slug, art in list(existing_articles.items())[:8]:
                preview = art["content"][:400].replace("\n", " ")
                articles_context += f"\n- **{art['title']}** ({art['category']}): {preview}...\n"

        user_prompt = f"""## Existing Wiki Index
{existing_index}

## Sample of Existing Articles (previews)
{articles_context}

## New Source to Integrate
**Title**: {chunk_label}
{"**(Large document — this is chunk " + str(chunk_num) + " of " + str(total_chunks) + ")**" if total_chunks > 1 else ""}

**Content**:
{chunk}

Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""

        response = client.messages.create(
            model=model,
            max_tokens=4096,
            system=COMPILE_SYSTEM_PROMPT,
            messages=[{"role": "user", "content": user_prompt}],
        )

        raw = response.content[0].text.strip()
        if raw.startswith("```"):
            raw = raw.split("\n", 1)[1]
            if raw.endswith("```"):
                raw = raw.rsplit("```", 1)[0]

        result = json.loads(raw)

        # Merge chunk results
        today = datetime.date.today().isoformat()
        for art in result.get("articles_updated", []) + result.get("articles_created", []):
            art["last_updated"] = today
            art["sources"] = art.get("sources", []) + [source_title]
            # Apply to existing_articles so next chunk sees current state
            existing_articles[art["slug"]] = art

        merged["articles_updated"].extend(result.get("articles_updated", []))
        merged["articles_created"].extend(result.get("articles_created", []))
        if result.get("summary"):
            merged["summary"] += f"[Part {chunk_num}] {result['summary']} "
        if result.get("log_entry"):
            merged["log_entry"] = result["log_entry"]

    # Deduplicate by slug (keep last version)
    seen: dict = {}
    for art in merged["articles_updated"] + merged["articles_created"]:
        seen[art["slug"]] = art
    merged["articles_updated"] = list(seen.values())
    merged["articles_created"] = []

    return merged


def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str:
    """Regenerate the wiki index from all articles."""
    article_list = []
    for slug, art in articles.items():
        article_list.append(f"- **{art['title']}** ({art['category']}): {', '.join(art.get('tags', []))}")

    prompt = f"""Regenerate a well-organised wiki index for these nursing knowledge articles.
Group them by category. Each entry should be a one-line summary.
Format as markdown with category headers (##).

Articles:
{chr(10).join(article_list)}

Return only the markdown index content."""

    response = client.messages.create(
        model=model,
        max_tokens=2048,
        messages=[{"role": "user", "content": prompt}],
    )
    return response.content[0].text.strip()