Spaces:

NurseCitizenDeveloper
/

nursing-knowledge-base

Sleeping

App Files Files Community

NurseCitizenDeveloper commited on Apr 4

Commit

5e159ec

verified ·

1 Parent(s): cadfe79

Upload core/compiler.py with huggingface_hub

Browse files

Files changed (1) hide show

core/compiler.py +85 -35

core/compiler.py CHANGED Viewed

@@ -55,58 +55,108 @@ Clinical content must:
 """
 def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
                    existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
     """
     Integrate a new source into the wiki.
-    Returns a dict with updated/created articles and metadata.
     """
-    # Build context: index + up to 5 most relevant article summaries
-    articles_context = ""
-    if existing_articles:
-        # Include first 500 chars of each article as context
-        for slug, art in list(existing_articles.items())[:8]:
-            preview = art["content"][:400].replace("\n", " ")
-            articles_context += f"\n- **{art['title']}** ({art['category']}): {preview}...\n"
-    user_prompt = f"""## Existing Wiki Index
 {existing_index}
 ## Sample of Existing Articles (previews)
 {articles_context}
 ## New Source to Integrate
-**Title**: {source_title}
 **Content**:
-{source_content[:8000]}
 Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""
-    response = client.messages.create(
-        model=model,
-        max_tokens=4096,
-        system=COMPILE_SYSTEM_PROMPT,
-        messages=[{"role": "user", "content": user_prompt}],
-    )
-    raw = response.content[0].text.strip()
-    # Strip markdown fences if present
-    if raw.startswith("```"):
-        raw = raw.split("\n", 1)[1]
-        if raw.endswith("```"):
-            raw = raw.rsplit("```", 1)[0]
-    result = json.loads(raw)
-    # Add metadata
-    today = datetime.date.today().isoformat()
-    for art in result.get("articles_updated", []) + result.get("articles_created", []):
-        art["last_updated"] = today
-        art["sources"] = art.get("sources", []) + [source_title]
-    return result
 def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str:

 """
+CHUNK_SIZE = 7000  # chars per chunk for large documents
+def _chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> list[str]:
+    """Split text into chunks at paragraph boundaries."""
+    if len(text) <= chunk_size:
+        return [text]
+    chunks = []
+    paragraphs = text.split("\n\n")
+    current = []
+    current_len = 0
+    for para in paragraphs:
+        if current_len + len(para) > chunk_size and current:
+            chunks.append("\n\n".join(current))
+            current = [para]
+            current_len = len(para)
+        else:
+            current.append(para)
+            current_len += len(para)
+    if current:
+        chunks.append("\n\n".join(current))
+    return chunks
 def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
                    existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
     """
     Integrate a new source into the wiki.
+    Large documents are automatically split into chunks and compiled sequentially,
+    with the wiki state updated between chunks so each pass builds on the last.
+    Returns a merged dict with all updated/created articles and metadata.
     """
+    chunks = _chunk_text(source_content)
+    total_chunks = len(chunks)
+    merged: dict = {"articles_updated": [], "articles_created": [], "summary": "", "index_updates": "", "log_entry": ""}
+    for chunk_num, chunk in enumerate(chunks, 1):
+        chunk_label = f"{source_title} (part {chunk_num}/{total_chunks})" if total_chunks > 1 else source_title
+        # Build context from current article state (updates between chunks)
+        articles_context = ""
+        if existing_articles:
+            for slug, art in list(existing_articles.items())[:8]:
+                preview = art["content"][:400].replace("\n", " ")
+                articles_context += f"\n- **{art['title']}** ({art['category']}): {preview}...\n"
+        user_prompt = f"""## Existing Wiki Index
 {existing_index}
 ## Sample of Existing Articles (previews)
 {articles_context}
 ## New Source to Integrate
+**Title**: {chunk_label}
+{"**(Large document — this is chunk " + str(chunk_num) + " of " + str(total_chunks) + ")**" if total_chunks > 1 else ""}
 **Content**:
+{chunk}
 Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""
+        response = client.messages.create(
+            model=model,
+            max_tokens=4096,
+            system=COMPILE_SYSTEM_PROMPT,
+            messages=[{"role": "user", "content": user_prompt}],
+        )
+        raw = response.content[0].text.strip()
+        if raw.startswith("```"):
+            raw = raw.split("\n", 1)[1]
+            if raw.endswith("```"):
+                raw = raw.rsplit("```", 1)[0]
+        result = json.loads(raw)
+        # Merge chunk results
+        today = datetime.date.today().isoformat()
+        for art in result.get("articles_updated", []) + result.get("articles_created", []):
+            art["last_updated"] = today
+            art["sources"] = art.get("sources", []) + [source_title]
+            # Apply to existing_articles so next chunk sees current state
+            existing_articles[art["slug"]] = art
+        merged["articles_updated"].extend(result.get("articles_updated", []))
+        merged["articles_created"].extend(result.get("articles_created", []))
+        if result.get("summary"):
+            merged["summary"] += f"[Part {chunk_num}] {result['summary']} "
+        if result.get("log_entry"):
+            merged["log_entry"] = result["log_entry"]
+    # Deduplicate by slug (keep last version)
+    seen: dict = {}
+    for art in merged["articles_updated"] + merged["articles_created"]:
+        seen[art["slug"]] = art
+    merged["articles_updated"] = list(seen.values())
+    merged["articles_created"] = []
+    return merged
 def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str: