NurseCitizenDeveloper commited on
Commit
5e159ec
·
verified ·
1 Parent(s): cadfe79

Upload core/compiler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. core/compiler.py +85 -35
core/compiler.py CHANGED
@@ -55,58 +55,108 @@ Clinical content must:
55
  """
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
59
  existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
60
  """
61
  Integrate a new source into the wiki.
62
 
63
- Returns a dict with updated/created articles and metadata.
 
 
 
64
  """
65
- # Build context: index + up to 5 most relevant article summaries
66
- articles_context = ""
67
- if existing_articles:
68
- # Include first 500 chars of each article as context
69
- for slug, art in list(existing_articles.items())[:8]:
70
- preview = art["content"][:400].replace("\n", " ")
71
- articles_context += f"\n- **{art['title']}** ({art['category']}): {preview}...\n"
72
-
73
- user_prompt = f"""## Existing Wiki Index
 
 
 
 
 
 
 
74
  {existing_index}
75
 
76
  ## Sample of Existing Articles (previews)
77
  {articles_context}
78
 
79
  ## New Source to Integrate
80
- **Title**: {source_title}
 
81
 
82
  **Content**:
83
- {source_content[:8000]}
84
 
85
  Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""
86
 
87
- response = client.messages.create(
88
- model=model,
89
- max_tokens=4096,
90
- system=COMPILE_SYSTEM_PROMPT,
91
- messages=[{"role": "user", "content": user_prompt}],
92
- )
93
-
94
- raw = response.content[0].text.strip()
95
- # Strip markdown fences if present
96
- if raw.startswith("```"):
97
- raw = raw.split("\n", 1)[1]
98
- if raw.endswith("```"):
99
- raw = raw.rsplit("```", 1)[0]
100
-
101
- result = json.loads(raw)
102
-
103
- # Add metadata
104
- today = datetime.date.today().isoformat()
105
- for art in result.get("articles_updated", []) + result.get("articles_created", []):
106
- art["last_updated"] = today
107
- art["sources"] = art.get("sources", []) + [source_title]
108
-
109
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
  def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str:
 
55
  """
56
 
57
 
58
+ CHUNK_SIZE = 7000 # chars per chunk for large documents
59
+
60
+
61
+ def _chunk_text(text: str, chunk_size: int = CHUNK_SIZE) -> list[str]:
62
+ """Split text into chunks at paragraph boundaries."""
63
+ if len(text) <= chunk_size:
64
+ return [text]
65
+ chunks = []
66
+ paragraphs = text.split("\n\n")
67
+ current = []
68
+ current_len = 0
69
+ for para in paragraphs:
70
+ if current_len + len(para) > chunk_size and current:
71
+ chunks.append("\n\n".join(current))
72
+ current = [para]
73
+ current_len = len(para)
74
+ else:
75
+ current.append(para)
76
+ current_len += len(para)
77
+ if current:
78
+ chunks.append("\n\n".join(current))
79
+ return chunks
80
+
81
+
82
  def compile_source(client: anthropic.Anthropic, source_title: str, source_content: str,
83
  existing_index: str, existing_articles: dict, model: str = "claude-sonnet-4-6") -> dict:
84
  """
85
  Integrate a new source into the wiki.
86
 
87
+ Large documents are automatically split into chunks and compiled sequentially,
88
+ with the wiki state updated between chunks so each pass builds on the last.
89
+
90
+ Returns a merged dict with all updated/created articles and metadata.
91
  """
92
+ chunks = _chunk_text(source_content)
93
+ total_chunks = len(chunks)
94
+
95
+ merged: dict = {"articles_updated": [], "articles_created": [], "summary": "", "index_updates": "", "log_entry": ""}
96
+
97
+ for chunk_num, chunk in enumerate(chunks, 1):
98
+ chunk_label = f"{source_title} (part {chunk_num}/{total_chunks})" if total_chunks > 1 else source_title
99
+
100
+ # Build context from current article state (updates between chunks)
101
+ articles_context = ""
102
+ if existing_articles:
103
+ for slug, art in list(existing_articles.items())[:8]:
104
+ preview = art["content"][:400].replace("\n", " ")
105
+ articles_context += f"\n- **{art['title']}** ({art['category']}): {preview}...\n"
106
+
107
+ user_prompt = f"""## Existing Wiki Index
108
  {existing_index}
109
 
110
  ## Sample of Existing Articles (previews)
111
  {articles_context}
112
 
113
  ## New Source to Integrate
114
+ **Title**: {chunk_label}
115
+ {"**(Large document — this is chunk " + str(chunk_num) + " of " + str(total_chunks) + ")**" if total_chunks > 1 else ""}
116
 
117
  **Content**:
118
+ {chunk}
119
 
120
  Please integrate this source into the wiki. Return valid JSON only, no markdown code fences."""
121
 
122
+ response = client.messages.create(
123
+ model=model,
124
+ max_tokens=4096,
125
+ system=COMPILE_SYSTEM_PROMPT,
126
+ messages=[{"role": "user", "content": user_prompt}],
127
+ )
128
+
129
+ raw = response.content[0].text.strip()
130
+ if raw.startswith("```"):
131
+ raw = raw.split("\n", 1)[1]
132
+ if raw.endswith("```"):
133
+ raw = raw.rsplit("```", 1)[0]
134
+
135
+ result = json.loads(raw)
136
+
137
+ # Merge chunk results
138
+ today = datetime.date.today().isoformat()
139
+ for art in result.get("articles_updated", []) + result.get("articles_created", []):
140
+ art["last_updated"] = today
141
+ art["sources"] = art.get("sources", []) + [source_title]
142
+ # Apply to existing_articles so next chunk sees current state
143
+ existing_articles[art["slug"]] = art
144
+
145
+ merged["articles_updated"].extend(result.get("articles_updated", []))
146
+ merged["articles_created"].extend(result.get("articles_created", []))
147
+ if result.get("summary"):
148
+ merged["summary"] += f"[Part {chunk_num}] {result['summary']} "
149
+ if result.get("log_entry"):
150
+ merged["log_entry"] = result["log_entry"]
151
+
152
+ # Deduplicate by slug (keep last version)
153
+ seen: dict = {}
154
+ for art in merged["articles_updated"] + merged["articles_created"]:
155
+ seen[art["slug"]] = art
156
+ merged["articles_updated"] = list(seen.values())
157
+ merged["articles_created"] = []
158
+
159
+ return merged
160
 
161
 
162
  def rebuild_index(client: anthropic.Anthropic, articles: dict, model: str = "claude-sonnet-4-6") -> str: