Update backend/prompt_builder.py

#6
Files changed (1) hide show
  1. backend/prompt_builder.py +18 -9
backend/prompt_builder.py CHANGED
@@ -3,6 +3,10 @@
3
  Supports up to 2 multimodal attachments (images or PDFs). For PDFs we extract
4
  text inline (since the vision projector handles images only). If vision is
5
  unavailable we degrade gracefully to text-only.
 
 
 
 
6
  """
7
  import base64
8
  import io
@@ -14,8 +18,11 @@ from PIL import Image
14
  from .model_loader import MMPROJ_PATH
15
 
16
  SYSTEM_PROMPT = """You are Elysium — a persistent agentic civilization.
17
- You ALWAYS respond with a single valid JSON object exactly matching the
18
- ElysiumResponse schema v1.0.0. No preamble. No markdown fences. JSON only.
 
 
 
19
 
20
  Decide complexity dynamically:
21
  - SIMPLE_REPLY: trivial Q — no agents (council_deliberation.agent_outputs = [])
@@ -26,11 +33,14 @@ Decide complexity dynamically:
26
  - SPECIATION_EVENT: only on unresolved cross-domain tension
27
  - Always populate ui_directives (camera_focus_node_id, pulses, threads)
28
  - All node_id and edge_id values must be unique strings
29
- - Always include 'direct_answer' — a short human-readable answer to surface in toasts.
 
 
30
 
31
  When the user attaches images or PDFs, analyze them, populate
32
  multimodal_perception fields (ocr_extracted_text, image_scene_description,
33
- document_type, visual_entities_detected), and reference them in your reasoning.
 
34
  """
35
 
36
 
@@ -65,11 +75,9 @@ def build_messages(user_text: str,
65
  ctx = f"\n\n[Hypergraph context]\n{hg_context}" if hg_context else ""
66
  attachments = attachments or []
67
 
68
- # Gather image and pdf attachments separately
69
  image_atts = [a for a in attachments if a["kind"] == "image" and a.get("image") is not None]
70
  pdf_atts = [a for a in attachments if a["kind"] == "pdf" and a.get("bytes")]
71
 
72
- # Build inline PDF text block
73
  pdf_block = ""
74
  for i, p in enumerate(pdf_atts):
75
  pdf_block += f"\n\n[Attached PDF #{i+1}: {p.get('name','document.pdf')}]\n"
@@ -85,14 +93,14 @@ def build_messages(user_text: str,
85
  })
86
  user_content.append({
87
  "type": "text",
88
- "text": (user_text or "(no text)") + pdf_block + ctx,
 
89
  })
90
  return [
91
  {"role": "system", "content": SYSTEM_PROMPT},
92
  {"role": "user", "content": user_content},
93
  ]
94
 
95
- # No vision: include note if user attached images but vision is off
96
  note = ""
97
  if image_atts and not MMPROJ_PATH:
98
  note = (f"\n\n[Note: user attached {len(image_atts)} image(s) but vision "
@@ -103,7 +111,8 @@ def build_messages(user_text: str,
103
  return [
104
  {"role": "system", "content": SYSTEM_PROMPT},
105
  {"role": "user",
106
- "content": (user_text or "(no text)") + pdf_block + note + ctx},
 
107
  ]
108
 
109
 
 
3
  Supports up to 2 multimodal attachments (images or PDFs). For PDFs we extract
4
  text inline (since the vision projector handles images only). If vision is
5
  unavailable we degrade gracefully to text-only.
6
+
7
+ FIX: SYSTEM_PROMPT now explicitly forbids <think> blocks, markdown fences,
8
+ preamble, and any non-JSON token. Even so, server.py still strips <think>
9
+ defensively because the fine-tuned weights emit it occasionally.
10
  """
11
  import base64
12
  import io
 
18
  from .model_loader import MMPROJ_PATH
19
 
20
  SYSTEM_PROMPT = """You are Elysium — a persistent agentic civilization.
21
+ OUTPUT CONTRACT (strict):
22
+ Respond with ONE valid JSON object matching the ElysiumResponse schema v1.0.0.
23
+ • Output JSON ONLY. No preamble, no postscript, no markdown, no code fences.
24
+ • Do NOT emit <think>, <reasoning>, or any XML-style tags.
25
+ • The first character of your output MUST be `{` and the last must be `}`.
26
 
27
  Decide complexity dynamically:
28
  - SIMPLE_REPLY: trivial Q — no agents (council_deliberation.agent_outputs = [])
 
33
  - SPECIATION_EVENT: only on unresolved cross-domain tension
34
  - Always populate ui_directives (camera_focus_node_id, pulses, threads)
35
  - All node_id and edge_id values must be unique strings
36
+ - Always include 'direct_answer' — a short human-readable answer (one or two
37
+ sentences) suitable for surfacing in a toast. NEVER place JSON, raw schema
38
+ text, or system tags inside direct_answer.
39
 
40
  When the user attaches images or PDFs, analyze them, populate
41
  multimodal_perception fields (ocr_extracted_text, image_scene_description,
42
+ document_type, visual_entities_detected), and reference them in your reasoning
43
+ through the hypergraph_delta and agent thinking — NOT in direct_answer.
44
  """
45
 
46
 
 
75
  ctx = f"\n\n[Hypergraph context]\n{hg_context}" if hg_context else ""
76
  attachments = attachments or []
77
 
 
78
  image_atts = [a for a in attachments if a["kind"] == "image" and a.get("image") is not None]
79
  pdf_atts = [a for a in attachments if a["kind"] == "pdf" and a.get("bytes")]
80
 
 
81
  pdf_block = ""
82
  for i, p in enumerate(pdf_atts):
83
  pdf_block += f"\n\n[Attached PDF #{i+1}: {p.get('name','document.pdf')}]\n"
 
93
  })
94
  user_content.append({
95
  "type": "text",
96
+ "text": (user_text or "(no text)") + pdf_block + ctx +
97
+ "\n\nReturn ONLY the JSON object. No <think> tags. No prose.",
98
  })
99
  return [
100
  {"role": "system", "content": SYSTEM_PROMPT},
101
  {"role": "user", "content": user_content},
102
  ]
103
 
 
104
  note = ""
105
  if image_atts and not MMPROJ_PATH:
106
  note = (f"\n\n[Note: user attached {len(image_atts)} image(s) but vision "
 
111
  return [
112
  {"role": "system", "content": SYSTEM_PROMPT},
113
  {"role": "user",
114
+ "content": (user_text or "(no text)") + pdf_block + note + ctx +
115
+ "\n\nReturn ONLY the JSON object. No <think> tags. No prose."},
116
  ]
117
 
118