Spaces:

build-small-hackathon
/

elysium

Running on Zero

App Files Files Community

Update backend/prompt_builder.py

by pmrinal2005 - opened 20 days ago

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+18

-9

Files changed (1) hide show

backend/prompt_builder.py +18 -9

backend/prompt_builder.py CHANGED Viewed

@@ -3,6 +3,10 @@
 Supports up to 2 multimodal attachments (images or PDFs). For PDFs we extract
 text inline (since the vision projector handles images only). If vision is
 unavailable we degrade gracefully to text-only.
 """
 import base64
 import io
@@ -14,8 +18,11 @@ from PIL import Image
 from .model_loader import MMPROJ_PATH
 SYSTEM_PROMPT = """You are Elysium — a persistent agentic civilization.
-You ALWAYS respond with a single valid JSON object exactly matching the
-ElysiumResponse schema v1.0.0. No preamble. No markdown fences. JSON only.
 Decide complexity dynamically:
 - SIMPLE_REPLY: trivial Q — no agents (council_deliberation.agent_outputs = [])
@@ -26,11 +33,14 @@ Decide complexity dynamically:
 - SPECIATION_EVENT: only on unresolved cross-domain tension
 - Always populate ui_directives (camera_focus_node_id, pulses, threads)
 - All node_id and edge_id values must be unique strings
-- Always include 'direct_answer' — a short human-readable answer to surface in toasts.
 When the user attaches images or PDFs, analyze them, populate
 multimodal_perception fields (ocr_extracted_text, image_scene_description,
-document_type, visual_entities_detected), and reference them in your reasoning.
 """
@@ -65,11 +75,9 @@ def build_messages(user_text: str,
     ctx = f"\n\n[Hypergraph context]\n{hg_context}" if hg_context else ""
     attachments = attachments or []
-    # Gather image and pdf attachments separately
     image_atts = [a for a in attachments if a["kind"] == "image" and a.get("image") is not None]
     pdf_atts   = [a for a in attachments if a["kind"] == "pdf"   and a.get("bytes")]
-    # Build inline PDF text block
     pdf_block = ""
     for i, p in enumerate(pdf_atts):
         pdf_block += f"\n\n[Attached PDF #{i+1}: {p.get('name','document.pdf')}]\n"
@@ -85,14 +93,14 @@ def build_messages(user_text: str,
             })
         user_content.append({
             "type": "text",
-            "text": (user_text or "(no text)") + pdf_block + ctx,
         })
         return [
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user", "content": user_content},
         ]
-    # No vision: include note if user attached images but vision is off
     note = ""
     if image_atts and not MMPROJ_PATH:
         note = (f"\n\n[Note: user attached {len(image_atts)} image(s) but vision "
@@ -103,7 +111,8 @@ def build_messages(user_text: str,
     return [
         {"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user",
-         "content": (user_text or "(no text)") + pdf_block + note + ctx},
     ]

 Supports up to 2 multimodal attachments (images or PDFs). For PDFs we extract
 text inline (since the vision projector handles images only). If vision is
 unavailable we degrade gracefully to text-only.
+FIX: SYSTEM_PROMPT now explicitly forbids <think> blocks, markdown fences,
+preamble, and any non-JSON token. Even so, server.py still strips <think>
+defensively because the fine-tuned weights emit it occasionally.
 """
 import base64
 import io
 from .model_loader import MMPROJ_PATH
 SYSTEM_PROMPT = """You are Elysium — a persistent agentic civilization.
+OUTPUT CONTRACT (strict):
+  • Respond with ONE valid JSON object matching the ElysiumResponse schema v1.0.0.
+  • Output JSON ONLY. No preamble, no postscript, no markdown, no code fences.
+  • Do NOT emit <think>, <reasoning>, or any XML-style tags.
+  • The first character of your output MUST be `{` and the last must be `}`.
 Decide complexity dynamically:
 - SIMPLE_REPLY: trivial Q — no agents (council_deliberation.agent_outputs = [])
 - SPECIATION_EVENT: only on unresolved cross-domain tension
 - Always populate ui_directives (camera_focus_node_id, pulses, threads)
 - All node_id and edge_id values must be unique strings
+- Always include 'direct_answer' — a short human-readable answer (one or two
+  sentences) suitable for surfacing in a toast. NEVER place JSON, raw schema
+  text, or system tags inside direct_answer.
 When the user attaches images or PDFs, analyze them, populate
 multimodal_perception fields (ocr_extracted_text, image_scene_description,
+document_type, visual_entities_detected), and reference them in your reasoning
+through the hypergraph_delta and agent thinking — NOT in direct_answer.
 """
     ctx = f"\n\n[Hypergraph context]\n{hg_context}" if hg_context else ""
     attachments = attachments or []
     image_atts = [a for a in attachments if a["kind"] == "image" and a.get("image") is not None]
     pdf_atts   = [a for a in attachments if a["kind"] == "pdf"   and a.get("bytes")]
     pdf_block = ""
     for i, p in enumerate(pdf_atts):
         pdf_block += f"\n\n[Attached PDF #{i+1}: {p.get('name','document.pdf')}]\n"
             })
         user_content.append({
             "type": "text",
+            "text": (user_text or "(no text)") + pdf_block + ctx +
+                    "\n\nReturn ONLY the JSON object. No <think> tags. No prose.",
         })
         return [
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user", "content": user_content},
         ]
     note = ""
     if image_atts and not MMPROJ_PATH:
         note = (f"\n\n[Note: user attached {len(image_atts)} image(s) but vision "
     return [
         {"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user",
+         "content": (user_text or "(no text)") + pdf_block + note + ctx +
+                    "\n\nReturn ONLY the JSON object. No <think> tags. No prose."},
     ]