Spaces:
Sleeping
Sleeping
Update app/backend/messages_templates.py
Browse files- app/backend/messages_templates.py +41 -109
app/backend/messages_templates.py
CHANGED
|
@@ -1,61 +1,3 @@
|
|
| 1 |
-
# def get_toc_extraction_messages(toc_text: str):
|
| 2 |
-
# return [
|
| 3 |
-
# {
|
| 4 |
-
# "role": "system",
|
| 5 |
-
# "content": "You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers."
|
| 6 |
-
# },
|
| 7 |
-
# {
|
| 8 |
-
# "role": "user",
|
| 9 |
-
# "content": "I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information."
|
| 10 |
-
# },
|
| 11 |
-
# {
|
| 12 |
-
# "role": "assistant",
|
| 13 |
-
# "content": "I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document."
|
| 14 |
-
# },
|
| 15 |
-
# {
|
| 16 |
-
# "role": "user",
|
| 17 |
-
# "content": f"""Here is the table of contents:
|
| 18 |
-
|
| 19 |
-
# {toc_text}
|
| 20 |
-
|
| 21 |
-
# WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
|
| 22 |
-
# - Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
|
| 23 |
-
# - Do NOT guess page numbers
|
| 24 |
-
# - Do NOT create generic textbook chapters
|
| 25 |
-
# - ONLY extract what you can clearly see in the provided text
|
| 26 |
-
|
| 27 |
-
# CRITICAL RULES:
|
| 28 |
-
# 1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
|
| 29 |
-
# 2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
|
| 30 |
-
# 3. Use the EXACT chapter titles shown in the document
|
| 31 |
-
# 4. Use the EXACT page numbers shown in the document
|
| 32 |
-
# 5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
|
| 33 |
-
# 6. Calculate end pages as: next chapter's start page minus 1
|
| 34 |
-
# 7. Return ONLY valid JSON - no explanations, no markdown formatting
|
| 35 |
-
# 8. If you cannot clearly identify chapters, return empty array []
|
| 36 |
-
|
| 37 |
-
# Look for patterns like:
|
| 38 |
-
# - "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
|
| 39 |
-
# - "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
|
| 40 |
-
# - "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
|
| 41 |
-
|
| 42 |
-
# DO NOT extract lines like:
|
| 43 |
-
# - "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
|
| 44 |
-
# - "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
|
| 45 |
-
|
| 46 |
-
# Use ONLY the exact titles from the document. Do not shorten or modify them.
|
| 47 |
-
|
| 48 |
-
# Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
|
| 49 |
-
|
| 50 |
-
# REMEMBER: Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered. y chapters, return an empty array []."""
|
| 51 |
-
# },
|
| 52 |
-
# {
|
| 53 |
-
# "role": "assistant",
|
| 54 |
-
# "content": "I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information."
|
| 55 |
-
# }
|
| 56 |
-
# ]
|
| 57 |
-
|
| 58 |
-
|
| 59 |
def toc_prompt(toc_text: str):
|
| 60 |
# Convert to Gemma 3 format - single string with proper turn markers
|
| 61 |
prompt = f"""<start_of_turn>user
|
|
@@ -125,24 +67,22 @@ def chapter_prompt(contexts, num_questions, max_questions=5):
|
|
| 125 |
prompt = """<start_of_turn>user
|
| 126 |
You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
|
| 127 |
|
| 128 |
-
|
| 129 |
-
1.
|
| 130 |
-
2.
|
| 131 |
-
3.
|
| 132 |
-
4.
|
| 133 |
-
5. Do
|
| 134 |
-
6.
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
CONTEXTS:
|
| 137 |
{contexts}
|
| 138 |
|
| 139 |
-
|
| 140 |
-
[
|
| 141 |
-
{{"question": "Your question here?", "answer": "Complete answer from the context"}},
|
| 142 |
-
{{"question": "Another question?", "answer": "Another answer"}}
|
| 143 |
-
]
|
| 144 |
-
|
| 145 |
-
Generate the questions now:<end_of_turn>
|
| 146 |
<start_of_turn>model
|
| 147 |
""".format(
|
| 148 |
num_questions=min(num_questions, max_questions),
|
|
@@ -151,6 +91,7 @@ Generate the questions now:<end_of_turn>
|
|
| 151 |
|
| 152 |
return prompt
|
| 153 |
|
|
|
|
| 154 |
def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
|
| 155 |
"""
|
| 156 |
Create a prompt formatted for Gemma 3 12B-IT model.
|
|
@@ -167,23 +108,22 @@ Generate {num_questions} questions from the following contexts. You may:
|
|
| 167 |
- Use multiple contexts for a single question
|
| 168 |
- Skip contexts if they don't contain meaningful information
|
| 169 |
|
| 170 |
-
|
| 171 |
-
1.
|
| 172 |
-
2.
|
| 173 |
-
3.
|
| 174 |
-
4.
|
| 175 |
-
5.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
CONTEXT GROUPS:
|
| 178 |
{context_groups}
|
| 179 |
|
| 180 |
-
|
| 181 |
-
[
|
| 182 |
-
{{"question": "Question text?", "answer": "Answer text", "context_used": [1, 2]}},
|
| 183 |
-
{{"question": "Question text?", "answer": "Answer text", "context_used": [1]}}
|
| 184 |
-
]
|
| 185 |
-
|
| 186 |
-
Generate the questions:<end_of_turn>
|
| 187 |
<start_of_turn>model
|
| 188 |
""".format(
|
| 189 |
num_questions=min(num_questions, max_questions),
|
|
@@ -214,40 +154,32 @@ def book_prompt(contexts, num_questions, user_query=None, max_questions=5):
|
|
| 214 |
if user_query:
|
| 215 |
topic_context = f"""
|
| 216 |
TOPIC FOCUS: {user_query}
|
| 217 |
-
|
| 218 |
-
- Relate to the main topic: "{user_query}"
|
| 219 |
-
- Explore different aspects of this topic found in the contexts
|
| 220 |
-
- Connect the topic to broader concepts when relevant
|
| 221 |
-
|
| 222 |
"""
|
| 223 |
|
| 224 |
prompt = """<start_of_turn>user
|
| 225 |
-
|
| 226 |
{topic_context}
|
| 227 |
-
|
| 228 |
-
1.
|
| 229 |
-
2.
|
| 230 |
-
3.
|
| 231 |
-
4.
|
| 232 |
-
5. Do
|
| 233 |
-
6.
|
| 234 |
-
7.
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
CONTEXTS
|
| 237 |
{contexts}
|
| 238 |
|
| 239 |
-
|
| 240 |
-
[
|
| 241 |
-
{{"question": "Your question here?", "answer": "Complete answer from the context"}},
|
| 242 |
-
{{"question": "Another question?", "answer": "Another answer"}}
|
| 243 |
-
]
|
| 244 |
-
|
| 245 |
-
Generate the questions now:<end_of_turn>
|
| 246 |
<start_of_turn>model
|
| 247 |
""".format(
|
| 248 |
num_questions=num_questions,
|
| 249 |
-
topic_context=topic_context,
|
| 250 |
-
query=user_query if user_query else "the provided content",
|
| 251 |
contexts=format_contexts(contexts)
|
| 252 |
)
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def toc_prompt(toc_text: str):
|
| 2 |
# Convert to Gemma 3 format - single string with proper turn markers
|
| 3 |
prompt = f"""<start_of_turn>user
|
|
|
|
| 67 |
prompt = """<start_of_turn>user
|
| 68 |
You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
|
| 69 |
|
| 70 |
+
RULES:
|
| 71 |
+
1. No questions about images/figures/diagrams
|
| 72 |
+
2. Include actual formulas in questions if referencing them
|
| 73 |
+
3. Vary question types (what, why, how, explain, compare)
|
| 74 |
+
4. Questions should explore different aspects of the topic
|
| 75 |
+
5. Do NOT reference "Context X" or "according to the text" in questions
|
| 76 |
+
6. Write questions as if asking about the topic directly
|
| 77 |
+
7. No yes/no questions
|
| 78 |
+
8. Answers must be 1-3 sentences from contexts
|
| 79 |
+
9. Generate EXACTLY {num_questions} questions
|
| 80 |
|
| 81 |
CONTEXTS:
|
| 82 |
{contexts}
|
| 83 |
|
| 84 |
+
Return ONLY valid JSON:
|
| 85 |
+
[{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
<start_of_turn>model
|
| 87 |
""".format(
|
| 88 |
num_questions=min(num_questions, max_questions),
|
|
|
|
| 91 |
|
| 92 |
return prompt
|
| 93 |
|
| 94 |
+
|
| 95 |
def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
|
| 96 |
"""
|
| 97 |
Create a prompt formatted for Gemma 3 12B-IT model.
|
|
|
|
| 108 |
- Use multiple contexts for a single question
|
| 109 |
- Skip contexts if they don't contain meaningful information
|
| 110 |
|
| 111 |
+
RULES:
|
| 112 |
+
1. No questions about images/figures/diagrams
|
| 113 |
+
2. Include actual formulas in questions if referencing them
|
| 114 |
+
3. Vary question types (what, why, how, explain, compare)
|
| 115 |
+
4. Questions should explore different aspects of the topic
|
| 116 |
+
5. Do NOT reference "Context X" or "according to the text" in questions
|
| 117 |
+
6. Write questions as if asking about the topic directly
|
| 118 |
+
7. No yes/no questions
|
| 119 |
+
8. Answers must be 1-3 sentences from contexts
|
| 120 |
+
9. Generate EXACTLY {num_questions} questions
|
| 121 |
|
| 122 |
CONTEXT GROUPS:
|
| 123 |
{context_groups}
|
| 124 |
|
| 125 |
+
Return ONLY valid JSON:
|
| 126 |
+
[{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
<start_of_turn>model
|
| 128 |
""".format(
|
| 129 |
num_questions=min(num_questions, max_questions),
|
|
|
|
| 154 |
if user_query:
|
| 155 |
topic_context = f"""
|
| 156 |
TOPIC FOCUS: {user_query}
|
| 157 |
+
Generate questions that relate to this topic and explore different aspects found in the contexts.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
"""
|
| 159 |
|
| 160 |
prompt = """<start_of_turn>user
|
| 161 |
+
Generate exactly {num_questions} questions from the contexts below as valid JSON.
|
| 162 |
{topic_context}
|
| 163 |
+
RULES:
|
| 164 |
+
1. No questions about images/figures/diagrams
|
| 165 |
+
2. Include actual formulas in questions if referencing them
|
| 166 |
+
3. Vary question types (what, why, how, explain, compare)
|
| 167 |
+
4. Questions should explore different aspects of the topic
|
| 168 |
+
5. Do NOT reference "Context X" or "according to the text" in questions
|
| 169 |
+
6. Write questions as if asking about the topic directly
|
| 170 |
+
7. No yes/no questions
|
| 171 |
+
8. Answers must be 1-3 sentences from contexts
|
| 172 |
+
9. Generate EXACTLY {num_questions} questions
|
| 173 |
|
| 174 |
+
CONTEXTS:
|
| 175 |
{contexts}
|
| 176 |
|
| 177 |
+
Return ONLY valid JSON:
|
| 178 |
+
[{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
<start_of_turn>model
|
| 180 |
""".format(
|
| 181 |
num_questions=num_questions,
|
| 182 |
+
topic_context=topic_context.strip(),
|
|
|
|
| 183 |
contexts=format_contexts(contexts)
|
| 184 |
)
|
| 185 |
|