davidepanza commited on
Commit
468a9aa
·
verified ·
1 Parent(s): 82faabd

Update app/backend/messages_templates.py

Browse files
Files changed (1) hide show
  1. app/backend/messages_templates.py +41 -109
app/backend/messages_templates.py CHANGED
@@ -1,61 +1,3 @@
1
- # def get_toc_extraction_messages(toc_text: str):
2
- # return [
3
- # {
4
- # "role": "system",
5
- # "content": "You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers."
6
- # },
7
- # {
8
- # "role": "user",
9
- # "content": "I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information."
10
- # },
11
- # {
12
- # "role": "assistant",
13
- # "content": "I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document."
14
- # },
15
- # {
16
- # "role": "user",
17
- # "content": f"""Here is the table of contents:
18
-
19
- # {toc_text}
20
-
21
- # WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
22
- # - Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
23
- # - Do NOT guess page numbers
24
- # - Do NOT create generic textbook chapters
25
- # - ONLY extract what you can clearly see in the provided text
26
-
27
- # CRITICAL RULES:
28
- # 1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
29
- # 2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
30
- # 3. Use the EXACT chapter titles shown in the document
31
- # 4. Use the EXACT page numbers shown in the document
32
- # 5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
33
- # 6. Calculate end pages as: next chapter's start page minus 1
34
- # 7. Return ONLY valid JSON - no explanations, no markdown formatting
35
- # 8. If you cannot clearly identify chapters, return empty array []
36
-
37
- # Look for patterns like:
38
- # - "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
39
- # - "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
40
- # - "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
41
-
42
- # DO NOT extract lines like:
43
- # - "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
44
- # - "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
45
-
46
- # Use ONLY the exact titles from the document. Do not shorten or modify them.
47
-
48
- # Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
49
-
50
- # REMEMBER: Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered. y chapters, return an empty array []."""
51
- # },
52
- # {
53
- # "role": "assistant",
54
- # "content": "I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information."
55
- # }
56
- # ]
57
-
58
-
59
  def toc_prompt(toc_text: str):
60
  # Convert to Gemma 3 format - single string with proper turn markers
61
  prompt = f"""<start_of_turn>user
@@ -125,24 +67,22 @@ def chapter_prompt(contexts, num_questions, max_questions=5):
125
  prompt = """<start_of_turn>user
126
  You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
127
 
128
- IMPORTANT REQUIREMENTS:
129
- 1. Output MUST be valid JSON format
130
- 2. Generate EXACTLY {num_questions} questions
131
- 3. Each question must have a complete answer from the contexts
132
- 4. Vary question types (what, why, how, when, explain, compare)
133
- 5. Do not generate yes/no questions
134
- 6. Answers should be 1-3 sentences long
 
 
 
135
 
136
  CONTEXTS:
137
  {contexts}
138
 
139
- OUTPUT FORMAT - Return ONLY valid JSON array:
140
- [
141
- {{"question": "Your question here?", "answer": "Complete answer from the context"}},
142
- {{"question": "Another question?", "answer": "Another answer"}}
143
- ]
144
-
145
- Generate the questions now:<end_of_turn>
146
  <start_of_turn>model
147
  """.format(
148
  num_questions=min(num_questions, max_questions),
@@ -151,6 +91,7 @@ Generate the questions now:<end_of_turn>
151
 
152
  return prompt
153
 
 
154
  def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
155
  """
156
  Create a prompt formatted for Gemma 3 12B-IT model.
@@ -167,23 +108,22 @@ Generate {num_questions} questions from the following contexts. You may:
167
  - Use multiple contexts for a single question
168
  - Skip contexts if they don't contain meaningful information
169
 
170
- REQUIREMENTS:
171
- 1. Output valid JSON array format
172
- 2. Generate EXACTLY {num_questions} questions
173
- 3. Each answer must be found in the provided contexts
174
- 4. Create diverse question types
175
- 5. Reference which context group(s) you used
 
 
 
 
176
 
177
  CONTEXT GROUPS:
178
  {context_groups}
179
 
180
- OUTPUT FORMAT - Return ONLY this JSON structure:
181
- [
182
- {{"question": "Question text?", "answer": "Answer text", "context_used": [1, 2]}},
183
- {{"question": "Question text?", "answer": "Answer text", "context_used": [1]}}
184
- ]
185
-
186
- Generate the questions:<end_of_turn>
187
  <start_of_turn>model
188
  """.format(
189
  num_questions=min(num_questions, max_questions),
@@ -214,40 +154,32 @@ def book_prompt(contexts, num_questions, user_query=None, max_questions=5):
214
  if user_query:
215
  topic_context = f"""
216
  TOPIC FOCUS: {user_query}
217
- The following contexts were retrieved based on this topic. Generate questions that:
218
- - Relate to the main topic: "{user_query}"
219
- - Explore different aspects of this topic found in the contexts
220
- - Connect the topic to broader concepts when relevant
221
-
222
  """
223
 
224
  prompt = """<start_of_turn>user
225
- You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
226
  {topic_context}
227
- IMPORTANT REQUIREMENTS:
228
- 1. Output MUST be valid JSON format
229
- 2. Generate EXACTLY {num_questions} questions
230
- 3. Each question must have a complete answer from the contexts
231
- 4. Vary question types (what, why, how, when, explain, compare)
232
- 5. Do not generate yes/no questions
233
- 6. Answers should be 1-3 sentences long
234
- 7. Questions should explore different aspects of the topic
 
 
235
 
236
- CONTEXTS (Retrieved based on topic: "{query}"):
237
  {contexts}
238
 
239
- OUTPUT FORMAT - Return ONLY valid JSON array:
240
- [
241
- {{"question": "Your question here?", "answer": "Complete answer from the context"}},
242
- {{"question": "Another question?", "answer": "Another answer"}}
243
- ]
244
-
245
- Generate the questions now:<end_of_turn>
246
  <start_of_turn>model
247
  """.format(
248
  num_questions=num_questions,
249
- topic_context=topic_context,
250
- query=user_query if user_query else "the provided content",
251
  contexts=format_contexts(contexts)
252
  )
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def toc_prompt(toc_text: str):
2
  # Convert to Gemma 3 format - single string with proper turn markers
3
  prompt = f"""<start_of_turn>user
 
67
  prompt = """<start_of_turn>user
68
  You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
69
 
70
+ RULES:
71
+ 1. No questions about images/figures/diagrams
72
+ 2. Include actual formulas in questions if referencing them
73
+ 3. Vary question types (what, why, how, explain, compare)
74
+ 4. Questions should explore different aspects of the topic
75
+ 5. Do NOT reference "Context X" or "according to the text" in questions
76
+ 6. Write questions as if asking about the topic directly
77
+ 7. No yes/no questions
78
+ 8. Answers must be 1-3 sentences from contexts
79
+ 9. Generate EXACTLY {num_questions} questions
80
 
81
  CONTEXTS:
82
  {contexts}
83
 
84
+ Return ONLY valid JSON:
85
+ [{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
 
 
 
 
 
86
  <start_of_turn>model
87
  """.format(
88
  num_questions=min(num_questions, max_questions),
 
91
 
92
  return prompt
93
 
94
+
95
  def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
96
  """
97
  Create a prompt formatted for Gemma 3 12B-IT model.
 
108
  - Use multiple contexts for a single question
109
  - Skip contexts if they don't contain meaningful information
110
 
111
+ RULES:
112
+ 1. No questions about images/figures/diagrams
113
+ 2. Include actual formulas in questions if referencing them
114
+ 3. Vary question types (what, why, how, explain, compare)
115
+ 4. Questions should explore different aspects of the topic
116
+ 5. Do NOT reference "Context X" or "according to the text" in questions
117
+ 6. Write questions as if asking about the topic directly
118
+ 7. No yes/no questions
119
+ 8. Answers must be 1-3 sentences from contexts
120
+ 9. Generate EXACTLY {num_questions} questions
121
 
122
  CONTEXT GROUPS:
123
  {context_groups}
124
 
125
+ Return ONLY valid JSON:
126
+ [{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
 
 
 
 
 
127
  <start_of_turn>model
128
  """.format(
129
  num_questions=min(num_questions, max_questions),
 
154
  if user_query:
155
  topic_context = f"""
156
  TOPIC FOCUS: {user_query}
157
+ Generate questions that relate to this topic and explore different aspects found in the contexts.
 
 
 
 
158
  """
159
 
160
  prompt = """<start_of_turn>user
161
+ Generate exactly {num_questions} questions from the contexts below as valid JSON.
162
  {topic_context}
163
+ RULES:
164
+ 1. No questions about images/figures/diagrams
165
+ 2. Include actual formulas in questions if referencing them
166
+ 3. Vary question types (what, why, how, explain, compare)
167
+ 4. Questions should explore different aspects of the topic
168
+ 5. Do NOT reference "Context X" or "according to the text" in questions
169
+ 6. Write questions as if asking about the topic directly
170
+ 7. No yes/no questions
171
+ 8. Answers must be 1-3 sentences from contexts
172
+ 9. Generate EXACTLY {num_questions} questions
173
 
174
+ CONTEXTS:
175
  {contexts}
176
 
177
+ Return ONLY valid JSON:
178
+ [{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
 
 
 
 
 
179
  <start_of_turn>model
180
  """.format(
181
  num_questions=num_questions,
182
+ topic_context=topic_context.strip(),
 
183
  contexts=format_contexts(contexts)
184
  )
185