Spaces:

davidepanza
/

test2text

Sleeping

App Files Files Community

davidepanza commited on Jun 17, 2025

Commit

468a9aa

verified ·

1 Parent(s): 82faabd

Update app/backend/messages_templates.py

Browse files

Files changed (1) hide show

app/backend/messages_templates.py +41 -109

app/backend/messages_templates.py CHANGED Viewed

@@ -1,61 +1,3 @@
-# def get_toc_extraction_messages(toc_text: str):
-#     return [
-#         {
-#             "role": "system",
-#             "content": "You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers."
-#         },
-#         {
-#             "role": "user",
-#             "content": "I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information."
-#         },
-#         {
-#             "role": "assistant",
-#             "content": "I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document."
-#         },
-#         {
-#             "role": "user",
-#             "content": f"""Here is the table of contents:
-#         {toc_text}
-#         WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
-#         - Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
-#         - Do NOT guess page numbers
-#         - Do NOT create generic textbook chapters
-#         - ONLY extract what you can clearly see in the provided text
-#         CRITICAL RULES:
-#         1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
-#         2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
-#         3. Use the EXACT chapter titles shown in the document
-#         4. Use the EXACT page numbers shown in the document
-#         5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
-#         6. Calculate end pages as: next chapter's start page minus 1
-#         7. Return ONLY valid JSON - no explanations, no markdown formatting
-#         8. If you cannot clearly identify chapters, return empty array []
-#         Look for patterns like:
-#         - "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
-#         - "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
-#         - "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"
-#         DO NOT extract lines like:
-#         - "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
-#         - "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"
-#         Use ONLY the exact titles from the document. Do not shorten or modify them.
-#         Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]
-#         REMEMBER: Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered.                  y chapters, return an empty array []."""
-#                 },
-#                 {
-#                     "role": "assistant",
-#                     "content": "I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information."
-#                 }
-#     ]
 def toc_prompt(toc_text: str):
     # Convert to Gemma 3 format - single string with proper turn markers
     prompt = f"""<start_of_turn>user
@@ -125,24 +67,22 @@ def chapter_prompt(contexts, num_questions, max_questions=5):
     prompt = """<start_of_turn>user
 You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
-IMPORTANT REQUIREMENTS:
-1. Output MUST be valid JSON format
-2. Generate EXACTLY {num_questions} questions
-3. Each question must have a complete answer from the contexts
-4. Vary question types (what, why, how, when, explain, compare)
-5. Do not generate yes/no questions
-6. Answers should be 1-3 sentences long
 CONTEXTS:
 {contexts}
-OUTPUT FORMAT - Return ONLY valid JSON array:
-[
-{{"question": "Your question here?", "answer": "Complete answer from the context"}},
-{{"question": "Another question?", "answer": "Another answer"}}
-]
-Generate the questions now:<end_of_turn>
 <start_of_turn>model
 """.format(
         num_questions=min(num_questions, max_questions),
@@ -151,6 +91,7 @@ Generate the questions now:<end_of_turn>
     return prompt
 def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
     """
     Create a prompt formatted for Gemma 3 12B-IT model.
@@ -167,23 +108,22 @@ Generate {num_questions} questions from the following contexts. You may:
 - Use multiple contexts for a single question
 - Skip contexts if they don't contain meaningful information
-REQUIREMENTS:
-1. Output valid JSON array format
-2. Generate EXACTLY {num_questions} questions
-3. Each answer must be found in the provided contexts
-4. Create diverse question types
-5. Reference which context group(s) you used
 CONTEXT GROUPS:
 {context_groups}
-OUTPUT FORMAT - Return ONLY this JSON structure:
-[
-{{"question": "Question text?", "answer": "Answer text", "context_used": [1, 2]}},
-{{"question": "Question text?", "answer": "Answer text", "context_used": [1]}}
-]
-Generate the questions:<end_of_turn>
 <start_of_turn>model
 """.format(
         num_questions=min(num_questions, max_questions),
@@ -214,40 +154,32 @@ def book_prompt(contexts, num_questions, user_query=None, max_questions=5):
     if user_query:
         topic_context = f"""
 TOPIC FOCUS: {user_query}
-The following contexts were retrieved based on this topic. Generate questions that:
-- Relate to the main topic: "{user_query}"
-- Explore different aspects of this topic found in the contexts
-- Connect the topic to broader concepts when relevant
 """
     prompt = """<start_of_turn>user
-You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
 {topic_context}
-IMPORTANT REQUIREMENTS:
-1. Output MUST be valid JSON format
-2. Generate EXACTLY {num_questions} questions
-3. Each question must have a complete answer from the contexts
-4. Vary question types (what, why, how, when, explain, compare)
-5. Do not generate yes/no questions
-6. Answers should be 1-3 sentences long
-7. Questions should explore different aspects of the topic
-CONTEXTS (Retrieved based on topic: "{query}"):
 {contexts}
-OUTPUT FORMAT - Return ONLY valid JSON array:
-[
-{{"question": "Your question here?", "answer": "Complete answer from the context"}},
-{{"question": "Another question?", "answer": "Another answer"}}
-]
-Generate the questions now:<end_of_turn>
 <start_of_turn>model
 """.format(
         num_questions=num_questions,
-        topic_context=topic_context,
-        query=user_query if user_query else "the provided content",
         contexts=format_contexts(contexts)
     )

 def toc_prompt(toc_text: str):
     # Convert to Gemma 3 format - single string with proper turn markers
     prompt = f"""<start_of_turn>user
     prompt = """<start_of_turn>user
 You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
+RULES:
+1. No questions about images/figures/diagrams
+2. Include actual formulas in questions if referencing them
+3. Vary question types (what, why, how, explain, compare)
+4. Questions should explore different aspects of the topic
+5. Do NOT reference "Context X" or "according to the text" in questions
+6. Write questions as if asking about the topic directly
+7. No yes/no questions
+8. Answers must be 1-3 sentences from contexts
+9. Generate EXACTLY {num_questions} questions
 CONTEXTS:
 {contexts}
+Return ONLY valid JSON:
+[{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
 <start_of_turn>model
 """.format(
         num_questions=min(num_questions, max_questions),
     return prompt
 def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
     """
     Create a prompt formatted for Gemma 3 12B-IT model.
 - Use multiple contexts for a single question
 - Skip contexts if they don't contain meaningful information
+RULES:
+1. No questions about images/figures/diagrams
+2. Include actual formulas in questions if referencing them
+3. Vary question types (what, why, how, explain, compare)
+4. Questions should explore different aspects of the topic
+5. Do NOT reference "Context X" or "according to the text" in questions
+6. Write questions as if asking about the topic directly
+7. No yes/no questions
+8. Answers must be 1-3 sentences from contexts
+9. Generate EXACTLY {num_questions} questions
 CONTEXT GROUPS:
 {context_groups}
+Return ONLY valid JSON:
+[{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
 <start_of_turn>model
 """.format(
         num_questions=min(num_questions, max_questions),
     if user_query:
         topic_context = f"""
 TOPIC FOCUS: {user_query}
+Generate questions that relate to this topic and explore different aspects found in the contexts.
 """
     prompt = """<start_of_turn>user
+Generate exactly {num_questions} questions from the contexts below as valid JSON.
 {topic_context}
+RULES:
+1. No questions about images/figures/diagrams
+2. Include actual formulas in questions if referencing them
+3. Vary question types (what, why, how, explain, compare)
+4. Questions should explore different aspects of the topic
+5. Do NOT reference "Context X" or "according to the text" in questions
+6. Write questions as if asking about the topic directly
+7. No yes/no questions
+8. Answers must be 1-3 sentences from contexts
+9. Generate EXACTLY {num_questions} questions
+CONTEXTS:
 {contexts}
+Return ONLY valid JSON:
+[{{"question": "Question?", "answer": "Answer from context"}}]<end_of_turn>
 <start_of_turn>model
 """.format(
         num_questions=num_questions,
+        topic_context=topic_context.strip(),
         contexts=format_contexts(contexts)
     )