Spaces:
Sleeping
Sleeping
Zeggai Abdellah
commited on
Commit
·
d8d8050
1
Parent(s):
fdc8d14
update the retrever and the system prompt
Browse files- prepare_env.py +25 -2
- rag_pipeline.py +0 -46
prepare_env.py
CHANGED
|
@@ -17,7 +17,11 @@ from langchain.retrievers.multi_query import MultiQueryRetriever
|
|
| 17 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 18 |
from llama_index.core.tools import FunctionTool
|
| 19 |
from llama_index.core.schema import TextNode
|
|
|
|
|
|
|
| 20 |
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def extract_source_ids(response_text):
|
| 23 |
"""
|
|
@@ -127,7 +131,25 @@ def create_vectorstore_from_json(json_path: str, collection_name: str, embedding
|
|
| 127 |
def create_retriever(vectorstore, docs, llm):
|
| 128 |
"""Create ensemble retriever with vector and BM25 search"""
|
| 129 |
print("🔍 Creating ensemble retriever...")
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
# Vector retriever
|
| 132 |
vector_retriever = vectorstore.as_retriever(
|
| 133 |
search_type="similarity",
|
|
@@ -150,7 +172,8 @@ def create_retriever(vectorstore, docs, llm):
|
|
| 150 |
# Multi-query expanding retriever
|
| 151 |
expanding_retriever = MultiQueryRetriever.from_llm(
|
| 152 |
retriever=ensemble_retriever,
|
| 153 |
-
llm=llm
|
|
|
|
| 154 |
)
|
| 155 |
print("✅ Multi-query expanding retriever created")
|
| 156 |
|
|
|
|
| 17 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 18 |
from llama_index.core.tools import FunctionTool
|
| 19 |
from llama_index.core.schema import TextNode
|
| 20 |
+
from langchain.prompts import PromptTemplate
|
| 21 |
+
import logging
|
| 22 |
|
| 23 |
+
logging.basicConfig()
|
| 24 |
+
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)
|
| 25 |
|
| 26 |
def extract_source_ids(response_text):
|
| 27 |
"""
|
|
|
|
| 131 |
def create_retriever(vectorstore, docs, llm):
|
| 132 |
"""Create ensemble retriever with vector and BM25 search"""
|
| 133 |
print("🔍 Creating ensemble retriever...")
|
| 134 |
+
# PromptTemplate for Vaccine Assistant MultiQuery Retriever
|
| 135 |
+
VACCINE_MULTIQUERY_PROMPT = PromptTemplate(
|
| 136 |
+
input_variables=["question"],
|
| 137 |
+
template="""You are an AI assistant specialized in vaccine-related medical information retrieval.
|
| 138 |
+
Your task is to generate multiple search queries based on the original question to find relevant information from official vaccine medical documents.
|
| 139 |
+
|
| 140 |
+
IMPORTANT GUIDELINES:
|
| 141 |
+
- Keep all vaccine-specific terminology and medical terms intact
|
| 142 |
+
- Maintain the clinical and medical context
|
| 143 |
+
- Focus on evidence-based vaccine information
|
| 144 |
+
- Preserve any specific vaccine names, diseases, or medical conditions mentioned
|
| 145 |
+
- Generate queries that would help retrieve information about vaccine schedules, dosing, contraindications, adverse events, and disease prevention
|
| 146 |
+
|
| 147 |
+
Original question: {question}
|
| 148 |
+
|
| 149 |
+
Generate 4 different search queries that rephrase the original question while maintaining vaccine terminology and medical accuracy. Each query should approach the topic from a slightly different angle to maximize retrieval from vaccine medical documents.
|
| 150 |
+
|
| 151 |
+
Provide only the alternative questions, one per line."""
|
| 152 |
+
)
|
| 153 |
# Vector retriever
|
| 154 |
vector_retriever = vectorstore.as_retriever(
|
| 155 |
search_type="similarity",
|
|
|
|
| 172 |
# Multi-query expanding retriever
|
| 173 |
expanding_retriever = MultiQueryRetriever.from_llm(
|
| 174 |
retriever=ensemble_retriever,
|
| 175 |
+
llm=llm,
|
| 176 |
+
prompt=VACCINE_MULTIQUERY_PROMPT,
|
| 177 |
)
|
| 178 |
print("✅ Multi-query expanding retriever created")
|
| 179 |
|
rag_pipeline.py
CHANGED
|
@@ -143,26 +143,6 @@ Answer the doctor's question accurately and concisely using only the provided in
|
|
| 143 |
- Include the citation in the table caption, e.g., 'Table: Vaccination Schedule [Source ID]'.
|
| 144 |
2. For lists, maintain the original bullet points/numbering and include citations.
|
| 145 |
3. Present information concisely but ensure clinical accuracy is never compromised.
|
| 146 |
-
|
| 147 |
-
### CRITICAL: Efficient Fallback Strategy
|
| 148 |
-
1. **MANDATORY SEARCH**: Use each relevant tool at least once to search for information, even if you suspect the information might not be available.
|
| 149 |
-
2. **BREAK DOWN COMPLEX QUERIES**: For comparative or multi-part questions (e.g., comparing Algerian and WHO guidelines), break the query into sub-queries and use the appropriate tool for each part:
|
| 150 |
-
- Use Guide_vector_tool for Algerian-specific information (e.g., national schedules, coverage targets).
|
| 151 |
-
- Use Immunization_in_Practice_tool for WHO-specific information (e.g., global recommendations, coverage targets).
|
| 152 |
-
3. **DO NOT STOP PREMATURELY**: Do not conclude "no information is available" without using the relevant tool(s) to search for the answer.
|
| 153 |
-
4. **BE DECISIVE**: Once you find relevant information for each sub-query, formulate your response immediately.
|
| 154 |
-
5. **ANSWER FULLY**: Address all parts of the question, using multiple tools if required by the query.
|
| 155 |
-
|
| 156 |
-
### Response Guidelines
|
| 157 |
-
- **MANDATORY TOOL SELECTION**:
|
| 158 |
-
- For queries mentioning "WHO," "World Health Organization," "international," "global guidance," or WHO documents (e.g., page numbers), use Immunization_in_Practice_tool first.
|
| 159 |
-
- For queries mentioning "Algerian," "national guide," or Algerian-specific terms (e.g., page numbers), use Guide_vector_tool first.
|
| 160 |
-
- For comparative queries (e.g., Algerian vs. WHO), use both Guide_vector_tool and Immunization_in_Practice_tool, addressing each part systematically.
|
| 161 |
-
- **EXPLICIT REASONING**: Before answering, log your reasoning steps, including which tools you will use and why, based on the query’s content.
|
| 162 |
-
- **Query Decomposition**: Break comparative or multi-part queries into sub-queries (e.g., one for Algerian information, one for WHO information) and use the appropriate tool for each.
|
| 163 |
-
- Provide all found information with proper citations using Source IDs only.
|
| 164 |
-
- If information is limited, clearly state: "Based on the available documents, I can provide the following information..." and indicate what is not available.
|
| 165 |
-
|
| 166 |
---
|
| 167 |
"""
|
| 168 |
else:
|
|
@@ -190,32 +170,6 @@ Answer the doctor's question accurately and concisely using only the provided in
|
|
| 190 |
2. For lists, maintain the original bullet points/numbering and include citations.
|
| 191 |
3. Present information concisely but ensure clinical accuracy is never compromised.
|
| 192 |
|
| 193 |
-
### CRITICAL: Efficient Response Strategy
|
| 194 |
-
1. **MANDATORY SEARCH**: Always use the relevant tool(s) to search for information before answering, even if you initially think no information is available.
|
| 195 |
-
2. **MANDATORY TOOL SELECTION**:
|
| 196 |
-
- For queries mentioning "WHO," "World Health Organization," "international," "global guidance," or WHO documents (e.g., page numbers), use Immunization_in_Practice_tool first.
|
| 197 |
-
- For queries mentioning "Algerian," "national guide," or Algerian-specific terms (e.g., page numbers), use Guide_vector_tool first.
|
| 198 |
-
- For comparative queries (e.g., Algerian vs. WHO), use both Guide_vector_tool and Immunization_in_Practice_tool, addressing each part systematically.
|
| 199 |
-
3. **Query Decomposition**: Break comparative or multi-part queries into sub-queries (e.g., one for Algerian information, one for WHO information) and use the appropriate tool for each.
|
| 200 |
-
4. **DO NOT STOP PREMATURELY**: Do not conclude "no information is available" without using the relevant tool(s) to search for the answer.
|
| 201 |
-
5. **EXPLICIT REASONING**: Before answering, log your reasoning steps, including which tools you will use and why, based on the query’s content.
|
| 202 |
-
6. **BE DECISIVE**: Once you find relevant information for each sub-query, formulate your response immediately.
|
| 203 |
-
7. **ANSWER FULLY**: Address all parts of the question, using multiple tools if required by the query.
|
| 204 |
-
8. **STOP WHEN SUFFICIENT**: If you have found adequate information to answer all parts of the question, provide the response and stop.
|
| 205 |
-
|
| 206 |
-
### Response Guidelines for Complex Questions
|
| 207 |
-
- For comparative questions: Break the query into sub-queries (e.g., Algerian vs. WHO), use Guide_vector_tool for Algerian specifics and Immunization_in_Practice_tool for WHO specifics, then provide the comparison.
|
| 208 |
-
- For multi-part questions: Address each part systematically, using the appropriate tool for each sub-query.
|
| 209 |
-
- If information is not found after using the relevant tool(s): State clearly: "Based on the available documents, I can provide the following information..." and specify what is not available.
|
| 210 |
-
- Do not repeatedly search for the same terms or rephrase searches excessively.
|
| 211 |
-
|
| 212 |
-
### When Information is Limited
|
| 213 |
-
If you cannot find complete information to fully answer a question:
|
| 214 |
-
1. Provide whatever relevant information you did find with proper citations using Source IDs only.
|
| 215 |
-
2. Clearly state: "Based on the available documents, I can provide the following information..."
|
| 216 |
-
3. Indicate what specific information is not available: "However, information about [specific topic] was not found in the provided documents after searching with the relevant tool(s)."
|
| 217 |
-
4. Do not conclude "no information is available" without attempting a search with the appropriate tool(s).
|
| 218 |
-
|
| 219 |
---
|
| 220 |
"""
|
| 221 |
|
|
|
|
| 143 |
- Include the citation in the table caption, e.g., 'Table: Vaccination Schedule [Source ID]'.
|
| 144 |
2. For lists, maintain the original bullet points/numbering and include citations.
|
| 145 |
3. Present information concisely but ensure clinical accuracy is never compromised.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
---
|
| 147 |
"""
|
| 148 |
else:
|
|
|
|
| 170 |
2. For lists, maintain the original bullet points/numbering and include citations.
|
| 171 |
3. Present information concisely but ensure clinical accuracy is never compromised.
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
---
|
| 174 |
"""
|
| 175 |
|