Zeggai Abdellah commited on
Commit
7f51074
·
1 Parent(s): fb64dbc

first commit

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. Dockerfile +28 -0
  3. app.py +46 -0
  4. chunks.json +0 -0
  5. prepare_env.py +89 -0
  6. rag_pipeline.py +289 -0
  7. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a Python 3.9 base image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /code
6
+
7
+ # Copy requirements file
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Create a non-root user for security
14
+ RUN useradd -m -u 1000 user
15
+ USER user
16
+ ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
17
+
18
+ # Set app directory
19
+ WORKDIR $HOME/app
20
+
21
+ # Copy all project files
22
+ COPY --chown=user . $HOME/app
23
+
24
+ # Expose port 7860 (Hugging Face default)
25
+ EXPOSE 7860
26
+
27
+ # Run the FastAPI app with uvicorn
28
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query
2
+ from prepare_env import prepare_environment_and_retriever
3
+ from rag_pipeline import full_rag_pipeline
4
+ from langchain_google_genai import GoogleGenerativeAI
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables from .env file
9
+ load_dotenv()
10
+
11
+
12
+ app = FastAPI()
13
+
14
+ # Prepare the environment and load the vector store
15
+ expanding_retriever = prepare_environment_and_retriever()
16
+
17
+
18
+ @app.get("/ask")
19
+ def ask_question(question: str, with_citations: bool = Query(False, description="Include citations in the response")):
20
+ response = full_rag_pipeline(question, expanding_retriever,clean_all_citations=with_citations)
21
+ return {"question": question, "answer": response}
22
+ @app.get("/generate_title")
23
+ def generate_title(first_question: str = Query(..., description="The first question to generate a title from")):
24
+ # Initialize the LLM - using the same model as in prepare_env.py
25
+ llm = GoogleGenerativeAI(
26
+ model="gemini-2.0-flash",
27
+ google_api_key=os.getenv("GOOGLE_API_KEY")
28
+ )
29
+
30
+ prompt = f"""Analyze this question and generate a very short title (3-5 words max):
31
+ 1. If it's medical/vaccine-related: Create a professional clinical title
32
+ 2. If non-medical: Create a general topic title
33
+ 3. If unclear or greeting: Use "General Inquiry"
34
+
35
+ Always return just the title text, nothing else.
36
+
37
+ Question: {first_question}
38
+
39
+ Title:"""
40
+
41
+ title = llm.invoke(prompt)
42
+ return {"title": title.strip()}
43
+
44
+ if __name__ == "__main__":
45
+ import uvicorn
46
+ uvicorn.run(app, host="0.0.0.0", port=7860)
chunks.json ADDED
The diff for this file is too large to render. See raw diff
 
prepare_env.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env file
6
+ load_dotenv()
7
+ from langchain_core.documents import Document
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_huggingface import HuggingFaceEmbeddings
10
+ from langchain_community.retrievers import BM25Retriever
11
+ from langchain.retrievers import EnsembleRetriever
12
+ from langchain.retrievers.multi_query import MultiQueryRetriever
13
+ from langchain_google_genai import GoogleGenerativeAI
14
+
15
+ def prepare_environment_and_retriever(
16
+ chunks_path="./chunks.json",
17
+ model_name="intfloat/multilingual-e5-base",
18
+ collection_name="Guide_2023_e5_multilingual",
19
+ persist_directory="chroma_db_multilingual",
20
+ k_vector=6,
21
+ k_sparse=2,
22
+ weights=[0.5, 0.5],
23
+ llm_model_name="gemini-2.0-flash"
24
+ ):
25
+ # Load the chunks.json
26
+ with open(chunks_path, "r", encoding="utf-8") as f:
27
+ chunks_data = json.load(f)
28
+
29
+ documents = []
30
+
31
+ for element in chunks_data:
32
+ text = element["text"]
33
+ metadata = {
34
+ "source": element["filename"],
35
+ "filetype": element["filetype"],
36
+ "element_id": element["element_id"]
37
+ }
38
+
39
+ if element.get("type") == "TableElement":
40
+ metadata["table_text_as_html"] = element["table_text_as_html"]
41
+
42
+ doc = Document(page_content=text, metadata=metadata)
43
+ documents.append(doc)
44
+
45
+ # Create the embedding function
46
+ embedding_function = HuggingFaceEmbeddings(
47
+ model_name=model_name
48
+ )
49
+
50
+ # Create and persist the vector store
51
+ vectorstore = Chroma.from_documents(
52
+ documents=documents,
53
+ embedding=embedding_function,
54
+ collection_name=collection_name,
55
+ persist_directory=persist_directory
56
+ )
57
+ # vectorstore.persist()
58
+ print("✅ Stored with multilingual embeddings.")
59
+
60
+ # Build retrievers
61
+ retriever_multilingual = vectorstore.as_retriever(
62
+ search_type="similarity",
63
+ search_kwargs={"k": k_vector}
64
+ )
65
+
66
+ bm25_retriever = BM25Retriever.from_documents(documents)
67
+ bm25_retriever.k = k_sparse
68
+
69
+ # Ensemble retriever (combining vector + sparse search)
70
+ ensemble_retriever = EnsembleRetriever(
71
+ retrievers=[retriever_multilingual, bm25_retriever],
72
+ weights=weights
73
+ )
74
+
75
+ # Language model for multi-query expansion
76
+ # Using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
77
+ llm = GoogleGenerativeAI(
78
+ model=llm_model_name,
79
+ google_api_key=os.getenv("GOOGLE_API_KEY")
80
+ )
81
+
82
+ expanding_retriever = MultiQueryRetriever.from_llm(
83
+ retriever=ensemble_retriever,
84
+ llm=llm
85
+ )
86
+
87
+ print("✅ Retrieval system ready (vector + sparse + ensemble + multi-query).")
88
+
89
+ return expanding_retriever # Return the final retriever
rag_pipeline.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from langchain_google_genai import GoogleGenerativeAI
4
+ from langchain_core.documents import Document
5
+ from langdetect import detect
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+
12
+ def generate_rag_response(query, retrieved_documents, model="gemini-2.0-flash"):
13
+ """
14
+ Perform Retrieval-Augmented Generation (RAG) using Google's Gemini.
15
+ Args:
16
+ query (str): The user's query.
17
+ retrieved_documents (list of str): The documents retrieved from the retriever.
18
+ model (str): The Gemini model to use.
19
+ Returns:
20
+ str: The generated response text.
21
+ """
22
+ information = "\n\n".join(retrieved_documents)
23
+
24
+ prompt = f"""You are a helpful and knowledgeable AI-powered vaccine assistant designed to support doctors in clinical decision-making.
25
+ You provide evidence-based guidance using only information from official vaccine medical documents.
26
+ Answer the doctor's question accurately and concisely using only the provided information.
27
+
28
+ IMPORTANT REQUIREMENTS:
29
+
30
+ ### Language Settings
31
+ 1. DETECT THE LANGUAGE OF THE DOCTOR'S QUERY.
32
+ 2. YOU MUST RESPOND ONLY IN ONE OF THESE THREE LANGUAGES:
33
+ - English (en): If the doctor's query is in English OR in any language not listed below
34
+ - Arabic (ar): ONLY if the doctor's query is in Arabic
35
+ - French (fr): ONLY if the doctor's query is in French
36
+ 3. DO NOT switch languages mid-response. Use ONLY ONE language throughout your entire answer.
37
+
38
+ ### Citation and Sourcing
39
+ 1. For each fact in your response, include an inline citation in the format [Source ID] immediately following the information, e.g., [e795ebd28318886c0b1a5395ac30ad90].
40
+ 2. Do NOT use 'Source ID:' in the citation format; use only the source ID in square brackets.
41
+ 3. If a fact is supported by multiple sources, use the following format:
42
+ - Use adjacent citations: [e795ebd28318886c0b1a5395ac30ad90][21a932b2340bb16707763f57f0ad2]
43
+ 4. Use ONLY the provided information and never include facts from your general knowledge.
44
+
45
+ ### Content Formatting
46
+ 1. When rendering tables:
47
+ - Convert HTML tables into clean Markdown format
48
+ - Preserve all original headers and data rows exactly
49
+ - Include the citation in the table caption, e.g., "Table: Vaccination Schedule [Source ID]"
50
+ 2. For lists, maintain the original bullet points/numbering and include citations.
51
+ 3. Present information concisely but ensure clinical accuracy is never compromised.
52
+
53
+ ### Professional Tone
54
+ 1. Maintain a professional, clinical tone appropriate for physician communication.
55
+ 2. Prioritize clarity and precision in medical terminology.
56
+
57
+ ### Response Handling
58
+ 1. If the question cannot be answered with the provided documents:
59
+ - English: "I don't have sufficient information in the provided documents to answer this question completely. Please consult additional official vaccine resources or a specialist for guidance on this topic."
60
+ - Arabic: "ليس لدي معلومات كافية في الوثائق المقدمة للإجابة على هذا السؤال بشكل كامل. يرجى استشارة مصادر لقاح رسمية إضافية أو متخصص للحصول على إرشادات حول هذا الموضوع."
61
+ - French: "Je n'ai pas suffisamment d'informations dans les documents fournis pour répondre complètement à cette question. Veuillez consulter des ressources officielles sur les vaccins ou un spécialiste pour obtenir des conseils sur ce sujet."
62
+ 2. If the question is clearly unrelated to vaccines or medicine:
63
+ - English: "I'm specialized in providing vaccine information for healthcare professionals. Could you please ask a question related to vaccines or immunization? I'd be happy to help with that."
64
+ - Arabic: "أنا متخصص في تقديم معلومات اللقاحات للمهنيين الصحيين. هل يمكنك طرح سؤال يتعلق باللقاحات أو التطعيم؟ سأكون سعيدًا بمساعدتك في ذلك."
65
+ - French: "Je suis spécialisé dans la fourniture d'informations sur les vaccins pour les professionnels de la santé. Pourriez-vous poser une question liée aux vaccins ou à l'immunisation ? Je serais heureux de vous aider avec ça."
66
+ 3. For simple greetings:
67
+ - Respond with a simple formal greeting in the same language as the query.
68
+
69
+ Question: {query}
70
+ Information: {information}
71
+ """
72
+
73
+ # Initialize the LLM - using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
74
+ llm = GoogleGenerativeAI(
75
+ model=model,
76
+ google_api_key=os.getenv("GOOGLE_API_KEY")
77
+ )
78
+
79
+ # Generate response using langchain
80
+ response = llm.invoke(prompt)
81
+ return response
82
+
83
+
84
+ def extract_source_ids(response_text):
85
+ """
86
+ Extract source IDs from the response, handling different citation formats:
87
+ - Standard format: [Source ID]
88
+ - Multiple sources in one citation: [Source ID1][Source ID2]
89
+ - Multiple sources in one bracket: [Source ID1, Source ID2]
90
+
91
+ Args:
92
+ response_text (str): The generated response text with inline citations.
93
+
94
+ Returns:
95
+ list of str: List of unique source IDs found in the response text.
96
+ """
97
+ import re
98
+
99
+ # First, extract all source IDs from inline citations with adjacent brackets [ID1][ID2]
100
+ # Replace them with single brackets with comma separation to standardize format
101
+ consolidated_text = re.sub(r'\][\s]*\[', '][', response_text)
102
+ consolidated_text = re.sub(r'\]\[', ', ', consolidated_text)
103
+
104
+ # Now extract all source IDs from any format (single ID or comma-separated IDs)
105
+ inline_citations = re.findall(r'\[([^\[\]]+)\]', consolidated_text)
106
+
107
+ if not inline_citations:
108
+ print("Warning: No source IDs found in the response text.")
109
+ return []
110
+
111
+ # Process each citation which might contain multiple comma-separated IDs
112
+ all_ids = []
113
+ for citation in inline_citations:
114
+ # Split by comma and strip whitespace
115
+ ids = [id_str.strip() for id_str in citation.split(',')]
116
+ all_ids.extend(ids)
117
+
118
+ # Get unique source IDs
119
+ source_ids = list(set(all_ids))
120
+
121
+ # Filter out any non-UUID-like IDs (if needed)
122
+ # This is now optional as we're handling various source ID formats
123
+ # uuid_pattern = r'^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$'
124
+ # source_ids = [source_id for source_id in source_ids if re.match(uuid_pattern, source_id, re.IGNORECASE)]
125
+
126
+ if not source_ids:
127
+ print("Warning: No valid source IDs found after filtering.")
128
+ return []
129
+
130
+ return source_ids
131
+
132
+
133
+ def format_response_with_sequential_citations(response_text, unique_ids, clean_all_citations=False):
134
+ """
135
+ Format the response text by either:
136
+ - Replacing source IDs with sequential numbers (default)
137
+ - Completely removing all citations (if clean_all_citations=True)
138
+
139
+ Handles multiple citation formats:
140
+ - Standard format: [Source ID]
141
+ - Multiple sources in one citation: [Source ID1][Source ID2]
142
+ - Multiple sources in one bracket: [Source ID1, Source ID2]
143
+
144
+ Args:
145
+ response_text (str): The generated response text with inline citations.
146
+ unique_ids (list): List of unique source IDs found in the response.
147
+ clean_all_citations (bool): If True, removes all citations completely.
148
+ If False, formats them as numbers.
149
+
150
+ Returns:
151
+ str: The formatted response text.
152
+ """
153
+ import re
154
+
155
+ if not unique_ids:
156
+ return response_text
157
+
158
+ formatted_response = response_text
159
+
160
+ # Create a mapping from source ID to sequential number
161
+ id_to_number = {source_id: str(i+1) for i, source_id in enumerate(unique_ids)}
162
+
163
+ if clean_all_citations:
164
+ # Remove all citations completely
165
+ formatted_response = re.sub(r'\[[^\[\]]+?\]', '', formatted_response)
166
+ # Clean up any resulting double spaces
167
+ formatted_response = re.sub(r'\s+', ' ', formatted_response)
168
+ else:
169
+ # First, standardize adjacent citations [ID1][ID2] to [ID1, ID2]
170
+ formatted_response = re.sub(r'\][\s]*\[', '][', formatted_response)
171
+ formatted_response = re.sub(r'\]\[', ', ', formatted_response)
172
+
173
+ # Now handle citations with multiple IDs
174
+ def replace_citation(match):
175
+ content = match.group(1)
176
+ # Check if there are multiple IDs separated by commas
177
+ if ',' in content:
178
+ ids = [id_str.strip() for id_str in content.split(',')]
179
+ numbers = []
180
+ for id_str in ids:
181
+ if id_str in id_to_number:
182
+ numbers.append(id_to_number[id_str])
183
+ if numbers:
184
+ return f"[{', '.join(numbers)}]"
185
+ # Single ID case
186
+ elif content in id_to_number:
187
+ return f"[{id_to_number[content]}]"
188
+ return match.group(0)
189
+
190
+ # Replace citations with their sequential numbers
191
+ formatted_response = re.sub(r'\[([^\[\]]+)\]', replace_citation, formatted_response)
192
+
193
+ return formatted_response.strip()
194
+
195
+ def retrieve_documents_and_prepare_inputs(query, expanding_retriever, chunks_path="./chunks.json"):
196
+ """
197
+ Retrieve relevant documents and prepare them for the RAG generation.
198
+
199
+ Args:
200
+ query (str): The user's query.
201
+ expanding_retriever: The retriever object (e.g., returned by prepare_environment_and_retriever).
202
+ chunks_path (str): Path to the chunks.json file.
203
+
204
+ Returns:
205
+ tuple: (source_texts_for_rag, retrieved_elements_full)
206
+ """
207
+ # Get documents - query expansion happens automatically
208
+ retrieved_docs = expanding_retriever.get_relevant_documents(query)
209
+
210
+ retrieved_chunk_ids = [doc.metadata["element_id"] for doc in retrieved_docs]
211
+
212
+ # Load all chunks
213
+ with open(chunks_path, "r", encoding="utf-8") as f:
214
+ chunks_data = json.load(f)
215
+
216
+ source_retrieved_texts = []
217
+ retrieved_elements_full = []
218
+
219
+ for chu in chunks_data:
220
+ if chu["element_id"] in retrieved_chunk_ids:
221
+ if chu.get("type") == "TableElement":
222
+ text = (
223
+ f"[Source ID: {chu['elements']['element_id']}]\n"
224
+ f"CONTENT:\n{chu['text']}\n"
225
+ f"HTML:\n{chu['table_text_as_html']}\n\n"
226
+ )
227
+ source_retrieved_texts.append(text)
228
+ else:
229
+ for element in chu.get("elements", []):
230
+ text = (
231
+ f"[Source ID: {element['element_id']}]\n"
232
+ f"CONTENT:\n{element['text']}\n\n"
233
+ )
234
+ source_retrieved_texts.append(text)
235
+ retrieved_elements_full.append(element)
236
+
237
+ return source_retrieved_texts, retrieved_elements_full
238
+
239
+ def full_rag_pipeline(query, expanding_retriever, chunks_path="./chunks.json", model="gemini-2.0-flash", clean_all_citations=False):
240
+ """
241
+ Full RAG pipeline from query to RAG response + extracted sources.
242
+
243
+ Args:
244
+ query (str): The user's query.
245
+ expanding_retriever: The retriever object.
246
+ chunks_path (str): Path to the chunks.json.
247
+ model (str): Gemini model.
248
+
249
+ Returns:
250
+ dict: {
251
+ "response": str,
252
+ "cited_elements_json": str,
253
+ "answer_language": str
254
+ }
255
+ """
256
+ source_texts, retrieved_elements = retrieve_documents_and_prepare_inputs(query, expanding_retriever, chunks_path)
257
+
258
+ # Step 1: RAG
259
+ response_text = generate_rag_response(query, source_texts, model=model)
260
+
261
+
262
+ # Step 2: Extract cited sources
263
+ unique_ids = extract_source_ids(response_text)
264
+
265
+ # Step 2.1: Format the response text with sequential citations
266
+ response_text = format_response_with_sequential_citations(response_text, unique_ids, clean_all_citations=clean_all_citations)
267
+
268
+ # Step 3: Get only the cited elements
269
+ cited_elements = [element for element in retrieved_elements if element["element_id"] in unique_ids]
270
+
271
+ cited_elements_json = json.dumps(cited_elements, ensure_ascii=False, indent=2)
272
+
273
+ # Improved language detection
274
+ try:
275
+ # Detect the language of the first 5 words of the response
276
+ first_line = " ".join(response_text.split()[:5])
277
+ first_line = re.sub(r'\[.*?\]', '', first_line) # Remove citations
278
+ answer_language = detect(first_line)
279
+ if answer_language not in ['en', 'ar', 'fr']:
280
+ # Fall back to query language if detection fails
281
+ answer_language = detect(query)
282
+ except:
283
+ answer_language = detect(query) if detect(query) in ['en', 'ar', 'fr'] else 'en'
284
+
285
+ return {
286
+ "response": response_text,
287
+ "cited_elements_json": cited_elements_json,
288
+ "answer_language": answer_language
289
+ }
requirements.txt ADDED
Binary file (574 Bytes). View file