Zeggai Abdellah commited on
Commit
ffaeec5
·
1 Parent(s): 30bafd5

first commit

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. Data/Processed_Data/chunks.json +0 -0
  3. Dockerfile +34 -0
  4. app.py +350 -0
  5. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Data/Processed_Data/chunks.json ADDED
The diff for this file is too large to render. See raw diff
 
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a Python 3.9 base image
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /code
6
+
7
+ # Copy requirements file
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Create a non-root user for security
14
+ RUN useradd -m -u 1000 user
15
+
16
+ # Set up directories and permissions
17
+ RUN mkdir -p /code/Data/Processed_Data && chown -R user:user /code
18
+
19
+ USER user
20
+
21
+ ENV HOME=/home/user \
22
+ PATH=/home/user/.local/bin:$PATH
23
+
24
+ # Set app directory
25
+ WORKDIR /code
26
+
27
+ # Copy all project files with correct ownership
28
+ COPY --chown=user . /code
29
+
30
+ # Expose port 7860 (Hugging Face default)
31
+ EXPOSE 7860
32
+
33
+ # Run the FastAPI app with uvicorn
34
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
2
+ from fastapi.responses import JSONResponse
3
+ from typing import List, Dict, Optional
4
+ import json
5
+ import time
6
+ import uuid
7
+ from datetime import datetime
8
+ import os
9
+ from pydantic import BaseModel
10
+ import google.generativeai as genai
11
+ from enum import Enum
12
+ import asyncio
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+
15
+ app = FastAPI(title="Vaccine Question Generator API")
16
+
17
+ # Add CORS middleware
18
+ app.add_middleware(
19
+ CORSMiddleware,
20
+ allow_origins=["*"], # Allow all origins
21
+ allow_credentials=True,
22
+ allow_methods=["*"], # Allow all methods
23
+ allow_headers=["*"], # Allow all headers
24
+ )
25
+
26
+ # Global variables to track generation state
27
+ generation_status = {
28
+ "is_running": False,
29
+ "total_chunks": 0,
30
+ "processed_chunks": 0,
31
+ "current_chunk_id": None,
32
+ "start_time": None,
33
+ "end_time": None,
34
+ "errors": [],
35
+ "result_file": None
36
+ }
37
+
38
+ # Chunks file path (will be configurable via API)
39
+ CHUNKS_PATH = "Data/Processed_Data/chunks.json"
40
+
41
+ # API Key (will be set via environment variable or API)
42
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
43
+
44
+ # Model type options
45
+ class ModelType(str, Enum):
46
+ GEMINI_FLASH = "gemini-2.0-flash"
47
+ GEMINI_PRO = "gemini-1.5-pro"
48
+
49
+ # Request schema for starting generation
50
+ class GenerationRequest(BaseModel):
51
+ chunks_path: Optional[str] = None
52
+ api_key: Optional[str] = None
53
+ model: ModelType = ModelType.GEMINI_FLASH
54
+ output_file: str = "vaccine_questions_dataset.json"
55
+
56
+ # Response schema for status updates
57
+ class GenerationStatus(BaseModel):
58
+ is_running: bool
59
+ total_chunks: int
60
+ processed_chunks: int
61
+ current_chunk_id: Optional[int]
62
+ progress_percentage: float
63
+ start_time: Optional[str]
64
+ end_time: Optional[str]
65
+ estimated_time_remaining: Optional[str]
66
+ errors: List[str]
67
+ result_file: Optional[str]
68
+
69
+ def estimate_difficulty(question: str, q_type: str) -> str:
70
+ """
71
+ Estimate question difficulty based on type and content.
72
+
73
+ Args:
74
+ question (str): The question text.
75
+ q_type (str): Question type (factual, conceptual, applied).
76
+
77
+ Returns:
78
+ str: Difficulty level (easy, medium, hard).
79
+ """
80
+ if q_type == "factual":
81
+ return "easy"
82
+ elif q_type == "conceptual":
83
+ return "medium"
84
+ return "hard" # applied
85
+
86
+ async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model: str) -> List[Dict]:
87
+ """
88
+ Generate French questions for a given document chunk using the Gemini API.
89
+
90
+ Args:
91
+ chunk (str): A chunk of text from the vaccine guide (in French).
92
+ chunk_id (int): Chunk identifier.
93
+ client: Gemini API client instance.
94
+ model (str): Model name for Gemini API.
95
+
96
+ Returns:
97
+ List[Dict]: List of questions with metadata.
98
+ """
99
+ prompt = f"""
100
+ À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
101
+ Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
102
+
103
+ Texte : {chunk}
104
+
105
+ Exemple de sortie :
106
+ ```json
107
+ [
108
+ {{
109
+ "question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?",
110
+ "type": "factual"
111
+ }},
112
+ {{
113
+ "question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?",
114
+ "type": "conceptual"
115
+ }},
116
+ {{
117
+ "question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?",
118
+ "type": "applied"
119
+ }}
120
+ ]
121
+ ```
122
+ """
123
+
124
+ try:
125
+ # Update global state
126
+ generation_status["current_chunk_id"] = chunk_id
127
+
128
+ # Generate response using Gemini
129
+ response = client.generate_content(
130
+ model=model,
131
+ contents=prompt,
132
+ )
133
+
134
+ # Parse the response
135
+ questions_text = response.text if hasattr(response, 'text') else ""
136
+
137
+ # Strip Markdown code fences
138
+ if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
139
+ questions_text = questions_text[7:-4].strip()
140
+ elif questions_text.startswith("```") and questions_text.endswith("```"):
141
+ questions_text = questions_text[3:-3].strip()
142
+
143
+ # Parse JSON
144
+ if not questions_text:
145
+ error_msg = f"Erreur: Réponse vide pour le chunk {chunk_id}"
146
+ generation_status["errors"].append(error_msg)
147
+ return []
148
+
149
+ questions = json.loads(questions_text)
150
+
151
+ formatted_questions = []
152
+ for q in questions:
153
+ question_id = str(uuid.uuid4())
154
+ difficulty = estimate_difficulty(q["question"], q["type"])
155
+ formatted_questions.append({
156
+ "question_id": question_id,
157
+ "chunk_id": chunk_id,
158
+ "chunk_text": chunk,
159
+ "question": q["question"],
160
+ "type": q["type"],
161
+ "difficulty": difficulty,
162
+ "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
163
+ "validated": False # Flag for expert review
164
+ })
165
+
166
+ # Update count of processed chunks
167
+ generation_status["processed_chunks"] += 1
168
+
169
+ return formatted_questions
170
+
171
+ except Exception as e:
172
+ error_msg = f"Error generating questions for chunk {chunk_id}: {str(e)}"
173
+ generation_status["errors"].append(error_msg)
174
+ return []
175
+
176
+ async def generate_questions_for_document(chunks: List[str], model: str, output_file: str, client) -> Dict:
177
+ """
178
+ Generate questions for all document chunks and structure as a scientific dataset.
179
+
180
+ Args:
181
+ chunks (List[str]): List of document chunks.
182
+ model (str): Model name for Gemini API.
183
+ output_file (str): File to save the results.
184
+ client: Gemini API client.
185
+
186
+ Returns:
187
+ Dict: Dataset with header and questions.
188
+ """
189
+ all_questions = []
190
+
191
+ # Reset/initialize the global state
192
+ generation_status["is_running"] = True
193
+ generation_status["total_chunks"] = len(chunks)
194
+ generation_status["processed_chunks"] = 0
195
+ generation_status["start_time"] = datetime.utcnow().isoformat()
196
+ generation_status["errors"] = []
197
+ generation_status["current_chunk_id"] = None
198
+ generation_status["end_time"] = None
199
+ generation_status["result_file"] = None
200
+
201
+ try:
202
+ for i, chunk in enumerate(chunks):
203
+ # Process each chunk
204
+ questions = await generate_questions_for_chunk(chunk, i, client, model)
205
+ all_questions.extend(questions)
206
+
207
+ # Rate limiting
208
+ await asyncio.sleep(9)
209
+
210
+ # Create dataset with scientific structure
211
+ dataset = {
212
+ "dataset_info": {
213
+ "title": "Vaccine Guide Question-Answer Dataset",
214
+ "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
215
+ "version": "1.1.0",
216
+ "created_date": datetime.utcnow().isoformat(),
217
+ "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
218
+ "generated_by": f"Gemini API ({model})",
219
+ "total_questions": len(all_questions),
220
+ "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
221
+ },
222
+ "questions": all_questions
223
+ }
224
+
225
+ # Save the dataset
226
+ with open(output_file, 'w', encoding='utf-8') as f:
227
+ json.dump(dataset, f, indent=4, ensure_ascii=False)
228
+
229
+ # Update final state
230
+ generation_status["end_time"] = datetime.utcnow().isoformat()
231
+ generation_status["result_file"] = output_file
232
+
233
+ return dataset
234
+ except Exception as e:
235
+ generation_status["errors"].append(f"Error in document generation: {str(e)}")
236
+ raise e
237
+ finally:
238
+ generation_status["is_running"] = False
239
+
240
+ async def background_generation_task(chunks_path: str, model: str, output_file: str, api_key: str = None):
241
+ """Background task for generating questions"""
242
+ try:
243
+ # Configure the client
244
+ if api_key:
245
+ genai.configure(api_key=api_key)
246
+ elif GOOGLE_API_KEY:
247
+ genai.configure(api_key=GOOGLE_API_KEY)
248
+ else:
249
+ raise ValueError("No API key provided for Gemini")
250
+
251
+ # Load chunks
252
+ with open(chunks_path, "r", encoding="utf-8") as f:
253
+ chunks_data = json.load(f)
254
+
255
+ # Extract texts from chunks
256
+ chunks = [chunk["text"] for chunk in chunks_data]
257
+
258
+ # Start generation process
259
+ await generate_questions_for_document(chunks, model, output_file, genai)
260
+ except Exception as e:
261
+ generation_status["errors"].append(f"Background task error: {str(e)}")
262
+ generation_status["is_running"] = False
263
+
264
+ @app.post("/generate", response_model=GenerationStatus)
265
+ async def start_generation(request: GenerationRequest, background_tasks: BackgroundTasks):
266
+ """Start the question generation process"""
267
+ # Check if generation is already running
268
+ if generation_status["is_running"]:
269
+ raise HTTPException(status_code=400, detail="Generation process is already running")
270
+
271
+ # Set up paths and configurations
272
+ chunks_path = request.chunks_path or CHUNKS_PATH
273
+ api_key = request.api_key or GOOGLE_API_KEY
274
+ model = request.model
275
+ output_file = request.output_file
276
+
277
+ # Validate that chunks file exists
278
+ if not os.path.exists(chunks_path):
279
+ raise HTTPException(status_code=404, detail=f"Chunks file not found at {chunks_path}")
280
+
281
+ # Validate API key is available
282
+ if not api_key:
283
+ raise HTTPException(status_code=400, detail="No API key provided")
284
+
285
+ # Start background generation task
286
+ background_tasks.add_task(
287
+ background_generation_task,
288
+ chunks_path,
289
+ model,
290
+ output_file,
291
+ api_key
292
+ )
293
+
294
+ # Return initial status
295
+ return get_generation_status()
296
+
297
+ @app.get("/status", response_model=GenerationStatus)
298
+ async def get_generation_status():
299
+ """Get the current status of the question generation process"""
300
+ # Calculate progress percentage
301
+ total = generation_status["total_chunks"]
302
+ processed = generation_status["processed_chunks"]
303
+
304
+ progress_percentage = (processed / total * 100) if total > 0 else 0
305
+
306
+ # Calculate estimated time remaining
307
+ etr = None
308
+ if (generation_status["is_running"] and
309
+ generation_status["start_time"] and
310
+ processed > 0):
311
+
312
+ start_time = datetime.fromisoformat(generation_status["start_time"])
313
+ time_elapsed = (datetime.utcnow() - start_time).total_seconds()
314
+ time_per_chunk = time_elapsed / processed
315
+ remaining_chunks = total - processed
316
+
317
+ etr_seconds = time_per_chunk * remaining_chunks
318
+ etr = f"{int(etr_seconds // 60)}m {int(etr_seconds % 60)}s"
319
+
320
+ # Return formatted status
321
+ return GenerationStatus(
322
+ is_running=generation_status["is_running"],
323
+ total_chunks=total,
324
+ processed_chunks=processed,
325
+ current_chunk_id=generation_status["current_chunk_id"],
326
+ progress_percentage=round(progress_percentage, 2),
327
+ start_time=generation_status["start_time"],
328
+ end_time=generation_status["end_time"],
329
+ estimated_time_remaining=etr,
330
+ errors=generation_status["errors"],
331
+ result_file=generation_status["result_file"]
332
+ )
333
+
334
+ @app.get("/")
335
+ async def root():
336
+ """Root endpoint with API information"""
337
+ return {
338
+ "name": "Vaccine Question Generator API",
339
+ "description": "API for generating question-answer pairs from vaccine guide chunks",
340
+ "endpoints": [
341
+ {"path": "/", "method": "GET", "description": "This information page"},
342
+ {"path": "/generate", "method": "POST", "description": "Start question generation process"},
343
+ {"path": "/status", "method": "GET", "description": "Get current generation status"}
344
+ ]
345
+ }
346
+
347
+
348
+ if __name__ == "__main__":
349
+ import uvicorn
350
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
Binary file (208 Bytes). View file