Zeggai Abdellah commited on
Commit
91dbc3c
·
1 Parent(s): ffaeec5

test sipmle version

Browse files
Files changed (1) hide show
  1. app.py +90 -246
app.py CHANGED
@@ -1,79 +1,26 @@
1
- from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
2
- from fastapi.responses import JSONResponse
3
- from typing import List, Dict, Optional
4
  import json
 
 
5
  import time
6
  import uuid
 
7
  from datetime import datetime
 
 
 
8
  import os
9
- from pydantic import BaseModel
10
- import google.generativeai as genai
11
- from enum import Enum
12
- import asyncio
13
- from fastapi.middleware.cors import CORSMiddleware
14
 
15
- app = FastAPI(title="Vaccine Question Generator API")
16
-
17
- # Add CORS middleware
18
- app.add_middleware(
19
- CORSMiddleware,
20
- allow_origins=["*"], # Allow all origins
21
- allow_credentials=True,
22
- allow_methods=["*"], # Allow all methods
23
- allow_headers=["*"], # Allow all headers
24
- )
25
-
26
- # Global variables to track generation state
27
- generation_status = {
28
- "is_running": False,
29
- "total_chunks": 0,
30
- "processed_chunks": 0,
31
- "current_chunk_id": None,
32
- "start_time": None,
33
- "end_time": None,
34
- "errors": [],
35
- "result_file": None
36
- }
37
-
38
- # Chunks file path (will be configurable via API)
39
- CHUNKS_PATH = "Data/Processed_Data/chunks.json"
40
-
41
- # API Key (will be set via environment variable or API)
42
- GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
43
-
44
- # Model type options
45
- class ModelType(str, Enum):
46
- GEMINI_FLASH = "gemini-2.0-flash"
47
- GEMINI_PRO = "gemini-1.5-pro"
48
-
49
- # Request schema for starting generation
50
- class GenerationRequest(BaseModel):
51
- chunks_path: Optional[str] = None
52
- api_key: Optional[str] = None
53
- model: ModelType = ModelType.GEMINI_FLASH
54
- output_file: str = "vaccine_questions_dataset.json"
55
-
56
- # Response schema for status updates
57
- class GenerationStatus(BaseModel):
58
- is_running: bool
59
- total_chunks: int
60
- processed_chunks: int
61
- current_chunk_id: Optional[int]
62
- progress_percentage: float
63
- start_time: Optional[str]
64
- end_time: Optional[str]
65
- estimated_time_remaining: Optional[str]
66
- errors: List[str]
67
- result_file: Optional[str]
68
 
69
  def estimate_difficulty(question: str, q_type: str) -> str:
70
  """
71
  Estimate question difficulty based on type and content.
72
-
73
  Args:
74
  question (str): The question text.
75
  q_type (str): Question type (factual, conceptual, applied).
76
-
77
  Returns:
78
  str: Difficulty level (easy, medium, hard).
79
  """
@@ -83,25 +30,27 @@ def estimate_difficulty(question: str, q_type: str) -> str:
83
  return "medium"
84
  return "hard" # applied
85
 
86
- async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model: str) -> List[Dict]:
 
87
  """
88
  Generate French questions for a given document chunk using the Gemini API.
89
-
90
  Args:
91
  chunk (str): A chunk of text from the vaccine guide (in French).
92
  chunk_id (int): Chunk identifier.
 
93
  client: Gemini API client instance.
94
  model (str): Model name for Gemini API.
95
-
96
  Returns:
97
  List[Dict]: List of questions with metadata.
98
  """
99
  prompt = f"""
100
- À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
101
  Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
102
-
103
  Texte : {chunk}
104
-
105
  Exemple de sortie :
106
  ```json
107
  [
@@ -120,32 +69,43 @@ async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model:
120
  ]
121
  ```
122
  """
123
-
124
  try:
125
- # Update global state
126
- generation_status["current_chunk_id"] = chunk_id
127
-
128
- # Generate response using Gemini
129
- response = client.generate_content(
130
  model=model,
131
- contents=prompt,
132
  )
133
 
134
- # Parse the response
135
- questions_text = response.text if hasattr(response, 'text') else ""
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  # Strip Markdown code fences
138
  if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
139
  questions_text = questions_text[7:-4].strip()
140
  elif questions_text.startswith("```") and questions_text.endswith("```"):
141
  questions_text = questions_text[3:-3].strip()
142
-
 
 
 
143
  # Parse JSON
144
  if not questions_text:
145
- error_msg = f"Erreur: Réponse vide pour le chunk {chunk_id}"
146
- generation_status["errors"].append(error_msg)
147
  return []
148
-
149
  questions = json.loads(questions_text)
150
 
151
  formatted_questions = []
@@ -162,189 +122,73 @@ async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model:
162
  "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
163
  "validated": False # Flag for expert review
164
  })
165
-
166
- # Update count of processed chunks
167
- generation_status["processed_chunks"] += 1
168
 
169
  return formatted_questions
170
-
171
  except Exception as e:
172
- error_msg = f"Error generating questions for chunk {chunk_id}: {str(e)}"
173
- generation_status["errors"].append(error_msg)
 
 
174
  return []
175
 
176
- async def generate_questions_for_document(chunks: List[str], model: str, output_file: str, client) -> Dict:
177
  """
178
  Generate questions for all document chunks and structure as a scientific dataset.
179
-
180
  Args:
181
  chunks (List[str]): List of document chunks.
182
- model (str): Model name for Gemini API.
183
- output_file (str): File to save the results.
184
- client: Gemini API client.
185
-
186
  Returns:
187
  Dict: Dataset with header and questions.
188
  """
189
  all_questions = []
190
-
191
- # Reset/initialize the global state
192
- generation_status["is_running"] = True
193
- generation_status["total_chunks"] = len(chunks)
194
- generation_status["processed_chunks"] = 0
195
- generation_status["start_time"] = datetime.utcnow().isoformat()
196
- generation_status["errors"] = []
197
- generation_status["current_chunk_id"] = None
198
- generation_status["end_time"] = None
199
- generation_status["result_file"] = None
200
 
201
- try:
202
- for i, chunk in enumerate(chunks):
203
- # Process each chunk
204
- questions = await generate_questions_for_chunk(chunk, i, client, model)
205
- all_questions.extend(questions)
206
-
207
- # Rate limiting
208
- await asyncio.sleep(9)
209
-
210
- # Create dataset with scientific structure
211
- dataset = {
212
- "dataset_info": {
213
- "title": "Vaccine Guide Question-Answer Dataset",
214
- "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
215
- "version": "1.1.0",
216
- "created_date": datetime.utcnow().isoformat(),
217
- "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
218
- "generated_by": f"Gemini API ({model})",
219
- "total_questions": len(all_questions),
220
- "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
221
- },
222
- "questions": all_questions
223
- }
224
-
225
- # Save the dataset
226
- with open(output_file, 'w', encoding='utf-8') as f:
227
- json.dump(dataset, f, indent=4, ensure_ascii=False)
228
-
229
- # Update final state
230
- generation_status["end_time"] = datetime.utcnow().isoformat()
231
- generation_status["result_file"] = output_file
232
-
233
- return dataset
234
- except Exception as e:
235
- generation_status["errors"].append(f"Error in document generation: {str(e)}")
236
- raise e
237
- finally:
238
- generation_status["is_running"] = False
239
-
240
- async def background_generation_task(chunks_path: str, model: str, output_file: str, api_key: str = None):
241
- """Background task for generating questions"""
242
- try:
243
- # Configure the client
244
- if api_key:
245
- genai.configure(api_key=api_key)
246
- elif GOOGLE_API_KEY:
247
- genai.configure(api_key=GOOGLE_API_KEY)
248
- else:
249
- raise ValueError("No API key provided for Gemini")
250
-
251
- # Load chunks
252
- with open(chunks_path, "r", encoding="utf-8") as f:
253
- chunks_data = json.load(f)
254
-
255
- # Extract texts from chunks
256
- chunks = [chunk["text"] for chunk in chunks_data]
257
-
258
- # Start generation process
259
- await generate_questions_for_document(chunks, model, output_file, genai)
260
- except Exception as e:
261
- generation_status["errors"].append(f"Background task error: {str(e)}")
262
- generation_status["is_running"] = False
263
-
264
- @app.post("/generate", response_model=GenerationStatus)
265
- async def start_generation(request: GenerationRequest, background_tasks: BackgroundTasks):
266
- """Start the question generation process"""
267
- # Check if generation is already running
268
- if generation_status["is_running"]:
269
- raise HTTPException(status_code=400, detail="Generation process is already running")
270
-
271
- # Set up paths and configurations
272
- chunks_path = request.chunks_path or CHUNKS_PATH
273
- api_key = request.api_key or GOOGLE_API_KEY
274
- model = request.model
275
- output_file = request.output_file
276
-
277
- # Validate that chunks file exists
278
- if not os.path.exists(chunks_path):
279
- raise HTTPException(status_code=404, detail=f"Chunks file not found at {chunks_path}")
280
 
281
- # Validate API key is available
282
- if not api_key:
283
- raise HTTPException(status_code=400, detail="No API key provided")
284
-
285
- # Start background generation task
286
- background_tasks.add_task(
287
- background_generation_task,
288
- chunks_path,
289
- model,
290
- output_file,
291
- api_key
292
- )
 
 
293
 
294
- # Return initial status
295
- return get_generation_status()
296
 
297
- @app.get("/status", response_model=GenerationStatus)
298
- async def get_generation_status():
299
- """Get the current status of the question generation process"""
300
- # Calculate progress percentage
301
- total = generation_status["total_chunks"]
302
- processed = generation_status["processed_chunks"]
303
-
304
- progress_percentage = (processed / total * 100) if total > 0 else 0
305
-
306
- # Calculate estimated time remaining
307
- etr = None
308
- if (generation_status["is_running"] and
309
- generation_status["start_time"] and
310
- processed > 0):
311
-
312
- start_time = datetime.fromisoformat(generation_status["start_time"])
313
- time_elapsed = (datetime.utcnow() - start_time).total_seconds()
314
- time_per_chunk = time_elapsed / processed
315
- remaining_chunks = total - processed
316
-
317
- etr_seconds = time_per_chunk * remaining_chunks
318
- etr = f"{int(etr_seconds // 60)}m {int(etr_seconds % 60)}s"
319
 
320
- # Return formatted status
321
- return GenerationStatus(
322
- is_running=generation_status["is_running"],
323
- total_chunks=total,
324
- processed_chunks=processed,
325
- current_chunk_id=generation_status["current_chunk_id"],
326
- progress_percentage=round(progress_percentage, 2),
327
- start_time=generation_status["start_time"],
328
- end_time=generation_status["end_time"],
329
- estimated_time_remaining=etr,
330
- errors=generation_status["errors"],
331
- result_file=generation_status["result_file"]
332
- )
333
-
334
- @app.get("/")
335
- async def root():
336
- """Root endpoint with API information"""
337
- return {
338
- "name": "Vaccine Question Generator API",
339
- "description": "API for generating question-answer pairs from vaccine guide chunks",
340
- "endpoints": [
341
- {"path": "/", "method": "GET", "description": "This information page"},
342
- {"path": "/generate", "method": "POST", "description": "Start question generation process"},
343
- {"path": "/status", "method": "GET", "description": "Get current generation status"}
344
- ]
345
- }
346
-
347
 
348
  if __name__ == "__main__":
349
  import uvicorn
 
 
 
 
 
 
 
 
 
350
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ from fastapi import FastAPI
 
 
2
  import json
3
+ from dotenv import load_dotenv
4
+ import requests
5
  import time
6
  import uuid
7
+ from typing import List, Dict
8
  from datetime import datetime
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+ from langchain_google_genai import GoogleGenerativeAI
12
  import os
 
 
 
 
 
13
 
14
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def estimate_difficulty(question: str, q_type: str) -> str:
17
  """
18
  Estimate question difficulty based on type and content.
19
+
20
  Args:
21
  question (str): The question text.
22
  q_type (str): Question type (factual, conceptual, applied).
23
+
24
  Returns:
25
  str: Difficulty level (easy, medium, hard).
26
  """
 
30
  return "medium"
31
  return "hard" # applied
32
 
33
+
34
+ def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]:
35
  """
36
  Generate French questions for a given document chunk using the Gemini API.
37
+
38
  Args:
39
  chunk (str): A chunk of text from the vaccine guide (in French).
40
  chunk_id (int): Chunk identifier.
41
+ api_key (str): Gemini API key.
42
  client: Gemini API client instance.
43
  model (str): Model name for Gemini API.
44
+
45
  Returns:
46
  List[Dict]: List of questions with metadata.
47
  """
48
  prompt = f"""
49
+ À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
50
  Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
51
+
52
  Texte : {chunk}
53
+
54
  Exemple de sortie :
55
  ```json
56
  [
 
69
  ]
70
  ```
71
  """
72
+
73
  try:
74
+ # Initialize the LLM - using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
75
+ llm = GoogleGenerativeAI(
 
 
 
76
  model=model,
77
+ google_api_key=os.getenv("GOOGLE_API_KEY")
78
  )
79
 
80
+ # Generate response using langchain
81
+ response = llm.invoke(prompt)
82
+
83
+
84
+ # Debug: Print raw response to inspect structure
85
+ print(f"Raw response for chunk {chunk_id}: {response}")
86
+
87
+ # Parse the response (adjust based on actual Gemini API response structure)
88
+ questions_text = ""
89
+ if hasattr(response, 'candidates') and response.candidates:
90
+ questions_text = response.candidates[0].content.parts[0].text if response.candidates[0].content.parts else ""
91
+
92
+ # Debug: Print extracted text
93
+ print(f"Extracted questions_text for chunk {chunk_id}: {questions_text}")
94
 
95
  # Strip Markdown code fences
96
  if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
97
  questions_text = questions_text[7:-4].strip()
98
  elif questions_text.startswith("```") and questions_text.endswith("```"):
99
  questions_text = questions_text[3:-3].strip()
100
+
101
+ # Debug: Print cleaned text
102
+ print(f"Cleaned questions_text for chunk {chunk_id}: {questions_text}")
103
+
104
  # Parse JSON
105
  if not questions_text:
106
+ print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
 
107
  return []
108
+
109
  questions = json.loads(questions_text)
110
 
111
  formatted_questions = []
 
122
  "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
123
  "validated": False # Flag for expert review
124
  })
 
 
 
125
 
126
  return formatted_questions
127
+
128
  except Exception as e:
129
+ print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}")
130
+ return []
131
+ except json.JSONDecodeError as e:
132
+ print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
133
  return []
134
 
135
+ def generate_questions_for_document(chunks: List[str],) -> Dict:
136
  """
137
  Generate questions for all document chunks and structure as a scientific dataset.
138
+
139
  Args:
140
  chunks (List[str]): List of document chunks.
141
+ api_key (str): Gemini API key.
142
+
 
 
143
  Returns:
144
  Dict: Dataset with header and questions.
145
  """
146
  all_questions = []
 
 
 
 
 
 
 
 
 
 
147
 
148
+ for i, chunk in enumerate(chunks):
149
+ print(f"Processing chunk {i+1}/{len(chunks)}...")
150
+ questions = generate_questions_for_chunk(chunk, i)
151
+ all_questions.extend(questions)
152
+ time.sleep(9) # Rate limiting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # Create dataset with scientific structure
155
+ dataset = {
156
+ "dataset_info": {
157
+ "title": "Vaccine Guide Question-Answer Dataset",
158
+ "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
159
+ "version": "1.1.0",
160
+ "created_date": datetime.utcnow().isoformat(),
161
+ "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
162
+ "generated_by": "Gemini API",
163
+ "total_questions": len(all_questions),
164
+ "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
165
+ },
166
+ "questions": all_questions
167
+ }
168
 
169
+ return dataset
 
170
 
171
+ def save_dataset(dataset: Dict, output_file: str):
172
+ """
173
+ Save dataset to a JSON file.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ Args:
176
+ dataset (Dict): The dataset to save.
177
+ output_file (str): Path to output JSON file.
178
+ """
179
+ with open(output_file, 'w', encoding='utf-8') as f:
180
+ json.dump(dataset, f, indent=4, ensure_ascii=False)
181
+ print(f"Dataset saved to {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  if __name__ == "__main__":
184
  import uvicorn
185
+
186
+ # Load the chunks from the JSON file
187
+ with open("Data/Processed_Data/chunks.json", "r", encoding="utf-8") as f:
188
+ chunks_data = json.load(f)
189
+
190
+ VACCINE_CHUNKS=[chunks_data[0]["text"]]
191
+ dataset = generate_questions_for_document(VACCINE_CHUNKS)
192
+ save_dataset(dataset, "vaccine_questions.json")
193
+ # Run the FastAPI app
194
  uvicorn.run(app, host="0.0.0.0", port=7860)