File size: 8,569 Bytes
db65992
 
 
1189628
 
db65992
1189628
db65992
1189628
db65992
 
1189628
db65992
 
1189628
 
db65992
 
1189628
 
db65992
 
1189628
 
 
db65992
 
 
 
 
1189628
db65992
1189628
db65992
1189628
 
 
 
 
 
 
db65992
 
1189628
 
 
 
 
 
 
 
 
db65992
1189628
db65992
 
 
 
 
 
 
1189628
 
 
 
 
 
 
 
 
 
 
 
db65992
1189628
 
 
db65992
1189628
db65992
1189628
db65992
1189628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db65992
1189628
db65992
1189628
 
 
 
 
db65992
 
 
1189628
db65992
1189628
db65992
1189628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db65992
1189628
 
 
db65992
 
1189628
 
 
db65992
 
1189628
 
db65992
1189628
db65992
1189628
 
 
db65992
 
 
 
 
1189628
db65992
 
 
 
 
 
 
 
 
1189628
db65992
 
 
1189628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db65992
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
import json
from dotenv import load_dotenv
import time
import uuid
from typing import List, Dict, Optional
from datetime import datetime
import os
import threading

# Load environment variables from .env file
load_dotenv()

from langchain_google_genai import GoogleGenerativeAI

app = FastAPI()

# Global variables to track answering status
answering_status = {
    "is_running": False,
    "start_time": None,
    "processed_questions": 0,
    "total_questions": 0,
    "answers_generated": 0,
    "completed": False,
    "result_file": None,
    "error": None
}

answering_lock = threading.Lock()

def generate_answer_for_question(question_data: Dict, model="gemini-2.0-flash") -> Dict:
    """
    Generate an answer for a given question using the Gemini API.
    
    Args:
        question_data (Dict): The question data including question, chunk_text, and type.
    
    Returns:
        Dict: The question data with an added answer field.
    """
    prompt = f"""
    À partir du texte suivant et de la question donnée, fournissez une réponse précise et concise en français. 
    Si with_citations=true, incluez des références spécifiques au texte dans la réponse.
    Retournez uniquement la réponse au format texte.

    Texte : {question_data["chunk_text"]}
    
    Question : {question_data["question"]}
    
    Type : {question_data["type"]}
    """
    
    try:
        llm = GoogleGenerativeAI(
            model=model,
            google_api_key=os.getenv("GOOGLE_API_KEY")
        )

        response = llm.invoke(prompt)
        
        answer_text = str(response).strip()
        
        # Update the question data with the answer
        question_data["answer"] = answer_text
        
        # Update the global status
        with answering_lock:
            answering_status["answers_generated"] += 1
        
        return question_data
    
    except Exception as e:
        print(f"Erreur lors de la génération de la réponse pour la question {question_data['question_id']}: {e}")
        question_data["answer"] = "Erreur lors de la génération de la réponse."
        return question_data

def answer_questions_in_background(questions: List[Dict]):
    """
    Generate answers in a background thread and update status.
    """
    global answering_status
    
    try:
        all_answered_questions = []
        
        with answering_lock:
            answering_status["total_questions"] = len(questions)
            answering_status["processed_questions"] = 0
            answering_status["answers_generated"] = 0
        
        for i, question in enumerate(questions):
            print(f"Processing question {i+1}/{len(questions)}...")
            answered_question = generate_answer_for_question(question)
            all_answered_questions.append(answered_question)
            
            with answering_lock:
                answering_status["processed_questions"] = i + 1
            
            time.sleep(9)  # Rate limiting
        
        dataset = {
            "dataset_info": {
                "title": "Vaccine Guide Question-Answer Dataset with Answers",
                "description": "A dataset of question-answer pairs with answers generated from a vaccine guide for AI language model training.",
                "version": "1.0.0",
                "created_date": datetime.utcnow().isoformat(),
                "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
                "generated_by": "Gemini API",
                "total_questions": len(all_answered_questions),
                "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
            },
            "questions": all_answered_questions
        }
        
        # Save the dataset
        filename = f"vaccine_answers_{int(time.time())}.json"
        with open(f"./{filename}", 'w', encoding='utf-8') as f:
            json.dump(dataset, f, indent=4, ensure_ascii=False)
        
        # Update status to completed
        with answering_lock:
            answering_status["completed"] = True
            answering_status["is_running"] = False
            answering_status["result_file"] = filename
    
    except Exception as e:
        print(f"Error in background answering: {e}")
        with answering_lock:
            answering_status["error"] = str(e)
            answering_status["is_running"] = False

@app.get("/generate-answers")
async def generate_answers():
    """
    Endpoint to generate answers for questions from the dataset
    """
    global answering_status
    
    # Check if answering is already running
    with answering_lock:
        if answering_status["is_running"]:
            return {
                "status": "running",
                "message": "Answer generation already in progress",
                "current_status": answering_status
            }
    
    try:
        # Reset status
        with answering_lock:
            answering_status["is_running"] = True
            answering_status["start_time"] = datetime.utcnow().isoformat()
            answering_status["processed_questions"] = 0
            answering_status["answers_generated"] = 0
            answering_status["completed"] = False
            answering_status["result_file"] = None
            answering_status["error"] = None
        
        # Load questions from the dataset
        with open("./vaccine_questions.json", "r", encoding="utf-8") as f:
            dataset = json.load(f)
        
        if dataset is None or "questions" not in dataset:
            raise HTTPException(status_code=404, detail="Questions file not found or invalid format")
        
        questions = dataset["questions"]
        
        # Start answering in background thread
        thread = threading.Thread(target=answer_questions_in_background, args=(questions,))
        thread.daemon = True
        thread.start()
        
        return {
            "status": "started",
            "message": "Answer generation started in background",
            "current_status": answering_status
        }
    except Exception as e:
        with answering_lock:
            answering_status["is_running"] = False
            answering_status["error"] = str(e)
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/answering-status")
async def get_answering_status():
    """
    Endpoint to check the current status of answering
    """
    with answering_lock:
        status_copy = answering_status.copy()
    
    return status_copy

@app.get("/download/{filename}")
async def download_file(filename: str):
    """
    Endpoint to download generated files
    """
    file_path = f"./{filename}"
    if os.path.exists(file_path):
        return FileResponse(file_path, media_type="application/json", filename=filename)
    raise HTTPException(status_code=404, detail="File not found")

@app.get("/")
async def root():
    """
    Root endpoint that serves the HTML UI from the index.html file.
    """
    return FileResponse("./index.html", media_type="text/html")

@app.get("/ask")
async def ask_question(question: str, with_citations: bool = False):
    """
    Endpoint to answer a specific question using the Gemini API.
    
    Args:
        question (str): The question to answer.
        with_citations (bool): Whether to include citations in the response.
    
    Returns:
        Dict: The question with the generated answer.
    """
    try:
        # For simplicity, use a default chunk for context (in practice, match to relevant chunk)
        with open("./vaccine_questions.json", "r", encoding="utf-8") as f:
            dataset = json.load(f)
        default_chunk = dataset["questions"][0]["chunk_text"] if dataset["questions"] else ""

        question_data = {
            "question_id": str(uuid.uuid4()),
            "chunk_id": 0,
            "chunk_text": default_chunk,
            "question": question,
            "type": "applied",
            "difficulty": "hard",
            "training_purpose": "Reasoning",
            "validated": False
        }
        
        answered_question = generate_answer_for_question(question_data)
        
        return answered_question
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Erreur lors de la génération de la réponse: {str(e)}")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)