Zeggai Abdellah
change the josn file form data to chunk
659455b
raw
history blame
10.6 kB
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
import json
from dotenv import load_dotenv
import time
import uuid
from typing import List, Dict, Optional
from datetime import datetime
from huggingface_hub import HfApi # For file persistence in Spaces
import os
import threading
import glob
# Load environment variables from .env file
load_dotenv()
from langchain_google_genai import GoogleGenerativeAI
app = FastAPI()
# Global variables to track generation status
generation_status = {
"is_running": False,
"start_time": None,
"processed_chunks": 0,
"total_chunks": 0,
"questions_generated": 0,
"completed": False,
"result_file": None,
"error": None
}
generation_lock = threading.Lock()
def estimate_difficulty(question: str, q_type: str) -> str:
"""
Estimate question difficulty based on type and content.
Args:
question (str): The question text.
q_type (str): Question type (factual, conceptual, applied).
Returns:
str: Difficulty level (easy, medium, hard).
"""
if q_type == "factual":
return "easy"
elif q_type == "conceptual":
return "medium"
return "hard" # applied
def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]:
"""
Generate French questions for a given document chunk using the Gemini API.
"""
prompt = f"""
À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
Texte : {chunk}
Exemple de sortie :
```json
[
{{
"question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?",
"type": "factual"
}},
{{
"question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?",
"type": "conceptual"
}},
{{
"question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?",
"type": "applied"
}}
]
```
"""
try:
llm = GoogleGenerativeAI(
model=model,
google_api_key=os.getenv("GOOGLE_API_KEY")
)
response = llm.invoke(prompt)
questions_text = str(response) # Convert response to string
# Strip Markdown code fences
if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
questions_text = questions_text[7:-4].strip()
elif questions_text.startswith("```") and questions_text.endswith("```"):
questions_text = questions_text[3:-3].strip()
if not questions_text:
print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
return []
questions = json.loads(questions_text)
formatted_questions = []
for q in questions:
question_id = str(uuid.uuid4())
difficulty = estimate_difficulty(q["question"], q["type"])
formatted_questions.append({
"question_id": question_id,
"chunk_id": chunk_id,
"chunk_text": chunk,
"question": q["question"],
"type": q["type"],
"difficulty": difficulty,
"training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
"validated": False
})
# Update the global status
with generation_lock:
generation_status["questions_generated"] += len(formatted_questions)
return formatted_questions
except Exception as e:
print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}")
return []
except json.JSONDecodeError as e:
print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
return []
def generate_questions_in_background(chunks: List[str]):
"""
Generate questions in a background thread and update status.
"""
global generation_status
try:
all_questions = []
with generation_lock:
generation_status["total_chunks"] = len(chunks)
generation_status["processed_chunks"] = 0
generation_status["questions_generated"] = 0
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
questions = generate_questions_for_chunk(chunk, i)
all_questions.extend(questions)
with generation_lock:
generation_status["processed_chunks"] = i + 1
time.sleep(9) # Rate limiting
dataset = {
"dataset_info": {
"title": "Vaccine Guide Question-Answer Dataset",
"description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
"version": "1.1.0",
"created_date": datetime.utcnow().isoformat(),
"source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
"generated_by": "Gemini API",
"total_questions": len(all_questions),
"intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
},
"questions": all_questions
}
# Save the dataset
filename = f"vaccine_questions_{int(time.time())}.json"
with open(f"./{filename}", 'w', encoding='utf-8') as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)
# Update status to completed
with generation_lock:
generation_status["completed"] = True
generation_status["is_running"] = False
generation_status["result_file"] = filename
except Exception as e:
print(f"Error in background generation: {e}")
with generation_lock:
generation_status["error"] = str(e)
generation_status["is_running"] = False
def save_dataset_to_space(dataset: Dict, filename: str):
"""
Save dataset to a file in the Space's persistent storage
"""
persistent_path = f"./{filename}"
with open(persistent_path, 'w', encoding='utf-8') as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)
print(f"Dataset saved to {persistent_path}")
@app.get("/generate-questions")
async def generate_questions():
"""
Endpoint to generate questions from all JSON files in the data folder
"""
global generation_status
# Check if generation is already running
with generation_lock:
if generation_status["is_running"]:
return {
"status": "running",
"message": "Generation already in progress",
"current_status": generation_status
}
try:
# Reset status
with generation_lock:
generation_status["is_running"] = True
generation_status["start_time"] = datetime.utcnow().isoformat()
generation_status["processed_chunks"] = 0
generation_status["questions_generated"] = 0
generation_status["completed"] = False
generation_status["result_file"] = None
generation_status["error"] = None
# Load all JSON files from data folder
json_files = glob.glob("./chunk/*.json")
if not json_files:
raise HTTPException(status_code=404, detail="No JSON files found in data folder")
all_chunks = []
for json_file in json_files:
with open(json_file, "r", encoding="utf-8") as f:
chunks_data = json.load(f)
if isinstance(chunks_data, list):
# If it's a list of chunks
for chunk in chunks_data:
if isinstance(chunk, dict) and "text" in chunk:
all_chunks.append(chunk["text"])
elif isinstance(chunk, str):
all_chunks.append(chunk)
elif isinstance(chunks_data, dict):
# If it's a dict, try to extract text content
if "text" in chunks_data:
all_chunks.append(chunks_data["text"])
elif "content" in chunks_data:
all_chunks.append(chunks_data["content"])
if not all_chunks:
raise HTTPException(status_code=404, detail="No text content found in JSON files")
# Start generation in background thread
thread = threading.Thread(target=generate_questions_in_background, args=(all_chunks,))
thread.daemon = True
thread.start()
return {
"status": "started",
"message": f"Question generation started for {len(json_files)} JSON files with {len(all_chunks)} chunks",
"current_status": generation_status
}
except Exception as e:
with generation_lock:
generation_status["is_running"] = False
generation_status["error"] = str(e)
raise HTTPException(status_code=500, detail=str(e))
@app.get("/generation-status")
async def get_generation_status():
"""
Endpoint to check the current status of generation
"""
with generation_lock:
status_copy = generation_status.copy()
return status_copy
@app.get("/download/{filename}")
async def download_file(filename: str):
"""
Endpoint to download generated files
"""
file_path = f"./{filename}"
if os.path.exists(file_path):
return FileResponse(file_path, media_type="application/json", filename=filename)
raise HTTPException(status_code=404, detail="File not found")
@app.get("/")
async def root():
"""
Root endpoint that serves the HTML UI from the index.html file.
"""
return FileResponse("./index.html", media_type="text/html")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)