Zeggai Abdellah
fix the downlad path
b8f2d15
raw
history blame
7.46 kB
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
import json
from dotenv import load_dotenv
import time
import uuid
from typing import List, Dict
from datetime import datetime
from huggingface_hub import HfApi # For file persistence in Spaces
import os
# Load environment variables from .env file
load_dotenv()
from langchain_google_genai import GoogleGenerativeAI
app = FastAPI()
def estimate_difficulty(question: str, q_type: str) -> str:
"""
Estimate question difficulty based on type and content.
Args:
question (str): The question text.
q_type (str): Question type (factual, conceptual, applied).
Returns:
str: Difficulty level (easy, medium, hard).
"""
if q_type == "factual":
return "easy"
elif q_type == "conceptual":
return "medium"
return "hard" # applied
def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]:
"""
Generate French questions for a given document chunk using the Gemini API.
"""
prompt = f"""
À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
Texte : {chunk}
Exemple de sortie :
```json
[
{{
"question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?",
"type": "factual"
}},
{{
"question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?",
"type": "conceptual"
}},
{{
"question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?",
"type": "applied"
}}
]
```
"""
try:
llm = GoogleGenerativeAI(
model=model,
google_api_key=os.getenv("GOOGLE_API_KEY")
)
response = llm.invoke(prompt)
questions_text = str(response) # Convert response to string
# Strip Markdown code fences
if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
questions_text = questions_text[7:-4].strip()
elif questions_text.startswith("```") and questions_text.endswith("```"):
questions_text = questions_text[3:-3].strip()
if not questions_text:
print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
return []
questions = json.loads(questions_text)
formatted_questions = []
for q in questions:
question_id = str(uuid.uuid4())
difficulty = estimate_difficulty(q["question"], q["type"])
formatted_questions.append({
"question_id": question_id,
"chunk_id": chunk_id,
"chunk_text": chunk,
"question": q["question"],
"type": q["type"],
"difficulty": difficulty,
"training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
"validated": False
})
return formatted_questions
except Exception as e:
print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}")
return []
except json.JSONDecodeError as e:
print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
return []
def generate_questions_for_document(chunks: List[str]) -> Dict:
"""
Generate questions for all document chunks and structure as a scientific dataset.
"""
all_questions = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}...")
questions = generate_questions_for_chunk(chunk, i)
all_questions.extend(questions)
time.sleep(9) # Rate limiting
dataset = {
"dataset_info": {
"title": "Vaccine Guide Question-Answer Dataset",
"description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
"version": "1.1.0",
"created_date": datetime.utcnow().isoformat(),
"source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
"generated_by": "Gemini API",
"total_questions": len(all_questions),
"intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
},
"questions": all_questions
}
return dataset
def save_dataset_to_space(dataset: Dict, filename: str):
"""
Save dataset to a file in the Space's persistent storage
"""
persistent_path = f"./{filename}"
with open(persistent_path, 'w', encoding='utf-8') as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)
print(f"Dataset saved to {persistent_path}")
# # Optionally upload to Space files
# try:
# api = HfApi(token=os.getenv("HF_TOKEN"))
# api.upload_file(
# path_or_fileobj=persistent_path,
# path_in_repo=filename,
# repo_id=os.getenv("SPACE_ID"),
# repo_type="space"
# )
# print(f"File {filename} uploaded to Space")
# except Exception as e:
# print(f"Could not upload to Space: {e}")
@app.get("/generate-questions")
async def generate_questions():
"""
Endpoint to generate questions from the vaccine guide chunks
"""
try:
chunks_data = None
with open("./chunks.json", "r", encoding="utf-8") as f:
chunks_data = json.load(f)
if chunks_data is None:
raise HTTPException(status_code=404, detail="Chunks file not found in any known location")
VACCINE_CHUNKS = [chunks_data[0]["text"]]
dataset = generate_questions_for_document(VACCINE_CHUNKS)
# Save to persistent storage
filename = "vaccine_questions.json"
save_dataset_to_space(dataset, filename)
return {
"status": "success",
"message": "Questions generated successfully",
"dataset_info": dataset["dataset_info"],
"download_url": f"/download/{filename}"
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/download/{filename}")
async def download_file(filename: str):
"""
Endpoint to download generated files
"""
file_path = f"./{filename}"
if os.path.exists(file_path):
return FileResponse(file_path, media_type="application/json", filename=filename)
raise HTTPException(status_code=404, detail="File not found")
@app.get("/")
async def root():
return {
"message": "Vaccine Question Generator API",
"endpoints": {
"GET /generate-questions": "Generate questions from vaccine guide",
"GET /download/{filename}": "Download generated files"
}
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)