from fastapi import FastAPI, HTTPException from fastapi.responses import FileResponse import json from dotenv import load_dotenv import time import uuid from typing import List, Dict from datetime import datetime from huggingface_hub import HfApi # For file persistence in Spaces import os # Load environment variables from .env file load_dotenv() from langchain_google_genai import GoogleGenerativeAI app = FastAPI() def estimate_difficulty(question: str, q_type: str) -> str: """ Estimate question difficulty based on type and content. Args: question (str): The question text. q_type (str): Question type (factual, conceptual, applied). Returns: str: Difficulty level (easy, medium, hard). """ if q_type == "factual": return "easy" elif q_type == "conceptual": return "medium" return "hard" # applied def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]: """ Generate French questions for a given document chunk using the Gemini API. """ prompt = f""" À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive. Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```. Texte : {chunk} Exemple de sortie : ```json [ {{ "question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?", "type": "factual" }}, {{ "question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?", "type": "conceptual" }}, {{ "question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?", "type": "applied" }} ] ``` """ try: llm = GoogleGenerativeAI( model=model, google_api_key=os.getenv("GOOGLE_API_KEY") ) response = llm.invoke(prompt) questions_text = str(response) # Convert response to string # Strip Markdown code fences if questions_text.startswith("```json\n") and questions_text.endswith("\n```"): questions_text = questions_text[7:-4].strip() elif questions_text.startswith("```") and questions_text.endswith("```"): questions_text = questions_text[3:-3].strip() if not questions_text: print(f"Erreur: Réponse vide pour le chunk {chunk_id}") return [] questions = json.loads(questions_text) formatted_questions = [] for q in questions: question_id = str(uuid.uuid4()) difficulty = estimate_difficulty(q["question"], q["type"]) formatted_questions.append({ "question_id": question_id, "chunk_id": chunk_id, "chunk_text": chunk, "question": q["question"], "type": q["type"], "difficulty": difficulty, "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning", "validated": False }) return formatted_questions except Exception as e: print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}") return [] except json.JSONDecodeError as e: print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}") return [] def generate_questions_for_document(chunks: List[str]) -> Dict: """ Generate questions for all document chunks and structure as a scientific dataset. """ all_questions = [] for i, chunk in enumerate(chunks): print(f"Processing chunk {i+1}/{len(chunks)}...") questions = generate_questions_for_chunk(chunk, i) all_questions.extend(questions) time.sleep(9) # Rate limiting dataset = { "dataset_info": { "title": "Vaccine Guide Question-Answer Dataset", "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.", "version": "1.1.0", "created_date": datetime.utcnow().isoformat(), "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf", "generated_by": "Gemini API", "total_questions": len(all_questions), "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning" }, "questions": all_questions } return dataset def save_dataset_to_space(dataset: Dict, filename: str): """ Save dataset to a file in the Space's persistent storage """ persistent_path = f"./{filename}" with open(persistent_path, 'w', encoding='utf-8') as f: json.dump(dataset, f, indent=4, ensure_ascii=False) print(f"Dataset saved to {persistent_path}") # # Optionally upload to Space files # try: # api = HfApi(token=os.getenv("HF_TOKEN")) # api.upload_file( # path_or_fileobj=persistent_path, # path_in_repo=filename, # repo_id=os.getenv("SPACE_ID"), # repo_type="space" # ) # print(f"File {filename} uploaded to Space") # except Exception as e: # print(f"Could not upload to Space: {e}") @app.get("/generate-questions") async def generate_questions(): """ Endpoint to generate questions from the vaccine guide chunks """ try: chunks_data = None with open("./chunks.json", "r", encoding="utf-8") as f: chunks_data = json.load(f) if chunks_data is None: raise HTTPException(status_code=404, detail="Chunks file not found in any known location") VACCINE_CHUNKS = [chunks_data[0]["text"]] dataset = generate_questions_for_document(VACCINE_CHUNKS) # Save to persistent storage filename = "vaccine_questions.json" save_dataset_to_space(dataset, filename) return { "status": "success", "message": "Questions generated successfully", "dataset_info": dataset["dataset_info"], "download_url": f"/download/{filename}" } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/download/{filename}") async def download_file(filename: str): """ Endpoint to download generated files """ file_path = f"./{filename}" if os.path.exists(file_path): return FileResponse(file_path, media_type="application/json", filename=filename) raise HTTPException(status_code=404, detail="File not found") @app.get("/") async def root(): return { "message": "Vaccine Question Generator API", "endpoints": { "GET /generate-questions": "Generate questions from vaccine guide", "GET /download/{filename}": "Download generated files" } } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)