Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import FileResponse | |
| import json | |
| from dotenv import load_dotenv | |
| import time | |
| import uuid | |
| from typing import List, Dict | |
| from datetime import datetime | |
| from huggingface_hub import HfApi # For file persistence in Spaces | |
| import os | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| from langchain_google_genai import GoogleGenerativeAI | |
| app = FastAPI() | |
| def estimate_difficulty(question: str, q_type: str) -> str: | |
| """ | |
| Estimate question difficulty based on type and content. | |
| Args: | |
| question (str): The question text. | |
| q_type (str): Question type (factual, conceptual, applied). | |
| Returns: | |
| str: Difficulty level (easy, medium, hard). | |
| """ | |
| if q_type == "factual": | |
| return "easy" | |
| elif q_type == "conceptual": | |
| return "medium" | |
| return "hard" # applied | |
| def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]: | |
| """ | |
| Generate French questions for a given document chunk using the Gemini API. | |
| """ | |
| prompt = f""" | |
| À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive. | |
| Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```. | |
| Texte : {chunk} | |
| Exemple de sortie : | |
| ```json | |
| [ | |
| {{ | |
| "question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?", | |
| "type": "factual" | |
| }}, | |
| {{ | |
| "question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?", | |
| "type": "conceptual" | |
| }}, | |
| {{ | |
| "question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?", | |
| "type": "applied" | |
| }} | |
| ] | |
| ``` | |
| """ | |
| try: | |
| llm = GoogleGenerativeAI( | |
| model=model, | |
| google_api_key=os.getenv("GOOGLE_API_KEY") | |
| ) | |
| response = llm.invoke(prompt) | |
| questions_text = str(response) # Convert response to string | |
| # Strip Markdown code fences | |
| if questions_text.startswith("```json\n") and questions_text.endswith("\n```"): | |
| questions_text = questions_text[7:-4].strip() | |
| elif questions_text.startswith("```") and questions_text.endswith("```"): | |
| questions_text = questions_text[3:-3].strip() | |
| if not questions_text: | |
| print(f"Erreur: Réponse vide pour le chunk {chunk_id}") | |
| return [] | |
| questions = json.loads(questions_text) | |
| formatted_questions = [] | |
| for q in questions: | |
| question_id = str(uuid.uuid4()) | |
| difficulty = estimate_difficulty(q["question"], q["type"]) | |
| formatted_questions.append({ | |
| "question_id": question_id, | |
| "chunk_id": chunk_id, | |
| "chunk_text": chunk, | |
| "question": q["question"], | |
| "type": q["type"], | |
| "difficulty": difficulty, | |
| "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning", | |
| "validated": False | |
| }) | |
| return formatted_questions | |
| except Exception as e: | |
| print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}") | |
| return [] | |
| except json.JSONDecodeError as e: | |
| print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}") | |
| return [] | |
| def generate_questions_for_document(chunks: List[str]) -> Dict: | |
| """ | |
| Generate questions for all document chunks and structure as a scientific dataset. | |
| """ | |
| all_questions = [] | |
| for i, chunk in enumerate(chunks): | |
| print(f"Processing chunk {i+1}/{len(chunks)}...") | |
| questions = generate_questions_for_chunk(chunk, i) | |
| all_questions.extend(questions) | |
| time.sleep(9) # Rate limiting | |
| dataset = { | |
| "dataset_info": { | |
| "title": "Vaccine Guide Question-Answer Dataset", | |
| "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.", | |
| "version": "1.1.0", | |
| "created_date": datetime.utcnow().isoformat(), | |
| "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf", | |
| "generated_by": "Gemini API", | |
| "total_questions": len(all_questions), | |
| "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning" | |
| }, | |
| "questions": all_questions | |
| } | |
| return dataset | |
| def save_dataset_to_space(dataset: Dict, filename: str): | |
| """ | |
| Save dataset to a file in the Space's persistent storage | |
| """ | |
| persistent_path = f"./{filename}" | |
| with open(persistent_path, 'w', encoding='utf-8') as f: | |
| json.dump(dataset, f, indent=4, ensure_ascii=False) | |
| print(f"Dataset saved to {persistent_path}") | |
| # # Optionally upload to Space files | |
| # try: | |
| # api = HfApi(token=os.getenv("HF_TOKEN")) | |
| # api.upload_file( | |
| # path_or_fileobj=persistent_path, | |
| # path_in_repo=filename, | |
| # repo_id=os.getenv("SPACE_ID"), | |
| # repo_type="space" | |
| # ) | |
| # print(f"File {filename} uploaded to Space") | |
| # except Exception as e: | |
| # print(f"Could not upload to Space: {e}") | |
| async def generate_questions(): | |
| """ | |
| Endpoint to generate questions from the vaccine guide chunks | |
| """ | |
| try: | |
| chunks_data = None | |
| with open("./chunks.json", "r", encoding="utf-8") as f: | |
| chunks_data = json.load(f) | |
| if chunks_data is None: | |
| raise HTTPException(status_code=404, detail="Chunks file not found in any known location") | |
| VACCINE_CHUNKS = [chunks_data[0]["text"]] | |
| dataset = generate_questions_for_document(VACCINE_CHUNKS) | |
| # Save to persistent storage | |
| filename = "vaccine_questions.json" | |
| save_dataset_to_space(dataset, filename) | |
| return { | |
| "status": "success", | |
| "message": "Questions generated successfully", | |
| "dataset_info": dataset["dataset_info"], | |
| "download_url": f"/download/{filename}" | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def download_file(filename: str): | |
| """ | |
| Endpoint to download generated files | |
| """ | |
| file_path = f"./{filename}" | |
| if os.path.exists(file_path): | |
| return FileResponse(file_path, media_type="application/json", filename=filename) | |
| raise HTTPException(status_code=404, detail="File not found") | |
| async def root(): | |
| return { | |
| "message": "Vaccine Question Generator API", | |
| "endpoints": { | |
| "GET /generate-questions": "Generate questions from vaccine guide", | |
| "GET /download/{filename}": "Download generated files" | |
| } | |
| } | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |