Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import FileResponse | |
| import json | |
| from dotenv import load_dotenv | |
| import time | |
| import uuid | |
| from typing import List, Dict, Optional | |
| from datetime import datetime | |
| from huggingface_hub import HfApi # For file persistence in Spaces | |
| import os | |
| import threading | |
| import glob | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| from langchain_google_genai import GoogleGenerativeAI | |
| app = FastAPI() | |
| # Global variables to track generation status | |
| generation_status = { | |
| "is_running": False, | |
| "start_time": None, | |
| "processed_chunks": 0, | |
| "total_chunks": 0, | |
| "questions_generated": 0, | |
| "completed": False, | |
| "result_file": None, | |
| "error": None | |
| } | |
| generation_lock = threading.Lock() | |
| def estimate_difficulty(question: str, q_type: str) -> str: | |
| """ | |
| Estimate question difficulty based on type and content. | |
| Args: | |
| question (str): The question text. | |
| q_type (str): Question type (factual, conceptual, applied). | |
| Returns: | |
| str: Difficulty level (easy, medium, hard). | |
| """ | |
| if q_type == "factual": | |
| return "easy" | |
| elif q_type == "conceptual": | |
| return "medium" | |
| return "hard" # applied | |
| def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]: | |
| """ | |
| Generate French questions for a given document chunk using the Gemini API. | |
| """ | |
| prompt = f""" | |
| À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive. | |
| Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```. | |
| Texte : {chunk} | |
| Exemple de sortie : | |
| ```json | |
| [ | |
| {{ | |
| "question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?", | |
| "type": "factual" | |
| }}, | |
| {{ | |
| "question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?", | |
| "type": "conceptual" | |
| }}, | |
| {{ | |
| "question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?", | |
| "type": "applied" | |
| }} | |
| ] | |
| ``` | |
| """ | |
| try: | |
| llm = GoogleGenerativeAI( | |
| model=model, | |
| google_api_key=os.getenv("GOOGLE_API_KEY") | |
| ) | |
| response = llm.invoke(prompt) | |
| questions_text = str(response) # Convert response to string | |
| # Strip Markdown code fences | |
| if questions_text.startswith("```json\n") and questions_text.endswith("\n```"): | |
| questions_text = questions_text[7:-4].strip() | |
| elif questions_text.startswith("```") and questions_text.endswith("```"): | |
| questions_text = questions_text[3:-3].strip() | |
| if not questions_text: | |
| print(f"Erreur: Réponse vide pour le chunk {chunk_id}") | |
| return [] | |
| questions = json.loads(questions_text) | |
| formatted_questions = [] | |
| for q in questions: | |
| question_id = str(uuid.uuid4()) | |
| difficulty = estimate_difficulty(q["question"], q["type"]) | |
| formatted_questions.append({ | |
| "question_id": question_id, | |
| "chunk_id": chunk_id, | |
| "chunk_text": chunk, | |
| "question": q["question"], | |
| "type": q["type"], | |
| "difficulty": difficulty, | |
| "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning", | |
| "validated": False | |
| }) | |
| # Update the global status | |
| with generation_lock: | |
| generation_status["questions_generated"] += len(formatted_questions) | |
| return formatted_questions | |
| except Exception as e: | |
| print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}") | |
| return [] | |
| except json.JSONDecodeError as e: | |
| print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}") | |
| return [] | |
| def generate_questions_in_background(chunks: List[str]): | |
| """ | |
| Generate questions in a background thread and update status. | |
| """ | |
| global generation_status | |
| try: | |
| all_questions = [] | |
| with generation_lock: | |
| generation_status["total_chunks"] = len(chunks) | |
| generation_status["processed_chunks"] = 0 | |
| generation_status["questions_generated"] = 0 | |
| for i, chunk in enumerate(chunks): | |
| print(f"Processing chunk {i+1}/{len(chunks)}...") | |
| questions = generate_questions_for_chunk(chunk, i) | |
| all_questions.extend(questions) | |
| with generation_lock: | |
| generation_status["processed_chunks"] = i + 1 | |
| time.sleep(9) # Rate limiting | |
| dataset = { | |
| "dataset_info": { | |
| "title": "Vaccine Guide Question-Answer Dataset", | |
| "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.", | |
| "version": "1.1.0", | |
| "created_date": datetime.utcnow().isoformat(), | |
| "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf", | |
| "generated_by": "Gemini API", | |
| "total_questions": len(all_questions), | |
| "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning" | |
| }, | |
| "questions": all_questions | |
| } | |
| # Save the dataset | |
| filename = f"vaccine_questions_{int(time.time())}.json" | |
| with open(f"./{filename}", 'w', encoding='utf-8') as f: | |
| json.dump(dataset, f, indent=4, ensure_ascii=False) | |
| # Update status to completed | |
| with generation_lock: | |
| generation_status["completed"] = True | |
| generation_status["is_running"] = False | |
| generation_status["result_file"] = filename | |
| except Exception as e: | |
| print(f"Error in background generation: {e}") | |
| with generation_lock: | |
| generation_status["error"] = str(e) | |
| generation_status["is_running"] = False | |
| def save_dataset_to_space(dataset: Dict, filename: str): | |
| """ | |
| Save dataset to a file in the Space's persistent storage | |
| """ | |
| persistent_path = f"./{filename}" | |
| with open(persistent_path, 'w', encoding='utf-8') as f: | |
| json.dump(dataset, f, indent=4, ensure_ascii=False) | |
| print(f"Dataset saved to {persistent_path}") | |
| async def generate_questions(): | |
| """ | |
| Endpoint to generate questions from all JSON files in the data folder | |
| """ | |
| global generation_status | |
| # Check if generation is already running | |
| with generation_lock: | |
| if generation_status["is_running"]: | |
| return { | |
| "status": "running", | |
| "message": "Generation already in progress", | |
| "current_status": generation_status | |
| } | |
| try: | |
| # Reset status | |
| with generation_lock: | |
| generation_status["is_running"] = True | |
| generation_status["start_time"] = datetime.utcnow().isoformat() | |
| generation_status["processed_chunks"] = 0 | |
| generation_status["questions_generated"] = 0 | |
| generation_status["completed"] = False | |
| generation_status["result_file"] = None | |
| generation_status["error"] = None | |
| # Load all JSON files from data folder | |
| json_files = glob.glob("./chunk/*.json") | |
| if not json_files: | |
| raise HTTPException(status_code=404, detail="No JSON files found in data folder") | |
| all_chunks = [] | |
| for json_file in json_files: | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| chunks_data = json.load(f) | |
| if isinstance(chunks_data, list): | |
| # If it's a list of chunks | |
| for chunk in chunks_data: | |
| if isinstance(chunk, dict) and "text" in chunk: | |
| all_chunks.append(chunk["text"]) | |
| elif isinstance(chunk, str): | |
| all_chunks.append(chunk) | |
| elif isinstance(chunks_data, dict): | |
| # If it's a dict, try to extract text content | |
| if "text" in chunks_data: | |
| all_chunks.append(chunks_data["text"]) | |
| elif "content" in chunks_data: | |
| all_chunks.append(chunks_data["content"]) | |
| if not all_chunks: | |
| raise HTTPException(status_code=404, detail="No text content found in JSON files") | |
| # Start generation in background thread | |
| thread = threading.Thread(target=generate_questions_in_background, args=(all_chunks,)) | |
| thread.daemon = True | |
| thread.start() | |
| return { | |
| "status": "started", | |
| "message": f"Question generation started for {len(json_files)} JSON files with {len(all_chunks)} chunks", | |
| "current_status": generation_status | |
| } | |
| except Exception as e: | |
| with generation_lock: | |
| generation_status["is_running"] = False | |
| generation_status["error"] = str(e) | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def get_generation_status(): | |
| """ | |
| Endpoint to check the current status of generation | |
| """ | |
| with generation_lock: | |
| status_copy = generation_status.copy() | |
| return status_copy | |
| async def download_file(filename: str): | |
| """ | |
| Endpoint to download generated files | |
| """ | |
| file_path = f"./{filename}" | |
| if os.path.exists(file_path): | |
| return FileResponse(file_path, media_type="application/json", filename=filename) | |
| raise HTTPException(status_code=404, detail="File not found") | |
| async def root(): | |
| """ | |
| Root endpoint that serves the HTML UI from the index.html file. | |
| """ | |
| return FileResponse("./index.html", media_type="text/html") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |