Spaces:

Zeggai
/

Question_Answer_Dataset

Sleeping

Zeggai Abdellah

change the josn file form data to chunk

659455b 11 months ago

10.6 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import FileResponse
	import json
	from dotenv import load_dotenv
	import time
	import uuid
	from typing import List, Dict, Optional
	from datetime import datetime
	from huggingface_hub import HfApi # For file persistence in Spaces
	import os
	import threading
	import glob

	# Load environment variables from .env file
	load_dotenv()

	from langchain_google_genai import GoogleGenerativeAI

	app = FastAPI()

	# Global variables to track generation status
	generation_status = {
	"is_running": False,
	"start_time": None,
	"processed_chunks": 0,
	"total_chunks": 0,
	"questions_generated": 0,
	"completed": False,
	"result_file": None,
	"error": None
	}

	generation_lock = threading.Lock()

	def estimate_difficulty(question: str, q_type: str) -> str:
	"""
	Estimate question difficulty based on type and content.

	Args:
	question (str): The question text.
	q_type (str): Question type (factual, conceptual, applied).

	Returns:
	str: Difficulty level (easy, medium, hard).
	"""
	if q_type == "factual":
	return "easy"
	elif q_type == "conceptual":
	return "medium"
	return "hard" # applied

	def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]:
	"""
	Generate French questions for a given document chunk using the Gemini API.
	"""
	prompt = f"""
	À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
	Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.

	Texte : {chunk}

	Exemple de sortie :
	```json
	[
	{{
	"question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?",
	"type": "factual"
	}},
	{{
	"question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?",
	"type": "conceptual"
	}},
	{{
	"question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?",
	"type": "applied"
	}}
	]
	```
	"""

	try:
	llm = GoogleGenerativeAI(
	model=model,
	google_api_key=os.getenv("GOOGLE_API_KEY")
	)

	response = llm.invoke(prompt)

	questions_text = str(response) # Convert response to string

	# Strip Markdown code fences
	if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
	questions_text = questions_text[7:-4].strip()
	elif questions_text.startswith("```") and questions_text.endswith("```"):
	questions_text = questions_text[3:-3].strip()

	if not questions_text:
	print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
	return []

	questions = json.loads(questions_text)

	formatted_questions = []
	for q in questions:
	question_id = str(uuid.uuid4())
	difficulty = estimate_difficulty(q["question"], q["type"])
	formatted_questions.append({
	"question_id": question_id,
	"chunk_id": chunk_id,
	"chunk_text": chunk,
	"question": q["question"],
	"type": q["type"],
	"difficulty": difficulty,
	"training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
	"validated": False
	})

	# Update the global status
	with generation_lock:
	generation_status["questions_generated"] += len(formatted_questions)

	return formatted_questions

	except Exception as e:
	print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}")
	return []
	except json.JSONDecodeError as e:
	print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
	return []

	def generate_questions_in_background(chunks: List[str]):
	"""
	Generate questions in a background thread and update status.
	"""
	global generation_status

	try:
	all_questions = []

	with generation_lock:
	generation_status["total_chunks"] = len(chunks)
	generation_status["processed_chunks"] = 0
	generation_status["questions_generated"] = 0

	for i, chunk in enumerate(chunks):
	print(f"Processing chunk {i+1}/{len(chunks)}...")
	questions = generate_questions_for_chunk(chunk, i)
	all_questions.extend(questions)

	with generation_lock:
	generation_status["processed_chunks"] = i + 1

	time.sleep(9) # Rate limiting

	dataset = {
	"dataset_info": {
	"title": "Vaccine Guide Question-Answer Dataset",
	"description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
	"version": "1.1.0",
	"created_date": datetime.utcnow().isoformat(),
	"source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
	"generated_by": "Gemini API",
	"total_questions": len(all_questions),
	"intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
	},
	"questions": all_questions
	}

	# Save the dataset
	filename = f"vaccine_questions_{int(time.time())}.json"
	with open(f"./{filename}", 'w', encoding='utf-8') as f:
	json.dump(dataset, f, indent=4, ensure_ascii=False)

	# Update status to completed
	with generation_lock:
	generation_status["completed"] = True
	generation_status["is_running"] = False
	generation_status["result_file"] = filename

	except Exception as e:
	print(f"Error in background generation: {e}")
	with generation_lock:
	generation_status["error"] = str(e)
	generation_status["is_running"] = False

	def save_dataset_to_space(dataset: Dict, filename: str):
	"""
	Save dataset to a file in the Space's persistent storage
	"""
	persistent_path = f"./{filename}"
	with open(persistent_path, 'w', encoding='utf-8') as f:
	json.dump(dataset, f, indent=4, ensure_ascii=False)
	print(f"Dataset saved to {persistent_path}")


	@app.get("/generate-questions")
	async def generate_questions():
	"""
	Endpoint to generate questions from all JSON files in the data folder
	"""
	global generation_status

	# Check if generation is already running
	with generation_lock:
	if generation_status["is_running"]:
	return {
	"status": "running",
	"message": "Generation already in progress",
	"current_status": generation_status
	}

	try:
	# Reset status
	with generation_lock:
	generation_status["is_running"] = True
	generation_status["start_time"] = datetime.utcnow().isoformat()
	generation_status["processed_chunks"] = 0
	generation_status["questions_generated"] = 0
	generation_status["completed"] = False
	generation_status["result_file"] = None
	generation_status["error"] = None

	# Load all JSON files from data folder
	json_files = glob.glob("./chunk/*.json")

	if not json_files:
	raise HTTPException(status_code=404, detail="No JSON files found in data folder")

	all_chunks = []
	for json_file in json_files:
	with open(json_file, "r", encoding="utf-8") as f:
	chunks_data = json.load(f)
	if isinstance(chunks_data, list):
	# If it's a list of chunks
	for chunk in chunks_data:
	if isinstance(chunk, dict) and "text" in chunk:
	all_chunks.append(chunk["text"])
	elif isinstance(chunk, str):
	all_chunks.append(chunk)
	elif isinstance(chunks_data, dict):
	# If it's a dict, try to extract text content
	if "text" in chunks_data:
	all_chunks.append(chunks_data["text"])
	elif "content" in chunks_data:
	all_chunks.append(chunks_data["content"])

	if not all_chunks:
	raise HTTPException(status_code=404, detail="No text content found in JSON files")

	# Start generation in background thread
	thread = threading.Thread(target=generate_questions_in_background, args=(all_chunks,))
	thread.daemon = True
	thread.start()

	return {
	"status": "started",
	"message": f"Question generation started for {len(json_files)} JSON files with {len(all_chunks)} chunks",
	"current_status": generation_status
	}
	except Exception as e:
	with generation_lock:
	generation_status["is_running"] = False
	generation_status["error"] = str(e)
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/generation-status")
	async def get_generation_status():
	"""
	Endpoint to check the current status of generation
	"""
	with generation_lock:
	status_copy = generation_status.copy()

	return status_copy

	@app.get("/download/{filename}")
	async def download_file(filename: str):
	"""
	Endpoint to download generated files
	"""
	file_path = f"./{filename}"
	if os.path.exists(file_path):
	return FileResponse(file_path, media_type="application/json", filename=filename)
	raise HTTPException(status_code=404, detail="File not found")

	@app.get("/")
	async def root():
	"""
	Root endpoint that serves the HTML UI from the index.html file.
	"""
	return FileResponse("./index.html", media_type="text/html")

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)