Spaces:

Zeggai
/

Question_Answer_Dataset

Sleeping

App Files Files Community

Zeggai Abdellah commited on May 4, 2025

Commit

ffaeec5

1 Parent(s): 30bafd5

first commit

Browse files

Files changed (5) hide show

.gitignore +1 -0
Data/Processed_Data/chunks.json +0 -0
Dockerfile +34 -0
app.py +350 -0
requirements.txt +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Data/Processed_Data/chunks.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Use a Python 3.9 base image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /code
+# Copy requirements file
+COPY ./requirements.txt /code/requirements.txt
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Create a non-root user for security
+RUN useradd -m -u 1000 user
+# Set up directories and permissions
+RUN mkdir -p /code/Data/Processed_Data && chown -R user:user /code
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set app directory
+WORKDIR /code
+# Copy all project files with correct ownership
+COPY --chown=user . /code
+# Expose port 7860 (Hugging Face default)
+EXPOSE 7860
+# Run the FastAPI app with uvicorn
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,350 @@

+from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
+from fastapi.responses import JSONResponse
+from typing import List, Dict, Optional
+import json
+import time
+import uuid
+from datetime import datetime
+import os
+from pydantic import BaseModel
+import google.generativeai as genai
+from enum import Enum
+import asyncio
+from fastapi.middleware.cors import CORSMiddleware
+app = FastAPI(title="Vaccine Question Generator API")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allow all methods
+    allow_headers=["*"],  # Allow all headers
+)
+# Global variables to track generation state
+generation_status = {
+    "is_running": False,
+    "total_chunks": 0,
+    "processed_chunks": 0,
+    "current_chunk_id": None,
+    "start_time": None,
+    "end_time": None,
+    "errors": [],
+    "result_file": None
+}
+# Chunks file path (will be configurable via API)
+CHUNKS_PATH = "Data/Processed_Data/chunks.json"
+# API Key (will be set via environment variable or API)
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
+# Model type options
+class ModelType(str, Enum):
+    GEMINI_FLASH = "gemini-2.0-flash"
+    GEMINI_PRO = "gemini-1.5-pro"
+# Request schema for starting generation
+class GenerationRequest(BaseModel):
+    chunks_path: Optional[str] = None
+    api_key: Optional[str] = None
+    model: ModelType = ModelType.GEMINI_FLASH
+    output_file: str = "vaccine_questions_dataset.json"
+# Response schema for status updates
+class GenerationStatus(BaseModel):
+    is_running: bool
+    total_chunks: int
+    processed_chunks: int
+    current_chunk_id: Optional[int]
+    progress_percentage: float
+    start_time: Optional[str]
+    end_time: Optional[str]
+    estimated_time_remaining: Optional[str]
+    errors: List[str]
+    result_file: Optional[str]
+def estimate_difficulty(question: str, q_type: str) -> str:
+    """
+    Estimate question difficulty based on type and content.
+    Args:
+        question (str): The question text.
+        q_type (str): Question type (factual, conceptual, applied).
+    Returns:
+        str: Difficulty level (easy, medium, hard).
+    """
+    if q_type == "factual":
+        return "easy"
+    elif q_type == "conceptual":
+        return "medium"
+    return "hard"  # applied
+async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model: str) -> List[Dict]:
+    """
+    Generate French questions for a given document chunk using the Gemini API.
+    Args:
+        chunk (str): A chunk of text from the vaccine guide (in French).
+        chunk_id (int): Chunk identifier.
+        client: Gemini API client instance.
+        model (str): Model name for Gemini API.
+    Returns:
+        List[Dict]: List of questions with metadata.
+    """
+    prompt = f"""
+    À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
+    Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
+    Texte : {chunk}
+    Exemple de sortie :
+    ```json
+    [
+        {{
+            "question": "Combien de structures sanitaires de proximité sont impliquées dans le suivi de la vaccination ?",
+            "type": "factual"
+        }},
+        {{
+            "question": "Quel est l'impact de la réglementation de la vaccination sur la couverture vaccinale ?",
+            "type": "conceptual"
+        }},
+        {{
+            "question": "Quelles seraient les conséquences si les établissements privés ne suivaient plus la réglementation vaccinale ?",
+            "type": "applied"
+        }}
+    ]
+    ```
+    """
+    try:
+        # Update global state
+        generation_status["current_chunk_id"] = chunk_id
+        # Generate response using Gemini
+        response = client.generate_content(
+            model=model,
+            contents=prompt,
+        )
+        # Parse the response
+        questions_text = response.text if hasattr(response, 'text') else ""
+        # Strip Markdown code fences
+        if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
+            questions_text = questions_text[7:-4].strip()
+        elif questions_text.startswith("```") and questions_text.endswith("```"):
+            questions_text = questions_text[3:-3].strip()
+        # Parse JSON
+        if not questions_text:
+            error_msg = f"Erreur: Réponse vide pour le chunk {chunk_id}"
+            generation_status["errors"].append(error_msg)
+            return []
+        questions = json.loads(questions_text)
+        formatted_questions = []
+        for q in questions:
+            question_id = str(uuid.uuid4())
+            difficulty = estimate_difficulty(q["question"], q["type"])
+            formatted_questions.append({
+                "question_id": question_id,
+                "chunk_id": chunk_id,
+                "chunk_text": chunk,
+                "question": q["question"],
+                "type": q["type"],
+                "difficulty": difficulty,
+                "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
+                "validated": False  # Flag for expert review
+            })
+        # Update count of processed chunks
+        generation_status["processed_chunks"] += 1
+        return formatted_questions
+    except Exception as e:
+        error_msg = f"Error generating questions for chunk {chunk_id}: {str(e)}"
+        generation_status["errors"].append(error_msg)
+        return []
+async def generate_questions_for_document(chunks: List[str], model: str, output_file: str, client) -> Dict:
+    """
+    Generate questions for all document chunks and structure as a scientific dataset.
+    Args:
+        chunks (List[str]): List of document chunks.
+        model (str): Model name for Gemini API.
+        output_file (str): File to save the results.
+        client: Gemini API client.
+    Returns:
+        Dict: Dataset with header and questions.
+    """
+    all_questions = []
+    # Reset/initialize the global state
+    generation_status["is_running"] = True
+    generation_status["total_chunks"] = len(chunks)
+    generation_status["processed_chunks"] = 0
+    generation_status["start_time"] = datetime.utcnow().isoformat()
+    generation_status["errors"] = []
+    generation_status["current_chunk_id"] = None
+    generation_status["end_time"] = None
+    generation_status["result_file"] = None
+    try:
+        for i, chunk in enumerate(chunks):
+            # Process each chunk
+            questions = await generate_questions_for_chunk(chunk, i, client, model)
+            all_questions.extend(questions)
+            # Rate limiting
+            await asyncio.sleep(9)
+        # Create dataset with scientific structure
+        dataset = {
+            "dataset_info": {
+                "title": "Vaccine Guide Question-Answer Dataset",
+                "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
+                "version": "1.1.0",
+                "created_date": datetime.utcnow().isoformat(),
+                "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
+                "generated_by": f"Gemini API ({model})",
+                "total_questions": len(all_questions),
+                "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
+            },
+            "questions": all_questions
+        }
+        # Save the dataset
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(dataset, f, indent=4, ensure_ascii=False)
+        # Update final state
+        generation_status["end_time"] = datetime.utcnow().isoformat()
+        generation_status["result_file"] = output_file
+        return dataset
+    except Exception as e:
+        generation_status["errors"].append(f"Error in document generation: {str(e)}")
+        raise e
+    finally:
+        generation_status["is_running"] = False
+async def background_generation_task(chunks_path: str, model: str, output_file: str, api_key: str = None):
+    """Background task for generating questions"""
+    try:
+        # Configure the client
+        if api_key:
+            genai.configure(api_key=api_key)
+        elif GOOGLE_API_KEY:
+            genai.configure(api_key=GOOGLE_API_KEY)
+        else:
+            raise ValueError("No API key provided for Gemini")
+        # Load chunks
+        with open(chunks_path, "r", encoding="utf-8") as f:
+            chunks_data = json.load(f)
+        # Extract texts from chunks
+        chunks = [chunk["text"] for chunk in chunks_data]
+        # Start generation process
+        await generate_questions_for_document(chunks, model, output_file, genai)
+    except Exception as e:
+        generation_status["errors"].append(f"Background task error: {str(e)}")
+        generation_status["is_running"] = False
+@app.post("/generate", response_model=GenerationStatus)
+async def start_generation(request: GenerationRequest, background_tasks: BackgroundTasks):
+    """Start the question generation process"""
+    # Check if generation is already running
+    if generation_status["is_running"]:
+        raise HTTPException(status_code=400, detail="Generation process is already running")
+    # Set up paths and configurations
+    chunks_path = request.chunks_path or CHUNKS_PATH
+    api_key = request.api_key or GOOGLE_API_KEY
+    model = request.model
+    output_file = request.output_file
+    # Validate that chunks file exists
+    if not os.path.exists(chunks_path):
+        raise HTTPException(status_code=404, detail=f"Chunks file not found at {chunks_path}")
+    # Validate API key is available
+    if not api_key:
+        raise HTTPException(status_code=400, detail="No API key provided")
+    # Start background generation task
+    background_tasks.add_task(
+        background_generation_task,
+        chunks_path,
+        model,
+        output_file,
+        api_key
+    )
+    # Return initial status
+    return get_generation_status()
+@app.get("/status", response_model=GenerationStatus)
+async def get_generation_status():
+    """Get the current status of the question generation process"""
+    # Calculate progress percentage
+    total = generation_status["total_chunks"]
+    processed = generation_status["processed_chunks"]
+    progress_percentage = (processed / total * 100) if total > 0 else 0
+    # Calculate estimated time remaining
+    etr = None
+    if (generation_status["is_running"] and
+        generation_status["start_time"] and
+        processed > 0):
+        start_time = datetime.fromisoformat(generation_status["start_time"])
+        time_elapsed = (datetime.utcnow() - start_time).total_seconds()
+        time_per_chunk = time_elapsed / processed
+        remaining_chunks = total - processed
+        etr_seconds = time_per_chunk * remaining_chunks
+        etr = f"{int(etr_seconds // 60)}m {int(etr_seconds % 60)}s"
+    # Return formatted status
+    return GenerationStatus(
+        is_running=generation_status["is_running"],
+        total_chunks=total,
+        processed_chunks=processed,
+        current_chunk_id=generation_status["current_chunk_id"],
+        progress_percentage=round(progress_percentage, 2),
+        start_time=generation_status["start_time"],
+        end_time=generation_status["end_time"],
+        estimated_time_remaining=etr,
+        errors=generation_status["errors"],
+        result_file=generation_status["result_file"]
+    )
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "name": "Vaccine Question Generator API",
+        "description": "API for generating question-answer pairs from vaccine guide chunks",
+        "endpoints": [
+            {"path": "/", "method": "GET", "description": "This information page"},
+            {"path": "/generate", "method": "POST", "description": "Start question generation process"},
+            {"path": "/status", "method": "GET", "description": "Get current generation status"}
+        ]
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

Binary file (208 Bytes). View file