Spaces:

Zeggai
/

Question_Answer_Dataset

Sleeping

App Files Files Community

Zeggai Abdellah commited on May 4, 2025

Commit

91dbc3c

1 Parent(s): ffaeec5

test sipmle version

Browse files

Files changed (1) hide show

app.py +90 -246

app.py CHANGED Viewed

@@ -1,79 +1,26 @@
-from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
-from fastapi.responses import JSONResponse
-from typing import List, Dict, Optional
 import json
 import time
 import uuid
 from datetime import datetime
 import os
-from pydantic import BaseModel
-import google.generativeai as genai
-from enum import Enum
-import asyncio
-from fastapi.middleware.cors import CORSMiddleware
-app = FastAPI(title="Vaccine Question Generator API")
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins
-    allow_credentials=True,
-    allow_methods=["*"],  # Allow all methods
-    allow_headers=["*"],  # Allow all headers
-)
-# Global variables to track generation state
-generation_status = {
-    "is_running": False,
-    "total_chunks": 0,
-    "processed_chunks": 0,
-    "current_chunk_id": None,
-    "start_time": None,
-    "end_time": None,
-    "errors": [],
-    "result_file": None
-}
-# Chunks file path (will be configurable via API)
-CHUNKS_PATH = "Data/Processed_Data/chunks.json"
-# API Key (will be set via environment variable or API)
-GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
-# Model type options
-class ModelType(str, Enum):
-    GEMINI_FLASH = "gemini-2.0-flash"
-    GEMINI_PRO = "gemini-1.5-pro"
-# Request schema for starting generation
-class GenerationRequest(BaseModel):
-    chunks_path: Optional[str] = None
-    api_key: Optional[str] = None
-    model: ModelType = ModelType.GEMINI_FLASH
-    output_file: str = "vaccine_questions_dataset.json"
-# Response schema for status updates
-class GenerationStatus(BaseModel):
-    is_running: bool
-    total_chunks: int
-    processed_chunks: int
-    current_chunk_id: Optional[int]
-    progress_percentage: float
-    start_time: Optional[str]
-    end_time: Optional[str]
-    estimated_time_remaining: Optional[str]
-    errors: List[str]
-    result_file: Optional[str]
 def estimate_difficulty(question: str, q_type: str) -> str:
     """
     Estimate question difficulty based on type and content.
     Args:
         question (str): The question text.
         q_type (str): Question type (factual, conceptual, applied).
     Returns:
         str: Difficulty level (easy, medium, hard).
     """
@@ -83,25 +30,27 @@ def estimate_difficulty(question: str, q_type: str) -> str:
         return "medium"
     return "hard"  # applied
-async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model: str) -> List[Dict]:
     """
     Generate French questions for a given document chunk using the Gemini API.
     Args:
         chunk (str): A chunk of text from the vaccine guide (in French).
         chunk_id (int): Chunk identifier.
         client: Gemini API client instance.
         model (str): Model name for Gemini API.
     Returns:
         List[Dict]: List of questions with metadata.
     """
     prompt = f"""
-    À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
     Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
     Texte : {chunk}
     Exemple de sortie :
     ```json
     [
@@ -120,32 +69,43 @@ async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model:
     ]
     ```
     """
     try:
-        # Update global state
-        generation_status["current_chunk_id"] = chunk_id
-        # Generate response using Gemini
-        response = client.generate_content(
             model=model,
-            contents=prompt,
         )
-        # Parse the response
-        questions_text = response.text if hasattr(response, 'text') else ""
         # Strip Markdown code fences
         if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
             questions_text = questions_text[7:-4].strip()
         elif questions_text.startswith("```") and questions_text.endswith("```"):
             questions_text = questions_text[3:-3].strip()
         # Parse JSON
         if not questions_text:
-            error_msg = f"Erreur: Réponse vide pour le chunk {chunk_id}"
-            generation_status["errors"].append(error_msg)
             return []
         questions = json.loads(questions_text)
         formatted_questions = []
@@ -162,189 +122,73 @@ async def generate_questions_for_chunk(chunk: str, chunk_id: int, client, model:
                 "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
                 "validated": False  # Flag for expert review
             })
-        # Update count of processed chunks
-        generation_status["processed_chunks"] += 1
         return formatted_questions
     except Exception as e:
-        error_msg = f"Error generating questions for chunk {chunk_id}: {str(e)}"
-        generation_status["errors"].append(error_msg)
         return []
-async def generate_questions_for_document(chunks: List[str], model: str, output_file: str, client) -> Dict:
     """
     Generate questions for all document chunks and structure as a scientific dataset.
     Args:
         chunks (List[str]): List of document chunks.
-        model (str): Model name for Gemini API.
-        output_file (str): File to save the results.
-        client: Gemini API client.
     Returns:
         Dict: Dataset with header and questions.
     """
     all_questions = []
-    # Reset/initialize the global state
-    generation_status["is_running"] = True
-    generation_status["total_chunks"] = len(chunks)
-    generation_status["processed_chunks"] = 0
-    generation_status["start_time"] = datetime.utcnow().isoformat()
-    generation_status["errors"] = []
-    generation_status["current_chunk_id"] = None
-    generation_status["end_time"] = None
-    generation_status["result_file"] = None
-    try:
-        for i, chunk in enumerate(chunks):
-            # Process each chunk
-            questions = await generate_questions_for_chunk(chunk, i, client, model)
-            all_questions.extend(questions)
-            # Rate limiting
-            await asyncio.sleep(9)
-        # Create dataset with scientific structure
-        dataset = {
-            "dataset_info": {
-                "title": "Vaccine Guide Question-Answer Dataset",
-                "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
-                "version": "1.1.0",
-                "created_date": datetime.utcnow().isoformat(),
-                "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
-                "generated_by": f"Gemini API ({model})",
-                "total_questions": len(all_questions),
-                "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
-            },
-            "questions": all_questions
-        }
-        # Save the dataset
-        with open(output_file, 'w', encoding='utf-8') as f:
-            json.dump(dataset, f, indent=4, ensure_ascii=False)
-        # Update final state
-        generation_status["end_time"] = datetime.utcnow().isoformat()
-        generation_status["result_file"] = output_file
-        return dataset
-    except Exception as e:
-        generation_status["errors"].append(f"Error in document generation: {str(e)}")
-        raise e
-    finally:
-        generation_status["is_running"] = False
-async def background_generation_task(chunks_path: str, model: str, output_file: str, api_key: str = None):
-    """Background task for generating questions"""
-    try:
-        # Configure the client
-        if api_key:
-            genai.configure(api_key=api_key)
-        elif GOOGLE_API_KEY:
-            genai.configure(api_key=GOOGLE_API_KEY)
-        else:
-            raise ValueError("No API key provided for Gemini")
-        # Load chunks
-        with open(chunks_path, "r", encoding="utf-8") as f:
-            chunks_data = json.load(f)
-        # Extract texts from chunks
-        chunks = [chunk["text"] for chunk in chunks_data]
-        # Start generation process
-        await generate_questions_for_document(chunks, model, output_file, genai)
-    except Exception as e:
-        generation_status["errors"].append(f"Background task error: {str(e)}")
-        generation_status["is_running"] = False
-@app.post("/generate", response_model=GenerationStatus)
-async def start_generation(request: GenerationRequest, background_tasks: BackgroundTasks):
-    """Start the question generation process"""
-    # Check if generation is already running
-    if generation_status["is_running"]:
-        raise HTTPException(status_code=400, detail="Generation process is already running")
-    # Set up paths and configurations
-    chunks_path = request.chunks_path or CHUNKS_PATH
-    api_key = request.api_key or GOOGLE_API_KEY
-    model = request.model
-    output_file = request.output_file
-    # Validate that chunks file exists
-    if not os.path.exists(chunks_path):
-        raise HTTPException(status_code=404, detail=f"Chunks file not found at {chunks_path}")
-    # Validate API key is available
-    if not api_key:
-        raise HTTPException(status_code=400, detail="No API key provided")
-    # Start background generation task
-    background_tasks.add_task(
-        background_generation_task,
-        chunks_path,
-        model,
-        output_file,
-        api_key
-    )
-    # Return initial status
-    return get_generation_status()
-@app.get("/status", response_model=GenerationStatus)
-async def get_generation_status():
-    """Get the current status of the question generation process"""
-    # Calculate progress percentage
-    total = generation_status["total_chunks"]
-    processed = generation_status["processed_chunks"]
-    progress_percentage = (processed / total * 100) if total > 0 else 0
-    # Calculate estimated time remaining
-    etr = None
-    if (generation_status["is_running"] and
-        generation_status["start_time"] and
-        processed > 0):
-        start_time = datetime.fromisoformat(generation_status["start_time"])
-        time_elapsed = (datetime.utcnow() - start_time).total_seconds()
-        time_per_chunk = time_elapsed / processed
-        remaining_chunks = total - processed
-        etr_seconds = time_per_chunk * remaining_chunks
-        etr = f"{int(etr_seconds // 60)}m {int(etr_seconds % 60)}s"
-    # Return formatted status
-    return GenerationStatus(
-        is_running=generation_status["is_running"],
-        total_chunks=total,
-        processed_chunks=processed,
-        current_chunk_id=generation_status["current_chunk_id"],
-        progress_percentage=round(progress_percentage, 2),
-        start_time=generation_status["start_time"],
-        end_time=generation_status["end_time"],
-        estimated_time_remaining=etr,
-        errors=generation_status["errors"],
-        result_file=generation_status["result_file"]
-    )
-@app.get("/")
-async def root():
-    """Root endpoint with API information"""
-    return {
-        "name": "Vaccine Question Generator API",
-        "description": "API for generating question-answer pairs from vaccine guide chunks",
-        "endpoints": [
-            {"path": "/", "method": "GET", "description": "This information page"},
-            {"path": "/generate", "method": "POST", "description": "Start question generation process"},
-            {"path": "/status", "method": "GET", "description": "Get current generation status"}
-        ]
-    }
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI
 import json
+from dotenv import load_dotenv
+import requests
 import time
 import uuid
+from typing import List, Dict
 from datetime import datetime
+# Load environment variables from .env file
+load_dotenv()
+from langchain_google_genai import GoogleGenerativeAI
 import os
+app = FastAPI()
 def estimate_difficulty(question: str, q_type: str) -> str:
     """
     Estimate question difficulty based on type and content.
     Args:
         question (str): The question text.
         q_type (str): Question type (factual, conceptual, applied).
     Returns:
         str: Difficulty level (easy, medium, hard).
     """
         return "medium"
     return "hard"  # applied
+def generate_questions_for_chunk(chunk: str, chunk_id: int,  model="gemini-2.0-flash") -> List[Dict]:
     """
     Generate French questions for a given document chunk using the Gemini API.
     Args:
         chunk (str): A chunk of text from the vaccine guide (in French).
         chunk_id (int): Chunk identifier.
+        api_key (str): Gemini API key.
         client: Gemini API client instance.
         model (str): Model name for Gemini API.
     Returns:
         List[Dict]: List of questions with metadata.
     """
     prompt = f"""
+    À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
     Fournissez uniquement les questions, sans réponses, en français. Retournez le résultat au format JSON, entouré de ```json\n...\n```.
     Texte : {chunk}
     Exemple de sortie :
     ```json
     [
     ]
     ```
     """
     try:
+        # Initialize the LLM - using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
+        llm = GoogleGenerativeAI(
             model=model,
+            google_api_key=os.getenv("GOOGLE_API_KEY")
         )
+        # Generate response using langchain
+        response = llm.invoke(prompt)
+        # Debug: Print raw response to inspect structure
+        print(f"Raw response for chunk {chunk_id}: {response}")
+        # Parse the response (adjust based on actual Gemini API response structure)
+        questions_text = ""
+        if hasattr(response, 'candidates') and response.candidates:
+            questions_text = response.candidates[0].content.parts[0].text if response.candidates[0].content.parts else ""
+        # Debug: Print extracted text
+        print(f"Extracted questions_text for chunk {chunk_id}: {questions_text}")
         # Strip Markdown code fences
         if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
             questions_text = questions_text[7:-4].strip()
         elif questions_text.startswith("```") and questions_text.endswith("```"):
             questions_text = questions_text[3:-3].strip()
+        # Debug: Print cleaned text
+        print(f"Cleaned questions_text for chunk {chunk_id}: {questions_text}")
         # Parse JSON
         if not questions_text:
+            print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
             return []
         questions = json.loads(questions_text)
         formatted_questions = []
                 "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
                 "validated": False  # Flag for expert review
             })
         return formatted_questions
     except Exception as e:
+        print(f"Erreur lors de la génération des questions pour le chunk {chunk_id}: {e}")
+        return []
+    except json.JSONDecodeError as e:
+        print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
         return []
+def generate_questions_for_document(chunks: List[str],) -> Dict:
     """
     Generate questions for all document chunks and structure as a scientific dataset.
     Args:
         chunks (List[str]): List of document chunks.
+        api_key (str): Gemini API key.
     Returns:
         Dict: Dataset with header and questions.
     """
     all_questions = []
+    for i, chunk in enumerate(chunks):
+        print(f"Processing chunk {i+1}/{len(chunks)}...")
+        questions = generate_questions_for_chunk(chunk, i)
+        all_questions.extend(questions)
+        time.sleep(9)  # Rate limiting
+    # Create dataset with scientific structure
+    dataset = {
+        "dataset_info": {
+            "title": "Vaccine Guide Question-Answer Dataset",
+            "description": "A dataset of question-answer pairs generated from a vaccine guide for AI language model training.",
+            "version": "1.1.0",
+            "created_date": datetime.utcnow().isoformat(),
+            "source": "Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.pdf",
+            "generated_by": "Gemini API",
+            "total_questions": len(all_questions),
+            "intended_use": "Fine-tuning medical language models for knowledge recall and reasoning"
+        },
+        "questions": all_questions
+    }
+    return dataset
+def save_dataset(dataset: Dict, output_file: str):
+    """
+    Save dataset to a JSON file.
+    Args:
+        dataset (Dict): The dataset to save.
+        output_file (str): Path to output JSON file.
+    """
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(dataset, f, indent=4, ensure_ascii=False)
+    print(f"Dataset saved to {output_file}")
 if __name__ == "__main__":
     import uvicorn
+    # Load the chunks from the JSON file
+    with open("Data/Processed_Data/chunks.json", "r", encoding="utf-8") as f:
+        chunks_data = json.load(f)
+    VACCINE_CHUNKS=[chunks_data[0]["text"]]
+    dataset = generate_questions_for_document(VACCINE_CHUNKS)
+    save_dataset(dataset, "vaccine_questions.json")
+    # Run the FastAPI app
     uvicorn.run(app, host="0.0.0.0", port=7860)