Spaces:

Zeggai
/

Question_Answer_Dataset

Sleeping

App Files Files Community

Zeggai Abdellah commited on May 4, 2025

Commit

bc69312

1 Parent(s): 9ec487c

try fix the trigger for the genration

Browse files

Files changed (2) hide show

app.py +88 -59
requirements.txt +0 -0

app.py CHANGED Viewed

@@ -1,15 +1,17 @@
-from fastapi import FastAPI
 import json
 from dotenv import load_dotenv
-import requests
 import time
 import uuid
 from typing import List, Dict
 from datetime import datetime
 # Load environment variables from .env file
 load_dotenv()
 from langchain_google_genai import GoogleGenerativeAI
-import os
 app = FastAPI()
@@ -30,20 +32,9 @@ def estimate_difficulty(question: str, q_type: str) -> str:
         return "medium"
     return "hard"  # applied
-def generate_questions_for_chunk(chunk: str, chunk_id: int,  model="gemini-2.0-flash") -> List[Dict]:
     """
     Generate French questions for a given document chunk using the Gemini API.
-    Args:
-        chunk (str): A chunk of text from the vaccine guide (in French).
-        chunk_id (int): Chunk identifier.
-        api_key (str): Gemini API key.
-        client: Gemini API client instance.
-        model (str): Model name for Gemini API.
-    Returns:
-        List[Dict]: List of questions with metadata.
     """
     prompt = f"""
     À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
@@ -71,26 +62,14 @@ def generate_questions_for_chunk(chunk: str, chunk_id: int,  model="gemini-2.0-f
     """
     try:
-        # Initialize the LLM - using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
         llm = GoogleGenerativeAI(
             model=model,
             google_api_key=os.getenv("GOOGLE_API_KEY")
         )
-        # Generate response using langchain
         response = llm.invoke(prompt)
-        # Debug: Print raw response to inspect structure
-        print(f"Raw response for chunk {chunk_id}: {response}")
-        # Parse the response (adjust based on actual Gemini API response structure)
-        questions_text = ""
-        if hasattr(response, 'candidates') and response.candidates:
-            questions_text = response.candidates[0].content.parts[0].text if response.candidates[0].content.parts else ""
-        # Debug: Print extracted text
-        print(f"Extracted questions_text for chunk {chunk_id}: {questions_text}")
         # Strip Markdown code fences
         if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
@@ -98,10 +77,6 @@ def generate_questions_for_chunk(chunk: str, chunk_id: int,  model="gemini-2.0-f
         elif questions_text.startswith("```") and questions_text.endswith("```"):
             questions_text = questions_text[3:-3].strip()
-        # Debug: Print cleaned text
-        print(f"Cleaned questions_text for chunk {chunk_id}: {questions_text}")
-        # Parse JSON
         if not questions_text:
             print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
             return []
@@ -120,7 +95,7 @@ def generate_questions_for_chunk(chunk: str, chunk_id: int,  model="gemini-2.0-f
                 "type": q["type"],
                 "difficulty": difficulty,
                 "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
-                "validated": False  # Flag for expert review
             })
         return formatted_questions
@@ -132,16 +107,9 @@ def generate_questions_for_chunk(chunk: str, chunk_id: int,  model="gemini-2.0-f
         print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
         return []
-def generate_questions_for_document(chunks: List[str],) -> Dict:
     """
     Generate questions for all document chunks and structure as a scientific dataset.
-    Args:
-        chunks (List[str]): List of document chunks.
-        api_key (str): Gemini API key.
-    Returns:
-        Dict: Dataset with header and questions.
     """
     all_questions = []
@@ -151,7 +119,6 @@ def generate_questions_for_document(chunks: List[str],) -> Dict:
         all_questions.extend(questions)
         time.sleep(9)  # Rate limiting
-    # Create dataset with scientific structure
     dataset = {
         "dataset_info": {
             "title": "Vaccine Guide Question-Answer Dataset",
@@ -168,27 +135,89 @@ def generate_questions_for_document(chunks: List[str],) -> Dict:
     return dataset
-def save_dataset(dataset: Dict, output_file: str):
     """
-    Save dataset to a JSON file.
-    Args:
-        dataset (Dict): The dataset to save.
-        output_file (str): Path to output JSON file.
     """
-    with open(output_file, 'w', encoding='utf-8') as f:
         json.dump(dataset, f, indent=4, ensure_ascii=False)
-    print(f"Dataset saved to {output_file}")
-if __name__ == "__main__":
-    import uvicorn
-    # Load the chunks from the JSON file
-    with open("Data/Processed_Data/chunks.json", "r", encoding="utf-8") as f:
-        chunks_data = json.load(f)
-    VACCINE_CHUNKS=[chunks_data[0]["text"]]
-    dataset = generate_questions_for_document(VACCINE_CHUNKS)
-    save_dataset(dataset, "vaccine_questions.json")
-    # Run the FastAPI app
     uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI, HTTPException, FileResponse
 import json
 from dotenv import load_dotenv
 import time
 import uuid
 from typing import List, Dict
 from datetime import datetime
+from huggingface_hub import HfApi  # For file persistence in Spaces
+import os
 # Load environment variables from .env file
 load_dotenv()
 from langchain_google_genai import GoogleGenerativeAI
 app = FastAPI()
         return "medium"
     return "hard"  # applied
+def generate_questions_for_chunk(chunk: str, chunk_id: int, model="gemini-2.0-flash") -> List[Dict]:
     """
     Generate French questions for a given document chunk using the Gemini API.
     """
     prompt = f"""
     À partir du texte suivant d'un guide sur les vaccins en français, générez 3 questions variées (factual, conceptual, applied) qui couvrent le contenu de manière exhaustive.
     """
     try:
         llm = GoogleGenerativeAI(
             model=model,
             google_api_key=os.getenv("GOOGLE_API_KEY")
         )
         response = llm.invoke(prompt)
+        questions_text = str(response)  # Convert response to string
         # Strip Markdown code fences
         if questions_text.startswith("```json\n") and questions_text.endswith("\n```"):
         elif questions_text.startswith("```") and questions_text.endswith("```"):
             questions_text = questions_text[3:-3].strip()
         if not questions_text:
             print(f"Erreur: Réponse vide pour le chunk {chunk_id}")
             return []
                 "type": q["type"],
                 "difficulty": difficulty,
                 "training_purpose": "Knowledge Recall" if q["type"] == "factual" else "Reasoning",
+                "validated": False
             })
         return formatted_questions
         print(f"Erreur de parsing de la réponse API pour le chunk {chunk_id}: {e}")
         return []
+def generate_questions_for_document(chunks: List[str]) -> Dict:
     """
     Generate questions for all document chunks and structure as a scientific dataset.
     """
     all_questions = []
         all_questions.extend(questions)
         time.sleep(9)  # Rate limiting
     dataset = {
         "dataset_info": {
             "title": "Vaccine Guide Question-Answer Dataset",
     return dataset
+def save_dataset_to_space(dataset: Dict, filename: str):
     """
+    Save dataset to a file in the Space's persistent storage
     """
+    persistent_path = f"/home/user/{filename}"
+    with open(persistent_path, 'w', encoding='utf-8') as f:
         json.dump(dataset, f, indent=4, ensure_ascii=False)
+    print(f"Dataset saved to {persistent_path}")
+    # Optionally upload to Space files
+    try:
+        api = HfApi(token=os.getenv("HF_TOKEN"))
+        api.upload_file(
+            path_or_fileobj=persistent_path,
+            path_in_repo=filename,
+            repo_id=os.getenv("SPACE_ID"),
+            repo_type="space"
+        )
+        print(f"File {filename} uploaded to Space")
+    except Exception as e:
+        print(f"Could not upload to Space: {e}")
+@app.get("/generate-questions")
+async def generate_questions():
+    """
+    Endpoint to generate questions from the vaccine guide chunks
+    """
+    try:
+        # Try to load chunks from different possible locations
+        chunks_paths = [
+            "Data/Processed_Data/chunks.json",
+            "chunks.json",
+            "/home/user/chunks.json"
+        ]
+        chunks_data = None
+        for path in chunks_paths:
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    chunks_data = json.load(f)
+                break
+            except FileNotFoundError:
+                continue
+        if chunks_data is None:
+            raise HTTPException(status_code=404, detail="Chunks file not found in any known location")
+        VACCINE_CHUNKS = [chunks_data[0]["text"]]
+        dataset = generate_questions_for_document(VACCINE_CHUNKS)
+        # Save to persistent storage
+        filename = "vaccine_questions.json"
+        save_dataset_to_space(dataset, filename)
+        return {
+            "status": "success",
+            "message": "Questions generated successfully",
+            "dataset_info": dataset["dataset_info"],
+            "download_url": f"/download/{filename}"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/download/{filename}")
+async def download_file(filename: str):
+    """
+    Endpoint to download generated files
+    """
+    file_path = f"/home/user/{filename}"
+    if os.path.exists(file_path):
+        return FileResponse(file_path, media_type="application/json", filename=filename)
+    raise HTTPException(status_code=404, detail="File not found")
+@app.get("/")
+async def root():
+    return {
+        "message": "Vaccine Question Generator API",
+        "endpoints": {
+            "POST /generate-questions": "Generate questions from vaccine guide",
+            "GET /download/{filename}": "Download generated files"
+        }
+    }
+if __name__ == "__main__":
+    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ