Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

203ee8d

1 Parent(s): 444b4d9

Switch to FLAN-T5-Large: uses standard HF storage, excellent for question generation

Browse files

Files changed (1) hide show

app.py +46 -51

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional, Dict, Any
 from contextlib import asynccontextmanager
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import uvicorn
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
@@ -55,15 +55,26 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
                 token=hf_token
             )
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-                device_map="auto" if device == "cuda" else None,
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-                use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
-                token=hf_token
-            )
             return tokenizer, model
@@ -101,8 +112,8 @@ async def load_model():
         try:
             logger.info("Loading model with transformers...")
-            # Use Llama 3.1 8B Instruct from official HF storage (not XetHub)
-            base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
@@ -170,42 +181,29 @@ app.add_middleware(
 )
 def create_question_prompt(statement: str, num_questions: int, difficulty_level: str) -> str:
-    """Create a prompt for question generation with reasoning"""
     difficulty_instruction = {
-        "easy": "Generate simple, straightforward questions that test basic understanding.",
-        "medium": "Generate questions that require some analysis and comprehension.",
-        "hard": "Generate complex questions that require deep thinking and reasoning.",
-        "mixed": "Generate a mix of easy, medium, and hard questions."
     }
-    system_prompt = """You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem.
-You are an expert educator and question generator. Your task is to create thoughtful, well-crafted questions from given statements."""
-    user_prompt = f"""<think>
-I need to analyze this statement and generate {num_questions} high-quality questions. Let me think about:
-1. The key concepts and information in the statement
-2. Different types of questions I can ask (factual, analytical, inferential, evaluative)
-3. The difficulty level requested: {difficulty_level}
-4. How to make questions that promote understanding and critical thinking
-</think>
-Based on the following statement, generate exactly {num_questions} questions.
-Statement: "{statement}"
 Requirements:
-- {difficulty_instruction[difficulty_level]}
-- Questions should be clear, well-formed, and grammatically correct
-- Vary the question types (what, how, why, when, where, etc.)
-- Each question should test different aspects of the statement
-- Make questions engaging and thought-provoking
 - Number each question (1., 2., 3., etc.)
-Generate the questions now:"""
-    return f"{system_prompt}\n\n{user_prompt}"
 def extract_questions(generated_text: str) -> List[str]:
     """Extract questions from the generated text"""
@@ -275,27 +273,24 @@ async def generate_questions(request: QuestionGenerationRequest):
         )
         # Generate response using transformers
-        inputs = tokenizer.encode(prompt, return_tensors="pt")
         if device == "cuda":
             inputs = inputs.to(device)
         with torch.no_grad():
             outputs = model.generate(
-                inputs,
-                max_new_tokens=request.max_length,
                 temperature=request.temperature,
                 top_p=0.95,
-                top_k=40,
-                repetition_penalty=1.1,
                 do_sample=True,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
             )
-        # Decode the generated text
-        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Remove the input prompt from the response
-        generated_text = full_response[len(prompt):].strip()
         logger.info(f"Generated text length: {len(generated_text)}")
         # Extract questions from the generated text
@@ -313,7 +308,7 @@ async def generate_questions(request: QuestionGenerationRequest):
             questions.append(f"What is the main point of this statement: '{request.statement[:100]}...'?")
         metadata = {
-            "model": "DavidAU/Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-GGUF",
             "temperature": request.temperature,
             "difficulty_level": request.difficulty_level,
             "generated_text_length": len(generated_text),
@@ -337,7 +332,7 @@ async def root():
     """Root endpoint with basic info"""
     return {
         "message": "Question Generation API",
-        "model": "DavidAU/Llama-3.1-1-million-ctx-DeepHermes-Deep-Reasoning-8B-GGUF",
         "endpoints": {
             "health": "/health",
             "generate": "/generate-questions",

 from contextlib import asynccontextmanager
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
 import uvicorn
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
                 token=hf_token
             )
+            # Use Seq2Seq model for T5-based models, CausalLM for others
+            if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
+                model = AutoModelForSeq2SeqLM.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+                    device_map="auto" if device == "cuda" else None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    token=hf_token
+                )
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+                    device_map="auto" if device == "cuda" else None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    use_safetensors=True,  # Force safetensors to avoid CVE-2025-32434
+                    token=hf_token
+                )
             return tokenizer, model
         try:
             logger.info("Loading model with transformers...")
+            # Use FLAN-T5 Large - excellent for question generation and uses standard HF storage
+            base_model_name = "google/flan-t5-large"
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
 )
 def create_question_prompt(statement: str, num_questions: int, difficulty_level: str) -> str:
+    """Create a prompt for question generation optimized for T5/FLAN models"""
     difficulty_instruction = {
+        "easy": "simple, straightforward questions that test basic understanding",
+        "medium": "questions that require some analysis and comprehension",
+        "hard": "complex questions that require deep thinking and reasoning",
+        "mixed": "a mix of easy, medium, and hard questions"
     }
+    # T5/FLAN models work better with direct, concise instructions
+    prompt = f"""Generate {num_questions} {difficulty_instruction[difficulty_level]} about this statement:
+"{statement}"
 Requirements:
+- Clear, well-formed questions
+- Vary question types (what, how, why, when, where)
 - Number each question (1., 2., 3., etc.)
+- End each question with a question mark
+Questions:"""
+    return prompt
 def extract_questions(generated_text: str) -> List[str]:
     """Extract questions from the generated text"""
         )
         # Generate response using transformers
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
         if device == "cuda":
             inputs = inputs.to(device)
         with torch.no_grad():
+            # T5 models use generate differently - they don't include input in output
             outputs = model.generate(
+                **inputs,
+                max_new_tokens=min(request.max_length, 512),
                 temperature=request.temperature,
                 top_p=0.95,
                 do_sample=True,
+                num_beams=1,
+                early_stopping=True
             )
+        # Decode the generated text (T5 doesn't include input prompt in output)
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         logger.info(f"Generated text length: {len(generated_text)}")
         # Extract questions from the generated text
             questions.append(f"What is the main point of this statement: '{request.statement[:100]}...'?")
         metadata = {
+            "model": "google/flan-t5-large",
             "temperature": request.temperature,
             "difficulty_level": request.difficulty_level,
             "generated_text_length": len(generated_text),
     """Root endpoint with basic info"""
     return {
         "message": "Question Generation API",
+        "model": "google/flan-t5-large",
         "endpoints": {
             "health": "/health",
             "generate": "/generate-questions",