Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

8b5e9db

1 Parent(s): e6b5afc

Switch to Llama-3.1-8B-Instruct: update model loading, prompts, and generation parameters

Browse files

Files changed (2) hide show

app.py +6 -6
gradio_app.py +6 -6

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional, Dict, Any
 from contextlib import asynccontextmanager
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
 import uvicorn
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
@@ -57,7 +57,7 @@ async def load_model_with_retry(model_name: str, hf_token: str, max_retries: int
             # Use Seq2Seq model for T5-based models, CausalLM for others
             if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
-                model = AutoModelForSeq2SeqLM.from_pretrained(
                     model_name,
                     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                     device_map="auto" if device == "cuda" else None,
@@ -113,7 +113,7 @@ async def load_model():
             logger.info("Loading model with transformers...")
             # Use FLAN-T5 Large - excellent for question generation and uses standard HF storage
-            base_model_name = "google/flan-t5-large"
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
@@ -281,7 +281,7 @@ async def generate_questions(request: QuestionGenerationRequest):
             # T5 models use generate differently - they don't include input in output
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=min(request.max_length, 512),
                 temperature=request.temperature,
                 top_p=0.95,
                 do_sample=True,
@@ -308,7 +308,7 @@ async def generate_questions(request: QuestionGenerationRequest):
             questions.append(f"What is the main point of this statement: '{request.statement[:100]}...'?")
         metadata = {
-            "model": "google/flan-t5-large",
             "temperature": request.temperature,
             "difficulty_level": request.difficulty_level,
             "generated_text_length": len(generated_text),
@@ -332,7 +332,7 @@ async def root():
     """Root endpoint with basic info"""
     return {
         "message": "Question Generation API",
-        "model": "google/flan-t5-large",
         "endpoints": {
             "health": "/health",
             "generate": "/generate-questions",

 from contextlib import asynccontextmanager
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForCausalLM, BitsAndBytesConfig
 import uvicorn
 from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
             # Use Seq2Seq model for T5-based models, CausalLM for others
             if "flan-t5" in model_name.lower() or "t5" in model_name.lower():
+                model = AutoModelForCausalLM.from_pretrained(
                     model_name,
                     torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                     device_map="auto" if device == "cuda" else None,
             logger.info("Loading model with transformers...")
             # Use FLAN-T5 Large - excellent for question generation and uses standard HF storage
+            base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
             tokenizer, model = await load_model_with_retry(base_model_name, hf_token)
             # T5 models use generate differently - they don't include input in output
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=min(request.max_length, 1024),
                 temperature=request.temperature,
                 top_p=0.95,
                 do_sample=True,
             questions.append(f"What is the main point of this statement: '{request.statement[:100]}...'?")
         metadata = {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "temperature": request.temperature,
             "difficulty_level": request.difficulty_level,
             "generated_text_length": len(generated_text),
     """Root endpoint with basic info"""
     return {
         "message": "Question Generation API",
+        "model": "meta-llama/Llama-3.1-8B-Instruct",
         "endpoints": {
             "health": "/health",
             "generate": "/generate-questions",

gradio_app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional, Dict, Any
 import threading
 import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import gradio as gr
 # Configure logging
@@ -43,8 +43,8 @@ class ModelManager:
             # Get HF token from environment
             hf_token = os.getenv("HF_TOKEN")
-            logger.info("Loading FLAN-T5-Large model...")
-            base_model_name = "google/flan-t5-large"
             self.tokenizer = AutoTokenizer.from_pretrained(
                 base_model_name,
@@ -53,7 +53,7 @@ class ModelManager:
                 token=hf_token
             )
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                 base_model_name,
                 torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                 device_map="auto" if self.device == "cuda" else None,
@@ -237,7 +237,7 @@ with gr.Blocks(css=css, title="Question Generation AI", theme=gr.themes.Soft())
     gr.Markdown(
         """
         # 🤖 Question Generation AI
-        ### Powered by FLAN-T5-Large
         Enter any statement or text, and I'll generate thoughtful questions about it. Perfect for creating study materials, assessments, or exploring topics deeper!
         """
@@ -320,7 +320,7 @@ with gr.Blocks(css=css, title="Question Generation AI", theme=gr.themes.Soft())
         """
         ---
         <div style="text-align: center; color: #666; font-size: 0.9em;">
-            Built with ❤️ using Gradio and FLAN-T5-Large •
             <a href="/docs" target="_blank">API Documentation</a>
         </div>
         """

 import threading
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 # Configure logging
             # Get HF token from environment
             hf_token = os.getenv("HF_TOKEN")
+            logger.info("Loading Llama-3.1-8B-Instruct model...")
+            base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
             self.tokenizer = AutoTokenizer.from_pretrained(
                 base_model_name,
                 token=hf_token
             )
+            self.model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
                 torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                 device_map="auto" if self.device == "cuda" else None,
     gr.Markdown(
         """
         # 🤖 Question Generation AI
+        ### Powered by Llama-3.1-8B-Instruct
         Enter any statement or text, and I'll generate thoughtful questions about it. Perfect for creating study materials, assessments, or exploring topics deeper!
         """
         """
         ---
         <div style="text-align: center; color: #666; font-size: 0.9em;">
+            Built with ❤️ using Gradio and Llama-3.1-8B-Instruct •
             <a href="/docs" target="_blank">API Documentation</a>
         </div>
         """