Spaces:

aledraa
/

generate_api

Sleeping

App Files Files Community

aledraa commited on Jun 23, 2025

Commit

a1b4668

verified ·

1 Parent(s): 1ac97be

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -11

app.py CHANGED Viewed

@@ -4,20 +4,47 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import json
 import random
 from typing import List, Optional
 app = FastAPI(title="Qwen Data Generator API")
-# Load model and tokenizer
 model_name = "Qwen/Qwen2.5-3B-Instruct"
-print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="auto",
-    device_map="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-print("Model loaded successfully!")
 class GenerationRequest(BaseModel):
     llm_commands: List[str]
@@ -47,6 +74,11 @@ JSON Array:"""
 @app.post("/generate", response_model=GenerationResponse)
 async def generate_data(request: GenerationRequest):
     try:
         # Set seed for reproducibility if provided
         if request.seed:
@@ -70,7 +102,11 @@ async def generate_data(request: GenerationRequest):
         )
         # Tokenize and generate
-        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
         with torch.no_grad():
             generated_ids = model.generate(
@@ -78,7 +114,8 @@ async def generate_data(request: GenerationRequest):
                 max_new_tokens=2048,
                 temperature=0.8,
                 do_sample=True,
-                pad_token_id=tokenizer.eos_token_id
             )
         # Decode response

 import torch
 import json
 import random
+import os
 from typing import List, Optional
 app = FastAPI(title="Qwen Data Generator API")
+# Global variables for model and tokenizer
+model = None
+tokenizer = None
 model_name = "Qwen/Qwen2.5-3B-Instruct"
+def load_model():
+    """Load model and tokenizer with proper error handling"""
+    global model, tokenizer
+    try:
+        print("Loading model...")
+        print(f"Cache directory: {os.environ.get('HF_HOME', 'Not set')}")
+        # Load tokenizer first (smaller download)
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True
+        )
+        print("Tokenizer loaded successfully!")
+        # Load model with specific configurations for better compatibility
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,  # Use float16 to save memory
+            device_map="auto",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
+        )
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        raise e
+# Load model on startup
+load_model()
 class GenerationRequest(BaseModel):
     llm_commands: List[str]
 @app.post("/generate", response_model=GenerationResponse)
 async def generate_data(request: GenerationRequest):
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
     try:
         # Set seed for reproducibility if provided
         if request.seed:
         )
         # Tokenize and generate
+        model_inputs = tokenizer([text], return_tensors="pt")
+        # Move inputs to same device as model
+        if torch.cuda.is_available():
+            model_inputs = model_inputs.to('cuda')
         with torch.no_grad():
             generated_ids = model.generate(
                 max_new_tokens=2048,
                 temperature=0.8,
                 do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id
             )
         # Decode response