Spaces:

LucianStorm
/

Tiny_LLAMA_Assistant

Sleeping

App Files Files Community

LucianStorm commited on Feb 4, 2025

Commit

b59f9c5

verified ·

1 Parent(s): e1a117d

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -39

app.py CHANGED Viewed

@@ -16,52 +16,76 @@ app.add_middleware(
     allow_headers=["*"],
 )
-print("Loading model and tokenizer...")
-try:
-    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True,
-        device_map='auto'
-    )
-    model.eval()
-    torch.backends.cudnn.benchmark = True
-    print("Model loaded successfully!")
-    MODEL_LOADED = True
-except Exception as e:
-    print(f"Error loading model: {e}")
-    MODEL_LOADED = False
 class Query(BaseModel):
     prompt: str
-    max_length: int = 150  # Increased for better responses
-    temperature: float = 0.7  # Balanced temperature
 @app.post("/chat")
 async def chat(query: Query):
     if not MODEL_LOADED:
-        raise HTTPException(status_code=503, detail="Model not loaded")
     try:
-        # Better prompt template
-        system_message = """You are a helpful fitness and nutrition assistant.
-        Provide clear, informative answers to help users with their fitness goals.
-        Be friendly but focused on giving practical advice."""
-        formatted_prompt = f"<|system|>{system_message}</s><|user|>{query.prompt}</s><|assistant|>"
         inputs = tokenizer(
             formatted_prompt,
             return_tensors="pt",
             truncation=True,
-            max_length=512  # Increased context window
-        ).to(model.device)
         with torch.no_grad():
             outputs = model.generate(
                 inputs["input_ids"],
@@ -70,36 +94,39 @@ async def chat(query: Query):
                 top_p=0.9,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
-                no_repeat_ngram_size=3,  # Prevent repetition
-                num_beams=1  # Keep generation fast
             )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Clean up response
         response = response.split("<|assistant|>")[-1].strip()
-        # If response is too short, try to generate more
-        if len(response.split()) < 5:
-            return {"response": "I apologize, but could you please rephrase your question? I'll try to give a more helpful response."}
         return {"response": response}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 def read_root():
     return {
         "status": "API is running!",
-        "model_loaded": MODEL_LOADED
     }
 @app.get("/debug")
 def debug_info():
     return {
         "model_loaded": MODEL_LOADED,
-        "model_name": model_name if MODEL_LOADED else None,
-        "device": str(next(model.parameters()).device) if MODEL_LOADED else None
     }
 if __name__ == "__main__":

     allow_headers=["*"],
 )
+# Global variables
+model = None
+tokenizer = None
+MODEL_LOADED = False
+def load_model():
+    global model, tokenizer, MODEL_LOADED
+    try:
+        print("Starting model load...")
+        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+        # CPU-specific settings
+        torch.set_num_threads(4)  # Limit CPU threads
+        print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            local_files_only=False
+        )
+        print("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,  # Use float32 for CPU
+            low_cpu_mem_usage=True,
+            device_map=None  # Force CPU
+        )
+        model.eval()  # Set to evaluation mode
+        MODEL_LOADED = True
+        print("Model loaded successfully on CPU!")
+        return True
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        MODEL_LOADED = False
+        return False
+# Load model on startup
+print("Initiating model load...")
+load_model()
 class Query(BaseModel):
     prompt: str
+    max_length: int = 100  # Reduced for CPU
+    temperature: float = 0.7
 @app.post("/chat")
 async def chat(query: Query):
+    global model, tokenizer, MODEL_LOADED
     if not MODEL_LOADED:
+        if not load_model():
+            raise HTTPException(
+                status_code=503,
+                detail="Model is not loaded. Please try again in a minute."
+            )
     try:
+        # Simpler prompt template for efficiency
+        formatted_prompt = f"<|user|>{query.prompt}</s><|assistant|>"
+        # Tokenize with smaller context
         inputs = tokenizer(
             formatted_prompt,
             return_tensors="pt",
             truncation=True,
+            max_length=256  # Reduced context window for CPU
+        )
+        # Generate with CPU-optimized settings
         with torch.no_grad():
             outputs = model.generate(
                 inputs["input_ids"],
                 top_p=0.9,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
+                num_beams=1,  # No beam search for speed
+                early_stopping=True
             )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         response = response.split("<|assistant|>")[-1].strip()
+        if not response or len(response.split()) < 3:
+            return {"response": "I apologize, could you please rephrase your question?"}
         return {"response": response}
     except Exception as e:
+        print(f"Error during generation: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 def read_root():
     return {
         "status": "API is running!",
+        "model_loaded": MODEL_LOADED,
+        "backend": "CPU"
     }
 @app.get("/debug")
 def debug_info():
     return {
         "model_loaded": MODEL_LOADED,
+        "device": "cpu",
+        "num_threads": torch.get_num_threads(),
+        "memory_info": {
+            "max_memory": f"{torch.cuda.max_memory_allocated() / 1024**2:.2f}MB" if torch.cuda.is_available() else "CPU only"
+        }
     }
 if __name__ == "__main__":