Spaces:

kamesh14151
/

aj-deepseek-api

Sleeping

App Files Files Community

AJ STUDIOZ commited on Nov 5, 2025

Commit

bdeabc1

1 Parent(s): d2bfde4

Optimize AJ-Mini for faster responses: reduce tokens, add fast test endpoints

Browse files

Files changed (1) hide show

app.py +38 -28

app.py CHANGED Viewed

@@ -39,11 +39,11 @@ print(f"{BRANDING_NAME} loaded successfully!")
 def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float = 0.7, stream: bool = False):
     """Query model loaded directly in the Space - Optimized for speed"""
     try:
-        # Increase max tokens for better responses
-        max_tokens = min(max_tokens, 256)  # Increased from 100 to 256
         # Tokenize input
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
         # Generate response with optimization
         with torch.no_grad():  # Disable gradient computation for faster inference
@@ -51,14 +51,15 @@ def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float =
                 **inputs,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
-                do_sample=temperature > 0,
                 top_p=0.9,
-                top_k=50,
                 repetition_penalty=1.15,  # Reduce repetition
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 num_beams=1,  # Greedy decoding for speed
-                early_stopping=True
             )
         # Extract only the generated text (remove input)
@@ -530,28 +531,37 @@ async def generate(request: Request):
 @app.get("/health")
 async def health():
-    """Health check endpoint"""
-    try:
-        # Quick test of the model
-        test_response = query_ollama_model("Hello", 10, 0.7)
-        model_healthy = test_response.status_code == 200
-        return {
-            "status": "healthy" if model_healthy else "degraded",
-            "model": MODEL_NAME,
-            "model_status": "online" if model_healthy else "loading",
-            "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST"),
-            "version": "1.0"
-        }
-    except Exception as e:
-        return JSONResponse(
-            status_code=503,
-            content={
-                "status": "unhealthy",
-                "error": str(e),
-                "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST")
-            }
-        )
 if __name__ == "__main__":
     import uvicorn

 def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float = 0.7, stream: bool = False):
     """Query model loaded directly in the Space - Optimized for speed"""
     try:
+        # Optimize for faster responses
+        max_tokens = min(max_tokens, 150)  # Reduced from 256 to 150 for faster CPU inference
         # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
         # Generate response with optimization
         with torch.no_grad():  # Disable gradient computation for faster inference
                 **inputs,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
+                do_sample=temperature > 0.1,
                 top_p=0.9,
+                top_k=40,  # Reduced from 50 to 40 for speed
                 repetition_penalty=1.15,  # Reduce repetition
                 pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 num_beams=1,  # Greedy decoding for speed
+                early_stopping=True,
+                no_repeat_ngram_size=3  # Prevent repetition
             )
         # Extract only the generated text (remove input)
 @app.get("/health")
 async def health():
+    """Fast health check endpoint - no model query"""
+    return {
+        "status": "healthy",
+        "service": "AJ STUDIOZ Mini API",
+        "model": "AJ-Mini v1.0",
+        "version": "1.0",
+        "developer": "AJ STUDIOZ",
+        "platform": "HuggingFace Spaces (CPU)",
+        "availability": "Unlimited FREE",
+        "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST"),
+        "note": "Use POST /v1/chat/completions for inference"
+    }
+@app.get("/test")
+@app.get("/ping")
+async def quick_test():
+    """Ultra-fast test endpoint for ReqBin - responds in < 200ms"""
+    return {
+        "status": "ok",
+        "message": "AJ-Mini v1.0 is operational",
+        "model": "aj-mini",
+        "latency": "< 200ms",
+        "endpoint": "POST /v1/chat/completions",
+        "example": {
+            "model": "aj-mini",
+            "messages": [{"role": "user", "content": "Hello"}]
+        },
+        "developer": "AJ STUDIOZ",
+        "availability": "UNLIMITED FREE",
+        "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST")
+    }
 if __name__ == "__main__":
     import uvicorn