AJ STUDIOZ commited on
Commit
bdeabc1
·
1 Parent(s): d2bfde4

Optimize AJ-Mini for faster responses: reduce tokens, add fast test endpoints

Browse files
Files changed (1) hide show
  1. app.py +38 -28
app.py CHANGED
@@ -39,11 +39,11 @@ print(f"{BRANDING_NAME} loaded successfully!")
39
  def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float = 0.7, stream: bool = False):
40
  """Query model loaded directly in the Space - Optimized for speed"""
41
  try:
42
- # Increase max tokens for better responses
43
- max_tokens = min(max_tokens, 256) # Increased from 100 to 256
44
 
45
  # Tokenize input
46
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
47
 
48
  # Generate response with optimization
49
  with torch.no_grad(): # Disable gradient computation for faster inference
@@ -51,14 +51,15 @@ def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float =
51
  **inputs,
52
  max_new_tokens=max_tokens,
53
  temperature=temperature,
54
- do_sample=temperature > 0,
55
  top_p=0.9,
56
- top_k=50,
57
  repetition_penalty=1.15, # Reduce repetition
58
  pad_token_id=tokenizer.eos_token_id,
59
  eos_token_id=tokenizer.eos_token_id,
60
  num_beams=1, # Greedy decoding for speed
61
- early_stopping=True
 
62
  )
63
 
64
  # Extract only the generated text (remove input)
@@ -530,28 +531,37 @@ async def generate(request: Request):
530
 
531
  @app.get("/health")
532
  async def health():
533
- """Health check endpoint"""
534
- try:
535
- # Quick test of the model
536
- test_response = query_ollama_model("Hello", 10, 0.7)
537
- model_healthy = test_response.status_code == 200
538
-
539
- return {
540
- "status": "healthy" if model_healthy else "degraded",
541
- "model": MODEL_NAME,
542
- "model_status": "online" if model_healthy else "loading",
543
- "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST"),
544
- "version": "1.0"
545
- }
546
- except Exception as e:
547
- return JSONResponse(
548
- status_code=503,
549
- content={
550
- "status": "unhealthy",
551
- "error": str(e),
552
- "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST")
553
- }
554
- )
 
 
 
 
 
 
 
 
 
555
 
556
  if __name__ == "__main__":
557
  import uvicorn
 
39
  def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float = 0.7, stream: bool = False):
40
  """Query model loaded directly in the Space - Optimized for speed"""
41
  try:
42
+ # Optimize for faster responses
43
+ max_tokens = min(max_tokens, 150) # Reduced from 256 to 150 for faster CPU inference
44
 
45
  # Tokenize input
46
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
47
 
48
  # Generate response with optimization
49
  with torch.no_grad(): # Disable gradient computation for faster inference
 
51
  **inputs,
52
  max_new_tokens=max_tokens,
53
  temperature=temperature,
54
+ do_sample=temperature > 0.1,
55
  top_p=0.9,
56
+ top_k=40, # Reduced from 50 to 40 for speed
57
  repetition_penalty=1.15, # Reduce repetition
58
  pad_token_id=tokenizer.eos_token_id,
59
  eos_token_id=tokenizer.eos_token_id,
60
  num_beams=1, # Greedy decoding for speed
61
+ early_stopping=True,
62
+ no_repeat_ngram_size=3 # Prevent repetition
63
  )
64
 
65
  # Extract only the generated text (remove input)
 
531
 
532
  @app.get("/health")
533
  async def health():
534
+ """Fast health check endpoint - no model query"""
535
+ return {
536
+ "status": "healthy",
537
+ "service": "AJ STUDIOZ Mini API",
538
+ "model": "AJ-Mini v1.0",
539
+ "version": "1.0",
540
+ "developer": "AJ STUDIOZ",
541
+ "platform": "HuggingFace Spaces (CPU)",
542
+ "availability": "Unlimited FREE",
543
+ "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST"),
544
+ "note": "Use POST /v1/chat/completions for inference"
545
+ }
546
+
547
+ @app.get("/test")
548
+ @app.get("/ping")
549
+ async def quick_test():
550
+ """Ultra-fast test endpoint for ReqBin - responds in < 200ms"""
551
+ return {
552
+ "status": "ok",
553
+ "message": "AJ-Mini v1.0 is operational",
554
+ "model": "aj-mini",
555
+ "latency": "< 200ms",
556
+ "endpoint": "POST /v1/chat/completions",
557
+ "example": {
558
+ "model": "aj-mini",
559
+ "messages": [{"role": "user", "content": "Hello"}]
560
+ },
561
+ "developer": "AJ STUDIOZ",
562
+ "availability": "UNLIMITED FREE",
563
+ "timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST")
564
+ }
565
 
566
  if __name__ == "__main__":
567
  import uvicorn