Spaces:
Sleeping
Sleeping
AJ STUDIOZ commited on
Commit ·
bdeabc1
1
Parent(s): d2bfde4
Optimize AJ-Mini for faster responses: reduce tokens, add fast test endpoints
Browse files
app.py
CHANGED
|
@@ -39,11 +39,11 @@ print(f"{BRANDING_NAME} loaded successfully!")
|
|
| 39 |
def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float = 0.7, stream: bool = False):
|
| 40 |
"""Query model loaded directly in the Space - Optimized for speed"""
|
| 41 |
try:
|
| 42 |
-
#
|
| 43 |
-
max_tokens = min(max_tokens,
|
| 44 |
|
| 45 |
# Tokenize input
|
| 46 |
-
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=
|
| 47 |
|
| 48 |
# Generate response with optimization
|
| 49 |
with torch.no_grad(): # Disable gradient computation for faster inference
|
|
@@ -51,14 +51,15 @@ def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float =
|
|
| 51 |
**inputs,
|
| 52 |
max_new_tokens=max_tokens,
|
| 53 |
temperature=temperature,
|
| 54 |
-
do_sample=temperature > 0,
|
| 55 |
top_p=0.9,
|
| 56 |
-
top_k=
|
| 57 |
repetition_penalty=1.15, # Reduce repetition
|
| 58 |
pad_token_id=tokenizer.eos_token_id,
|
| 59 |
eos_token_id=tokenizer.eos_token_id,
|
| 60 |
num_beams=1, # Greedy decoding for speed
|
| 61 |
-
early_stopping=True
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
# Extract only the generated text (remove input)
|
|
@@ -530,28 +531,37 @@ async def generate(request: Request):
|
|
| 530 |
|
| 531 |
@app.get("/health")
|
| 532 |
async def health():
|
| 533 |
-
"""
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
|
| 556 |
if __name__ == "__main__":
|
| 557 |
import uvicorn
|
|
|
|
| 39 |
def query_ollama_model(prompt: str, max_tokens: int = 1000, temperature: float = 0.7, stream: bool = False):
|
| 40 |
"""Query model loaded directly in the Space - Optimized for speed"""
|
| 41 |
try:
|
| 42 |
+
# Optimize for faster responses
|
| 43 |
+
max_tokens = min(max_tokens, 150) # Reduced from 256 to 150 for faster CPU inference
|
| 44 |
|
| 45 |
# Tokenize input
|
| 46 |
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)
|
| 47 |
|
| 48 |
# Generate response with optimization
|
| 49 |
with torch.no_grad(): # Disable gradient computation for faster inference
|
|
|
|
| 51 |
**inputs,
|
| 52 |
max_new_tokens=max_tokens,
|
| 53 |
temperature=temperature,
|
| 54 |
+
do_sample=temperature > 0.1,
|
| 55 |
top_p=0.9,
|
| 56 |
+
top_k=40, # Reduced from 50 to 40 for speed
|
| 57 |
repetition_penalty=1.15, # Reduce repetition
|
| 58 |
pad_token_id=tokenizer.eos_token_id,
|
| 59 |
eos_token_id=tokenizer.eos_token_id,
|
| 60 |
num_beams=1, # Greedy decoding for speed
|
| 61 |
+
early_stopping=True,
|
| 62 |
+
no_repeat_ngram_size=3 # Prevent repetition
|
| 63 |
)
|
| 64 |
|
| 65 |
# Extract only the generated text (remove input)
|
|
|
|
| 531 |
|
| 532 |
@app.get("/health")
|
| 533 |
async def health():
|
| 534 |
+
"""Fast health check endpoint - no model query"""
|
| 535 |
+
return {
|
| 536 |
+
"status": "healthy",
|
| 537 |
+
"service": "AJ STUDIOZ Mini API",
|
| 538 |
+
"model": "AJ-Mini v1.0",
|
| 539 |
+
"version": "1.0",
|
| 540 |
+
"developer": "AJ STUDIOZ",
|
| 541 |
+
"platform": "HuggingFace Spaces (CPU)",
|
| 542 |
+
"availability": "Unlimited FREE",
|
| 543 |
+
"timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST"),
|
| 544 |
+
"note": "Use POST /v1/chat/completions for inference"
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
@app.get("/test")
|
| 548 |
+
@app.get("/ping")
|
| 549 |
+
async def quick_test():
|
| 550 |
+
"""Ultra-fast test endpoint for ReqBin - responds in < 200ms"""
|
| 551 |
+
return {
|
| 552 |
+
"status": "ok",
|
| 553 |
+
"message": "AJ-Mini v1.0 is operational",
|
| 554 |
+
"model": "aj-mini",
|
| 555 |
+
"latency": "< 200ms",
|
| 556 |
+
"endpoint": "POST /v1/chat/completions",
|
| 557 |
+
"example": {
|
| 558 |
+
"model": "aj-mini",
|
| 559 |
+
"messages": [{"role": "user", "content": "Hello"}]
|
| 560 |
+
},
|
| 561 |
+
"developer": "AJ STUDIOZ",
|
| 562 |
+
"availability": "UNLIMITED FREE",
|
| 563 |
+
"timestamp": get_ist_time().strftime("%Y-%m-%d %H:%M:%S IST")
|
| 564 |
+
}
|
| 565 |
|
| 566 |
if __name__ == "__main__":
|
| 567 |
import uvicorn
|