Spaces:
Running
Running
Update app.py
#2
by Sebebeb - opened
app.py
CHANGED
|
@@ -11,7 +11,7 @@ from fastapi.responses import (
|
|
| 11 |
)
|
| 12 |
import httpx
|
| 13 |
from bs4 import BeautifulSoup
|
| 14 |
-
from typing import List, Dict, Any
|
| 15 |
import asyncio
|
| 16 |
import re
|
| 17 |
import random
|
|
@@ -23,7 +23,6 @@ from helper.subscriptions import (
|
|
| 23 |
TIER_CONFIG,
|
| 24 |
PLAN_ORDER,
|
| 25 |
)
|
| 26 |
-
from typing import Optional
|
| 27 |
from helper.keywords import *
|
| 28 |
from helper.assets import (
|
| 29 |
save_base64_image,
|
|
@@ -101,6 +100,95 @@ def is_cinematic_image_prompt(prompt: str) -> bool:
|
|
| 101 |
return True
|
| 102 |
return False
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
PKEY = os.getenv("POLLINATIONS_KEY", "")
|
| 105 |
PKEY2 = os.getenv("POLLINATIONS2_KEY", "")
|
| 106 |
PKEY3 = os.getenv("POLLINATIONS3_KEY", "")
|
|
@@ -327,8 +415,9 @@ async def generate_text(
|
|
| 327 |
prompt_text = extract_user_text(messages)
|
| 328 |
|
| 329 |
uses_tools = (
|
| 330 |
-
"tools" in body and isinstance(body["tools"], list) and len(body["tools"]) > 0
|
| 331 |
-
|
|
|
|
| 332 |
|
| 333 |
long_context = is_long_context(messages)
|
| 334 |
code_present = contains_code(prompt_text)
|
|
@@ -362,7 +451,18 @@ async def generate_text(
|
|
| 362 |
provider = "groq"
|
| 363 |
has_images = contains_images(messages)
|
| 364 |
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 367 |
provider = "groq"
|
| 368 |
else:
|
|
@@ -374,21 +474,21 @@ async def generate_text(
|
|
| 374 |
else:
|
| 375 |
chosen_model = "openai/gpt-oss-20b"
|
| 376 |
provider = "groq"
|
| 377 |
-
|
| 378 |
elif code_present:
|
| 379 |
-
|
| 380 |
if code_heavy and score >= 6:
|
| 381 |
chosen_model = "gpt-oss-120b"
|
| 382 |
provider = "cerebras"
|
| 383 |
-
|
| 384 |
elif score >= 4:
|
| 385 |
chosen_model = "llama-3.3-70b-versatile"
|
| 386 |
provider = "groq"
|
| 387 |
-
|
| 388 |
elif score >= 4:
|
| 389 |
chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 390 |
provider = "groq"
|
| 391 |
-
|
| 392 |
if provider == "groq" and (
|
| 393 |
total_chars > MAX_GROQ_PROMPT_CHARS or total_bytes > MAX_GROQ_PROMPT_BYTES
|
| 394 |
):
|
|
@@ -414,6 +514,7 @@ async def generate_text(
|
|
| 414 |
Structured: {structured_task}
|
| 415 |
Multi-question: {multi_q}
|
| 416 |
MULTIMODAL REQUIRED: {has_images}
|
|
|
|
| 417 |
→ Selected: {chosen_model} ({provider})
|
| 418 |
"""
|
| 419 |
)
|
|
@@ -426,16 +527,16 @@ async def generate_text(
|
|
| 426 |
if not groq_keys_list:
|
| 427 |
raise HTTPException(500, "Missing GROQ_KEY(s)")
|
| 428 |
API_KEY = random.choice(groq_keys_list)
|
| 429 |
-
|
| 430 |
url = "https://api.groq.com/openai/v1/chat/completions"
|
| 431 |
-
|
| 432 |
elif provider == "cerebras":
|
| 433 |
cer_keys = os.getenv("CER_KEY", "")
|
| 434 |
cer_keys_list = [k.strip() for k in cer_keys.split(",") if k.strip()]
|
| 435 |
if not cer_keys_list:
|
| 436 |
raise HTTPException(500, "Missing CER_KEY(s)")
|
| 437 |
API_KEY = random.choice(cer_keys_list)
|
| 438 |
-
|
| 439 |
url = "https://api.cerebras.ai/v1/chat/completions"
|
| 440 |
|
| 441 |
else:
|
|
@@ -443,6 +544,97 @@ async def generate_text(
|
|
| 443 |
|
| 444 |
headers = {"Authorization": f"Bearer {API_KEY}"}
|
| 445 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
if stream:
|
| 447 |
body["stream"] = True
|
| 448 |
|
|
@@ -558,7 +750,7 @@ async def gensfx(
|
|
| 558 |
|
| 559 |
@app.get("/gen/tts/{prompt}")
|
| 560 |
@app.post("/gen/tts")
|
| 561 |
-
async def
|
| 562 |
request: Request,
|
| 563 |
prompt: str = None,
|
| 564 |
authorization: Optional[str] = Header(None),
|
|
@@ -597,7 +789,7 @@ async def gensfx(
|
|
| 597 |
@app.get("/gen/video/{prompt}")
|
| 598 |
@app.post("/gen/video")
|
| 599 |
@app.head("/gen/video")
|
| 600 |
-
async def
|
| 601 |
request: Request,
|
| 602 |
prompt: str = None,
|
| 603 |
authorization: Optional[str] = Header(None),
|
|
|
|
| 11 |
)
|
| 12 |
import httpx
|
| 13 |
from bs4 import BeautifulSoup
|
| 14 |
+
from typing import List, Dict, Any, Optional
|
| 15 |
import asyncio
|
| 16 |
import re
|
| 17 |
import random
|
|
|
|
| 23 |
TIER_CONFIG,
|
| 24 |
PLAN_ORDER,
|
| 25 |
)
|
|
|
|
| 26 |
from helper.keywords import *
|
| 27 |
from helper.assets import (
|
| 28 |
save_base64_image,
|
|
|
|
| 100 |
return True
|
| 101 |
return False
|
| 102 |
|
| 103 |
+
|
| 104 |
+
# -----------------------------------------------------------------------------
|
| 105 |
+
# Multimodal helpers (server-side fix for: tools + images)
|
| 106 |
+
# -----------------------------------------------------------------------------
|
| 107 |
+
|
| 108 |
+
def contains_images(messages: List[Dict[str, Any]]) -> bool:
|
| 109 |
+
"""
|
| 110 |
+
Detect Chat Completions multimodal image parts.
|
| 111 |
+
Works with OpenAI-style: {"type":"image_url","image_url":{"url":"..."}}.
|
| 112 |
+
"""
|
| 113 |
+
if not isinstance(messages, list):
|
| 114 |
+
return False
|
| 115 |
+
for m in messages:
|
| 116 |
+
if not isinstance(m, dict):
|
| 117 |
+
continue
|
| 118 |
+
content = m.get("content")
|
| 119 |
+
if isinstance(content, list):
|
| 120 |
+
for part in content:
|
| 121 |
+
if not isinstance(part, dict):
|
| 122 |
+
continue
|
| 123 |
+
ptype = part.get("type")
|
| 124 |
+
if ptype == "image_url":
|
| 125 |
+
return True
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def content_to_text(content: Any) -> str:
|
| 130 |
+
"""
|
| 131 |
+
Convert a message.content (string or multimodal parts array) to a plain string.
|
| 132 |
+
For parts arrays, keeps only text parts and drops image parts.
|
| 133 |
+
"""
|
| 134 |
+
if isinstance(content, str):
|
| 135 |
+
return content
|
| 136 |
+
if isinstance(content, list):
|
| 137 |
+
out = []
|
| 138 |
+
for part in content:
|
| 139 |
+
if isinstance(part, dict) and part.get("type") == "text":
|
| 140 |
+
txt = part.get("text")
|
| 141 |
+
if isinstance(txt, str) and txt:
|
| 142 |
+
out.append(txt)
|
| 143 |
+
return "\n".join(out).strip()
|
| 144 |
+
return ""
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def flatten_messages_to_text_only(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 148 |
+
"""
|
| 149 |
+
Return messages with content always a string (drops image parts).
|
| 150 |
+
Preserves role and other fields.
|
| 151 |
+
"""
|
| 152 |
+
flattened: List[Dict[str, Any]] = []
|
| 153 |
+
for m in messages:
|
| 154 |
+
if not isinstance(m, dict):
|
| 155 |
+
continue
|
| 156 |
+
nm = dict(m)
|
| 157 |
+
nm["content"] = content_to_text(m.get("content"))
|
| 158 |
+
flattened.append(nm)
|
| 159 |
+
return flattened
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def find_last_multimodal_user_message(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
| 163 |
+
"""
|
| 164 |
+
Find last user message whose content is a parts array containing an image_url.
|
| 165 |
+
"""
|
| 166 |
+
for m in reversed(messages):
|
| 167 |
+
if not isinstance(m, dict):
|
| 168 |
+
continue
|
| 169 |
+
if m.get("role") != "user":
|
| 170 |
+
continue
|
| 171 |
+
content = m.get("content")
|
| 172 |
+
if not isinstance(content, list):
|
| 173 |
+
continue
|
| 174 |
+
for part in content:
|
| 175 |
+
if isinstance(part, dict) and part.get("type") == "image_url":
|
| 176 |
+
return m
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def append_instruction_to_multimodal_user_content(content: Any, instruction: str) -> Any:
|
| 181 |
+
"""
|
| 182 |
+
Adds an extra text part to a multimodal content array, or appends to string.
|
| 183 |
+
"""
|
| 184 |
+
if isinstance(content, str):
|
| 185 |
+
return (content + "\n\n" + instruction).strip()
|
| 186 |
+
if isinstance(content, list):
|
| 187 |
+
# Keep as list, add an extra trailing text part.
|
| 188 |
+
return content + [{"type": "text", "text": instruction}]
|
| 189 |
+
return instruction
|
| 190 |
+
|
| 191 |
+
|
| 192 |
PKEY = os.getenv("POLLINATIONS_KEY", "")
|
| 193 |
PKEY2 = os.getenv("POLLINATIONS2_KEY", "")
|
| 194 |
PKEY3 = os.getenv("POLLINATIONS3_KEY", "")
|
|
|
|
| 415 |
prompt_text = extract_user_text(messages)
|
| 416 |
|
| 417 |
uses_tools = (
|
| 418 |
+
("tools" in body and isinstance(body["tools"], list) and len(body["tools"]) > 0)
|
| 419 |
+
or ("tool_choice" in body and body["tool_choice"] not in [None, "none"])
|
| 420 |
+
)
|
| 421 |
|
| 422 |
long_context = is_long_context(messages)
|
| 423 |
code_present = contains_code(prompt_text)
|
|
|
|
| 451 |
provider = "groq"
|
| 452 |
has_images = contains_images(messages)
|
| 453 |
|
| 454 |
+
# IMPORTANT FIX:
|
| 455 |
+
# Some upstream OpenAI-compat providers reject `tools` when any message content is multimodal (list parts),
|
| 456 |
+
# returning: messages[n].content must be a string.
|
| 457 |
+
# If the request uses tools and includes images, we do a 2-pass approach:
|
| 458 |
+
# (1) vision caption (NO tools; keep multimodal)
|
| 459 |
+
# (2) tool-capable call with text-only messages + appended caption
|
| 460 |
+
needs_two_pass = bool(has_images and uses_tools)
|
| 461 |
+
|
| 462 |
+
# Routing:
|
| 463 |
+
# - If images exist AND tools are NOT in use, route to a vision-capable model directly.
|
| 464 |
+
# - If tools are in use (even with images), route to tool models (pass 2 will be text-only).
|
| 465 |
+
if has_images and not uses_tools:
|
| 466 |
chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 467 |
provider = "groq"
|
| 468 |
else:
|
|
|
|
| 474 |
else:
|
| 475 |
chosen_model = "openai/gpt-oss-20b"
|
| 476 |
provider = "groq"
|
| 477 |
+
|
| 478 |
elif code_present:
|
| 479 |
+
|
| 480 |
if code_heavy and score >= 6:
|
| 481 |
chosen_model = "gpt-oss-120b"
|
| 482 |
provider = "cerebras"
|
| 483 |
+
|
| 484 |
elif score >= 4:
|
| 485 |
chosen_model = "llama-3.3-70b-versatile"
|
| 486 |
provider = "groq"
|
| 487 |
+
|
| 488 |
elif score >= 4:
|
| 489 |
chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 490 |
provider = "groq"
|
| 491 |
+
|
| 492 |
if provider == "groq" and (
|
| 493 |
total_chars > MAX_GROQ_PROMPT_CHARS or total_bytes > MAX_GROQ_PROMPT_BYTES
|
| 494 |
):
|
|
|
|
| 514 |
Structured: {structured_task}
|
| 515 |
Multi-question: {multi_q}
|
| 516 |
MULTIMODAL REQUIRED: {has_images}
|
| 517 |
+
TWO-PASS (tools+images): {needs_two_pass}
|
| 518 |
→ Selected: {chosen_model} ({provider})
|
| 519 |
"""
|
| 520 |
)
|
|
|
|
| 527 |
if not groq_keys_list:
|
| 528 |
raise HTTPException(500, "Missing GROQ_KEY(s)")
|
| 529 |
API_KEY = random.choice(groq_keys_list)
|
| 530 |
+
|
| 531 |
url = "https://api.groq.com/openai/v1/chat/completions"
|
| 532 |
+
|
| 533 |
elif provider == "cerebras":
|
| 534 |
cer_keys = os.getenv("CER_KEY", "")
|
| 535 |
cer_keys_list = [k.strip() for k in cer_keys.split(",") if k.strip()]
|
| 536 |
if not cer_keys_list:
|
| 537 |
raise HTTPException(500, "Missing CER_KEY(s)")
|
| 538 |
API_KEY = random.choice(cer_keys_list)
|
| 539 |
+
|
| 540 |
url = "https://api.cerebras.ai/v1/chat/completions"
|
| 541 |
|
| 542 |
else:
|
|
|
|
| 544 |
|
| 545 |
headers = {"Authorization": f"Bearer {API_KEY}"}
|
| 546 |
|
| 547 |
+
# -------------------------------------------------------------------------
|
| 548 |
+
# Two-pass fix implementation (tools + multimodal images)
|
| 549 |
+
# -------------------------------------------------------------------------
|
| 550 |
+
if needs_two_pass:
|
| 551 |
+
# 1) Build a captioning request (no tools/tool_choice, stream disabled)
|
| 552 |
+
# Prefer the last multimodal user message that actually contains images.
|
| 553 |
+
last_mm_user = find_last_multimodal_user_message(messages)
|
| 554 |
+
mm_user_msg = last_mm_user if last_mm_user else {"role": "user", "content": messages[-1].get("content")}
|
| 555 |
+
|
| 556 |
+
caption_instruction = (
|
| 557 |
+
"Describe the attached image(s) in detail. "
|
| 558 |
+
"Include any text you can read, objects, UI elements, and relationships. "
|
| 559 |
+
"Return only the description."
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
caption_messages = [
|
| 563 |
+
{"role": "system", "content": "You are a precise image captioning assistant."},
|
| 564 |
+
{
|
| 565 |
+
"role": "user",
|
| 566 |
+
"content": append_instruction_to_multimodal_user_content(
|
| 567 |
+
mm_user_msg.get("content"),
|
| 568 |
+
caption_instruction,
|
| 569 |
+
),
|
| 570 |
+
},
|
| 571 |
+
]
|
| 572 |
+
|
| 573 |
+
caption_body = dict(body)
|
| 574 |
+
caption_body["model"] = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 575 |
+
caption_body["messages"] = caption_messages
|
| 576 |
+
caption_body["stream"] = False
|
| 577 |
+
caption_body.pop("tools", None)
|
| 578 |
+
caption_body.pop("tool_choice", None)
|
| 579 |
+
caption_body.pop("tool_choice", None)
|
| 580 |
+
|
| 581 |
+
try:
|
| 582 |
+
async with httpx.AsyncClient(timeout=None) as client:
|
| 583 |
+
cap = await client.post(url, json=caption_body, headers=headers)
|
| 584 |
+
except Exception as e:
|
| 585 |
+
raise HTTPException(502, f"Caption upstream request failed: {str(e)}")
|
| 586 |
+
|
| 587 |
+
if cap.status_code >= 400:
|
| 588 |
+
# Surface a safe snippet for debugging.
|
| 589 |
+
snippet = cap.text[:800] if isinstance(cap.text, str) else ""
|
| 590 |
+
raise HTTPException(
|
| 591 |
+
status_code=400,
|
| 592 |
+
detail=f"Caption upstream provider error ({cap.status_code}): {snippet}",
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
try:
|
| 596 |
+
cap_json = cap.json()
|
| 597 |
+
caption = (
|
| 598 |
+
((cap_json.get("choices") or [{}])[0].get("message") or {}).get("content")
|
| 599 |
+
or ""
|
| 600 |
+
)
|
| 601 |
+
except Exception:
|
| 602 |
+
caption = ""
|
| 603 |
+
|
| 604 |
+
caption = (caption or "").strip()
|
| 605 |
+
if not caption:
|
| 606 |
+
caption = "(No caption returned.)"
|
| 607 |
+
|
| 608 |
+
# Keep captions bounded so we don't accidentally blow prompt limits.
|
| 609 |
+
if len(caption) > 4000:
|
| 610 |
+
caption = caption[:4000] + "…"
|
| 611 |
+
|
| 612 |
+
# 2) Rewrite original request to be text-only messages, append caption.
|
| 613 |
+
rewritten = flatten_messages_to_text_only(messages)
|
| 614 |
+
rewritten.append(
|
| 615 |
+
{
|
| 616 |
+
"role": "user",
|
| 617 |
+
"content": "[Image description]\n" + caption,
|
| 618 |
+
}
|
| 619 |
+
)
|
| 620 |
+
body["messages"] = rewritten
|
| 621 |
+
|
| 622 |
+
# Re-check limits with the rewritten messages.
|
| 623 |
+
total_chars2, total_bytes2 = calculate_messages_size(rewritten)
|
| 624 |
+
if total_chars2 > MAX_CHAT_PROMPT_CHARS or total_bytes2 > MAX_CHAT_PROMPT_BYTES:
|
| 625 |
+
raise HTTPException(
|
| 626 |
+
status_code=413,
|
| 627 |
+
detail=(
|
| 628 |
+
f"Prompt context too large after image captioning ({total_chars2} chars, {total_bytes2} bytes). "
|
| 629 |
+
f"Max allowed is {MAX_CHAT_PROMPT_CHARS} chars or {MAX_CHAT_PROMPT_BYTES} bytes."
|
| 630 |
+
),
|
| 631 |
+
)
|
| 632 |
+
|
| 633 |
+
# With the rewrite, we are no longer multimodal for upstream.
|
| 634 |
+
has_images = False
|
| 635 |
+
|
| 636 |
+
# -------------------------------------------------------------------------
|
| 637 |
+
|
| 638 |
if stream:
|
| 639 |
body["stream"] = True
|
| 640 |
|
|
|
|
| 750 |
|
| 751 |
@app.get("/gen/tts/{prompt}")
|
| 752 |
@app.post("/gen/tts")
|
| 753 |
+
async def gentts(
|
| 754 |
request: Request,
|
| 755 |
prompt: str = None,
|
| 756 |
authorization: Optional[str] = Header(None),
|
|
|
|
| 789 |
@app.get("/gen/video/{prompt}")
|
| 790 |
@app.post("/gen/video")
|
| 791 |
@app.head("/gen/video")
|
| 792 |
+
async def genvideo(
|
| 793 |
request: Request,
|
| 794 |
prompt: str = None,
|
| 795 |
authorization: Optional[str] = Header(None),
|