Spaces:

sharktide
/

lightning

Running

App Files Files Community

Update app.py

by Sebebeb - opened Mar 16

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+207

-15

Files changed (1) hide show

app.py +207 -15

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from fastapi.responses import (
 )
 import httpx
 from bs4 import BeautifulSoup
-from typing import List, Dict, Any
 import asyncio
 import re
 import random
@@ -23,7 +23,6 @@ from helper.subscriptions import (
     TIER_CONFIG,
     PLAN_ORDER,
 )
-from typing import Optional
 from helper.keywords import *
 from helper.assets import (
     save_base64_image,
@@ -101,6 +100,95 @@ def is_cinematic_image_prompt(prompt: str) -> bool:
             return True
     return False
 PKEY = os.getenv("POLLINATIONS_KEY", "")
 PKEY2 = os.getenv("POLLINATIONS2_KEY", "")
 PKEY3 = os.getenv("POLLINATIONS3_KEY", "")
@@ -327,8 +415,9 @@ async def generate_text(
     prompt_text = extract_user_text(messages)
     uses_tools = (
-        "tools" in body and isinstance(body["tools"], list) and len(body["tools"]) > 0
-    ) or ("tool_choice" in body and body["tool_choice"] not in [None, "none"])
     long_context = is_long_context(messages)
     code_present = contains_code(prompt_text)
@@ -362,7 +451,18 @@ async def generate_text(
     provider = "groq"
     has_images = contains_images(messages)
-    if has_images:
         chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
         provider = "groq"
     else:
@@ -374,21 +474,21 @@ async def generate_text(
             else:
                 chosen_model = "openai/gpt-oss-20b"
             provider = "groq"
         elif code_present:
             if code_heavy and score >= 6:
                 chosen_model = "gpt-oss-120b"
                 provider = "cerebras"
             elif score >= 4:
                 chosen_model = "llama-3.3-70b-versatile"
                 provider = "groq"
         elif score >= 4:
             chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
             provider = "groq"
         if provider == "groq" and (
             total_chars > MAX_GROQ_PROMPT_CHARS or total_bytes > MAX_GROQ_PROMPT_BYTES
         ):
@@ -414,6 +514,7 @@ async def generate_text(
       Structured: {structured_task}
       Multi-question: {multi_q}
       MULTIMODAL REQUIRED: {has_images}
       → Selected: {chosen_model} ({provider})
     """
     )
@@ -426,16 +527,16 @@ async def generate_text(
         if not groq_keys_list:
             raise HTTPException(500, "Missing GROQ_KEY(s)")
         API_KEY = random.choice(groq_keys_list)
         url = "https://api.groq.com/openai/v1/chat/completions"
     elif provider == "cerebras":
         cer_keys = os.getenv("CER_KEY", "")
         cer_keys_list = [k.strip() for k in cer_keys.split(",") if k.strip()]
         if not cer_keys_list:
             raise HTTPException(500, "Missing CER_KEY(s)")
         API_KEY = random.choice(cer_keys_list)
         url = "https://api.cerebras.ai/v1/chat/completions"
     else:
@@ -443,6 +544,97 @@ async def generate_text(
     headers = {"Authorization": f"Bearer {API_KEY}"}
     if stream:
         body["stream"] = True
@@ -558,7 +750,7 @@ async def gensfx(
 @app.get("/gen/tts/{prompt}")
 @app.post("/gen/tts")
-async def gensfx(
     request: Request,
     prompt: str = None,
     authorization: Optional[str] = Header(None),
@@ -597,7 +789,7 @@ async def gensfx(
 @app.get("/gen/video/{prompt}")
 @app.post("/gen/video")
 @app.head("/gen/video")
-async def genvideo_airforce(
     request: Request,
     prompt: str = None,
     authorization: Optional[str] = Header(None),

 )
 import httpx
 from bs4 import BeautifulSoup
+from typing import List, Dict, Any, Optional
 import asyncio
 import re
 import random
     TIER_CONFIG,
     PLAN_ORDER,
 )
 from helper.keywords import *
 from helper.assets import (
     save_base64_image,
             return True
     return False
+# -----------------------------------------------------------------------------
+# Multimodal helpers (server-side fix for: tools + images)
+# -----------------------------------------------------------------------------
+def contains_images(messages: List[Dict[str, Any]]) -> bool:
+    """
+    Detect Chat Completions multimodal image parts.
+    Works with OpenAI-style: {"type":"image_url","image_url":{"url":"..."}}.
+    """
+    if not isinstance(messages, list):
+        return False
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        content = m.get("content")
+        if isinstance(content, list):
+            for part in content:
+                if not isinstance(part, dict):
+                    continue
+                ptype = part.get("type")
+                if ptype == "image_url":
+                    return True
+    return False
+def content_to_text(content: Any) -> str:
+    """
+    Convert a message.content (string or multimodal parts array) to a plain string.
+    For parts arrays, keeps only text parts and drops image parts.
+    """
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        out = []
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "text":
+                txt = part.get("text")
+                if isinstance(txt, str) and txt:
+                    out.append(txt)
+        return "\n".join(out).strip()
+    return ""
+def flatten_messages_to_text_only(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Return messages with content always a string (drops image parts).
+    Preserves role and other fields.
+    """
+    flattened: List[Dict[str, Any]] = []
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        nm = dict(m)
+        nm["content"] = content_to_text(m.get("content"))
+        flattened.append(nm)
+    return flattened
+def find_last_multimodal_user_message(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """
+    Find last user message whose content is a parts array containing an image_url.
+    """
+    for m in reversed(messages):
+        if not isinstance(m, dict):
+            continue
+        if m.get("role") != "user":
+            continue
+        content = m.get("content")
+        if not isinstance(content, list):
+            continue
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "image_url":
+                return m
+    return None
+def append_instruction_to_multimodal_user_content(content: Any, instruction: str) -> Any:
+    """
+    Adds an extra text part to a multimodal content array, or appends to string.
+    """
+    if isinstance(content, str):
+        return (content + "\n\n" + instruction).strip()
+    if isinstance(content, list):
+        # Keep as list, add an extra trailing text part.
+        return content + [{"type": "text", "text": instruction}]
+    return instruction
 PKEY = os.getenv("POLLINATIONS_KEY", "")
 PKEY2 = os.getenv("POLLINATIONS2_KEY", "")
 PKEY3 = os.getenv("POLLINATIONS3_KEY", "")
     prompt_text = extract_user_text(messages)
     uses_tools = (
+        ("tools" in body and isinstance(body["tools"], list) and len(body["tools"]) > 0)
+        or ("tool_choice" in body and body["tool_choice"] not in [None, "none"])
+    )
     long_context = is_long_context(messages)
     code_present = contains_code(prompt_text)
     provider = "groq"
     has_images = contains_images(messages)
+    # IMPORTANT FIX:
+    # Some upstream OpenAI-compat providers reject `tools` when any message content is multimodal (list parts),
+    # returning: messages[n].content must be a string.
+    # If the request uses tools and includes images, we do a 2-pass approach:
+    #   (1) vision caption (NO tools; keep multimodal)
+    #   (2) tool-capable call with text-only messages + appended caption
+    needs_two_pass = bool(has_images and uses_tools)
+    # Routing:
+    # - If images exist AND tools are NOT in use, route to a vision-capable model directly.
+    # - If tools are in use (even with images), route to tool models (pass 2 will be text-only).
+    if has_images and not uses_tools:
         chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
         provider = "groq"
     else:
             else:
                 chosen_model = "openai/gpt-oss-20b"
             provider = "groq"
         elif code_present:
             if code_heavy and score >= 6:
                 chosen_model = "gpt-oss-120b"
                 provider = "cerebras"
             elif score >= 4:
                 chosen_model = "llama-3.3-70b-versatile"
                 provider = "groq"
         elif score >= 4:
             chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
             provider = "groq"
         if provider == "groq" and (
             total_chars > MAX_GROQ_PROMPT_CHARS or total_bytes > MAX_GROQ_PROMPT_BYTES
         ):
       Structured: {structured_task}
       Multi-question: {multi_q}
       MULTIMODAL REQUIRED: {has_images}
+      TWO-PASS (tools+images): {needs_two_pass}
       → Selected: {chosen_model} ({provider})
     """
     )
         if not groq_keys_list:
             raise HTTPException(500, "Missing GROQ_KEY(s)")
         API_KEY = random.choice(groq_keys_list)
         url = "https://api.groq.com/openai/v1/chat/completions"
     elif provider == "cerebras":
         cer_keys = os.getenv("CER_KEY", "")
         cer_keys_list = [k.strip() for k in cer_keys.split(",") if k.strip()]
         if not cer_keys_list:
             raise HTTPException(500, "Missing CER_KEY(s)")
         API_KEY = random.choice(cer_keys_list)
         url = "https://api.cerebras.ai/v1/chat/completions"
     else:
     headers = {"Authorization": f"Bearer {API_KEY}"}
+    # -------------------------------------------------------------------------
+    # Two-pass fix implementation (tools + multimodal images)
+    # -------------------------------------------------------------------------
+    if needs_two_pass:
+        # 1) Build a captioning request (no tools/tool_choice, stream disabled)
+        # Prefer the last multimodal user message that actually contains images.
+        last_mm_user = find_last_multimodal_user_message(messages)
+        mm_user_msg = last_mm_user if last_mm_user else {"role": "user", "content": messages[-1].get("content")}
+        caption_instruction = (
+            "Describe the attached image(s) in detail. "
+            "Include any text you can read, objects, UI elements, and relationships. "
+            "Return only the description."
+        )
+        caption_messages = [
+            {"role": "system", "content": "You are a precise image captioning assistant."},
+            {
+                "role": "user",
+                "content": append_instruction_to_multimodal_user_content(
+                    mm_user_msg.get("content"),
+                    caption_instruction,
+                ),
+            },
+        ]
+        caption_body = dict(body)
+        caption_body["model"] = "meta-llama/llama-4-scout-17b-16e-instruct"
+        caption_body["messages"] = caption_messages
+        caption_body["stream"] = False
+        caption_body.pop("tools", None)
+        caption_body.pop("tool_choice", None)
+        caption_body.pop("tool_choice", None)
+        try:
+            async with httpx.AsyncClient(timeout=None) as client:
+                cap = await client.post(url, json=caption_body, headers=headers)
+        except Exception as e:
+            raise HTTPException(502, f"Caption upstream request failed: {str(e)}")
+        if cap.status_code >= 400:
+            # Surface a safe snippet for debugging.
+            snippet = cap.text[:800] if isinstance(cap.text, str) else ""
+            raise HTTPException(
+                status_code=400,
+                detail=f"Caption upstream provider error ({cap.status_code}): {snippet}",
+            )
+        try:
+            cap_json = cap.json()
+            caption = (
+                ((cap_json.get("choices") or [{}])[0].get("message") or {}).get("content")
+                or ""
+            )
+        except Exception:
+            caption = ""
+        caption = (caption or "").strip()
+        if not caption:
+            caption = "(No caption returned.)"
+        # Keep captions bounded so we don't accidentally blow prompt limits.
+        if len(caption) > 4000:
+            caption = caption[:4000] + "…"
+        # 2) Rewrite original request to be text-only messages, append caption.
+        rewritten = flatten_messages_to_text_only(messages)
+        rewritten.append(
+            {
+                "role": "user",
+                "content": "[Image description]\n" + caption,
+            }
+        )
+        body["messages"] = rewritten
+        # Re-check limits with the rewritten messages.
+        total_chars2, total_bytes2 = calculate_messages_size(rewritten)
+        if total_chars2 > MAX_CHAT_PROMPT_CHARS or total_bytes2 > MAX_CHAT_PROMPT_BYTES:
+            raise HTTPException(
+                status_code=413,
+                detail=(
+                    f"Prompt context too large after image captioning ({total_chars2} chars, {total_bytes2} bytes). "
+                    f"Max allowed is {MAX_CHAT_PROMPT_CHARS} chars or {MAX_CHAT_PROMPT_BYTES} bytes."
+                ),
+            )
+        # With the rewrite, we are no longer multimodal for upstream.
+        has_images = False
+    # -------------------------------------------------------------------------
     if stream:
         body["stream"] = True
 @app.get("/gen/tts/{prompt}")
 @app.post("/gen/tts")
+async def gentts(
     request: Request,
     prompt: str = None,
     authorization: Optional[str] = Header(None),
 @app.get("/gen/video/{prompt}")
 @app.post("/gen/video")
 @app.head("/gen/video")
+async def genvideo(
     request: Request,
     prompt: str = None,
     authorization: Optional[str] = Header(None),