Files changed (1) hide show
  1. app.py +207 -15
app.py CHANGED
@@ -11,7 +11,7 @@ from fastapi.responses import (
11
  )
12
  import httpx
13
  from bs4 import BeautifulSoup
14
- from typing import List, Dict, Any
15
  import asyncio
16
  import re
17
  import random
@@ -23,7 +23,6 @@ from helper.subscriptions import (
23
  TIER_CONFIG,
24
  PLAN_ORDER,
25
  )
26
- from typing import Optional
27
  from helper.keywords import *
28
  from helper.assets import (
29
  save_base64_image,
@@ -101,6 +100,95 @@ def is_cinematic_image_prompt(prompt: str) -> bool:
101
  return True
102
  return False
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  PKEY = os.getenv("POLLINATIONS_KEY", "")
105
  PKEY2 = os.getenv("POLLINATIONS2_KEY", "")
106
  PKEY3 = os.getenv("POLLINATIONS3_KEY", "")
@@ -327,8 +415,9 @@ async def generate_text(
327
  prompt_text = extract_user_text(messages)
328
 
329
  uses_tools = (
330
- "tools" in body and isinstance(body["tools"], list) and len(body["tools"]) > 0
331
- ) or ("tool_choice" in body and body["tool_choice"] not in [None, "none"])
 
332
 
333
  long_context = is_long_context(messages)
334
  code_present = contains_code(prompt_text)
@@ -362,7 +451,18 @@ async def generate_text(
362
  provider = "groq"
363
  has_images = contains_images(messages)
364
 
365
- if has_images:
 
 
 
 
 
 
 
 
 
 
 
366
  chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
367
  provider = "groq"
368
  else:
@@ -374,21 +474,21 @@ async def generate_text(
374
  else:
375
  chosen_model = "openai/gpt-oss-20b"
376
  provider = "groq"
377
-
378
  elif code_present:
379
-
380
  if code_heavy and score >= 6:
381
  chosen_model = "gpt-oss-120b"
382
  provider = "cerebras"
383
-
384
  elif score >= 4:
385
  chosen_model = "llama-3.3-70b-versatile"
386
  provider = "groq"
387
-
388
  elif score >= 4:
389
  chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
390
  provider = "groq"
391
-
392
  if provider == "groq" and (
393
  total_chars > MAX_GROQ_PROMPT_CHARS or total_bytes > MAX_GROQ_PROMPT_BYTES
394
  ):
@@ -414,6 +514,7 @@ async def generate_text(
414
  Structured: {structured_task}
415
  Multi-question: {multi_q}
416
  MULTIMODAL REQUIRED: {has_images}
 
417
  → Selected: {chosen_model} ({provider})
418
  """
419
  )
@@ -426,16 +527,16 @@ async def generate_text(
426
  if not groq_keys_list:
427
  raise HTTPException(500, "Missing GROQ_KEY(s)")
428
  API_KEY = random.choice(groq_keys_list)
429
-
430
  url = "https://api.groq.com/openai/v1/chat/completions"
431
-
432
  elif provider == "cerebras":
433
  cer_keys = os.getenv("CER_KEY", "")
434
  cer_keys_list = [k.strip() for k in cer_keys.split(",") if k.strip()]
435
  if not cer_keys_list:
436
  raise HTTPException(500, "Missing CER_KEY(s)")
437
  API_KEY = random.choice(cer_keys_list)
438
-
439
  url = "https://api.cerebras.ai/v1/chat/completions"
440
 
441
  else:
@@ -443,6 +544,97 @@ async def generate_text(
443
 
444
  headers = {"Authorization": f"Bearer {API_KEY}"}
445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  if stream:
447
  body["stream"] = True
448
 
@@ -558,7 +750,7 @@ async def gensfx(
558
 
559
  @app.get("/gen/tts/{prompt}")
560
  @app.post("/gen/tts")
561
- async def gensfx(
562
  request: Request,
563
  prompt: str = None,
564
  authorization: Optional[str] = Header(None),
@@ -597,7 +789,7 @@ async def gensfx(
597
  @app.get("/gen/video/{prompt}")
598
  @app.post("/gen/video")
599
  @app.head("/gen/video")
600
- async def genvideo_airforce(
601
  request: Request,
602
  prompt: str = None,
603
  authorization: Optional[str] = Header(None),
 
11
  )
12
  import httpx
13
  from bs4 import BeautifulSoup
14
+ from typing import List, Dict, Any, Optional
15
  import asyncio
16
  import re
17
  import random
 
23
  TIER_CONFIG,
24
  PLAN_ORDER,
25
  )
 
26
  from helper.keywords import *
27
  from helper.assets import (
28
  save_base64_image,
 
100
  return True
101
  return False
102
 
103
+
104
+ # -----------------------------------------------------------------------------
105
+ # Multimodal helpers (server-side fix for: tools + images)
106
+ # -----------------------------------------------------------------------------
107
+
108
+ def contains_images(messages: List[Dict[str, Any]]) -> bool:
109
+ """
110
+ Detect Chat Completions multimodal image parts.
111
+ Works with OpenAI-style: {"type":"image_url","image_url":{"url":"..."}}.
112
+ """
113
+ if not isinstance(messages, list):
114
+ return False
115
+ for m in messages:
116
+ if not isinstance(m, dict):
117
+ continue
118
+ content = m.get("content")
119
+ if isinstance(content, list):
120
+ for part in content:
121
+ if not isinstance(part, dict):
122
+ continue
123
+ ptype = part.get("type")
124
+ if ptype == "image_url":
125
+ return True
126
+ return False
127
+
128
+
129
+ def content_to_text(content: Any) -> str:
130
+ """
131
+ Convert a message.content (string or multimodal parts array) to a plain string.
132
+ For parts arrays, keeps only text parts and drops image parts.
133
+ """
134
+ if isinstance(content, str):
135
+ return content
136
+ if isinstance(content, list):
137
+ out = []
138
+ for part in content:
139
+ if isinstance(part, dict) and part.get("type") == "text":
140
+ txt = part.get("text")
141
+ if isinstance(txt, str) and txt:
142
+ out.append(txt)
143
+ return "\n".join(out).strip()
144
+ return ""
145
+
146
+
147
+ def flatten_messages_to_text_only(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
148
+ """
149
+ Return messages with content always a string (drops image parts).
150
+ Preserves role and other fields.
151
+ """
152
+ flattened: List[Dict[str, Any]] = []
153
+ for m in messages:
154
+ if not isinstance(m, dict):
155
+ continue
156
+ nm = dict(m)
157
+ nm["content"] = content_to_text(m.get("content"))
158
+ flattened.append(nm)
159
+ return flattened
160
+
161
+
162
+ def find_last_multimodal_user_message(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
163
+ """
164
+ Find last user message whose content is a parts array containing an image_url.
165
+ """
166
+ for m in reversed(messages):
167
+ if not isinstance(m, dict):
168
+ continue
169
+ if m.get("role") != "user":
170
+ continue
171
+ content = m.get("content")
172
+ if not isinstance(content, list):
173
+ continue
174
+ for part in content:
175
+ if isinstance(part, dict) and part.get("type") == "image_url":
176
+ return m
177
+ return None
178
+
179
+
180
+ def append_instruction_to_multimodal_user_content(content: Any, instruction: str) -> Any:
181
+ """
182
+ Adds an extra text part to a multimodal content array, or appends to string.
183
+ """
184
+ if isinstance(content, str):
185
+ return (content + "\n\n" + instruction).strip()
186
+ if isinstance(content, list):
187
+ # Keep as list, add an extra trailing text part.
188
+ return content + [{"type": "text", "text": instruction}]
189
+ return instruction
190
+
191
+
192
  PKEY = os.getenv("POLLINATIONS_KEY", "")
193
  PKEY2 = os.getenv("POLLINATIONS2_KEY", "")
194
  PKEY3 = os.getenv("POLLINATIONS3_KEY", "")
 
415
  prompt_text = extract_user_text(messages)
416
 
417
  uses_tools = (
418
+ ("tools" in body and isinstance(body["tools"], list) and len(body["tools"]) > 0)
419
+ or ("tool_choice" in body and body["tool_choice"] not in [None, "none"])
420
+ )
421
 
422
  long_context = is_long_context(messages)
423
  code_present = contains_code(prompt_text)
 
451
  provider = "groq"
452
  has_images = contains_images(messages)
453
 
454
+ # IMPORTANT FIX:
455
+ # Some upstream OpenAI-compat providers reject `tools` when any message content is multimodal (list parts),
456
+ # returning: messages[n].content must be a string.
457
+ # If the request uses tools and includes images, we do a 2-pass approach:
458
+ # (1) vision caption (NO tools; keep multimodal)
459
+ # (2) tool-capable call with text-only messages + appended caption
460
+ needs_two_pass = bool(has_images and uses_tools)
461
+
462
+ # Routing:
463
+ # - If images exist AND tools are NOT in use, route to a vision-capable model directly.
464
+ # - If tools are in use (even with images), route to tool models (pass 2 will be text-only).
465
+ if has_images and not uses_tools:
466
  chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
467
  provider = "groq"
468
  else:
 
474
  else:
475
  chosen_model = "openai/gpt-oss-20b"
476
  provider = "groq"
477
+
478
  elif code_present:
479
+
480
  if code_heavy and score >= 6:
481
  chosen_model = "gpt-oss-120b"
482
  provider = "cerebras"
483
+
484
  elif score >= 4:
485
  chosen_model = "llama-3.3-70b-versatile"
486
  provider = "groq"
487
+
488
  elif score >= 4:
489
  chosen_model = "meta-llama/llama-4-scout-17b-16e-instruct"
490
  provider = "groq"
491
+
492
  if provider == "groq" and (
493
  total_chars > MAX_GROQ_PROMPT_CHARS or total_bytes > MAX_GROQ_PROMPT_BYTES
494
  ):
 
514
  Structured: {structured_task}
515
  Multi-question: {multi_q}
516
  MULTIMODAL REQUIRED: {has_images}
517
+ TWO-PASS (tools+images): {needs_two_pass}
518
  → Selected: {chosen_model} ({provider})
519
  """
520
  )
 
527
  if not groq_keys_list:
528
  raise HTTPException(500, "Missing GROQ_KEY(s)")
529
  API_KEY = random.choice(groq_keys_list)
530
+
531
  url = "https://api.groq.com/openai/v1/chat/completions"
532
+
533
  elif provider == "cerebras":
534
  cer_keys = os.getenv("CER_KEY", "")
535
  cer_keys_list = [k.strip() for k in cer_keys.split(",") if k.strip()]
536
  if not cer_keys_list:
537
  raise HTTPException(500, "Missing CER_KEY(s)")
538
  API_KEY = random.choice(cer_keys_list)
539
+
540
  url = "https://api.cerebras.ai/v1/chat/completions"
541
 
542
  else:
 
544
 
545
  headers = {"Authorization": f"Bearer {API_KEY}"}
546
 
547
+ # -------------------------------------------------------------------------
548
+ # Two-pass fix implementation (tools + multimodal images)
549
+ # -------------------------------------------------------------------------
550
+ if needs_two_pass:
551
+ # 1) Build a captioning request (no tools/tool_choice, stream disabled)
552
+ # Prefer the last multimodal user message that actually contains images.
553
+ last_mm_user = find_last_multimodal_user_message(messages)
554
+ mm_user_msg = last_mm_user if last_mm_user else {"role": "user", "content": messages[-1].get("content")}
555
+
556
+ caption_instruction = (
557
+ "Describe the attached image(s) in detail. "
558
+ "Include any text you can read, objects, UI elements, and relationships. "
559
+ "Return only the description."
560
+ )
561
+
562
+ caption_messages = [
563
+ {"role": "system", "content": "You are a precise image captioning assistant."},
564
+ {
565
+ "role": "user",
566
+ "content": append_instruction_to_multimodal_user_content(
567
+ mm_user_msg.get("content"),
568
+ caption_instruction,
569
+ ),
570
+ },
571
+ ]
572
+
573
+ caption_body = dict(body)
574
+ caption_body["model"] = "meta-llama/llama-4-scout-17b-16e-instruct"
575
+ caption_body["messages"] = caption_messages
576
+ caption_body["stream"] = False
577
+ caption_body.pop("tools", None)
578
+ caption_body.pop("tool_choice", None)
579
+ caption_body.pop("tool_choice", None)
580
+
581
+ try:
582
+ async with httpx.AsyncClient(timeout=None) as client:
583
+ cap = await client.post(url, json=caption_body, headers=headers)
584
+ except Exception as e:
585
+ raise HTTPException(502, f"Caption upstream request failed: {str(e)}")
586
+
587
+ if cap.status_code >= 400:
588
+ # Surface a safe snippet for debugging.
589
+ snippet = cap.text[:800] if isinstance(cap.text, str) else ""
590
+ raise HTTPException(
591
+ status_code=400,
592
+ detail=f"Caption upstream provider error ({cap.status_code}): {snippet}",
593
+ )
594
+
595
+ try:
596
+ cap_json = cap.json()
597
+ caption = (
598
+ ((cap_json.get("choices") or [{}])[0].get("message") or {}).get("content")
599
+ or ""
600
+ )
601
+ except Exception:
602
+ caption = ""
603
+
604
+ caption = (caption or "").strip()
605
+ if not caption:
606
+ caption = "(No caption returned.)"
607
+
608
+ # Keep captions bounded so we don't accidentally blow prompt limits.
609
+ if len(caption) > 4000:
610
+ caption = caption[:4000] + "…"
611
+
612
+ # 2) Rewrite original request to be text-only messages, append caption.
613
+ rewritten = flatten_messages_to_text_only(messages)
614
+ rewritten.append(
615
+ {
616
+ "role": "user",
617
+ "content": "[Image description]\n" + caption,
618
+ }
619
+ )
620
+ body["messages"] = rewritten
621
+
622
+ # Re-check limits with the rewritten messages.
623
+ total_chars2, total_bytes2 = calculate_messages_size(rewritten)
624
+ if total_chars2 > MAX_CHAT_PROMPT_CHARS or total_bytes2 > MAX_CHAT_PROMPT_BYTES:
625
+ raise HTTPException(
626
+ status_code=413,
627
+ detail=(
628
+ f"Prompt context too large after image captioning ({total_chars2} chars, {total_bytes2} bytes). "
629
+ f"Max allowed is {MAX_CHAT_PROMPT_CHARS} chars or {MAX_CHAT_PROMPT_BYTES} bytes."
630
+ ),
631
+ )
632
+
633
+ # With the rewrite, we are no longer multimodal for upstream.
634
+ has_images = False
635
+
636
+ # -------------------------------------------------------------------------
637
+
638
  if stream:
639
  body["stream"] = True
640
 
 
750
 
751
  @app.get("/gen/tts/{prompt}")
752
  @app.post("/gen/tts")
753
+ async def gentts(
754
  request: Request,
755
  prompt: str = None,
756
  authorization: Optional[str] = Header(None),
 
789
  @app.get("/gen/video/{prompt}")
790
  @app.post("/gen/video")
791
  @app.head("/gen/video")
792
+ async def genvideo(
793
  request: Request,
794
  prompt: str = None,
795
  authorization: Optional[str] = Header(None),