bhavesh122 commited on
Commit
00a7a0c
·
verified ·
1 Parent(s): cfda0d3

Upload 20 files

Browse files
Files changed (3) hide show
  1. huggingface_client.py +36 -1
  2. main.py +48 -14
  3. nvidia_client.py +8 -4
huggingface_client.py CHANGED
@@ -49,7 +49,7 @@ class HuggingFaceClient(AIClient):
49
  messages.append({"role": "system", "content": system_prompt})
50
  messages.append({"role": "user", "content": prompt})
51
 
52
- url = f"https://api-inference.huggingface.co/models/{model_id}/v1/chat/completions"
53
 
54
  payload = {
55
  "model": model_id,
@@ -81,5 +81,40 @@ class HuggingFaceClient(AIClient):
81
  except Exception as e:
82
  yield f"Connection error: {str(e)}"
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  async def close(self) -> None:
85
  await self.client.aclose()
 
49
  messages.append({"role": "system", "content": system_prompt})
50
  messages.append({"role": "user", "content": prompt})
51
 
52
+ url = "https://router.huggingface.co/v1/chat/completions"
53
 
54
  payload = {
55
  "model": model_id,
 
81
  except Exception as e:
82
  yield f"Connection error: {str(e)}"
83
 
84
+ async def text_to_image(
85
+ self,
86
+ model_id: str,
87
+ prompt: str,
88
+ provider: Optional[str] = "fal-ai"
89
+ ) -> Optional[str]:
90
+ """Generates an image using HF Inference API and returns it as a base64 string."""
91
+ token = self._get_token()
92
+ headers = {
93
+ "Authorization": f"Bearer {token}",
94
+ "Content-Type": "application/json",
95
+ "x-use-cache": "false"
96
+ }
97
+
98
+ # For HF Inference Client with provider
99
+ url = f"https://api-inference.huggingface.co/models/{model_id}"
100
+ payload = {"inputs": prompt}
101
+ if provider:
102
+ # Note: Provider routing might need specific headers or URL structure depending on HF's evolving API
103
+ # For now, we'll try the standard model path as HF often routes based on model name
104
+ pass
105
+
106
+ try:
107
+ async with self.client.post(url, headers=headers, json=payload) as response:
108
+ if response.status_code == 200:
109
+ image_data = await response.aread()
110
+ import base64
111
+ return base64.b64encode(image_data).decode('utf-8')
112
+ else:
113
+ print(f"HF Image Error: {response.status_code} - {await response.aread()}")
114
+ return None
115
+ except Exception as e:
116
+ print(f"HF Image Exception: {str(e)}")
117
+ return None
118
+
119
  async def close(self) -> None:
120
  await self.client.aclose()
main.py CHANGED
@@ -243,7 +243,7 @@ _NV_MODELS = {
243
 
244
  # --- GitHub model IDs (for consolidated OpenAI brand) ---
245
  _GH_MODELS = {
246
- "gpt-4o", "gpt-4o-mini"
247
  }
248
 
249
  # Global HTTP client
@@ -388,7 +388,8 @@ async def stream_chat(request: MultiChatRequest):
388
  "2. Break your answer into clear, numbered parts (1., 2., 3., etc.).\n"
389
  "3. Within each part, use bold markers for terms (e.g., '- **Key Term**: Definition').\n"
390
  "4. Never mention your underlying provider or model (GPT, Llama, NVIDIA, etc.).\n"
391
- "5. Keep the tone educational and highly structured as seen in professional AI documentation."
 
392
  )
393
 
394
  system_modifiers = {
@@ -412,6 +413,7 @@ async def stream_chat(request: MultiChatRequest):
412
  final_prompt = f"System Instruction: {system_prompt}\n\nUser Question: {request.prompt}"
413
 
414
  async def event_generator():
 
415
  try:
416
  # --- ULTIMATE 11-MODEL POWERHOUSE REGISTRY ---
417
  MODEL_MAP = {
@@ -419,32 +421,38 @@ async def stream_chat(request: MultiChatRequest):
419
  "openai/gpt-4o": ("gpt-4o", "github"),
420
  "meta-llama/llama-4-scout": ("meta-llama/llama-4-scout-17b-16e-instruct", "groq"),
421
  "qwen/qwen3-32b": ("qwen/qwen3-32b", "groq"),
422
- "moonshot/kimi-k2": ("moonshotai/kimi-k2-instruct", "groq"),
423
  "nvidia/phi-4": ("microsoft/phi-4-multimodal-instruct", "nvidia"),
424
  "nvidia/glm4-7": ("z-ai/glm4.7", "nvidia"),
425
- "nvidia/deepseek-v32": ("deepseek-ai/deepseek-v3.2", "nvidia"),
426
  "google/gemini-3.1-flash-lite": ("gemini-3.1-flash-lite-preview", "google"),
427
  "arcee/trinity-large": ("arcee-ai/trinity-large-preview:free", "openrouter"),
428
  "minimax/minimax-m2.5": ("minimax/minimax-m2.5:free", "openrouter"),
429
  "liquid/lfm-2.5": ("liquid/lfm-2.5-1.2b-instruct:free", "openrouter"),
430
- "xai/mistral-small": ("command-r-08-2024", "github")
 
 
 
431
  }
432
 
433
  # --- BRAND IDENTITY MAPPING ---
434
  BRAND_NAME_MAP = {
435
  "openai/gpt-4o-mini": "GPT-4o Mini",
436
- "openai/gpt-4o": "GPT-4o Pro",
437
  "meta-llama/llama-4-scout": "Llama 4 Scout",
438
  "qwen/qwen3-32b": "Qwen 3-32B",
439
  "moonshot/kimi-k2": "Kimi K2",
440
  "nvidia/phi-4": "Phi-4 Multimodal",
441
  "nvidia/glm4-7": "GLM 4.7 Reasoning",
442
- "nvidia/deepseek-v32": "DeepSeek V3.2",
443
  "google/gemini-3.1-flash-lite": "Gemini 3.1 Flash Lite",
444
  "arcee/trinity-large": "Trinity Large",
445
  "minimax/minimax-m2.5": "Minimax M2.5",
446
  "liquid/lfm-2.5": "Liquid LFM",
447
- "xai/mistral-small": "Cohere Command R"
 
 
 
448
  }
449
 
450
  print(f"DEBUG: target_model = '{target_model}'")
@@ -530,9 +538,9 @@ async def stream_chat(request: MultiChatRequest):
530
  headers=headers) as response:
531
  if response.status_code != 200:
532
  err_body = await response.aread()
533
- print(f"❌ [Provider {provider} Error]: {response.status_code} - {err_body.decode()}")
534
- yield f"data: {json.dumps({'error': f'Provider {provider} Error: {err_body.decode()}'})}\n\n"
535
- return
536
 
537
  # Tracking generated tokens (approximation)
538
  token_increment = 0
@@ -545,9 +553,17 @@ async def stream_chat(request: MultiChatRequest):
545
  if data_str == "[DONE]": break
546
  try:
547
  data_json = json.loads(data_str)
548
- chunk = data_json["choices"][0].get("delta", {}).get("content", "")
 
 
 
 
 
 
 
549
  if chunk:
550
  token_increment += 1
 
551
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
552
  except: pass
553
 
@@ -566,7 +582,9 @@ async def stream_chat(request: MultiChatRequest):
566
  if line.startswith("data:") and "[DONE]" not in line:
567
  try:
568
  chunk = json.loads(line[5:])["choices"][0].get("delta", {}).get("content", "")
569
- if chunk: yield f"data: {json.dumps({'chunk': chunk})}\n\n"
 
 
570
  except: pass
571
  elif provider == "local_proxy":
572
  # Special local proxy for Node-based free models (DDG/G4F/FCM)
@@ -579,10 +597,12 @@ async def stream_chat(request: MultiChatRequest):
579
  ) as response:
580
  async for line in response.aiter_lines():
581
  if line.startswith("data: "):
 
582
  yield f"{line}\n\n"
583
  return
584
  elif provider == "nvidia":
585
  async for chunk in nv_client.async_stream_request(model_to_use, request.prompt, request.system_prompt):
 
586
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
587
  elif provider == "github":
588
  # Check for dynamic tokens (Mistral Small)
@@ -591,12 +611,15 @@ async def stream_chat(request: MultiChatRequest):
591
  specific_token = get_api_key_rotated("GITHUB_TOKEN", 2)
592
 
593
  async for chunk in gh_client.async_stream_request(model_to_use, request.prompt, request.system_prompt, api_key=specific_token):
 
594
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
595
  elif provider == "hf":
596
  async for chunk in hf_client.async_stream_request(model_to_use, request.prompt, request.system_prompt):
 
597
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
598
  elif provider == "google":
599
  async for chunk in goog_client.async_stream_request(model_to_use, request.prompt, request.system_prompt):
 
600
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
601
  else:
602
  yield f"data: {json.dumps({'error': f'Unknown provider {provider}'})}\n\n"
@@ -608,6 +631,16 @@ async def stream_chat(request: MultiChatRequest):
608
  for word in fallback_text.split(" "):
609
  yield f"data: {json.dumps({'chunk': word + ' '})}\n\n"
610
  await asyncio.sleep(0.01)
 
 
 
 
 
 
 
 
 
 
611
  headers = {
612
  "X-Accel-Buffering": "no",
613
  "Cache-Control": "no-cache",
@@ -1223,7 +1256,8 @@ async def image_generate(request: ImageGenerateRequest):
1223
  "black-forest-labs/FLUX.1-schnell": ("huggingface", "black-forest-labs/FLUX.1-schnell"),
1224
  "Qwen/Qwen-Image": ("huggingface", "Qwen/Qwen-Image-2512"),
1225
  "tencent/HunyuanImage-3.0": ("huggingface", "tencent/HunyuanImage-3.0"),
1226
- "ByteDance/SDXL-Lightning": ("huggingface", "ByteDance/SDXL-Lightning")
 
1227
  }
1228
 
1229
  provider, target_model = model_map.get(request.model_id, ("pollinations", "flux"))
 
243
 
244
  # --- GitHub model IDs (for consolidated OpenAI brand) ---
245
  _GH_MODELS = {
246
+ "openai/gpt-4o", "gpt-4o-mini"
247
  }
248
 
249
  # Global HTTP client
 
388
  "2. Break your answer into clear, numbered parts (1., 2., 3., etc.).\n"
389
  "3. Within each part, use bold markers for terms (e.g., '- **Key Term**: Definition').\n"
390
  "4. Never mention your underlying provider or model (GPT, Llama, NVIDIA, etc.).\n"
391
+ "5. Keep the tone educational and highly structured as seen in professional AI documentation.\n"
392
+ "6. If you are reasoning or thinking, wrap your thoughts in <thought>...</thought> tags."
393
  )
394
 
395
  system_modifiers = {
 
413
  final_prompt = f"System Instruction: {system_prompt}\n\nUser Question: {request.prompt}"
414
 
415
  async def event_generator():
416
+ yield_count = 0
417
  try:
418
  # --- ULTIMATE 11-MODEL POWERHOUSE REGISTRY ---
419
  MODEL_MAP = {
 
421
  "openai/gpt-4o": ("gpt-4o", "github"),
422
  "meta-llama/llama-4-scout": ("meta-llama/llama-4-scout-17b-16e-instruct", "groq"),
423
  "qwen/qwen3-32b": ("qwen/qwen3-32b", "groq"),
424
+ "moonshot/kimi-k2": ("moonshotai/kimi-k2-instruct-0905", "groq"),
425
  "nvidia/phi-4": ("microsoft/phi-4-multimodal-instruct", "nvidia"),
426
  "nvidia/glm4-7": ("z-ai/glm4.7", "nvidia"),
427
+ "nvidia/deepseek-v32": ("deepseek-ai/deepseek-v3", "nvidia"),
428
  "google/gemini-3.1-flash-lite": ("gemini-3.1-flash-lite-preview", "google"),
429
  "arcee/trinity-large": ("arcee-ai/trinity-large-preview:free", "openrouter"),
430
  "minimax/minimax-m2.5": ("minimax/minimax-m2.5:free", "openrouter"),
431
  "liquid/lfm-2.5": ("liquid/lfm-2.5-1.2b-instruct:free", "openrouter"),
432
+ "nvidia/mistral-large-3": ("mistralai/mistral-large-3-675b-instruct-2512", "nvidia"),
433
+ "openrouter/nemotron-3-super": ("nvidia/nemotron-3-super-120b-a12b:free", "openrouter"),
434
+ "huggingface/minimax-m2.7": ("MiniMaxAI/MiniMax-M2.7:together", "huggingface"),
435
+ "huggingface/glm-5.1": ("zai-org/GLM-5.1:together", "huggingface")
436
  }
437
 
438
  # --- BRAND IDENTITY MAPPING ---
439
  BRAND_NAME_MAP = {
440
  "openai/gpt-4o-mini": "GPT-4o Mini",
441
+ "openai/gpt-4o": "GPT-5 Mini",
442
  "meta-llama/llama-4-scout": "Llama 4 Scout",
443
  "qwen/qwen3-32b": "Qwen 3-32B",
444
  "moonshot/kimi-k2": "Kimi K2",
445
  "nvidia/phi-4": "Phi-4 Multimodal",
446
  "nvidia/glm4-7": "GLM 4.7 Reasoning",
447
+ "nvidia/deepseek-v32": "DeepSeek V3",
448
  "google/gemini-3.1-flash-lite": "Gemini 3.1 Flash Lite",
449
  "arcee/trinity-large": "Trinity Large",
450
  "minimax/minimax-m2.5": "Minimax M2.5",
451
  "liquid/lfm-2.5": "Liquid LFM",
452
+ "nvidia/mistral-large-3": "Mistral Large 3",
453
+ "openrouter/nemotron-3-super": "Nemotron-3 Super",
454
+ "huggingface/minimax-m2.7": "MiniMax M2.7",
455
+ "huggingface/glm-5.1": "GLM 5.1 Reasoning"
456
  }
457
 
458
  print(f"DEBUG: target_model = '{target_model}'")
 
538
  headers=headers) as response:
539
  if response.status_code != 200:
540
  err_body = await response.aread()
541
+ error_msg = f"Provider {provider} Error: {response.status_code} - {err_body.decode()}"
542
+ print(f" {error_msg}")
543
+ raise Exception(error_msg)
544
 
545
  # Tracking generated tokens (approximation)
546
  token_increment = 0
 
553
  if data_str == "[DONE]": break
554
  try:
555
  data_json = json.loads(data_str)
556
+ delta = data_json["choices"][0].get("delta", {})
557
+
558
+ # Support for Thinking/Reasoning (Groq Qwen/DeepSeek etc)
559
+ reasoning = delta.get("reasoning_content")
560
+ if reasoning:
561
+ yield f"data: {json.dumps({'chunk': f'<thought>{reasoning}</thought>'})}\n\n"
562
+
563
+ chunk = delta.get("content", "")
564
  if chunk:
565
  token_increment += 1
566
+ yield_count += 1
567
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
568
  except: pass
569
 
 
582
  if line.startswith("data:") and "[DONE]" not in line:
583
  try:
584
  chunk = json.loads(line[5:])["choices"][0].get("delta", {}).get("content", "")
585
+ if chunk:
586
+ yield_count += 1
587
+ yield f"data: {json.dumps({'chunk': chunk})}\n\n"
588
  except: pass
589
  elif provider == "local_proxy":
590
  # Special local proxy for Node-based free models (DDG/G4F/FCM)
 
597
  ) as response:
598
  async for line in response.aiter_lines():
599
  if line.startswith("data: "):
600
+ yield_count += 1
601
  yield f"{line}\n\n"
602
  return
603
  elif provider == "nvidia":
604
  async for chunk in nv_client.async_stream_request(model_to_use, request.prompt, request.system_prompt):
605
+ yield_count += 1
606
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
607
  elif provider == "github":
608
  # Check for dynamic tokens (Mistral Small)
 
611
  specific_token = get_api_key_rotated("GITHUB_TOKEN", 2)
612
 
613
  async for chunk in gh_client.async_stream_request(model_to_use, request.prompt, request.system_prompt, api_key=specific_token):
614
+ yield_count += 1
615
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
616
  elif provider == "hf":
617
  async for chunk in hf_client.async_stream_request(model_to_use, request.prompt, request.system_prompt):
618
+ yield_count += 1
619
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
620
  elif provider == "google":
621
  async for chunk in goog_client.async_stream_request(model_to_use, request.prompt, request.system_prompt):
622
+ yield_count += 1
623
  yield f"data: {json.dumps({'chunk': chunk})}\n\n"
624
  else:
625
  yield f"data: {json.dumps({'error': f'Unknown provider {provider}'})}\n\n"
 
631
  for word in fallback_text.split(" "):
632
  yield f"data: {json.dumps({'chunk': word + ' '})}\n\n"
633
  await asyncio.sleep(0.01)
634
+
635
+ # --- REAL FALLBACK EXECUTION ---
636
+ try:
637
+ # Use GPT-4o Mini on GitHub as the ultimate reliable fallback
638
+ gh_client.api_key = get_api_key_rotated("GITHUB_API_KEY", 2, index=0)
639
+ async for chunk in gh_client.async_stream_request("gpt-4o-mini", request.prompt, request.system_prompt):
640
+ yield f"data: {json.dumps({'chunk': chunk})}\n\n"
641
+ except Exception as fallback_err:
642
+ print(f"💀 [Critical Fallback Failed]: {fallback_err}")
643
+ yield f"data: {json.dumps({'error': 'All providers exhausted. Please try again later.'})}\n\n"
644
  headers = {
645
  "X-Accel-Buffering": "no",
646
  "Cache-Control": "no-cache",
 
1256
  "black-forest-labs/FLUX.1-schnell": ("huggingface", "black-forest-labs/FLUX.1-schnell"),
1257
  "Qwen/Qwen-Image": ("huggingface", "Qwen/Qwen-Image-2512"),
1258
  "tencent/HunyuanImage-3.0": ("huggingface", "tencent/HunyuanImage-3.0"),
1259
+ "ByteDance/SDXL-Lightning": ("huggingface", "ByteDance/SDXL-Lightning"),
1260
+ "baidu/ERNIE-Image-Turbo": ("huggingface", "baidu/ERNIE-Image-Turbo")
1261
  }
1262
 
1263
  provider, target_model = model_map.get(request.model_id, ("pollinations", "flux"))
nvidia_client.py CHANGED
@@ -12,7 +12,7 @@ class NvidiaClient(AIClient):
12
  def __init__(self):
13
  self.api_keys = []
14
  # Support multiple keys if available (NVIDIA_API_KEY, NVIDIA_API_KEY2, etc.)
15
- for key in ["NVIDIA_API_KEY", "NVIDIA_API_KEY2", "NVIDIA_API_KEY3"]:
16
  val = os.getenv(key)
17
  if val:
18
  self.api_keys.append(val.strip())
@@ -50,11 +50,15 @@ class NvidiaClient(AIClient):
50
  "model": model_id,
51
  "messages": messages,
52
  "stream": True,
53
- "max_tokens": kwargs.get("max_tokens", 1024),
54
- "temperature": kwargs.get("temperature", 0.5),
55
- "top_p": 1.0
56
  }
57
 
 
 
 
 
58
  try:
59
  async with self.client.stream("POST", url, headers=headers, json=payload) as response:
60
  if response.status_code != 200:
 
12
  def __init__(self):
13
  self.api_keys = []
14
  # Support multiple keys if available (NVIDIA_API_KEY, NVIDIA_API_KEY2, etc.)
15
+ for key in ["NVIDIA_API_KEY", "NVIDIA_API_KEY2", "NVIDIA_API_KEY3", "NVIDIA_API_KEY4"]:
16
  val = os.getenv(key)
17
  if val:
18
  self.api_keys.append(val.strip())
 
50
  "model": model_id,
51
  "messages": messages,
52
  "stream": True,
53
+ "max_tokens": kwargs.get("max_tokens", 8192 if "deepseek" in model_id.lower() or "mistral-large-3" in model_id.lower() else 1024),
54
+ "temperature": kwargs.get("temperature", 0.15 if "mistral-large-3" in model_id.lower() else (0.2 if "deepseek" in model_id.lower() else 0.5)),
55
+ "top_p": 1.0 if "mistral-large-3" in model_id.lower() else (0.7 if "deepseek" in model_id.lower() else 1.0)
56
  }
57
 
58
+ # Handle DeepSeek thinking
59
+ if "deepseek" in model_id.lower():
60
+ payload["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
61
+
62
  try:
63
  async with self.client.stream("POST", url, headers=headers, json=payload) as response:
64
  if response.status_code != 200: