Keeby-smilyai commited on
Commit
2e319c6
Β·
verified Β·
1 Parent(s): 3319054

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +386 -344
app.py CHANGED
@@ -13,7 +13,6 @@ from tokenizers import Tokenizer
13
  import numpy as np
14
  import time
15
  from typing import Dict, Any, List
16
- import asyncio
17
 
18
  # ============================================================================
19
  # Configuration
@@ -263,8 +262,7 @@ def generate_tokens(
263
  temperature: float = 0.8,
264
  top_k: int = 40,
265
  top_p: float = 0.9,
266
- repetition_penalty: float = 1.1,
267
- stop_sequences: List[str] = None
268
  ):
269
  """Generator that yields tokens one at a time"""
270
  if len(input_ids) > config['max_position_embeddings'] - max_tokens:
@@ -322,17 +320,17 @@ def generate_tokens(
322
  input_tensor = input_tensor[:, -config['max_position_embeddings']:]
323
 
324
  # ============================================================================
325
- # API Functions
326
  # ============================================================================
327
 
328
  def chat_completion_api(
329
  messages_json: str,
330
- max_tokens: int = 512,
331
- temperature: float = 0.8,
332
- top_p: float = 0.9,
333
- top_k: int = 40,
334
- repetition_penalty: float = 1.1,
335
- stream: bool = False
336
  ) -> str:
337
  """OpenAI-style chat completion API"""
338
  try:
@@ -358,83 +356,58 @@ def chat_completion_api(
358
 
359
  start_time = time.time()
360
  token_count = 0
 
361
 
362
- if stream:
363
- # Streaming response
364
- response_text = ""
365
- for token_id in generate_tokens(
366
- input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
367
- ):
368
- token_text = tokenizer.decode([token_id])
369
- response_text += token_text
370
- token_count += 1
371
-
372
- # Check for end token
373
- if "<|im_end|>" in response_text:
374
- response_text = response_text.split("<|im_end|>")[0]
375
- break
376
-
377
- # Yield streaming chunk (SSE format)
378
- yield f"data: {json.dumps({'choices': [{'delta': {'content': token_text}, 'index': 0}]})}\n\n"
379
-
380
- elapsed = time.time() - start_time
381
-
382
- # Final chunk
383
- yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop', 'index': 0}], 'usage': {'completion_tokens': token_count, 'total_tokens': len(input_ids) + token_count}, 'stats': {'elapsed_sec': round(elapsed, 2), 'tokens_per_sec': round(token_count / elapsed if elapsed > 0 else 0, 1)}})}\n\n"
384
- yield "data: [DONE]\n\n"
385
- else:
386
- # Non-streaming response
387
- response_text = ""
388
- for token_id in generate_tokens(
389
- input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
390
- ):
391
- token_text = tokenizer.decode([token_id])
392
- response_text += token_text
393
- token_count += 1
394
-
395
- if "<|im_end|>" in response_text:
396
- response_text = response_text.split("<|im_end|>")[0]
397
- break
398
-
399
- elapsed = time.time() - start_time
400
-
401
- result = {
402
- "id": f"chatcmpl-{int(time.time())}",
403
- "object": "chat.completion",
404
- "created": int(time.time()),
405
- "model": "sam-z-1",
406
- "choices": [{
407
- "index": 0,
408
- "message": {
409
- "role": "assistant",
410
- "content": response_text.strip()
411
- },
412
- "finish_reason": "stop"
413
- }],
414
- "usage": {
415
- "prompt_tokens": len(input_ids),
416
- "completion_tokens": token_count,
417
- "total_tokens": len(input_ids) + token_count
418
  },
419
- "stats": {
420
- "elapsed_sec": round(elapsed, 2),
421
- "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
422
- }
 
 
 
 
 
 
423
  }
424
-
425
- return json.dumps(result, indent=2)
 
426
 
427
  except Exception as e:
428
  return json.dumps({"error": str(e)}, indent=2)
429
 
430
  def text_completion_api(
431
  prompt: str,
432
- max_tokens: int = 512,
433
- temperature: float = 0.8,
434
- top_p: float = 0.9,
435
- top_k: int = 40,
436
- repetition_penalty: float = 1.1,
437
- stream: bool = False
438
  ) -> str:
439
  """OpenAI-style text completion API"""
440
  try:
@@ -442,61 +415,45 @@ def text_completion_api(
442
 
443
  start_time = time.time()
444
  token_count = 0
 
445
 
446
- if stream:
447
- response_text = ""
448
- for token_id in generate_tokens(
449
- input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
450
- ):
451
- token_text = tokenizer.decode([token_id])
452
- response_text += token_text
453
- token_count += 1
454
-
455
- yield f"data: {json.dumps({'choices': [{'text': token_text, 'index': 0}]})}\n\n"
456
-
457
- elapsed = time.time() - start_time
458
-
459
- yield f"data: {json.dumps({'choices': [{'finish_reason': 'stop', 'index': 0}], 'usage': {'completion_tokens': token_count, 'total_tokens': len(input_ids) + token_count}, 'stats': {'elapsed_sec': round(elapsed, 2), 'tokens_per_sec': round(token_count / elapsed if elapsed > 0 else 0, 1)}})}\n\n"
460
- yield "data: [DONE]\n\n"
461
- else:
462
- response_text = ""
463
- for token_id in generate_tokens(
464
- input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
465
- ):
466
- token_text = tokenizer.decode([token_id])
467
- response_text += token_text
468
- token_count += 1
469
-
470
- elapsed = time.time() - start_time
471
-
472
- result = {
473
- "id": f"cmpl-{int(time.time())}",
474
- "object": "text_completion",
475
- "created": int(time.time()),
476
- "model": "sam-z-1",
477
- "choices": [{
478
- "text": response_text,
479
- "index": 0,
480
- "finish_reason": "stop"
481
- }],
482
- "usage": {
483
- "prompt_tokens": len(input_ids),
484
- "completion_tokens": token_count,
485
- "total_tokens": len(input_ids) + token_count
486
- },
487
- "stats": {
488
- "elapsed_sec": round(elapsed, 2),
489
- "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
490
- }
491
  }
492
-
493
- return json.dumps(result, indent=2)
 
494
 
495
  except Exception as e:
496
  return json.dumps({"error": str(e)}, indent=2)
497
 
498
  # ============================================================================
499
- # Gradio UI
500
  # ============================================================================
501
 
502
  custom_css = """
@@ -521,16 +478,6 @@ custom_css = """
521
  border-left: 4px solid #667eea;
522
  margin: 1rem 0;
523
  }
524
-
525
- .code-block {
526
- background: #282c34;
527
- color: #abb2bf;
528
- padding: 1rem;
529
- border-radius: 6px;
530
- font-family: 'Monaco', 'Menlo', monospace;
531
- font-size: 0.9rem;
532
- overflow-x: auto;
533
- }
534
  """
535
 
536
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
@@ -548,8 +495,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
548
  # ========== Chat Completion Tab ==========
549
  with gr.Tab("πŸ’¬ Chat Completion"):
550
  gr.Markdown("""
551
- ### `/v1/chat/completions` Endpoint
552
- OpenAI-compatible chat completion API with streaming support
553
  """)
554
 
555
  with gr.Row():
@@ -572,7 +519,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
572
  chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
573
 
574
  chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
575
- chat_stream = gr.Checkbox(label="Stream Response", value=False)
576
 
577
  chat_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
578
 
@@ -584,48 +531,36 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
584
  )
585
 
586
  gr.Markdown("""
587
- ### Python Example
588
  ```python
589
- import requests
590
- import json
591
 
592
- # For Hugging Face Spaces
593
- API_URL = "https://YOUR-SPACE.hf.space" # Your Space URL
594
 
595
  messages = [
596
  {"role": "user", "content": "Hello! Who are you?"}
597
  ]
598
 
599
- response = requests.post(
600
- f"{API_URL}/chat/completions",
601
- json={
602
- "messages": messages,
603
- "max_tokens": 512,
604
- "temperature": 0.8,
605
- "stream": False
606
- }
 
607
  )
608
 
609
- print(response.json())
610
- ```
611
-
612
- ### cURL Example
613
- ```bash
614
- curl -X POST "https://YOUR-SPACE.hf.space/chat/completions" \\
615
- -H "Content-Type: application/json" \\
616
- -d '{
617
- "messages": [{"role": "user", "content": "Hello!"}],
618
- "max_tokens": 512,
619
- "temperature": 0.8
620
- }'
621
  ```
622
  """)
623
 
624
  # ========== Text Completion Tab ==========
625
  with gr.Tab("πŸ“ Text Completion"):
626
  gr.Markdown("""
627
- ### `/v1/completions` Endpoint
628
- OpenAI-compatible text completion API
629
  """)
630
 
631
  with gr.Row():
@@ -645,7 +580,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
645
  text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
646
 
647
  text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
648
- text_stream = gr.Checkbox(label="Stream Response", value=False)
649
 
650
  text_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
651
 
@@ -657,22 +592,24 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
657
  )
658
 
659
  gr.Markdown("""
660
- ### Python Example
661
  ```python
662
- import requests
663
-
664
- API_URL = "https://YOUR-SPACE.hf.space"
665
-
666
- response = requests.post(
667
- f"{API_URL}/completions",
668
- json={
669
- "prompt": "Once upon a time",
670
- "max_tokens": 512,
671
- "temperature": 0.8
672
- }
 
 
673
  )
674
 
675
- print(response.json())
676
  ```
677
  """)
678
 
@@ -688,30 +625,117 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
688
  - **Context Length**: {config['max_position_embeddings']} tokens
689
  - **Vocabulary Size**: {config['vocab_size']}
690
 
691
- ## Available Endpoints
692
 
693
- ### 1. Chat Completions
694
- **Endpoint**: `/v1/chat/completions` (OpenAI compatible)
695
 
696
- **Request Format**:
697
- ```json
698
- {{
699
- "messages": [
700
- {{"role": "user", "content": "Hello!"}}
701
- ],
702
- "max_tokens": 512,
703
- "temperature": 0.8,
704
- "top_p": 0.9,
705
- "top_k": 40,
706
- "repetition_penalty": 1.1,
707
- "stream": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  }}
 
 
 
709
  ```
710
 
711
- **Response Format**:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
  ```json
713
  {{
714
- "id": "chatcmpl-123",
715
  "object": "chat.completion",
716
  "created": 1234567890,
717
  "model": "sam-z-1",
@@ -719,7 +743,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
719
  "index": 0,
720
  "message": {{
721
  "role": "assistant",
722
- "content": "Hello! I'm SAM-Z-1..."
723
  }},
724
  "finish_reason": "stop"
725
  }}],
@@ -735,31 +759,15 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
735
  }}
736
  ```
737
 
738
- ### 2. Text Completions
739
- **Endpoint**: `/v1/completions`
740
-
741
- **Request Format**:
742
- ```json
743
- {{
744
- "prompt": "Once upon a time",
745
- "max_tokens": 512,
746
- "temperature": 0.8,
747
- "top_p": 0.9,
748
- "top_k": 40,
749
- "repetition_penalty": 1.1,
750
- "stream": false
751
- }}
752
- ```
753
-
754
- **Response Format**:
755
  ```json
756
  {{
757
- "id": "cmpl-123",
758
  "object": "text_completion",
759
  "created": 1234567890,
760
  "model": "sam-z-1",
761
  "choices": [{{
762
- "text": " in a distant galaxy...",
763
  "index": 0,
764
  "finish_reason": "stop"
765
  }}],
@@ -767,187 +775,221 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as d
767
  "prompt_tokens": 5,
768
  "completion_tokens": 15,
769
  "total_tokens": 20
 
 
 
 
770
  }}
771
  }}
772
  ```
773
 
774
- ## Parameters
775
 
776
- | Parameter | Type | Default | Description |
777
- |-----------|------|---------|-------------|
778
- | `max_tokens` | int | 512 | Maximum tokens to generate |
779
- | `temperature` | float | 0.8 | Sampling temperature (0.1-2.0) |
780
- | `top_p` | float | 0.9 | Nucleus sampling threshold |
781
- | `top_k` | int | 40 | Top-K sampling |
782
- | `repetition_penalty` | float | 1.1 | Penalty for repeated tokens |
783
- | `stream` | bool | false | Enable streaming responses |
784
-
785
- ## Streaming Responses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
 
787
- When `stream=true`, the API returns Server-Sent Events (SSE):
788
 
789
- ```
790
- data: {{"choices": [{{"delta": {{"content": "Hello"}}, "index": 0}}]}}
 
 
791
 
792
- data: {{"choices": [{{"delta": {{"content": " there"}}, "index": 0}}]}}
 
 
 
 
793
 
794
- data: {{"choices": [{{"finish_reason": "stop", "index": 0}}]}}
 
 
 
 
795
 
796
- data: [DONE]
797
- ```
 
 
798
 
799
  ## Rate Limits & Performance
800
 
801
- - **Optimized for CPU**: Uses TensorFlow graph optimization
802
  - **Average Speed**: 10-20 tokens/sec on CPU
803
  - **Context Window**: {config['max_position_embeddings']} tokens
804
- - **Concurrent Requests**: Supported via Gradio queue
805
 
806
  ## Error Handling
807
 
808
- Errors return JSON with error description:
809
- ```json
810
- {{
811
- "error": "Error message here"
812
- }}
813
- ```
814
-
815
- ## Usage Tips
816
-
817
- 1. **Lower temperature** (0.3-0.5) for factual responses
818
- 2. **Higher temperature** (0.8-1.2) for creative content
819
- 3. **Use streaming** for better UX in production
820
- 4. **Adjust top_k/top_p** to control diversity
821
- 5. **Increase repetition_penalty** if model repeats phrases
822
-
823
- ## Model Capabilities
824
-
825
- βœ… General conversation
826
- βœ… Question answering
827
- βœ… Code generation
828
- βœ… Creative writing
829
- βœ… Text completion
830
- βœ… Instruction following
831
-
832
- ❌ Does NOT use reasoning tokens (`<think>` tags)
833
- ❌ Not fine-tuned for specific domains
834
-
835
- ## Integration Examples
836
-
837
- ### Python (requests)
838
- ```python
839
- import requests
840
-
841
- def chat(message, history=[]):
842
- messages = history + [{{"role": "user", "content": message}}]
843
-
844
- response = requests.post(
845
- "https://YOUR-SPACE.hf.space/chat/completions",
846
- json={{"messages": messages, "temperature": 0.8}}
847
- )
848
-
849
- return response.json()["choices"][0]["message"]["content"]
850
- ```
851
-
852
- ### Python (streaming)
853
  ```python
854
- import requests
855
-
856
- def chat_stream(message):
857
- response = requests.post(
858
- "https://YOUR-SPACE.hf.space/chat/completions",
859
- json={{
860
- "messages": [{{"role": "user", "content": message}}],
861
- "stream": True
862
- }},
863
- stream=True
864
  )
 
865
 
866
- for line in response.iter_lines():
867
- if line:
868
- line = line.decode('utf-8')
869
- if line.startswith('data: '):
870
- data = line[6:]
871
- if data != '[DONE]':
872
- import json
873
- chunk = json.loads(data)
874
- if 'choices' in chunk:
875
- delta = chunk['choices'][0].get('delta', {{}})
876
- if 'content' in delta:
877
- print(delta['content'], end='', flush=True)
878
- ```
879
-
880
- ### JavaScript (fetch)
881
- ```javascript
882
- async function chat(message) {{
883
- const response = await fetch('https://YOUR-SPACE.hf.space/chat/completions', {{
884
- method: 'POST',
885
- headers: {{'Content-Type': 'application/json'}},
886
- body: JSON.stringify({{
887
- messages: [{{role: 'user', content: message}}],
888
- temperature: 0.8
889
- }})
890
- }});
891
-
892
- const data = await response.json();
893
- return data.choices[0].message.content;
894
- }}
895
- ```
896
-
897
- ### cURL
898
- ```bash
899
- curl -X POST https://YOUR-SPACE.hf.space/chat/completions \\
900
- -H "Content-Type: application/json" \\
901
- -d '{{
902
- "messages": [
903
- {{"role": "user", "content": "What is the capital of France?"}}
904
- ],
905
- "max_tokens": 100,
906
- "temperature": 0.7
907
- }}'
908
  ```
909
 
910
  ## Troubleshooting
911
 
912
- **Slow responses?**
 
 
 
 
 
913
  - Reduce `max_tokens`
914
  - Lower `top_k` value
915
- - Use smaller prompts
916
 
917
- **Repetitive output?**
918
  - Increase `repetition_penalty` (try 1.2-1.5)
919
  - Adjust `temperature` higher
920
  - Use `top_p` sampling
921
 
922
- **Incoherent output?**
923
  - Lower `temperature` (try 0.5-0.7)
924
  - Reduce `top_k` (try 20-30)
925
- - Ensure prompt is clear
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
 
927
  ---
928
 
929
- **Model**: SAM-Z-1 | **License**: Check model card on Hugging Face
930
  **Support**: Open an issue on the Space for bugs or questions
931
  """)
932
 
933
- # Event handlers for Chat Completion
934
  chat_btn.click(
935
  fn=chat_completion_api,
936
  inputs=[
937
  messages_input, chat_max_tokens, chat_temperature,
938
  chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
939
  ],
940
- outputs=[chat_output]
 
941
  )
942
 
943
- # Event handlers for Text Completion
944
  text_btn.click(
945
  fn=text_completion_api,
946
  inputs=[
947
  prompt_input, text_max_tokens, text_temperature,
948
  text_top_p, text_top_k, text_rep_penalty, text_stream
949
  ],
950
- outputs=[text_output]
 
951
  )
952
 
953
  # Launch
 
13
  import numpy as np
14
  import time
15
  from typing import Dict, Any, List
 
16
 
17
  # ============================================================================
18
  # Configuration
 
262
  temperature: float = 0.8,
263
  top_k: int = 40,
264
  top_p: float = 0.9,
265
+ repetition_penalty: float = 1.1
 
266
  ):
267
  """Generator that yields tokens one at a time"""
268
  if len(input_ids) > config['max_position_embeddings'] - max_tokens:
 
320
  input_tensor = input_tensor[:, -config['max_position_embeddings']:]
321
 
322
  # ============================================================================
323
+ # API Functions - FIXED FOR GRADIO
324
  # ============================================================================
325
 
326
  def chat_completion_api(
327
  messages_json: str,
328
+ max_tokens: int,
329
+ temperature: float,
330
+ top_p: float,
331
+ top_k: int,
332
+ repetition_penalty: float,
333
+ stream: bool
334
  ) -> str:
335
  """OpenAI-style chat completion API"""
336
  try:
 
356
 
357
  start_time = time.time()
358
  token_count = 0
359
+ response_text = ""
360
 
361
+ for token_id in generate_tokens(
362
+ input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
363
+ ):
364
+ token_text = tokenizer.decode([token_id])
365
+ response_text += token_text
366
+ token_count += 1
367
+
368
+ if "<|im_end|>" in response_text:
369
+ response_text = response_text.split("<|im_end|>")[0]
370
+ break
371
+
372
+ elapsed = time.time() - start_time
373
+
374
+ result = {
375
+ "id": f"chatcmpl-{int(time.time())}",
376
+ "object": "chat.completion",
377
+ "created": int(time.time()),
378
+ "model": "sam-z-1",
379
+ "choices": [{
380
+ "index": 0,
381
+ "message": {
382
+ "role": "assistant",
383
+ "content": response_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  },
385
+ "finish_reason": "stop"
386
+ }],
387
+ "usage": {
388
+ "prompt_tokens": len(input_ids),
389
+ "completion_tokens": token_count,
390
+ "total_tokens": len(input_ids) + token_count
391
+ },
392
+ "stats": {
393
+ "elapsed_sec": round(elapsed, 2),
394
+ "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
395
  }
396
+ }
397
+
398
+ return json.dumps(result, indent=2)
399
 
400
  except Exception as e:
401
  return json.dumps({"error": str(e)}, indent=2)
402
 
403
  def text_completion_api(
404
  prompt: str,
405
+ max_tokens: int,
406
+ temperature: float,
407
+ top_p: float,
408
+ top_k: int,
409
+ repetition_penalty: float,
410
+ stream: bool
411
  ) -> str:
412
  """OpenAI-style text completion API"""
413
  try:
 
415
 
416
  start_time = time.time()
417
  token_count = 0
418
+ response_text = ""
419
 
420
+ for token_id in generate_tokens(
421
+ input_ids, max_tokens, temperature, top_k, top_p, repetition_penalty
422
+ ):
423
+ token_text = tokenizer.decode([token_id])
424
+ response_text += token_text
425
+ token_count += 1
426
+
427
+ elapsed = time.time() - start_time
428
+
429
+ result = {
430
+ "id": f"cmpl-{int(time.time())}",
431
+ "object": "text_completion",
432
+ "created": int(time.time()),
433
+ "model": "sam-z-1",
434
+ "choices": [{
435
+ "text": response_text,
436
+ "index": 0,
437
+ "finish_reason": "stop"
438
+ }],
439
+ "usage": {
440
+ "prompt_tokens": len(input_ids),
441
+ "completion_tokens": token_count,
442
+ "total_tokens": len(input_ids) + token_count
443
+ },
444
+ "stats": {
445
+ "elapsed_sec": round(elapsed, 2),
446
+ "tokens_per_sec": round(token_count / elapsed if elapsed > 0 else 0, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  }
448
+ }
449
+
450
+ return json.dumps(result, indent=2)
451
 
452
  except Exception as e:
453
  return json.dumps({"error": str(e)}, indent=2)
454
 
455
  # ============================================================================
456
+ # Gradio UI with API Routes
457
  # ============================================================================
458
 
459
  custom_css = """
 
478
  border-left: 4px solid #667eea;
479
  margin: 1rem 0;
480
  }
 
 
 
 
 
 
 
 
 
 
481
  """
482
 
483
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="SAM-Z-1 API") as demo:
 
495
  # ========== Chat Completion Tab ==========
496
  with gr.Tab("πŸ’¬ Chat Completion"):
497
  gr.Markdown("""
498
+ ### Chat Completions API
499
+ OpenAI-compatible chat completion endpoint
500
  """)
501
 
502
  with gr.Row():
 
519
  chat_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
520
 
521
  chat_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
522
+ chat_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
523
 
524
  chat_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
525
 
 
531
  )
532
 
533
  gr.Markdown("""
534
+ ### Python Example with Gradio Client
535
  ```python
536
+ from gradio_client import Client
 
537
 
538
+ client = Client("YOUR-SPACE-URL")
 
539
 
540
  messages = [
541
  {"role": "user", "content": "Hello! Who are you?"}
542
  ]
543
 
544
+ result = client.predict(
545
+ messages_json=json.dumps(messages),
546
+ max_tokens=512,
547
+ temperature=0.8,
548
+ top_p=0.9,
549
+ top_k=40,
550
+ repetition_penalty=1.1,
551
+ stream=False,
552
+ api_name="/chat_completions"
553
  )
554
 
555
+ print(result)
 
 
 
 
 
 
 
 
 
 
 
556
  ```
557
  """)
558
 
559
  # ========== Text Completion Tab ==========
560
  with gr.Tab("πŸ“ Text Completion"):
561
  gr.Markdown("""
562
+ ### Text Completions API
563
+ OpenAI-compatible text completion endpoint
564
  """)
565
 
566
  with gr.Row():
 
580
  text_top_k = gr.Slider(1, 100, 40, step=1, label="Top K")
581
 
582
  text_rep_penalty = gr.Slider(1.0, 2.0, 1.1, step=0.1, label="Repetition Penalty")
583
+ text_stream = gr.Checkbox(label="Stream Response (Not implemented in UI)", value=False)
584
 
585
  text_btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
586
 
 
592
  )
593
 
594
  gr.Markdown("""
595
+ ### Python Example with Gradio Client
596
  ```python
597
+ from gradio_client import Client
598
+
599
+ client = Client("YOUR-SPACE-URL")
600
+
601
+ result = client.predict(
602
+ prompt="Once upon a time",
603
+ max_tokens=512,
604
+ temperature=0.8,
605
+ top_p=0.9,
606
+ top_k=40,
607
+ repetition_penalty=1.1,
608
+ stream=False,
609
+ api_name="/text_completions"
610
  )
611
 
612
+ print(result)
613
  ```
614
  """)
615
 
 
625
  - **Context Length**: {config['max_position_embeddings']} tokens
626
  - **Vocabulary Size**: {config['vocab_size']}
627
 
628
+ ## Using the API
629
 
630
+ ### Method 1: Gradio Client (Recommended)
 
631
 
632
+ Install the Gradio client:
633
+ ```bash
634
+ pip install gradio_client
635
+ ```
636
+
637
+ **Chat Completion:**
638
+ ```python
639
+ from gradio_client import Client
640
+ import json
641
+
642
+ client = Client("https://YOUR-SPACE.hf.space")
643
+
644
+ messages = [
645
+ {{"role": "user", "content": "What is Python?"}}
646
+ ]
647
+
648
+ result = client.predict(
649
+ messages_json=json.dumps(messages),
650
+ max_tokens=512,
651
+ temperature=0.8,
652
+ top_p=0.9,
653
+ top_k=40,
654
+ repetition_penalty=1.1,
655
+ stream=False,
656
+ api_name="/chat_completions"
657
+ )
658
+
659
+ response = json.loads(result)
660
+ print(response["choices"][0]["message"]["content"])
661
+ ```
662
+
663
+ **Text Completion:**
664
+ ```python
665
+ result = client.predict(
666
+ prompt="Once upon a time",
667
+ max_tokens=512,
668
+ temperature=0.8,
669
+ top_p=0.9,
670
+ top_k=40,
671
+ repetition_penalty=1.1,
672
+ stream=False,
673
+ api_name="/text_completions"
674
+ )
675
+
676
+ response = json.loads(result)
677
+ print(response["choices"][0]["text"])
678
+ ```
679
+
680
+ ### Method 2: Direct HTTP Requests
681
+
682
+ **Chat Completion:**
683
+ ```python
684
+ import requests
685
+ import json
686
+
687
+ url = "https://YOUR-SPACE.hf.space/call/chat_completions"
688
+
689
+ payload = {{
690
+ "data": [
691
+ json.dumps([{{"role": "user", "content": "Hello!"}}]), # messages_json
692
+ 512, # max_tokens
693
+ 0.8, # temperature
694
+ 0.9, # top_p
695
+ 40, # top_k
696
+ 1.1, # repetition_penalty
697
+ False # stream
698
+ ]
699
  }}
700
+
701
+ response = requests.post(url, json=payload)
702
+ print(response.json())
703
  ```
704
 
705
+ ## API Endpoints
706
+
707
+ ### Chat Completions
708
+ - **API Name**: `/chat_completions`
709
+ - **URL**: `https://YOUR-SPACE.hf.space/call/chat_completions`
710
+
711
+ **Parameters:**
712
+ 1. `messages_json` (str): JSON string of messages array
713
+ 2. `max_tokens` (int): Maximum tokens to generate (50-1024)
714
+ 3. `temperature` (float): Sampling temperature (0.1-2.0)
715
+ 4. `top_p` (float): Nucleus sampling threshold (0.1-1.0)
716
+ 5. `top_k` (int): Top-K sampling (1-100)
717
+ 6. `repetition_penalty` (float): Penalty for repetition (1.0-2.0)
718
+ 7. `stream` (bool): Stream response (UI only, not functional)
719
+
720
+ ### Text Completions
721
+ - **API Name**: `/text_completions`
722
+ - **URL**: `https://YOUR-SPACE.hf.space/call/text_completions`
723
+
724
+ **Parameters:**
725
+ 1. `prompt` (str): Text prompt
726
+ 2. `max_tokens` (int): Maximum tokens to generate
727
+ 3. `temperature` (float): Sampling temperature
728
+ 4. `top_p` (float): Nucleus sampling threshold
729
+ 5. `top_k` (int): Top-K sampling
730
+ 6. `repetition_penalty` (float): Penalty for repetition
731
+ 7. `stream` (bool): Stream response (UI only)
732
+
733
+ ## Response Format
734
+
735
+ **Chat Completion Response:**
736
  ```json
737
  {{
738
+ "id": "chatcmpl-1234567890",
739
  "object": "chat.completion",
740
  "created": 1234567890,
741
  "model": "sam-z-1",
 
743
  "index": 0,
744
  "message": {{
745
  "role": "assistant",
746
+ "content": "Response text here"
747
  }},
748
  "finish_reason": "stop"
749
  }}],
 
759
  }}
760
  ```
761
 
762
+ **Text Completion Response:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
763
  ```json
764
  {{
765
+ "id": "cmpl-1234567890",
766
  "object": "text_completion",
767
  "created": 1234567890,
768
  "model": "sam-z-1",
769
  "choices": [{{
770
+ "text": "Completion text here",
771
  "index": 0,
772
  "finish_reason": "stop"
773
  }}],
 
775
  "prompt_tokens": 5,
776
  "completion_tokens": 15,
777
  "total_tokens": 20
778
+ }},
779
+ "stats": {{
780
+ "elapsed_sec": 1.2,
781
+ "tokens_per_sec": 12.5
782
  }}
783
  }}
784
  ```
785
 
786
+ ## Complete Example Script
787
 
788
+ ```python
789
+ #!/usr/bin/env python3
790
+ """
791
+ SAM-Z-1 API Client Example
792
+ """
793
+ from gradio_client import Client
794
+ import json
795
+
796
+ # Initialize client
797
+ client = Client("https://YOUR-SPACE.hf.space")
798
+
799
+ def chat(message, history=[]):
800
+ \"\"\"Send a chat message\"\"\"
801
+ messages = history + [{{"role": "user", "content": message}}]
802
+
803
+ result = client.predict(
804
+ messages_json=json.dumps(messages),
805
+ max_tokens=512,
806
+ temperature=0.8,
807
+ top_p=0.9,
808
+ top_k=40,
809
+ repetition_penalty=1.1,
810
+ stream=False,
811
+ api_name="/chat_completions"
812
+ )
813
+
814
+ response = json.loads(result)
815
+ assistant_msg = response["choices"][0]["message"]["content"]
816
+
817
+ # Update history
818
+ history.append({{"role": "user", "content": message}})
819
+ history.append({{"role": "assistant", "content": assistant_msg}})
820
+
821
+ return assistant_msg, history
822
+
823
+ def complete(prompt):
824
+ \"\"\"Complete text\"\"\"
825
+ result = client.predict(
826
+ prompt=prompt,
827
+ max_tokens=512,
828
+ temperature=0.8,
829
+ top_p=0.9,
830
+ top_k=40,
831
+ repetition_penalty=1.1,
832
+ stream=False,
833
+ api_name="/text_completions"
834
+ )
835
+
836
+ response = json.loads(result)
837
+ return response["choices"][0]["text"]
838
+
839
+ # Example usage
840
+ if __name__ == "__main__":
841
+ # Chat example
842
+ print("=== Chat Example ===")
843
+ history = []
844
+
845
+ response, history = chat("Hello! Who are you?", history)
846
+ print(f"Assistant: {{response}}\\n")
847
+
848
+ response, history = chat("What can you help me with?", history)
849
+ print(f"Assistant: {{response}}\\n")
850
+
851
+ # Text completion example
852
+ print("\\n=== Text Completion Example ===")
853
+ completion = complete("Once upon a time in a distant galaxy")
854
+ print(f"Completion: {{completion}}")
855
+ ```
856
 
857
+ ## Parameters Guide
858
 
859
+ ### Temperature (0.1 - 2.0)
860
+ - **Low (0.1-0.5)**: More focused, deterministic, factual
861
+ - **Medium (0.6-0.9)**: Balanced creativity and coherence
862
+ - **High (1.0-2.0)**: More creative, diverse, unpredictable
863
 
864
+ ### Top-P (0.1 - 1.0)
865
+ - Controls diversity via nucleus sampling
866
+ - **0.9** (default): Good balance
867
+ - Lower values = more focused
868
+ - Higher values = more diverse
869
 
870
+ ### Top-K (1 - 100)
871
+ - Limits vocabulary to top K tokens
872
+ - **40** (default): Good balance
873
+ - Lower values = more focused
874
+ - Higher values = more diverse
875
 
876
+ ### Repetition Penalty (1.0 - 2.0)
877
+ - **1.0**: No penalty
878
+ - **1.1** (default): Slight penalty
879
+ - **1.5+**: Strong penalty (use if model repeats)
880
 
881
  ## Rate Limits & Performance
882
 
883
+ - **Concurrent Requests**: Supported via Gradio queue
884
  - **Average Speed**: 10-20 tokens/sec on CPU
885
  - **Context Window**: {config['max_position_embeddings']} tokens
886
+ - **Queue Size**: Up to 20 concurrent requests
887
 
888
  ## Error Handling
889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890
  ```python
891
+ try:
892
+ result = client.predict(
893
+ messages_json=json.dumps(messages),
894
+ max_tokens=512,
895
+ temperature=0.8,
896
+ top_p=0.9,
897
+ top_k=40,
898
+ repetition_penalty=1.1,
899
+ stream=False,
900
+ api_name="/chat_completions"
901
  )
902
+ response = json.loads(result)
903
 
904
+ if "error" in response:
905
+ print(f"API Error: {{response['error']}}")
906
+ else:
907
+ print(response["choices"][0]["message"]["content"])
908
+
909
+ except Exception as e:
910
+ print(f"Request failed: {{e}}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
  ```
912
 
913
  ## Troubleshooting
914
 
915
+ **Connection Issues:**
916
+ - Verify Space URL is correct
917
+ - Check if Space is running
918
+ - Ensure gradio_client is installed
919
+
920
+ **Slow Responses:**
921
  - Reduce `max_tokens`
922
  - Lower `top_k` value
923
+ - Use shorter prompts
924
 
925
+ **Repetitive Output:**
926
  - Increase `repetition_penalty` (try 1.2-1.5)
927
  - Adjust `temperature` higher
928
  - Use `top_p` sampling
929
 
930
+ **Incoherent Output:**
931
  - Lower `temperature` (try 0.5-0.7)
932
  - Reduce `top_k` (try 20-30)
933
+ - Ensure prompt is clear and well-formatted
934
+
935
+ ## Chat Template Format
936
+
937
+ The model uses ChatML format:
938
+ ```
939
+ <|im_start|>system
940
+ System message here<|im_end|>
941
+ <|im_start|>user
942
+ User message here<|im_end|>
943
+ <|im_start|>assistant
944
+ Assistant response here<|im_end|>
945
+ ```
946
+
947
+ ## Tips for Best Results
948
+
949
+ 1. **Use clear, specific prompts**
950
+ 2. **Lower temperature for factual tasks**
951
+ 3. **Higher temperature for creative tasks**
952
+ 4. **Adjust repetition penalty if model repeats phrases**
953
+ 5. **Keep context under {config['max_position_embeddings']} tokens**
954
+ 6. **Use system messages to set behavior**
955
+
956
+ ## Model Capabilities
957
+
958
+ βœ… General conversation
959
+ βœ… Question answering
960
+ βœ… Code generation
961
+ βœ… Creative writing
962
+ βœ… Text completion
963
+ βœ… Instruction following
964
+
965
+ ❌ Does NOT use reasoning tokens (`<think>` tags)
966
+ ❌ Not fine-tuned for specific domains
967
 
968
  ---
969
 
970
+ **Model**: SAM-Z-1 | **API Version**: 1.0
971
  **Support**: Open an issue on the Space for bugs or questions
972
  """)
973
 
974
+ # ========== API Routes - MUST USE api_name parameter ==========
975
  chat_btn.click(
976
  fn=chat_completion_api,
977
  inputs=[
978
  messages_input, chat_max_tokens, chat_temperature,
979
  chat_top_p, chat_top_k, chat_rep_penalty, chat_stream
980
  ],
981
+ outputs=[chat_output],
982
+ api_name="chat_completions" # This creates /call/chat_completions endpoint
983
  )
984
 
 
985
  text_btn.click(
986
  fn=text_completion_api,
987
  inputs=[
988
  prompt_input, text_max_tokens, text_temperature,
989
  text_top_p, text_top_k, text_rep_penalty, text_stream
990
  ],
991
+ outputs=[text_output],
992
+ api_name="text_completions" # This creates /call/text_completions endpoint
993
  )
994
 
995
  # Launch