EGYADMIN commited on
Commit
cfa3f95
·
verified ·
1 Parent(s): 24a1793

Switch to Hugging Face Inference API for efficient model access

Browse files
Files changed (1) hide show
  1. app.py +104 -137
app.py CHANGED
@@ -1,160 +1,127 @@
1
  import gradio as gr
2
- import torch
3
  import os
4
- import gc
5
- from datetime import datetime
6
 
7
- # Patch for is_torch_fx_available if needed
8
- try:
9
- from transformers.utils import is_torch_fx_available
10
- except ImportError:
11
- import transformers.utils
12
- transformers.utils.is_torch_fx_available = lambda: False
13
- print("Patched is_torch_fx_available function")
14
-
15
- from transformers import AutoTokenizer, AutoModelForCausalLM
16
-
17
- # Model configuration
18
  MODEL_NAME = "moonshotai/Kimi-K2-Instruct"
19
- DEFAULT_SYSTEM_PROMPT = "You are Kimi, an AI assistant created by Moonshot AI."
20
-
21
- # Global variables
22
- model = None
23
- tokenizer = None
24
-
25
- def load_model():
26
- """Load the Kimi-K2-Instruct model with optimized settings"""""
27
- global model, tokenizer
28
-
29
- print("=" * 50)
30
- print(f"Starting at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
31
- print("=" * 50)
32
- print(f"CUDA available: {torch.cuda.is_available()}")
33
-
34
- if torch.cuda.is_available():
35
- print(f"GPU count: {torch.cuda.device_count()}")
36
- for i in range(torch.cuda.device_count()):
37
- gpu_name = torch.cuda.get_device_name(i)
38
- gpu_mem = torch.cuda.get_device_properties(i).total_memory / 1e9
39
- print(f" GPU {i}: {gpu_name} ({gpu_mem:.1f} GB)")
40
-
41
- # Clear GPU memory
42
- torch.cuda.empty_cache()
43
- gc.collect()
44
-
45
- print(f"\nLoading model: {MODEL_NAME}")
46
-
47
- try:
48
- # Load tokenizer first
49
- print("Loading tokenizer...")
50
- tokenizer = AutoTokenizer.from_pretrained(
51
- MODEL_NAME,
52
- trust_remote_code=True,
53
- token=os.environ.get("HF_TOKEN")
54
- )
55
- print("Tokenizer loaded successfully")
56
-
57
- # Load model with memory optimizations
58
- print("Loading model (this may take several minutes)...")
59
- model = AutoModelForCausalLM.from_pretrained(
60
- MODEL_NAME,
61
- torch_dtype=torch.bfloat16,
62
- device_map="auto",
63
- trust_remote_code=True,
64
- token=os.environ.get("HF_TOKEN"),
65
- low_cpu_mem_usage=True
66
- )
67
- print("Model loaded successfully!")
68
  return True
 
 
 
69
 
70
- except Exception as e:
71
- print(f"Error loading model: {str(e)}")
72
- import traceback
73
- traceback.print_exc()
74
- return False
75
-
76
- def generate_response(message, history, system_prompt, max_tokens, temperature, top_p):
77
- """Generate response from the model"""""
78
- global model, tokenizer
79
-
80
- if model is None or tokenizer is None:
81
- return "Model not loaded yet. Please wait..."
82
-
83
- try:
84
- messages = [{"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}]
85
-
86
- for h in history:
87
- if h[0]:
88
- messages.append({"role": "user", "content": h[0]})
89
- if h[1]:
90
- messages.append({"role": "assistant", "content": h[1]})
91
-
92
- messages.append({"role": "user", "content": message})
93
-
94
- inputs = tokenizer.apply_chat_template(
95
- messages,
96
- add_generation_prompt=True,
97
- return_tensors="pt"
98
- ).to(model.device)
99
-
100
- with torch.no_grad():
101
- outputs = model.generate(
102
- inputs,
103
- max_new_tokens=int(max_tokens),
104
- temperature=float(temperature),
105
- top_p=float(top_p),
106
- do_sample=True,
107
- pad_token_id=tokenizer.eos_token_id
108
- )
109
 
110
- response = tokenizer.decode(
111
- outputs[0][inputs.shape[1]:],
112
- skip_special_tokens=True
113
- )
114
- return response
115
 
116
- except Exception as e:
117
- return f"Error: {str(e)}"
 
118
 
119
- # Create interface
120
- print(f"\n===== Application Startup =====\n")
 
 
 
121
 
122
- model_loaded = load_model()
123
 
124
- with gr.Blocks(title="Kimi-K2-Instruct") as iface:
125
- gr.Markdown("# Kimi-K2-Instruct Chat")
 
 
 
 
 
126
 
127
- if not model_loaded:
128
- gr.Markdown("**Warning:** Model failed to load.")
129
 
130
- chatbot = gr.Chatbot(height=450)
 
131
 
132
- with gr.Row():
133
- msg = gr.Textbox(placeholder="Type here...", label="Message", scale=4)
134
- submit_btn = gr.Button("Send", variant="primary", scale=1)
 
 
 
135
 
136
- with gr.Accordion("Settings", open=False):
137
- system_prompt = gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt")
138
- with gr.Row():
139
- max_tokens = gr.Slider(64, 2048, 512, step=64, label="Max Tokens")
140
- temperature = gr.Slider(0.1, 2.0, 0.6, step=0.1, label="Temperature")
141
- top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P")
 
142
 
143
- clear_btn = gr.Button("Clear")
 
144
 
145
- def respond(message, history, system_prompt, max_tokens, temperature, top_p):
146
- response = generate_response(message, history, system_prompt, max_tokens, temperature, top_p)
147
- history.append((message, response))
148
- return "", history
149
 
150
- msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [msg, chatbot])
151
- submit_btn.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [msg, chatbot])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  clear_btn.click(lambda: [], None, chatbot)
153
 
154
  if __name__ == "__main__":
155
  iface.launch(server_name="0.0.0.0", server_port=7860)
156
- )
157
  )
158
- )
159
- )
160
- )
 
1
  import gradio as gr
 
2
  import os
3
+ from huggingface_hub import InferenceClient
 
4
 
5
+ # Model configuration - Using Inference API
 
 
 
 
 
 
 
 
 
 
6
  MODEL_NAME = "moonshotai/Kimi-K2-Instruct"
7
+ DEFAULT_SYSTEM_PROMPT = "You are Kimi, an AI assistant created by Moonshot AI. You are helpful, harmless, and honest."
8
+
9
+ # Initialize Inference Client
10
+ client = None
11
+
12
+ def init_client():
13
+ """Initialize the Hugging Face Inference Client"""""
14
+ global client
15
+ hf_token = os.environ.get("HF_TOKEN")
16
+ if hf_token:
17
+ client = InferenceClient(token=hf_token)
18
+ print("Inference client initialized successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  return True
20
+ else:
21
+ print("Warning: HF_TOKEN not found. Please set it in Space secrets.")
22
+ return False
23
 
24
+ def generate_response(message, history, system_prompt, max_tokens, temperature):
25
+ """Generate response using Hugging Face Inference API"""""
26
+ global client
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ if client is None:
29
+ if not init_client():
30
+ return "Error: HF_TOKEN not configured. Please add it in Space settings."
 
 
31
 
32
+ try:
33
+ # Build messages
34
+ messages = [{"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}]
35
 
36
+ for h in history:
37
+ if h[0]:
38
+ messages.append({"role": "user", "content": h[0]})
39
+ if h[1]:
40
+ messages.append({"role": "assistant", "content": h[1]})
41
 
42
+ messages.append({"role": "user", "content": message})
43
 
44
+ # Call Inference API
45
+ response = client.chat_completion(
46
+ model=MODEL_NAME,
47
+ messages=messages,
48
+ max_tokens=int(max_tokens),
49
+ temperature=float(temperature)
50
+ )
51
 
52
+ return response.choices[0].message.content
 
53
 
54
+ except Exception as e:
55
+ return f"Error: {str(e)}"
56
 
57
+ # Create interface
58
+ print("===== Kimi K2 Thinking Dev =====")
59
+ print(f"Using Inference API with model: {MODEL_NAME}")
60
+
61
+ # Initialize client at startup
62
+ client_ready = init_client()
63
 
64
+ with gr.Blocks(title="Kimi-K2 Chat", theme=gr.themes.Soft()) as iface:
65
+ gr.Markdown("""
66
+ # 🤖 Kimi-K2 Instruct Chat
67
+ **Powered by Hugging Face Inference API**
68
+
69
+ This space uses the Kimi-K2-Instruct model via API for efficient inference.
70
+ """"")
71
 
72
+ if not client_ready:
73
+ gr.Markdown("⚠️ **Warning:** HF_TOKEN not found. Please configure it in Space secrets.")
74
 
75
+ chatbot = gr.Chatbot(height=450, label="Chat")
 
 
 
76
 
77
+ with gr.Row():
78
+ msg = gr.Textbox(
79
+ placeholder="Type your message here...",
80
+ label="Your Message",
81
+ scale=4,
82
+ lines=2
83
+ )
84
+ submit_btn = gr.Button("Send 🚀", variant="primary", scale=1)
85
+
86
+ with gr.Accordion("⚙️ Settings", open=False):
87
+ system_prompt = gr.Textbox(
88
+ value=DEFAULT_SYSTEM_PROMPT,
89
+ label="System Prompt",
90
+ lines=2
91
+ )
92
+ with gr.Row():
93
+ max_tokens = gr.Slider(
94
+ minimum=64,
95
+ maximum=2048,
96
+ value=512,
97
+ step=64,
98
+ label="Max Tokens"
99
+ )
100
+ temperature = gr.Slider(
101
+ minimum=0.1,
102
+ maximum=2.0,
103
+ value=0.7,
104
+ step=0.1,
105
+ label="Temperature"
106
+ )
107
+
108
+ clear_btn = gr.Button("🗑️ Clear Chat")
109
+
110
+ def respond(message, history, system_prompt, max_tokens, temperature):
111
+ if not message.strip():
112
+ return "", history
113
+ response = generate_response(message, history, system_prompt, max_tokens, temperature)
114
+ history.append((message, response))
115
+ return "", history
116
+
117
+ msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature], [msg, chatbot])
118
+ submit_btn.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature], [msg, chatbot])
119
  clear_btn.click(lambda: [], None, chatbot)
120
 
121
  if __name__ == "__main__":
122
  iface.launch(server_name="0.0.0.0", server_port=7860)
123
+ )
124
  )
125
+ )
126
+ )
127
+ )