AnatoliiG commited on
Commit
dd4c32e
·
1 Parent(s): 97ce0ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -64
app.py CHANGED
@@ -10,18 +10,15 @@ from gradio import mount_gradio_app
10
  from huggingface_hub import hf_hub_download
11
  from llama_cpp import Llama
12
 
13
- # Конфигурация модели
14
  REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
15
  FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
16
-
17
  CONTEXT_SIZE = 8192
18
- MAX_OUTPUT_TOKENS = 4096
19
 
20
  print(f"Loading model {REPO_ID}...")
21
  try:
22
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
23
-
24
- print("Initializing Llama...")
25
  llm = Llama(
26
  model_path=model_path,
27
  n_ctx=CONTEXT_SIZE,
@@ -30,11 +27,11 @@ try:
30
  verbose=True,
31
  )
32
  except Exception as e:
33
- print(f"Critical Error loading model: {e}")
34
  raise e
35
 
 
36
  app = FastAPI()
37
-
38
  app.add_middleware(
39
  CORSMiddleware,
40
  allow_origins=["*"],
@@ -51,12 +48,7 @@ async def chat_completions(request: Request):
51
  messages = data.get("messages", [])
52
  stream = data.get("stream", False)
53
  temperature = data.get("temperature", 0.4)
54
- max_tokens = data.get("max_tokens", MAX_OUTPUT_TOKENS)
55
-
56
- if not messages:
57
- return JSONResponse(
58
- content={"error": "No messages provided"}, status_code=400
59
- )
60
 
61
  output = llm.create_chat_completion(
62
  messages=messages,
@@ -68,61 +60,39 @@ async def chat_completions(request: Request):
68
  if stream:
69
 
70
  def iter_content():
71
- try:
72
- for chunk in output:
73
- yield f"data: {json.dumps(chunk)}\n\n"
74
- except Exception as e:
75
- print(f"Streaming error: {e}")
76
- err_chunk = {
77
- "choices": [
78
- {
79
- "delta": {"content": f"\n[ERROR]: {str(e)}"},
80
- "finish_reason": "error",
81
- }
82
- ]
83
- }
84
- yield f"data: {json.dumps(err_chunk)}\n\n"
85
- finally:
86
- yield "data: [DONE]\n\n"
87
-
88
- return StreamingResponse(
89
- iter_content(),
90
- media_type="text/event-stream",
91
- headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
92
- )
93
 
94
- return JSONResponse(content=output)
95
 
 
96
  except Exception as e:
97
- print(f"API Error: {e}")
98
  return JSONResponse(content={"error": str(e)}, status_code=500)
99
 
100
 
101
- def gradio_interface(message, history):
102
- messages = [
103
- {
104
- "role": "system",
105
- "content": "You are an expert coding assistant. Write clean, efficient code.",
106
- }
107
- ]
108
 
109
- history_subset = history[-10:] if len(history) > 10 else history
 
 
 
110
 
111
- for u, a in history_subset:
112
- messages.append({"role": "user", "content": u})
113
- messages.append({"role": "assistant", "content": a})
114
  messages.append({"role": "user", "content": message})
115
 
116
  partial_text = ""
117
  try:
118
- response_stream = llm.create_chat_completion(
119
  messages=messages,
120
- max_tokens=MAX_OUTPUT_TOKENS,
121
- temperature=0.4,
122
  stream=True,
123
  )
124
 
125
- for chunk in response_stream:
126
  delta = chunk["choices"][0]["delta"]
127
  if "content" in delta:
128
  partial_text += delta["content"]
@@ -130,20 +100,141 @@ def gradio_interface(message, history):
130
 
131
  except Exception as e:
132
  traceback.print_exc()
133
- error_msg = f"\n\n🚫 **Error:** {str(e)}\nTry refreshing the page or shortening the context."
134
- yield partial_text + error_msg
135
-
136
-
137
- demo = gr.ChatInterface(
138
- fn=gradio_interface,
139
- title="Qwen 2.5 Coder (7B-Instruct)",
140
- description="Running on CPU. Generation might be slow. Please be patient.",
141
- examples=[
142
- "Write a Python script to scrape a website.",
143
- "Explain how asyncio works in Python.",
144
- ],
 
 
 
 
145
  )
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  app = mount_gradio_app(app, demo, path="/")
148
 
149
  if __name__ == "__main__":
 
10
  from huggingface_hub import hf_hub_download
11
  from llama_cpp import Llama
12
 
13
+ # --- КОНФИГУРАЦИЯ ---
14
  REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
15
  FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
 
16
  CONTEXT_SIZE = 8192
17
+ DEFAULT_MAX_TOKENS = 4096
18
 
19
  print(f"Loading model {REPO_ID}...")
20
  try:
21
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
 
22
  llm = Llama(
23
  model_path=model_path,
24
  n_ctx=CONTEXT_SIZE,
 
27
  verbose=True,
28
  )
29
  except Exception as e:
30
+ print(f"Critical Error: {e}")
31
  raise e
32
 
33
+ # --- API (FastAPI) ---
34
  app = FastAPI()
 
35
  app.add_middleware(
36
  CORSMiddleware,
37
  allow_origins=["*"],
 
48
  messages = data.get("messages", [])
49
  stream = data.get("stream", False)
50
  temperature = data.get("temperature", 0.4)
51
+ max_tokens = data.get("max_tokens", DEFAULT_MAX_TOKENS)
 
 
 
 
 
52
 
53
  output = llm.create_chat_completion(
54
  messages=messages,
 
60
  if stream:
61
 
62
  def iter_content():
63
+ for chunk in output:
64
+ yield f"data: {json.dumps(chunk)}\n\n"
65
+ yield "data: [DONE]\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ return StreamingResponse(iter_content(), media_type="text/event-stream")
68
 
69
+ return JSONResponse(content=output)
70
  except Exception as e:
 
71
  return JSONResponse(content={"error": str(e)}, status_code=500)
72
 
73
 
74
+ # --- ЛОГИКА ГЕНЕРАЦИИ ДЛЯ GRADIO ---
75
+ def generate_response(message, history, system_prompt, temperature, max_tokens):
76
+ # Формируем сообщения
77
+ messages = [{"role": "system", "content": system_prompt}]
 
 
 
78
 
79
+ # Берем последние 10 сообщений для экономии памяти
80
+ for user_msg, assistant_msg in history[-10:]:
81
+ messages.append({"role": "user", "content": user_msg})
82
+ messages.append({"role": "assistant", "content": assistant_msg})
83
 
 
 
 
84
  messages.append({"role": "user", "content": message})
85
 
86
  partial_text = ""
87
  try:
88
+ stream = llm.create_chat_completion(
89
  messages=messages,
90
+ max_tokens=int(max_tokens),
91
+ temperature=float(temperature),
92
  stream=True,
93
  )
94
 
95
+ for chunk in stream:
96
  delta = chunk["choices"][0]["delta"]
97
  if "content" in delta:
98
  partial_text += delta["content"]
 
100
 
101
  except Exception as e:
102
  traceback.print_exc()
103
+ yield partial_text + f"\n\n **Error:** {str(e)}"
104
+
105
+
106
+ # --- ИНТЕРФЕЙС (Gradio Blocks) ---
107
+
108
+ # CSS для увеличения высоты окна чата и улучшения шрифтов кода
109
+ custom_css = """
110
+ #chatbot {
111
+ height: 70vh !important;
112
+ overflow: auto;
113
+ }
114
+ """
115
+
116
+ # Используем тему Soft для более приятного визуала
117
+ theme = gr.themes.Soft(
118
+ primary_hue="blue", secondary_hue="slate", neutral_hue="slate", text_size="lg"
119
  )
120
 
121
+ with gr.Blocks(theme=theme, css=custom_css, title="Qwen Coder Pro") as demo:
122
+ gr.Markdown("# 💻 Qwen 2.5 Coder Assistant")
123
+
124
+ with gr.Row():
125
+ # Левая колонка - Настройки (20% ширины)
126
+ with gr.Column(scale=1, min_width=250):
127
+ gr.Markdown("### ⚙️ Settings")
128
+
129
+ system_prompt = gr.Textbox(
130
+ label="System Prompt",
131
+ value="You are an expert coding assistant. Write clean, efficient code and explain it clearly.",
132
+ lines=4,
133
+ interactive=True,
134
+ )
135
+
136
+ temperature = gr.Slider(
137
+ minimum=0.0,
138
+ maximum=1.0,
139
+ value=0.4,
140
+ step=0.1,
141
+ label="Creativity (Temperature)",
142
+ info="Lower = more precise code. Higher = more creative.",
143
+ )
144
+
145
+ max_tokens = gr.Slider(
146
+ minimum=512,
147
+ maximum=8192,
148
+ value=4096,
149
+ step=256,
150
+ label="Max Response Length",
151
+ info="Limit the length of the answer.",
152
+ )
153
+
154
+ gr.Markdown("---")
155
+ clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
156
+
157
+ # Правая колонка - Чат (80% ширины)
158
+ with gr.Column(scale=4):
159
+ chatbot = gr.Chatbot(
160
+ label="Conversation",
161
+ elem_id="chatbot",
162
+ show_copy_button=True, # Кнопка копирования кода
163
+ avatar_images=(
164
+ None,
165
+ "https://api.iconify.design/noto:robot.svg",
166
+ ), # Иконка бота
167
+ type="messages", # Новый формат сообщений Gradio
168
+ )
169
+
170
+ with gr.Row():
171
+ msg = gr.Textbox(
172
+ show_label=False,
173
+ placeholder="Type your code question here...",
174
+ scale=8,
175
+ container=False,
176
+ lines=2,
177
+ )
178
+ submit_btn = gr.Button("Run ➤", variant="primary", scale=1)
179
+
180
+ # --- СВЯЗКА СОБЫТИЙ ---
181
+
182
+ # Функция обертка для обработки истории в новом формате Gradio
183
+ def user_input(user_message, history):
184
+ return "", history + [{"role": "user", "content": user_message}]
185
+
186
+ def bot_response(history, sys_p, temp, m_tok):
187
+ # Преобразуем формат истории Gradio (список словарей) в формат для модели (список кортежей для старой логики или обработка словарей напрямую)
188
+ # Здесь мы адаптируем логику под список словарей
189
+
190
+ messages = [{"role": "system", "content": sys_p}]
191
+ # Добавляем историю (исключая последнее сообщение, которое мы добавим сейчас для генерации,
192
+ # но в новом формате Gradio история уже содержит последнее сообщение пользователя)
193
+
194
+ # Конвертация для LlamaCPP
195
+ llama_messages = [{"role": "system", "content": sys_p}]
196
+
197
+ # Берем последние 20 сообщений
198
+ relevant_history = history[-20:]
199
+
200
+ for msg in relevant_history:
201
+ llama_messages.append({"role": msg["role"], "content": msg["content"]})
202
+
203
+ partial_text = ""
204
+ history.append({"role": "assistant", "content": ""})
205
+
206
+ try:
207
+ stream = llm.create_chat_completion(
208
+ messages=llama_messages,
209
+ max_tokens=int(m_tok),
210
+ temperature=float(temp),
211
+ stream=True,
212
+ )
213
+
214
+ for chunk in stream:
215
+ delta = chunk["choices"][0]["delta"]
216
+ if "content" in delta:
217
+ partial_text += delta["content"]
218
+ # Обновляем последнее сообщение ассистента в истории
219
+ history[-1]["content"] = partial_text
220
+ yield history
221
+
222
+ except Exception as e:
223
+ history[-1]["content"] = f"Error: {str(e)}"
224
+ yield history
225
+
226
+ # Отправка по Enter или кнопке
227
+ msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
228
+ bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
229
+ )
230
+
231
+ submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
232
+ bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
233
+ )
234
+
235
+ # Очистка
236
+ clear_btn.click(lambda: [], None, chatbot, queue=False)
237
+
238
  app = mount_gradio_app(app, demo, path="/")
239
 
240
  if __name__ == "__main__":