Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| model = None | |
| def load_model(): | |
| global model | |
| try: | |
| print("Начинаем загрузку модели из Hub...") | |
| model_path = hf_hub_download( | |
| repo_id="AugustLight/LLight-3.2-3B-Instruct", | |
| filename="Llight.Q8_0.gguf", | |
| repo_type="model" | |
| ) | |
| print(f"Модель загружена в: {model_path}") | |
| model = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=4, | |
| n_batch=512 | |
| ) | |
| print("Модель успешно инициализирована!") | |
| return model | |
| except Exception as e: | |
| print(f"Подробная ошибка при загрузке модели: {str(e)}") | |
| raise e | |
| def respond(message, history, system_message, max_new_tokens, temperature, top_p): | |
| try: | |
| global model | |
| if model is None: | |
| model = load_model() | |
| context = f"{system_message}\n\n" | |
| for user_msg, assistant_msg in history: | |
| context += f"User: {user_msg}\nAssistant: {assistant_msg}\n" | |
| context += f"User: {message}\nAssistant: " | |
| print(f"Генерируем ответ для контекста длиной {len(context)} символов") | |
| response_text = "" | |
| # Используем генерацию с потоком | |
| for response in model( | |
| prompt=context, | |
| max_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stop=["User:", "\n\n", "<|endoftext|>"], | |
| echo=False, | |
| stream=True | |
| ): | |
| chunk = response['choices'][0]['text'] | |
| response_text += chunk | |
| print(f"Промежуточный ответ: {chunk}") | |
| yield response_text # Отправляем накопленный текст | |
| print("Ответ сгенерирован полностью.") | |
| except Exception as e: | |
| error_msg = f"Произошла ошибка: {str(e)}" | |
| print(error_msg) | |
| yield error_msg | |
| with gr.Blocks() as demo: | |
| chatbot = gr.Chatbot() | |
| msg = gr.Textbox(label="Сообщение") | |
| with gr.Accordion("Параметры", open=False): | |
| system = gr.Textbox( | |
| value="Ты дружелюбный и полезный ассистент. Отвечай обдуманно и по делу.", | |
| label="System message" | |
| ) | |
| max_new_tokens = gr.Slider( | |
| minimum=1, | |
| maximum=2048, | |
| value=512, | |
| step=1, | |
| label="Max new tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.3, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)" | |
| ) | |
| clear = gr.Button("Очистить") | |
| def user(user_message, history): | |
| return "", history + [[user_message, None]] | |
| def bot(history, system_message, max_new_tokens, temperature, top_p): | |
| message = history[-1][0] | |
| for response_text in respond(message, history[:-1], system_message, max_new_tokens, temperature, top_p): | |
| history[-1][1] = response_text | |
| yield history | |
| msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
| bot, [chatbot, system, max_new_tokens, temperature, top_p], chatbot | |
| ) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| if __name__ == "__main__": | |
| try: | |
| print("Инициализация приложения...") | |
| model = load_model() | |
| print("Модель загружена успешно при старте") | |
| except Exception as e: | |
| print(f"Ошибка при инициализации: {str(e)}") | |
| demo.launch() |