| from flask import Flask, request, Response |
| import logging |
| from llama_cpp import Llama |
| import threading |
| from huggingface_hub import snapshot_download, Repository |
| import huggingface_hub |
| import gc |
| import os.path |
| from datetime import datetime |
| import xml.etree.ElementTree as ET |
|
|
|
|
| SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык." |
| SYSTEM_TOKEN = 1788 |
| USER_TOKEN = 1404 |
| BOT_TOKEN = 9225 |
| LINEBREAK_TOKEN = 13 |
|
|
| ROLE_TOKENS = { |
| "user": USER_TOKEN, |
| "bot": BOT_TOKEN, |
| "system": SYSTEM_TOKEN |
| } |
|
|
| CONTEXT_SIZE = 4001 |
| ENABLE_GPU = True |
| GPU_LAYERS = 70 |
|
|
| |
| lock = threading.Lock() |
|
|
| app = Flask(__name__) |
| |
| app.logger.setLevel(logging.DEBUG) |
|
|
| |
| |
| |
|
|
| |
| |
|
|
| repo_name = "TheBloke/Llama-2-70B-Chat-GGUF" |
| model_name = "llama-2-70b-chat.Q4_K_M.gguf" |
|
|
| |
| |
| local_dir = '.' |
|
|
| if os.path.isdir('/data'): |
| app.logger.info('Persistent storage enabled') |
|
|
| model = None |
|
|
| model_path = snapshot_download(repo_id=repo_name, allow_patterns=model_name) + '/' + model_name |
| app.logger.info('Model path: ' + model_path) |
|
|
| DATASET_REPO_URL = "https://huggingface.co/datasets/muryshev/saiga-chat" |
| DATA_FILENAME = "llama-2-70b-q4-k-m.xml" |
| DATA_FILE = os.path.join("dataset", DATA_FILENAME) |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN") |
| app.logger.info("hfh: "+huggingface_hub.__version__) |
|
|
| repo = Repository( |
| local_dir="dataset", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN |
| ) |
|
|
|
|
|
|
| def log(req: str = '', resp: str = ''): |
| if req or resp: |
| element = ET.Element("row", {"time": str(datetime.now()) }) |
| req_element = ET.SubElement(element, "request") |
| req_element.text = req |
| resp_element = ET.SubElement(element, "response") |
| resp_element.text = resp |
| |
| with open(DATA_FILE, "ab+") as xml_file: |
| xml_file.write(ET.tostring(element, encoding="utf-8")) |
| |
| commit_url = repo.push_to_hub() |
| app.logger.info(commit_url) |
|
|
|
|
| def init_model(context_size, enable_gpu=False, gpu_layer_number=35): |
| global model |
| |
| if model is not None: |
| del model |
| gc.collect() |
| |
| if enable_gpu: |
| model = Llama( |
| model_path=model_path, |
| n_ctx=context_size, |
| n_parts=1, |
| |
| logits_all=True, |
| |
| verbose=True, |
| n_gpu_layers=gpu_layer_number, |
| n_gqa=8 |
| ) |
| return model |
| else: |
| model = Llama( |
| model_path=model_path, |
| n_ctx=context_size, |
| n_parts=1, |
| |
| logits_all=True, |
| |
| verbose=True, |
| n_gqa=8 |
| ) |
| return model |
|
|
| init_model(CONTEXT_SIZE, ENABLE_GPU, GPU_LAYERS) |
|
|
| def get_message_tokens(model, role, content): |
| message_tokens = model.tokenize(content.encode("utf-8")) |
| message_tokens.insert(1, ROLE_TOKENS[role]) |
| message_tokens.insert(2, LINEBREAK_TOKEN) |
| message_tokens.append(model.token_eos()) |
| return message_tokens |
|
|
| def get_system_tokens(model): |
| system_message = { |
| "role": "system", |
| "content": SYSTEM_PROMPT |
| } |
| return get_message_tokens(model, **system_message) |
|
|
| def get_system_tokens_for_preprompt(model, preprompt): |
| system_message = { |
| "role": "system", |
| "content": preprompt |
| } |
| return get_message_tokens(model, **system_message) |
|
|
| |
| |
| |
| |
|
|
| stop_generation = False |
|
|
| def generate_tokens(model, generator): |
| global stop_generation |
| app.logger.info('generate_tokens started') |
| with lock: |
| try: |
| for token in generator: |
| if token == model.token_eos() or stop_generation: |
| stop_generation = False |
| app.logger.info('End generating') |
| yield b'' |
| break |
| |
| token_str = model.detokenize([token]) |
| yield token_str |
| except Exception as e: |
| app.logger.info('generator exception') |
| app.logger.info(e) |
| yield b'' |
|
|
| @app.route('/change_context_size', methods=['GET']) |
| def handler_change_context_size(): |
| global stop_generation, model |
| stop_generation = True |
|
|
| new_size = int(request.args.get('size', CONTEXT_SIZE)) |
| init_model(new_size, ENABLE_GPU, GPU_LAYERS) |
| |
| return Response('Size changed', content_type='text/plain') |
| |
| @app.route('/stop_generation', methods=['GET']) |
| def handler_stop_generation(): |
| global stop_generation |
| stop_generation = True |
| return Response('Stopped', content_type='text/plain') |
| |
| @app.route('/', methods=['GET', 'PUT', 'DELETE', 'PATCH']) |
| def generate_unknown_response(): |
| app.logger.info('unknown method: '+request.method) |
| try: |
| request_payload = request.get_json() |
| app.logger.info('payload: '+request.get_json()) |
| except Exception as e: |
| app.logger.info('payload empty') |
|
|
| return Response('What do you want?', content_type='text/plain') |
|
|
| response_tokens = bytearray() |
| def generate_and_log_tokens(user_request, model, generator): |
| |
| for token in generate_tokens(model, generator): |
| |
| |
| |
| |
| |
| yield token |
| |
| @app.route('/', methods=['POST']) |
| def generate_response(): |
| global stop_generation |
| raw_content = request.data |
| tokens = model.tokenize(raw_content) |
| generator = model.generate( |
| tokens[:CONTEXT_SIZE] |
| ) |
| app.logger.info('Generator created') |
|
|
| |
| return Response(generate_and_log_tokens(raw_content, model, generator), content_type='text/plain', status=200, direct_passthrough=True) |
|
|
|
|
|
|
| if __name__ == "__main__": |
| app.run(host="0.0.0.0", port=7860, debug=False, threaded=False) |