| import os | |
| import gradio as gr | |
| from ctransformers import AutoModelForCausalLM | |
| model_repo = os.getenv('HF_MODEL_REPO') | |
| model_bin = os.getenv('HF_MODEL_BIN') | |
| llm = AutoModelForCausalLM.from_pretrained( | |
| model_repo, | |
| model_file=model_bin, | |
| threads=2, | |
| seed=42, | |
| context_length=16384, | |
| lib="avx2", | |
| ) | |
| def response(prompt): | |
| txt = llm(prompt, max_new_tokens=8192, temperature=0.8, top_p=0.5, repetition_penalty=1.1, reset=False, stop=["</s>","<|im_end|>"], ) | |
| return txt | |
| if __name__ == '__main__': | |
| title = "Chat" | |
| demo_status = "Demo is running on CPU" | |
| gr.Interface(response, inputs="text", outputs="text", | |
| title=title, | |
| ).launch() |