| import gradio as gr |
|
|
| import torch |
| import subprocess |
| import importlib.util |
|
|
| if importlib.util.find_spec("llama_cpp") is None: |
| import os, subprocess |
| os.environ["CMAKE_ARGS"] = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" |
| os.environ["FORCE_CMAKE"] = "1" |
| subprocess.run( |
| "pip install llama-cpp-python==0.3.15 --no-cache-dir", |
| shell=True, |
| ) |
| from llama_cpp import Llama |
|
|
| llm = Llama.from_pretrained( |
| repo_id="Mungert/Phi-4-mini-instruct.gguf", |
| filename="phi-4-mini-iq3_m.gguf", |
| n_threads=4, |
| n_batch=512, |
| ) |
|
|
| def main(input): |
| output = llm( |
| input, |
| max_tokens=48, |
| temperature=0.7, |
| ) |
| return output["choices"][0]["text"] |
|
|
| demo = gr.Interface( |
| fn=main, |
| inputs=gr.Textbox(lines=4, label="Prompt"), |
| outputs=gr.Textbox(lines=10, label="Response"), |
| ) |
|
|
| demo.launch( |
| show_error=True, |
| max_threads=1, |
| ) |
|
|
|
|