File size: 1,926 Bytes
f821365
c86e03f
 
 
29a7fe7
389d21e
539f835
 
389d21e
c86e03f
 
50e06b0
c86e03f
389d21e
c86e03f
389d21e
 
 
 
 
02799cd
389d21e
 
 
 
 
 
 
 
be40b4e
50e06b0
02799cd
389d21e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d152984
389d21e
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# --- Configuration ---
# 1. Update with your model's repo ID and file name
MODEL_REPO = "Kezovic/iris-f16gguf-test"
MODEL_FILE = "llama-3.2-1b-instruct.F16.gguf"
# Adjust context window and other params as needed
CONTEXT_WINDOW = 4096 
MAX_NEW_TOKENS = 512
TEMPERATURE = 1.5 

# --- Model Loading Function ---
def load_llm():
    """Downloads the GGUF model and initializes LlamaCPP."""
    print("Downloading model...")
    model_path = hf_hub_download(
        repo_id=MODEL_REPO,
        filename=MODEL_FILE
    )
    
    # Initialize the LLM with the downloaded model path
    # n_ctx is the context window size
    # n_threads is set to 2 (free CPU core limit) for better parallelization
    llm = Llama(
        model_path=model_path,
        n_ctx=CONTEXT_WINDOW,
        n_threads=2,
        verbose=False, # Set to True for debugging
        min_p=0.1
    )
    print("Model loaded successfully!")
    return llm

# Load the model only once when the Space starts
llm = load_llm()

# --- Inference Function ---
def generate(prompt, history):
    """Generates a response using the Llama model."""
    # Use a basic prompt template (adjust for your model's specific format)
    full_prompt = f"### Human: {prompt}\n### Assistant:"

    output = llm(
        prompt=full_prompt, 
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        stop=["### Human:"], # Stop generation at the next user turn
        echo=False
    )
    
    # Extract the text from the response object
    response_text = output['choices'][0]['text'].strip()
    return response_text

# --- Gradio Interface ---
# Use the ChatInterface for a quick, functional chat UI
gr.ChatInterface(
    generate,
    title=f"Chat with {MODEL_FILE}",
    description="A GGUF LLM hosted on Hugging Face CPU Space using llama-cpp-python."
).launch()