Spaces:

ConceptModels
/

Concept

Runtime error

File size: 3,240 Bytes

f1aa842
d96cffe
 
 
f1aa842
d96cffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1aa842
 
 
 
 
 
 
 
d96cffe
f1aa842
d96cffe
 
f1aa842
 
 
 
d96cffe
 
 
 
 
 
f1aa842
d96cffe
 
 
 
 
 
 
 
f1aa842
 
d96cffe
f1aa842
d96cffe
 
 
f1aa842
d96cffe
 
 
 
 
f1aa842
d96cffe
f1aa842
 
 
 
d96cffe
f1aa842
 
 
 
 
 
 
 
 
 
 
 
 
d96cffe
 
f1aa842
 
 
d96cffe

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

# 1. Configuration
MODEL_ID = "ConceptModels/Concept-7b-V1-Full"

# 2. Load Model and Tokenizer (Done once at startup)
print(f"Loading {MODEL_ID}... this may take a while.")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    
    # Attempt to use GPU if available, otherwise CPU
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Running on device: {device}")

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto" if device == "cuda" else None,
        # Uncomment the line below to use 4-bit quantization (requires pip install bitsandbytes)
        # load_in_4bit=True 
    )
    # If using CPU, move model explicitly
    if device == "cpu":
        model.to("cpu")
        
    print("Model loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    raise e

def respond(
    message,
    history: list[dict[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    hf_token=None, # Not strictly needed for local if logged in via CLI, but kept for signature compatibility
):
    # 3. Format the conversation
    # We construct the list of messages including system, history, and current input
    messages = [{"role": "system", "content": system_message}]
    messages.extend(history)
    messages.append({"role": "user", "content": message})

    # Apply the model's specific chat template
    input_ids = tokenizer.apply_chat_template(
        messages, 
        return_tensors="pt", 
        add_generation_prompt=True
    ).to(model.device)

    # 4. Setup Streaming
    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )

    # 5. Run generation in a separate thread so we can yield tokens
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # 6. Yield output as it generates
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        yield partial_message

# 7. Gradio Interface
chatbot = gr.ChatInterface(
    respond,
    type="messages",
    additional_inputs=[
        gr.Textbox(value="You are an AI called Concept. You are made for programming in any type of code.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

with gr.Blocks() as demo:
    # Removed LoginButton because local execution usually relies on environment login
    # or public models.
    chatbot.render()

if __name__ == "__main__":
    demo.launch()