File size: 2,630 Bytes
beac658
f5faf0f
 
 
cd00e73
cb115bc
 
fd89608
cd00e73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beac658
57e83fb
a3be3b5
3c7b473
 
 
cd00e73
 
 
fd483eb
cd00e73
 
fd483eb
cd00e73
a3be3b5
cd00e73
fd483eb
 
 
cd00e73
cb115bc
 
fd483eb
 
 
6627d48
fd483eb
 
 
 
57e83fb
fd483eb
cd00e73
f5faf0f
cd00e73
57e83fb
cd00e73
a3be3b5
3c7b473
a3be3b5
 
3c7b473
c1fbe89
2aa5988
cd00e73
 
 
 
6627d48
cd00e73
 
 
 
 
 
 
1eab3a3
e2cc05e
cd00e73
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Use lighter model for CPU
#model_name = "microsoft/phi-2"  # 2.7B - TOO HEAVY
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 1.1B - much lighter

try:
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        device_map="cpu",
        low_cpu_mem_usage=True  # Critical for CPU
    )
    print("Model loaded successfully")
    
except Exception as e:
    print(f"Failed to load model: {e}")
    # Fallback to dummy function
    model, tokenizer = None, None

def generate_response(message):
    """Process user input and generate response"""
    if not message.strip():
        return "Please enter a question."
    
    if model is None or tokenizer is None:
        return f"Model not loaded. Testing UI with: {message}"
    
    try:
        # Format for chat model
        prompt = f"<|user|>\n{message}\n<|assistant|>\n"
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=384)
        
        # Generate with lower token count for CPU
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,  # FIX: Add attention mask
                max_new_tokens=600,  # Reduced for CPU
                temperature=0.8,
                do_sample=True,
                top_p=0.9,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        return response.strip()
    
    except Exception as e:
        return f"Error: {str(e)[:100]}"

# Create interface
interface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Input", placeholder="Enter programming question...", lines=3),
    outputs=gr.Textbox(label="Output", lines=10),
    title="LiveCoder API",
    description="LLM programming assistant",
    allow_flagging="never"
)

# API endpoint info
USERNAME = "sarekuwa"
SPACE_NAME = "livecoder"
print(f"API Endpoint: https://{USERNAME}-{SPACE_NAME}.hf.space/api/predict")

# CRITICAL: Enable queue for request processing
interface.queue(default_concurrency_limit=1)

# Launch application
interface.launch(
    server_name="0.0.0.0", 
    server_port=7860, 
    share=False,
    debug=True
)