File size: 2,630 Bytes
beac658 f5faf0f cd00e73 cb115bc fd89608 cd00e73 beac658 57e83fb a3be3b5 3c7b473 cd00e73 fd483eb cd00e73 fd483eb cd00e73 a3be3b5 cd00e73 fd483eb cd00e73 cb115bc fd483eb 6627d48 fd483eb 57e83fb fd483eb cd00e73 f5faf0f cd00e73 57e83fb cd00e73 a3be3b5 3c7b473 a3be3b5 3c7b473 c1fbe89 2aa5988 cd00e73 6627d48 cd00e73 1eab3a3 e2cc05e cd00e73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Use lighter model for CPU
#model_name = "microsoft/phi-2" # 2.7B - TOO HEAVY
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # 1.1B - much lighter
try:
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32,
device_map="cpu",
low_cpu_mem_usage=True # Critical for CPU
)
print("Model loaded successfully")
except Exception as e:
print(f"Failed to load model: {e}")
# Fallback to dummy function
model, tokenizer = None, None
def generate_response(message):
"""Process user input and generate response"""
if not message.strip():
return "Please enter a question."
if model is None or tokenizer is None:
return f"Model not loaded. Testing UI with: {message}"
try:
# Format for chat model
prompt = f"<|user|>\n{message}\n<|assistant|>\n"
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=384)
# Generate with lower token count for CPU
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask, # FIX: Add attention mask
max_new_tokens=600, # Reduced for CPU
temperature=0.8,
do_sample=True,
top_p=0.9,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response.strip()
except Exception as e:
return f"Error: {str(e)[:100]}"
# Create interface
interface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Input", placeholder="Enter programming question...", lines=3),
outputs=gr.Textbox(label="Output", lines=10),
title="LiveCoder API",
description="LLM programming assistant",
allow_flagging="never"
)
# API endpoint info
USERNAME = "sarekuwa"
SPACE_NAME = "livecoder"
print(f"API Endpoint: https://{USERNAME}-{SPACE_NAME}.hf.space/api/predict")
# CRITICAL: Enable queue for request processing
interface.queue(default_concurrency_limit=1)
# Launch application
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
) |