File size: 2,230 Bytes
c85f9bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Global variables
_model = None
_tokenizer = None
_model_name = "microsoft/DialoGPT-small"  # Using a smaller, faster model for testing

def initialize_tokenizer():
    """Initialize tokenizer"""
    global _tokenizer
    if _tokenizer is None:
        print("[MinimalService] Loading tokenizer...")
        _tokenizer = AutoTokenizer.from_pretrained(_model_name)
        if _tokenizer.pad_token is None:
            _tokenizer.pad_token = _tokenizer.eos_token
        print("[MinimalService] Tokenizer loaded successfully.")
    return _tokenizer

@spaces.GPU
def generate_text_gpu(prompt: str, max_tokens: int = 50):
    """GPU function for text generation"""
    global _model, _tokenizer
    
    print("[MinimalService] GPU function called")
    
    # Initialize tokenizer
    if _tokenizer is None:
        initialize_tokenizer()
    
    # Load model in GPU context
    if _model is None:
        print("[MinimalService] Loading model...")
        _model = AutoModelForCausalLM.from_pretrained(
            _model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        print("[MinimalService] Model loaded.")
    
    # Simple generation
    inputs = _tokenizer.encode(prompt, return_tensors="pt")
    device = next(_model.parameters()).device
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = _model.generate(
            inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=_tokenizer.eos_token_id
        )
    
    response = _tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

class MinimalService:
    def __init__(self):
        print("[MinimalService] Service initialized")
        # Initialize tokenizer immediately
        initialize_tokenizer()
    
    def generate(self, prompt: str):
        """Public method to generate text"""
        return generate_text_gpu(prompt)

# Create instance
service = MinimalService()

# Print confirmation that GPU function is registered
print(f"[MinimalService] GPU function available: {generate_text_gpu.__name__}")