Spaces:
Sleeping
Sleeping
| import torch | |
| import random | |
| from quart import Quart | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| app = Quart(__name__) | |
| tokenizer = AutoTokenizer.from_pretrained("OpenBuddy/openbuddy-openllama-3b-v10-bf16") | |
| model = AutoModelForCausalLM.from_pretrained("OpenBuddy/openbuddy-openllama-3b-v10-bf16") | |
| model.eval() | |
| with open('system.prompt', 'r', encoding='utf-8') as f: | |
| prompt = f.read() | |
| async def echo(): | |
| data = await request.get_json() | |
| if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500 | |
| userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " | |
| input_ids = tokenizer.encode(userPrompt, return_tensors='pt') | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| input_ids=input_ids, | |
| do_sample=random.choice([True, False]), temperature=float(random.randint(7,20)) / 10.0, | |
| max_new_tokens=data.get("max_tokens") or random.randomint(200,500), | |
| eos_token_id=tokenizer.eos_token_id, return_full_text = False) | |
| output = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| return {"output": output} | |
| async def get(): | |
| return "better to run it on own container" |