Spaces:
Runtime error
Runtime error
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel | |
| from fastapi import FastAPI | |
| app = FastAPI() | |
| # Load model | |
| quant_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True | |
| ) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| "heliosbrahma/falcon-7b-sharded-bf16-finetuned-mental-health-conversational", | |
| trust_remote_code=True, | |
| quantization_config=quant_config, | |
| device_map="auto" | |
| ) | |
| repo_name = "your-username/falcon-7b-mental-health-finetuned" # Your repo from Step 1 | |
| model = PeftModel.from_pretrained(base_model, repo_name) | |
| tokenizer = AutoTokenizer.from_pretrained(repo_name) | |
| print("Model loaded!") | |
| # Generation function | |
| def generate_response(prompt, max_length=200, temperature=0.7): | |
| inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to("cuda" if torch.cuda.is_available() else "cpu") | |
| outputs = model.generate( | |
| input_ids=inputs["input_ids"], | |
| attention_mask=inputs["attention_mask"], | |
| max_length=max_length + len(inputs["input_ids"][0]), | |
| temperature=temperature, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| no_repeat_ngram_size=3 | |
| ) | |
| full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| if prompt.lower() in full_response.lower(): | |
| response_start = full_response.lower().find(prompt.lower()) + len(prompt) | |
| return full_response[response_start:].strip() | |
| return full_response.strip() | |
| # API endpoint | |
| async def chat(prompt: str): | |
| response = generate_response(prompt) | |
| return {"response": response} |