import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel from fastapi import FastAPI app = FastAPI() # Load model quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True ) base_model = AutoModelForCausalLM.from_pretrained( "heliosbrahma/falcon-7b-sharded-bf16-finetuned-mental-health-conversational", trust_remote_code=True, quantization_config=quant_config, device_map="auto" ) repo_name = "your-username/falcon-7b-mental-health-finetuned" # Your repo from Step 1 model = PeftModel.from_pretrained(base_model, repo_name) tokenizer = AutoTokenizer.from_pretrained(repo_name) print("Model loaded!") # Generation function def generate_response(prompt, max_length=200, temperature=0.7): inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to("cuda" if torch.cuda.is_available() else "cpu") outputs = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length + len(inputs["input_ids"][0]), temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, no_repeat_ngram_size=3 ) full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) if prompt.lower() in full_response.lower(): response_start = full_response.lower().find(prompt.lower()) + len(prompt) return full_response[response_start:].strip() return full_response.strip() # API endpoint @app.post("/chat") async def chat(prompt: str): response = generate_response(prompt) return {"response": response}