from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, LlamaForCausalLM import torch import os app = FastAPI(title="My 500M AI API") # Pointing to the Model you built! REPO_NAME = "Sdey10/My-500M-Mini-TUF" print("Downloading Model from Hugging Face...") # We fetch the public model without hardcoding your secret token tokenizer = AutoTokenizer.from_pretrained(REPO_NAME) model = LlamaForCausalLM.from_pretrained(REPO_NAME) # Free Hugging Face Spaces run on CPUs model.to("cpu") model.eval() class PromptRequest(BaseModel): prompt: str max_tokens: int = 50 @app.post("/generate") def generate_text(request: PromptRequest): inputs = tokenizer(request.prompt, return_tensors="pt").to("cpu") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=request.max_tokens, temperature=0.7, do_sample=True, repetition_penalty=1.2 ) response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response_text}