from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List, Dict, Any import requests import os app = FastAPI(title="Hyper-Fast Serverless Engine v3.5") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Serverless Inference API Endpoint API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" # Fetching token from Space Secrets HF_TOKEN = os.getenv("HF_TOKEN", "") headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {} class ChatPayload(BaseModel): user_id: str user_message: str current_chat_history: List[Dict[str, Any]] = [] user_files: Dict[str, Any] = {} @app.get("/") def status(): return {"status": "online", "engine": "Serverless GPU Bridge Active"} @app.post("/chat") async def chat_endpoint(payload: ChatPayload): try: user_query = payload.user_message # 1. Base System Prompt Setup prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a fast, responsive AI assistant.<|eot_id|>" # 2. Append history with correct formatting (Fixed {} here) for turn in payload.current_chat_history[-4:]: role = "user" if turn.get("role") == "user" else "assistant" content = turn.get("content", "") prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\mrow{content}<|eot_id|>" # 3. Append current user query prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" api_payload = { "inputs": prompt, "parameters": { "max_new_tokens": 256, "temperature": 0.7, "return_full_text": False } } # 4. Request to HuggingFace API Cluster response = requests.post(API_URL, headers=headers, json=api_payload) if response.status_code != 200: raise HTTPException(status_code=response.status_code, detail=f"HF API Error: {response.text}") result = response.json() # 5. Extract generated text safely if isinstance(result, list) and len(result) > 0: generated_text = result[0].get("generated_text", "").strip() elif isinstance(result, dict) and "generated_text" in result: generated_text = result.get("generated_text", "").strip() else: generated_text = str(result) # 6. Build response state updated_history = payload.current_chat_history + [ {"role": "user", "content": user_query}, {"role": "assistant", "content": generated_text} ] return { "updated_chat_history": updated_history, "updated_files": payload.user_files } except Exception as exec_err: # Pushes logs inside HuggingFace console for exact traceback if anything else breaks print(f"Exception Triggered: {str(exec_err)}") raise HTTPException(status_code=500, detail=f"Cloud Matrix Failure: {str(exec_err)}")