import os from huggingface_hub import login import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from peft import PeftModel import torch import json # ── 0) Hugging Face login ──────────────────────────────────────────────────────── hf_token = os.environ.get("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN environment variable is not set.") login(token=hf_token) # ── 1) Load your model ─────────────────────────────────────────────────────────── base_model_id = "meta-llama/Llama-2-7b-chat-hf" adapter_id = "mdot77/fingpt-llama2-7b-forecaster-finetuned" tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True) base_model = AutoModelForCausalLM.from_pretrained( base_model_id, device_map="auto", load_in_8bit=True, # save VRAM ) model = PeftModel.from_pretrained( base_model, adapter_id, device_map="auto", ) model.eval() # ── 2) Define your system‐instruction template ─────────────────────────────────── SYSTEM = """You are a portfolio optimization assistant. For a given stock snapshot, recommend how the allocation should be adjusted. Your response MUST be valid JSON matching this schema: { "ticker": "", "snapshot": "", "verdict": "", "new_alloc_pct": , "reasoning": "" } Do not include any extra keys or commentary. At the end, emit only the JSON.""" # ── 3) Inference function ─────────────────────────────────────────────────────── def infer(data_json: str): # Hardcoded generation parameters max_new_tokens = 256 temperature = 0.0 top_p = 1.0 prompt = ( "[INST] <>\n" f"{SYSTEM}\n" "<>\n\n" "DATA:\n" f"{data_json}\n" "[/INST]" ) inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(model.device) gen_cfg = GenerationConfig( max_new_tokens=max_new_tokens, do_sample=(temperature > 0), temperature=temperature, top_p=top_p, use_cache=True, ) outputs = model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], generation_config=gen_cfg, ) new_tokens = outputs[0, inputs["input_ids"].shape[-1]:] reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip() try: parsed = json.loads(reply) return json.dumps(parsed, indent=2) except json.JSONDecodeError: return reply # ── 4) API wrapper function for the main prediction ────────────────────────────── def predict_api_wrapper(request_data): """Wrapper function that handles the API request format""" try: # Extract the data from the request if isinstance(request_data, dict) and "data" in request_data and len(request_data["data"]) > 0: data_json = request_data["data"][0] elif isinstance(request_data, str): data_json = request_data else: return {"error": "No data provided"} # Call the inference function result = infer(data_json) # Try to parse the result as JSON if it's not already try: if isinstance(result, str): parsed_result = json.loads(result) else: parsed_result = result except json.JSONDecodeError: # If it's not valid JSON, return as text parsed_result = {"raw_output": result} return parsed_result except Exception as e: return {"error": str(e)} # ── 5) Gradio interface with API endpoints ─────────────────────────────────────── with gr.Blocks(title="Portfolio-Optimizer Inference") as iface: gr.Markdown("# Portfolio-Optimizer Inference") gr.Markdown("Paste your snapshot JSON and get back a single-JSON allocation verdict.") with gr.Tab("Inference"): input_text = gr.Textbox( label="Snapshot data (JSON)", lines=15, value='{"ticker": "COIN", "snapshot": "2022-06-18", "previous_allocation_pct": 0.05}' ) output_json = gr.JSON(label="Model output") predict_btn = gr.Button("Predict") predict_btn.click(fn=infer, inputs=input_text, outputs=output_json) with gr.Tab("API Testing"): gr.Markdown("## API Testing Interface") gr.Markdown("Use this to test the API functionality. The main API endpoint is available at `/api/predict/`") api_input = gr.Textbox( label="API Request Data (JSON)", lines=10, value='{"data": ["{\\"ticker\\": \\"AAPL\\", \\"snapshot\\": \\"2025-01-01\\", \\"previous_allocation_pct\\": 0.05}"]}' ) api_output = gr.JSON(label="API Response") api_btn = gr.Button("Test API") api_btn.click(fn=predict_api_wrapper, inputs=api_input, outputs=api_output) # ── 6) Launch for Hugging Face Spaces ─────────────────────────────────────────── if __name__ == "__main__": # For Hugging Face Spaces, use the default launch # The API endpoints will be available at /api/predict/ automatically iface.launch()