import os import threading import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "EREN121232/MAJESTIC-FIN-R1-gguf") MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MAJESTIC-FIN-R1-Q8_0.gguf") MODEL_LABEL = os.getenv("MODEL_LABEL", "MAJESTIC-FIN-R1 Q8_0") N_CTX = int(os.getenv("N_CTX", "4096")) N_THREADS = int(os.getenv("CPU_CORES", os.getenv("N_THREADS", str(os.cpu_count() or 2)))) _MODEL = None _MODEL_LOCK = threading.Lock() _INFER_LOCK = threading.Lock() def get_model() -> Llama: global _MODEL with _MODEL_LOCK: if _MODEL is None: model_path = hf_hub_download( repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME, ) _MODEL = Llama( model_path=model_path, n_ctx=N_CTX, n_threads=N_THREADS, n_gpu_layers=0, verbose=False, ) return _MODEL def generate(prompt: str, system_prompt: str, temperature: float, max_tokens: int, top_p: float, repeat_penalty: float) -> str: prompt = prompt.strip() system_prompt = system_prompt.strip() if not prompt: return "Please enter a prompt." messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) llm = get_model() with _INFER_LOCK: response = llm.create_chat_completion( messages=messages, temperature=float(temperature), max_tokens=int(max_tokens), top_p=float(top_p), repeat_penalty=float(repeat_penalty), ) return response["choices"][0]["message"]["content"].strip() with gr.Blocks(title="MAJESTIC FIN R1 Free API") as demo: gr.Markdown( f""" # MAJESTIC FIN R1 Free API Public CPU deployment for `{MODEL_LABEL}` backed by `llama-cpp-python`. The API endpoint name is `/chat`. """ ) prompt = gr.Textbox( label="Prompt", lines=8, placeholder="Ask about finance, markets, accounting, or your fine-tuned task.", ) output = gr.Textbox(label="Response", lines=14) with gr.Accordion("Generation Settings", open=False): system_prompt = gr.Textbox( label="System Prompt", lines=4, value="You are MAJESTIC-FIN-R1, a helpful finance-focused assistant.", ) temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.05, label="Temperature") max_tokens = gr.Slider(64, 1024, value=256, step=32, label="Max Tokens") top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P") repeat_penalty = gr.Slider(1.0, 1.5, value=1.1, step=0.05, label="Repeat Penalty") run_button = gr.Button("Generate", variant="primary") gr.Examples( examples=[ ["Summarize the key risks in a company's balance sheet."], ["Explain EBITDA vs free cash flow in simple terms."], ["Give a short market outlook for a cautious investor."], ], inputs=prompt, ) run_button.click( fn=generate, inputs=[prompt, system_prompt, temperature, max_tokens, top_p, repeat_penalty], outputs=output, api_name="chat", show_progress="minimal", concurrency_limit=1, ) prompt.submit( fn=generate, inputs=[prompt, system_prompt, temperature, max_tokens, top_p, repeat_penalty], outputs=output, show_progress="minimal", concurrency_limit=1, ) if __name__ == "__main__": demo.queue(max_size=16).launch()