mo35 commited on
Commit
cfde10f
Β·
0 Parent(s):

Add Gradio chat interface

Browse files
Files changed (2) hide show
  1. app.py +84 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Space β€” Gemma 4 Quantitative Finance Chat
3
+ Hardware: Nvidia T4 medium (16 GB VRAM)
4
+ """
5
+
6
+ import os
7
+ import torch
8
+ import gradio as gr
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
+ from peft import PeftModel
11
+
12
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
13
+ HF_USERNAME = os.environ.get("HF_USERNAME", "mo35")
14
+ BASE_MODEL = "google/gemma-4-E4B-it"
15
+ LORA_REPO = f"{HF_USERNAME}/gemma4-quantfin-lora"
16
+
17
+ # ── Load model at startup ─────────────────────────────────────────────────────
18
+ print("Loading tokenizer...")
19
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
20
+
21
+ print("Loading base model in 4-bit...")
22
+ bnb_config = BitsAndBytesConfig(
23
+ load_in_4bit = True,
24
+ bnb_4bit_compute_dtype = torch.bfloat16,
25
+ bnb_4bit_use_double_quant = True,
26
+ bnb_4bit_quant_type = "nf4",
27
+ )
28
+ base_model = AutoModelForCausalLM.from_pretrained(
29
+ BASE_MODEL,
30
+ quantization_config = bnb_config,
31
+ device_map = "auto",
32
+ token = HF_TOKEN,
33
+ )
34
+
35
+ print(f"Loading LoRA adapter from {LORA_REPO}...")
36
+ model = PeftModel.from_pretrained(base_model, LORA_REPO, token=HF_TOKEN)
37
+ model.eval()
38
+ print("Model ready.")
39
+
40
+ # ── Inference ─────────────────────────────────────────────────────────────────
41
+ def respond(message: str, history: list) -> str:
42
+ messages = [{"role": "user", "content": message}]
43
+
44
+ inputs = tokenizer.apply_chat_template(
45
+ messages,
46
+ add_generation_prompt = True,
47
+ return_tensors = "pt",
48
+ ).to(model.device)
49
+
50
+ with torch.no_grad():
51
+ outputs = model.generate(
52
+ inputs,
53
+ max_new_tokens = 1024,
54
+ temperature = 0.7,
55
+ do_sample = True,
56
+ repetition_penalty = 1.1,
57
+ )
58
+
59
+ return tokenizer.decode(
60
+ outputs[0][inputs.shape[-1]:],
61
+ skip_special_tokens = True,
62
+ )
63
+
64
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
65
+ demo = gr.ChatInterface(
66
+ fn = respond,
67
+ title = "Gemma 4 β€” Quantitative Finance",
68
+ description = (
69
+ "A specialized AI assistant fine-tuned on quantitative finance: derivatives pricing, "
70
+ "stochastic calculus, risk models, and portfolio theory. "
71
+ "Answers include LaTeX mathematical derivations."
72
+ ),
73
+ examples = [
74
+ "Derive the Black-Scholes PDE from first principles.",
75
+ "Explain the SABR model and its implied volatility approximation.",
76
+ "What is the difference between risk-neutral and real-world measures?",
77
+ "Derive the Heston model characteristic function.",
78
+ "Explain Value at Risk vs Expected Shortfall.",
79
+ ],
80
+ theme = gr.themes.Soft(),
81
+ cache_examples = False,
82
+ )
83
+
84
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.49.0
3
+ peft>=0.13.0
4
+ bitsandbytes>=0.43.0
5
+ accelerate>=0.26.0
6
+ torch>=2.1.0