Datangtang commited on
Commit
ac0916f
·
verified ·
1 Parent(s): 6e74518

go back to 1b & 3b

Browse files
Files changed (1) hide show
  1. app.py +114 -67
app.py CHANGED
@@ -3,79 +3,126 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
- print("Downloading GGUF model from HuggingFace...")
7
-
8
- # Download model
9
- model_path = hf_hub_download(
10
- repo_id="Datangtang/GGUF1B",
11
- filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
12
- local_dir="./model",
13
- token=os.environ["HF_TOKEN"]
14
- )
15
-
16
- print(f"Model downloaded to: {model_path}")
17
- print("Loading GGUF model with optimized settings...")
18
-
19
- # Load with optimized settings
20
- llm = Llama(
21
- model_path=model_path,
22
- n_ctx=1024, # Reduced from 2048 (faster)
23
- n_threads=6, # Increased from 4 (use more CPU)
24
- n_batch=512, # Added: larger batch for faster processing
25
- n_gpu_layers=0,
26
- verbose=False,
27
- use_mlock=True, # Keep model in RAM
28
- use_mmap=True, # Use memory mapping
29
- )
30
-
31
- print("Model loaded successfully!")
32
-
33
- def chat(message, history):
34
- """Handle chat interactions"""
35
- # Build conversation (keep it short)
36
- conversation = ""
37
-
38
- # Only use last 3 turns of history to keep context short
39
- recent_history = history[-3:] if len(history) > 3 else history
40
-
41
- for human, assistant in recent_history:
42
- conversation += f"User: {human}\n"
43
- conversation += f"Assistant: {assistant}\n"
44
-
45
- conversation += f"User: {message}\n"
46
- conversation += "Assistant:"
47
-
48
- # Generate with optimized settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  response = llm(
50
  conversation,
51
- max_tokens=128, # Reduced from 256 (faster)
52
  temperature=0.7,
53
  top_p=0.9,
54
- top_k=40, # Added: limit sampling
55
  repeat_penalty=1.1,
56
- stop=["User:", "\n\n"],
57
- echo=False,
58
  )
59
-
60
- return response['choices'][0]['text'].strip()
61
-
62
- # Create interface WITHOUT example caching
63
- demo = gr.ChatInterface(
64
- fn=chat,
65
- title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
66
- description=(
67
- "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
68
- "Optimized with GGUF Q4_K_M quantization | "
69
- "ID2223 Lab 2"
70
- ),
71
- examples=[
72
- "What is machine learning?",
73
- "Explain AI briefly",
74
- "What is LoRA?",
75
- ],
76
- cache_examples=False, # IMPORTANT: Disable caching
77
- theme="soft",
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  if __name__ == "__main__":
81
  demo.launch()
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
+ # ----------------------------------------
7
+ # Global model cache
8
+ # ----------------------------------------
9
+ loaded_models = {} # Cache loaded Llama models
10
+ current_model_name = None
11
+
12
+ MODEL_CONFIGS = {
13
+ "1B Model (Datangtang/GGUF1B)": {
14
+ "repo_id": "Datangtang/GGUF1B",
15
+ "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
16
+ },
17
+ "3B Model (Datangtang/GGUF3B)": {
18
+ "repo_id": "Datangtang/GGGF3B",
19
+ "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
20
+ }
21
+ }
22
+
23
+
24
+ # ----------------------------------------
25
+ # Load model function
26
+ # ----------------------------------------
27
+ def load_model(model_choice):
28
+ global loaded_models, current_model_name
29
+
30
+ if model_choice in loaded_models:
31
+ print(f"Reusing already loaded model: {model_choice}")
32
+ current_model_name = model_choice
33
+ return loaded_models[model_choice]
34
+
35
+ print(f"Downloading model: {model_choice}")
36
+
37
+ cfg = MODEL_CONFIGS[model_choice]
38
+
39
+ model_path = hf_hub_download(
40
+ repo_id=cfg["repo_id"],
41
+ filename=cfg["filename"],
42
+ local_dir="./model",
43
+ token=os.environ["HF_TOKEN"]
44
+ )
45
+
46
+ print(f"Model downloaded to: {model_path}")
47
+ print("Loading GGUF model into memory...")
48
+
49
+ llm = Llama(
50
+ model_path=model_path,
51
+ n_ctx=1024,
52
+ n_threads=6,
53
+ n_batch=512,
54
+ n_gpu_layers=0,
55
+ use_mmap=True,
56
+ use_mlock=True,
57
+ verbose=False,
58
+ )
59
+
60
+ loaded_models[model_choice] = llm
61
+ current_model_name = model_choice
62
+
63
+ print("Model loaded successfully!")
64
+ return llm
65
+
66
+
67
+ # ----------------------------------------
68
+ # Chat function
69
+ # ----------------------------------------
70
+ def chat(message, history, model_choice):
71
+ llm = load_model(model_choice)
72
+
73
+ # System prompt
74
+ conversation = "System: You are a helpful assistant.\n"
75
+
76
+ # Add last 3 messages
77
+ for human, assistant in history[-3:]:
78
+ conversation += f"User: {human}\nAssistant: {assistant}\n"
79
+
80
+ conversation += f"User: {message}\nAssistant:"
81
+
82
  response = llm(
83
  conversation,
84
+ max_tokens=128,
85
  temperature=0.7,
86
  top_p=0.9,
87
+ top_k=40,
88
  repeat_penalty=1.1,
89
+ stop=["User:", "Assistant:"],
90
+ echo=False
91
  )
92
+
93
+ return response["choices"][0]["text"].strip()
94
+
95
+
96
+ # ----------------------------------------
97
+ # Gradio UI
98
+ # ----------------------------------------
99
+ with gr.Blocks() as demo:
100
+
101
+ gr.Markdown("# 🦙 Datangtang GGUF Model Demo")
102
+ gr.Markdown("Switch between **1B** and **3B** GGUF models in real-time.")
103
+
104
+ model_choice = gr.Dropdown(
105
+ label="Select Model",
106
+ choices=list(MODEL_CONFIGS.keys()),
107
+ value="1B Model (Datangtang/GGUF1B)",
108
+ )
109
+
110
+ chat_iface = gr.ChatInterface(
111
+ fn=lambda message, history: chat(message, history, model_choice.value),
112
+ examples=[
113
+ "Explain deep learning in one paragraph.",
114
+ "What is the difference between supervised and unsupervised learning?",
115
+ "Explain what a transformer model is.",
116
+ ],
117
+ cache_examples=False,
118
+ )
119
+
120
+ model_choice.change(
121
+ fn=lambda x: f"🔄 Switched to: {x}",
122
+ inputs=[model_choice],
123
+ outputs=[],
124
+ )
125
+
126
 
127
  if __name__ == "__main__":
128
  demo.launch()