Datangtang commited on
Commit
77cf31a
·
verified ·
1 Parent(s): a3cfd53

最初成功时的代码

Browse files
Files changed (1) hide show
  1. app.py +67 -109
app.py CHANGED
@@ -3,119 +3,77 @@ from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
- # ----------------------------------------
7
- # Global model cache
8
- # ----------------------------------------
9
- loaded_models = {}
10
- current_model_name = None
11
-
12
- MODEL_CONFIGS = {
13
- "1B Model (Datangtang/GGUF1B)": {
14
- "repo_id": "Datangtang/GFUF1B",
15
- "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
16
- },
17
- "3B Model (Datangtang/GGUF3B)": {
18
- "repo_id": "Datangtang/GGUF3B",
19
- "filename": "llama-3.2-3b-instruct.Q4_K_M.gguf"
20
- }
21
- }
22
-
23
-
24
- # ----------------------------------------
25
- # Load model function
26
- # ----------------------------------------
27
- def load_model(model_choice):
28
- if model_choice in loaded_models:
29
- print(f"Reusing already loaded model: {model_choice}")
30
- return loaded_models[model_choice]
31
-
32
- cfg = MODEL_CONFIGS[model_choice]
33
-
34
- print(f"Downloading model: {model_choice}")
35
- model_path = hf_hub_download(
36
- repo_id=cfg["repo_id"],
37
- filename=cfg["filename"],
38
- local_dir="./model",
39
- token=os.environ["HF_TOKEN"]
40
- )
41
-
42
- print("Loading model into memory...")
43
- llm = Llama(
44
- model_path=model_path,
45
- n_ctx=1024,
46
- n_threads=6,
47
- n_batch=512,
48
- n_gpu_layers=0,
49
- use_mmap=True,
50
- use_mlock=True,
51
- verbose=False
52
- )
53
-
54
- loaded_models[model_choice] = llm
55
- print("Model loaded successfully!")
56
- return llm
57
-
58
-
59
- # ----------------------------------------
60
- # Chat function (HuggingFace-compatible)
61
- # ----------------------------------------
62
- def chat(message, history, model_choice):
63
- llm = load_model(model_choice)
64
-
65
- # Build conversation prompt
66
- conversation = "System: You are a helpful assistant.\n"
67
-
68
- for human, assistant in history[-3:]:
69
  conversation += f"User: {human}\n"
70
- if assistant:
71
- conversation += f"Assistant: {assistant}\n"
72
-
73
- conversation += f"User: {message}\nAssistant:"
74
-
 
75
  response = llm(
76
  conversation,
77
- max_tokens=128,
78
  temperature=0.7,
79
  top_p=0.9,
80
- top_k=40,
81
  repeat_penalty=1.1,
82
- stop=["User:", "Assistant:"]
83
- )
84
-
85
- return response["choices"][0]["text"].strip()
86
-
87
-
88
- # ----------------------------------------
89
- # Gradio UI
90
- # ----------------------------------------
91
- with gr.Blocks() as demo:
92
-
93
- gr.Markdown("## 🦙 Datangtang GGUF Model Demo")
94
-
95
- model_choice = gr.Dropdown(
96
- label="Select Model",
97
- choices=list(MODEL_CONFIGS.keys()),
98
- value="1B Model (Datangtang/GGUF1B)"
99
  )
100
-
101
- chatbot = gr.Chatbot()
102
- msg_box = gr.Textbox(label="Message")
103
-
104
- # Add user message to history
105
- def user_send(message, history):
106
- history = history + [[message, None]]
107
- return history, ""
108
-
109
- # Generate bot response
110
- def bot_reply(history, model_choice):
111
- user_msg = history[-1][0]
112
- bot_msg = chat(user_msg, history[:-1], model_choice)
113
- history[-1][1] = bot_msg
114
- return history
115
-
116
- # Wire events
117
- msg_box.submit(user_send, [msg_box, chatbot], [chatbot, msg_box]).then(
118
- bot_reply, [chatbot, model_choice], chatbot
119
- )
120
-
121
- demo.launch()
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
6
+ print("Downloading GGUF model from HuggingFace...")
7
+
8
+ # Download model
9
+ model_path = hf_hub_download(
10
+ repo_id="Datangtang/GGUF3B",
11
+ filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
12
+ local_dir="./model"
13
+ )
14
+
15
+ print(f"Model downloaded to: {model_path}")
16
+ print("Loading GGUF model with optimized settings...")
17
+
18
+ # Load with optimized settings
19
+ llm = Llama(
20
+ model_path=model_path,
21
+ n_ctx=1024, # Reduced from 2048 (faster)
22
+ n_threads=6, # Increased from 4 (use more CPU)
23
+ n_batch=512, # Added: larger batch for faster processing
24
+ n_gpu_layers=0,
25
+ verbose=False,
26
+ use_mlock=True, # Keep model in RAM
27
+ use_mmap=True, # Use memory mapping
28
+ )
29
+
30
+ print("Model loaded successfully!")
31
+
32
+ def chat(message, history):
33
+ """Handle chat interactions"""
34
+ # Build conversation (keep it short)
35
+ conversation = ""
36
+
37
+ # Only use last 3 turns of history to keep context short
38
+ recent_history = history[-3:] if len(history) > 3 else history
39
+
40
+ for human, assistant in recent_history:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  conversation += f"User: {human}\n"
42
+ conversation += f"Assistant: {assistant}\n"
43
+
44
+ conversation += f"User: {message}\n"
45
+ conversation += "Assistant:"
46
+
47
+ # Generate with optimized settings
48
  response = llm(
49
  conversation,
50
+ max_tokens=128, # Reduced from 256 (faster)
51
  temperature=0.7,
52
  top_p=0.9,
53
+ top_k=40, # Added: limit sampling
54
  repeat_penalty=1.1,
55
+ stop=["User:", "\n\n"],
56
+ echo=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  )
58
+
59
+ return response['choices'][0]['text'].strip()
60
+
61
+ # Create interface WITHOUT example caching
62
+ demo = gr.ChatInterface(
63
+ fn=chat,
64
+ title="Bit & Sugar/llama-3.2-3b-finetome-1000steps-gguf",
65
+ description=(
66
+ "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
67
+ "Optimized with GGUF Q4_K_M quantization | "
68
+ "ID2223 Lab 2"
69
+ ),
70
+ examples=[
71
+ "What is machine learning?",
72
+ "Explain AI briefly",
73
+ "What is LoRA?",
74
+ ],
75
+ cache_examples=False, # IMPORTANT: Disable caching
76
+ )
77
+
78
+ if __name__ == "__main__":
79
+ demo.launch()