Datangtang commited on
Commit
4061801
·
verified ·
1 Parent(s): 1adfee6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -16
app.py CHANGED
@@ -1,24 +1,80 @@
1
  import gradio as gr
2
- from ctransformers import AutoModelForCausalLM
 
 
3
 
4
- model_id = "Datangtang/GGUF_New_1B"
5
 
6
- model = AutoModelForCausalLM.from_pretrained(
7
- model_id,
8
- model_file="llama-3.2-1b-instruct.Q4_K_M.gguf", # 改成你的 gguf 文件名
9
- model_type="llama",
10
- gpu_layers=0
11
  )
12
 
13
- def chat_fn(message):
14
- response = model(message, max_new_tokens=200)
15
- return response
16
 
17
- demo = gr.Interface(
18
- fn=chat_fn,
19
- inputs="text",
20
- outputs="text",
21
- title="My GGUF Model"
 
 
 
 
 
22
  )
23
 
24
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
 
6
+ print("Downloading GGUF model from HuggingFace...")
7
 
8
+ # Download model
9
+ model_path = hf_hub_download(
10
+ repo_id="Datangtang/GGUF1B",
11
+ filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
12
+ local_dir="./model"
13
  )
14
 
15
+ print(f"Model downloaded to: {model_path}")
16
+ print("Loading GGUF model with optimized settings...")
 
17
 
18
+ # Load with optimized settings
19
+ llm = Llama(
20
+ model_path=model_path,
21
+ n_ctx=1024, # Reduced from 2048 (faster)
22
+ n_threads=6, # Increased from 4 (use more CPU)
23
+ n_batch=512, # Added: larger batch for faster processing
24
+ n_gpu_layers=0,
25
+ verbose=False,
26
+ use_mlock=True, # Keep model in RAM
27
+ use_mmap=True, # Use memory mapping
28
  )
29
 
30
+ print("Model loaded successfully!")
31
+
32
+ def chat(message, history):
33
+ """Handle chat interactions"""
34
+ # Build conversation (keep it short)
35
+ conversation = ""
36
+
37
+ # Only use last 3 turns of history to keep context short
38
+ recent_history = history[-3:] if len(history) > 3 else history
39
+
40
+ for human, assistant in recent_history:
41
+ conversation += f"User: {human}\n"
42
+ conversation += f"Assistant: {assistant}\n"
43
+
44
+ conversation += f"User: {message}\n"
45
+ conversation += "Assistant:"
46
+
47
+ # Generate with optimized settings
48
+ response = llm(
49
+ conversation,
50
+ max_tokens=128, # Reduced from 256 (faster)
51
+ temperature=0.7,
52
+ top_p=0.9,
53
+ top_k=40, # Added: limit sampling
54
+ repeat_penalty=1.1,
55
+ stop=["User:", "\n\n"],
56
+ echo=False,
57
+ )
58
+
59
+ return response['choices'][0]['text'].strip()
60
+
61
+ # Create interface WITHOUT example caching
62
+ demo = gr.ChatInterface(
63
+ fn=chat,
64
+ title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
65
+ description=(
66
+ "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
67
+ "Optimized with GGUF Q4_K_M quantization | "
68
+ "ID2223 Lab 2"
69
+ ),
70
+ examples=[
71
+ "What is machine learning?",
72
+ "Explain AI briefly",
73
+ "What is LoRA?",
74
+ ],
75
+ cache_examples=False, # IMPORTANT: Disable caching
76
+ theme="soft",
77
+ )
78
+
79
+ if __name__ == "__main__":
80
+ demo.launch()