Hitoku commited on
Commit
156e73e
Β·
verified Β·
1 Parent(s): 26e41fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -11
app.py CHANGED
@@ -1,17 +1,81 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import torch
 
4
 
5
- model_id = "Datangtang/lora_lab2_model_1B"
6
 
7
- tokenizer = AutoTokenizer.from_pretrained(model_id)
8
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
 
 
 
 
 
9
 
10
- def chat_fn(message):
11
- inputs = tokenizer(message, return_tensors="pt")
12
- outputs = model.generate(**inputs, max_new_tokens=150)
13
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
14
 
15
- demo = gr.Interface(fn=chat_fn, inputs="text", outputs="text", title="My Finetuned LLM Chat")
 
 
 
 
 
 
 
 
 
 
16
 
17
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
 
6
+ print("Downloading GGUF model from HuggingFace...")
7
 
8
+ # Download model
9
+ model_path = hf_hub_download(
10
+ repo_id="Datangtang/GGUF1B",
11
+ filename="llama-3.2-1b-instruct.Q4_K_M.gguf",
12
+ local_dir="./model",
13
+ token=os.environ["HF_TOKEN"]
14
+ )
15
 
16
+ print(f"Model downloaded to: {model_path}")
17
+ print("Loading GGUF model with optimized settings...")
 
 
18
 
19
+ # Load with optimized settings
20
+ llm = Llama(
21
+ model_path=model_path,
22
+ n_ctx=1024, # Reduced from 2048 (faster)
23
+ n_threads=6, # Increased from 4 (use more CPU)
24
+ n_batch=512, # Added: larger batch for faster processing
25
+ n_gpu_layers=0,
26
+ verbose=False,
27
+ use_mlock=True, # Keep model in RAM
28
+ use_mmap=True, # Use memory mapping
29
+ )
30
 
31
+ print("Model loaded successfully!")
32
+
33
+ def chat(message, history):
34
+ """Handle chat interactions"""
35
+ # Build conversation (keep it short)
36
+ conversation = ""
37
+
38
+ # Only use last 3 turns of history to keep context short
39
+ recent_history = history[-3:] if len(history) > 3 else history
40
+
41
+ for human, assistant in recent_history:
42
+ conversation += f"User: {human}\n"
43
+ conversation += f"Assistant: {assistant}\n"
44
+
45
+ conversation += f"User: {message}\n"
46
+ conversation += "Assistant:"
47
+
48
+ # Generate with optimized settings
49
+ response = llm(
50
+ conversation,
51
+ max_tokens=128, # Reduced from 256 (faster)
52
+ temperature=0.7,
53
+ top_p=0.9,
54
+ top_k=40, # Added: limit sampling
55
+ repeat_penalty=1.1,
56
+ stop=["User:", "\n\n"],
57
+ echo=False,
58
+ )
59
+
60
+ return response['choices'][0]['text'].strip()
61
+
62
+ # Create interface WITHOUT example caching
63
+ demo = gr.ChatInterface(
64
+ fn=chat,
65
+ title="kkkkkkatherine/llama-3.2-1b-finetome-1000steps-gguf",
66
+ description=(
67
+ "Best model from 8 experiments (1000 steps, 23% loss improvement) | "
68
+ "Optimized with GGUF Q4_K_M quantization | "
69
+ "ID2223 Lab 2"
70
+ ),
71
+ examples=[
72
+ "What is machine learning?",
73
+ "Explain AI briefly",
74
+ "What is LoRA?",
75
+ ],
76
+ cache_examples=False, # IMPORTANT: Disable caching
77
+ theme="soft",
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()