Fu01978 commited on
Commit
c392854
·
verified ·
1 Parent(s): 98b52f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -15
app.py CHANGED
@@ -1,33 +1,141 @@
1
  import os
2
- from huggingface_hub import hf_hub_download
 
3
  from llama_cpp import Llama
4
  import gradio as gr
 
5
 
6
- REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
7
- FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
 
 
 
 
8
 
9
- model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  llm = Llama(
12
  model_path=model_path,
13
- n_ctx=2048,
14
  n_threads=4,
15
- n_gpu_layers=0
 
16
  )
17
 
18
- def chat_fn(message, history):
19
- out = llm(
20
- message,
21
- max_tokens=256,
22
- temperature=0.7,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  top_p=0.95,
24
- stop=["</s>"]
25
  )
26
- return out["choices"][0]["text"]
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  demo = gr.ChatInterface(
29
  fn=chat_fn,
30
- title="llama-cpp-python GGUF Space 🚀",
 
31
  )
32
 
33
- demo.launch()
 
 
1
  import os
2
+ import requests
3
+ from huggingface_hub import hf_hub_download, hf_hub_url
4
  from llama_cpp import Llama
5
  import gradio as gr
6
+ import tempfile
7
 
8
+ # -------------------------
9
+ # Config: change if you want
10
+ # -------------------------
11
+ REPO_ID = "mradermacher/EuroLLM-1.7B-Instruct-GGUF"
12
+ FILENAME = "EuroLLM-1.7B-Instruct.Q8_0.gguf"
13
+ SYSTEM_PROMPT = "You are a helpful assistant. Answer concisely and helpfully."
14
 
15
+ # local path we'll store the model
16
+ MODEL_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), "models")
17
+ os.makedirs(MODEL_DIR, exist_ok=True)
18
+ MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
19
 
20
+ # -------------------------
21
+ # Helper: robust download
22
+ # -------------------------
23
+ def download_from_hf(repo_id: str, filename: str, dest: str) -> str:
24
+ """Download using huggingface_hub if possible; fallback to direct url via requests."""
25
+ if os.path.exists(dest) and os.path.getsize(dest) > 0:
26
+ print(f"Model already exists at {dest}")
27
+ return dest
28
+
29
+ try:
30
+ print("Trying hf_hub_download...")
31
+ path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_DIR)
32
+ # hf_hub_download may return a cache path; copy/move to dest if needed
33
+ if os.path.abspath(path) != os.path.abspath(dest):
34
+ # move the cached file into our models folder with the expected name
35
+ os.replace(path, dest)
36
+ path = dest
37
+ print("Downloaded via hf_hub_download:", path)
38
+ return path
39
+ except Exception as e:
40
+ print("hf_hub_download failed:", e)
41
+ # fallback: construct the direct URL and download via requests
42
+ try:
43
+ print("Falling back to direct URL via requests...")
44
+ url = hf_hub_url(repo_id=repo_id, filename=filename)
45
+ # url is the Hub URL (signed? but usually works for public repos)
46
+ # If user provided direct URL with ?download=true, you can paste that directly.
47
+ print("Downloading from:", url)
48
+ with requests.get(url, stream=True, timeout=60) as r:
49
+ r.raise_for_status()
50
+ with open(dest, "wb") as f:
51
+ for chunk in r.iter_content(chunk_size=8192):
52
+ if chunk:
53
+ f.write(chunk)
54
+ print("Downloaded fallback to:", dest)
55
+ return dest
56
+ except Exception as e2:
57
+ raise RuntimeError(f"Both hf_hub_download and direct download failed: {e2}")
58
+
59
+ # -------------------------
60
+ # Ensure model is present
61
+ # -------------------------
62
+ model_path = download_from_hf(REPO_ID, FILENAME, MODEL_PATH)
63
+
64
+ # -------------------------
65
+ # Load the model (llama-cpp-python)
66
+ # -------------------------
67
  llm = Llama(
68
  model_path=model_path,
69
+ n_ctx=2048, # lower if you need less memory
70
  n_threads=4,
71
+ n_gpu_layers=0, # CPU-only. If you have GPU layers available, adjust.
72
+ # stream is set per-call in create_chat_completion below.
73
  )
74
 
75
+ # -------------------------
76
+ # Chat formatting helpers
77
+ # -------------------------
78
+ def build_messages(history, user_message, system_prompt=SYSTEM_PROMPT):
79
+ """
80
+ Convert history (list of [user, assistant]) into chat messages format expected by create_chat_completion.
81
+ Then append the current user_message at the end.
82
+ """
83
+ messages = []
84
+ if system_prompt:
85
+ messages.append({"role": "system", "content": system_prompt})
86
+
87
+ # history is list of [user, assistant] pairs
88
+ for user_msg, assistant_msg in history:
89
+ messages.append({"role": "user", "content": user_msg})
90
+ if assistant_msg is not None and assistant_msg != "":
91
+ messages.append({"role": "assistant", "content": assistant_msg})
92
+
93
+ # now add current user message
94
+ messages.append({"role": "user", "content": user_message})
95
+ return messages
96
+
97
+ # -------------------------
98
+ # Streaming generator for Gradio
99
+ # -------------------------
100
+ def chat_fn(user_message, history):
101
+ """
102
+ Gradio ChatInterface expects either a single return (reply string) or a generator that yields partial strings.
103
+ We'll stream partial assistant text as it arrives from llama-cpp-python create_chat_completion(..., stream=True).
104
+ """
105
+ # history is list of [user, assistant] pairs from Gradio
106
+ messages = build_messages(history or [], user_message)
107
+
108
+ # create_chat_completion returns an iterator when stream=True
109
+ stream = llm.create_chat_completion(
110
+ messages=messages,
111
+ max_tokens=512,
112
+ temperature=0.2,
113
  top_p=0.95,
114
+ stream=True
115
  )
 
116
 
117
+ # accumulate incremental content and yield progressive replies
118
+ partial = ""
119
+ for chunk in stream:
120
+ # chunk structure: {"id":..., "object":"chat.completion.chunk", "choices":[{"delta":{"content": "..."}}, ...]}
121
+ try:
122
+ if "choices" in chunk and len(chunk["choices"]) > 0:
123
+ delta = chunk["choices"][0].get("delta", {})
124
+ if "content" in delta:
125
+ partial += delta["content"]
126
+ yield partial
127
+ except Exception:
128
+ # ignore malformed chunk and continue
129
+ continue
130
+
131
+ # -------------------------
132
+ # Launch Gradio
133
+ # -------------------------
134
  demo = gr.ChatInterface(
135
  fn=chat_fn,
136
+ title="EuroLLM 1.7B (GGUF) streaming chat",
137
+ description="Model: mradermacher/EuroLLM-1.7B-Instruct (Q8_0). System prompt enabled. Streaming ON.",
138
  )
139
 
140
+ if __name__ == "__main__":
141
+ demo.launch()