Anonymous0045 commited on
Commit
2e24877
·
verified ·
1 Parent(s): 826e872

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -107
app.py CHANGED
@@ -7,49 +7,30 @@ import config
7
 
8
 
9
  # ============================
10
- # Environment & Token Setup
11
  # ============================
12
 
13
  HF_TOKEN = os.environ.get("HF_TOKEN")
14
 
15
- if HF_TOKEN is None:
16
- print("Warning: HF_TOKEN not found. Download may fail for gated repos.")
17
-
18
-
19
- # ============================
20
- # Model Download (cached automatically by HF)
21
- # ============================
22
-
23
  print("Downloading model from Hugging Face Hub...")
24
 
25
- try:
26
- model_path = hf_hub_download(
27
- repo_id=config.MODEL_REPO,
28
- filename=config.MODEL_FILE,
29
- token=HF_TOKEN,
30
- cache_dir="/tmp/hf_cache"
31
- )
32
-
33
- print(f"Model downloaded successfully: {model_path}")
34
 
35
- except Exception as e:
36
- print("Model download failed:", str(e))
37
- raise e
38
 
39
 
40
  # ============================
41
- # CPU Optimization
42
  # ============================
43
 
44
  CPU_THREADS = multiprocessing.cpu_count()
45
 
46
- print(f"CPU Threads available: {CPU_THREADS}")
47
-
48
-
49
- # ============================
50
- # Load llama.cpp model
51
- # ============================
52
-
53
  print("Loading model into memory...")
54
 
55
  llm = Llama(
@@ -66,118 +47,82 @@ print("Model loaded successfully.")
66
 
67
 
68
  # ============================
69
- # Prompt System
70
  # ============================
71
 
72
  SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant.
73
-
74
- Rules:
75
- - Write clean, correct, production-ready code
76
- - Be concise
77
- - Only explain if asked
78
- - Prefer efficient solutions
79
  """
80
 
81
 
82
- def format_prompt(message, history):
83
 
84
  prompt = SYSTEM_PROMPT + "\n\n"
85
 
86
- for user, assistant in history:
87
- prompt += f"User: {user}\nAssistant: {assistant}\n"
 
 
 
88
 
89
- prompt += f"User: {message}\nAssistant:"
90
 
91
  return prompt
92
 
93
 
94
  # ============================
95
- # Streaming Generation
96
- # ============================
97
-
98
- def generate_stream(message, history):
99
-
100
- prompt = format_prompt(message, history)
101
-
102
- output = ""
103
-
104
- try:
105
- for token in llm(
106
- prompt,
107
- max_tokens=config.MAX_TOKENS,
108
- temperature=config.TEMPERATURE,
109
- top_p=0.95,
110
- stream=True
111
- ):
112
- text = token["choices"][0]["text"]
113
- output += text
114
- yield output
115
-
116
- except Exception as e:
117
- yield f"Error during generation: {str(e)}"
118
-
119
-
120
- # ============================
121
- # Gradio UI Logic
122
  # ============================
123
 
124
- def user(user_message, history):
125
- return "", history + [[user_message, ""]]
126
 
 
127
 
128
- def bot(history):
129
 
130
- user_message = history[-1][0]
131
 
132
- for response in generate_stream(user_message, history[:-1]):
133
- history[-1][1] = response
134
- yield history
 
 
 
 
 
 
 
135
 
136
 
137
  # ============================
138
- # Gradio Interface
139
  # ============================
140
 
141
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
142
-
143
- gr.Markdown("# DeepSeek Coder 1.3B (GGUF Production)")
144
- gr.Markdown("Fast, efficient coding assistant running on llama.cpp")
145
-
146
- chatbot = gr.Chatbot(height=500)
147
-
148
- msg = gr.Textbox(
149
- placeholder="Ask a coding question...",
150
- container=False
151
- )
152
 
153
- clear = gr.Button("Clear Chat")
154
 
 
155
 
156
- msg.submit(
157
- user,
158
- [msg, chatbot],
159
- [msg, chatbot],
160
- queue=True
161
- ).then(
162
- bot,
163
- chatbot,
164
- chatbot
165
- )
166
-
167
-
168
- clear.click(
169
- lambda: [],
170
- None,
171
- chatbot,
172
- queue=False
173
- )
174
 
175
 
176
  # ============================
177
- # Launch Server
178
  # ============================
179
 
180
- demo.queue()
 
 
 
 
 
181
 
182
  demo.launch(
183
  server_name="0.0.0.0",
 
7
 
8
 
9
  # ============================
10
+ # Download Model
11
  # ============================
12
 
13
  HF_TOKEN = os.environ.get("HF_TOKEN")
14
 
 
 
 
 
 
 
 
 
15
  print("Downloading model from Hugging Face Hub...")
16
 
17
+ model_path = hf_hub_download(
18
+ repo_id=config.MODEL_REPO,
19
+ filename=config.MODEL_FILE,
20
+ token=HF_TOKEN,
21
+ cache_dir="/tmp/hf_cache"
22
+ )
 
 
 
23
 
24
+ print("Model downloaded successfully:", model_path)
 
 
25
 
26
 
27
  # ============================
28
+ # Load Model
29
  # ============================
30
 
31
  CPU_THREADS = multiprocessing.cpu_count()
32
 
33
+ print("CPU Threads available:", CPU_THREADS)
 
 
 
 
 
 
34
  print("Loading model into memory...")
35
 
36
  llm = Llama(
 
47
 
48
 
49
  # ============================
50
+ # Prompt Formatting
51
  # ============================
52
 
53
  SYSTEM_PROMPT = """You are DeepSeek Coder, an expert programming assistant.
54
+ You write clean, efficient, production-ready code.
55
+ Only explain if user asks.
 
 
 
 
56
  """
57
 
58
 
59
+ def build_prompt(messages):
60
 
61
  prompt = SYSTEM_PROMPT + "\n\n"
62
 
63
+ for msg in messages:
64
+ if msg["role"] == "user":
65
+ prompt += f"User: {msg['content']}\n"
66
+ elif msg["role"] == "assistant":
67
+ prompt += f"Assistant: {msg['content']}\n"
68
 
69
+ prompt += "Assistant:"
70
 
71
  return prompt
72
 
73
 
74
  # ============================
75
+ # Streaming Generator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # ============================
77
 
78
+ def generate_response(message, history):
 
79
 
80
+ messages = history + [{"role": "user", "content": message}]
81
 
82
+ prompt = build_prompt(messages)
83
 
84
+ output = ""
85
 
86
+ for token in llm(
87
+ prompt,
88
+ max_tokens=config.MAX_TOKENS,
89
+ temperature=config.TEMPERATURE,
90
+ top_p=0.95,
91
+ stream=True
92
+ ):
93
+ text = token["choices"][0]["text"]
94
+ output += text
95
+ yield output
96
 
97
 
98
  # ============================
99
+ # Gradio Chat Interface
100
  # ============================
101
 
102
+ def chat(message, history):
 
 
 
 
 
 
 
 
 
 
103
 
104
+ history = history or []
105
 
106
+ assistant_response = ""
107
 
108
+ for partial in generate_response(message, history):
109
+ assistant_response = partial
110
+ yield history + [
111
+ {"role": "user", "content": message},
112
+ {"role": "assistant", "content": assistant_response},
113
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  # ============================
117
+ # Launch UI
118
  # ============================
119
 
120
+ demo = gr.ChatInterface(
121
+ fn=chat,
122
+ title="DeepSeek Coder 1.3B",
123
+ description="Production GGUF model running on llama.cpp",
124
+ type="messages"
125
+ )
126
 
127
  demo.launch(
128
  server_name="0.0.0.0",