truegleai commited on
Commit
a122c4c
·
verified ·
1 Parent(s): fc23551

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc76f3326b49b4d81d1027d092bf7ce5b4ed2de4136d1d2e7e6347c3ec8376
3
+ size 10364416480
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DeepSeek Coder V2 Lite 16B
3
+ emoji: 💻
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # 🚀 o87Dev - Maximum Capacity Deployment
13
+
14
+ **Strategy:** Deploy the largest viable model (`DeepSeek-Coder-V2-Lite-Instruct-16B-Q4_K_M`) on Hugging Face's free CPU tier.
15
+
16
+ ## ⚙️ Technical Details
17
+ - **Model:** DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf (10.4GB)
18
+ - **Quantization:** Q4_K_M (Optimal quality/size for free tier)
19
+ - **Loader:** `llama-cpp-python` (CPU optimized)
20
+ - **Context:** 2048 tokens (max for free tier stability)
21
+
22
+ ## 📊 Performance Expectations
23
+ - **First load:** ~60-120 seconds (model loads from disk)
24
+ - **Inference speed:** ~2-5 tokens/second on CPU
25
+ - **Memory usage:** ~12-14GB of 16GB available
26
+
27
+ ## 🎯 Usage Tips
28
+ 1. First request triggers model load (be patient)
29
+ 2. Keep prompts under 500 tokens for best results
30
+ 3. Use temperature 0.7-0.9 for creative tasks
31
+ 4. Monitor memory usage in Space logs
32
+
33
+ ## 🔗 Integration
34
+ This Space serves as the primary AI endpoint for the o87Dev local API server.
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
4
+ import time
5
+ import os
6
+
7
+ # Configuration
8
+ MODEL_NAME = "DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf"
9
+ MODEL_PATH = MODEL_NAME # Since we placed it in the same directory
10
+
11
+ # Initialize model (will be loaded on first use)
12
+ llm = None
13
+
14
+ def load_model():
15
+ """Lazy-load the model only when needed."""
16
+ global llm
17
+ if llm is None:
18
+ print(f"⏳ Loading model {MODEL_NAME}... This may take 1-2 minutes on first run.")
19
+ start_time = time.time()
20
+
21
+ # CPU-optimized settings for free tier
22
+ llm = Llama(
23
+ model_path=MODEL_PATH,
24
+ n_ctx=2048, # Context size (smaller = less memory)
25
+ n_threads=2, # Use 2 CPU threads
26
+ n_gpu_layers=0, # CPU only on free tier
27
+ verbose=False
28
+ )
29
+
30
+ load_time = time.time() - start_time
31
+ print(f"✅ Model loaded in {load_time:.1f} seconds. Ready for inference.")
32
+ return llm
33
+
34
+ def generate_code(prompt, max_tokens=256, temperature=0.7):
35
+ """Main generation function."""
36
+ try:
37
+ model = load_model()
38
+
39
+ # Format prompt for DeepSeek-Coder Instruct models
40
+ formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
41
+
42
+ # Generate
43
+ output = model(
44
+ formatted_prompt,
45
+ max_tokens=max_tokens,
46
+ temperature=temperature,
47
+ top_p=0.95,
48
+ echo=False,
49
+ stop=["###", "\n\n\n"]
50
+ )
51
+
52
+ return output['choices'][0]['text'].strip()
53
+
54
+ except Exception as e:
55
+ return f"❌ Error: {str(e)}"
56
+
57
+ # Create Gradio interface
58
+ demo = gr.Interface(
59
+ fn=generate_code,
60
+ inputs=[
61
+ gr.Textbox(
62
+ label="Code Prompt",
63
+ placeholder="Write a Python function to reverse a string...",
64
+ lines=4
65
+ ),
66
+ gr.Slider(
67
+ minimum=32,
68
+ maximum=512,
69
+ value=256,
70
+ step=32,
71
+ label="Max Tokens"
72
+ ),
73
+ gr.Slider(
74
+ minimum=0.1,
75
+ maximum=1.0,
76
+ value=0.7,
77
+ step=0.1,
78
+ label="Temperature"
79
+ )
80
+ ],
81
+ outputs=gr.Code(
82
+ label="Generated Code",
83
+ language="python"
84
+ ),
85
+ title="💻 DeepSeek Coder V2 Lite (16B) - o87Dev",
86
+ description="**CPU Deployment** - Largest viable model on Hugging Face Spaces free tier. ⚠️ **First request loads model (~1-2 min)**",
87
+ examples=[
88
+ ["Write a Python function to check if a number is prime"],
89
+ ["Create a React component for a login form"],
90
+ ["Explain binary search algorithm in Python"]
91
+ ]
92
+ )
93
+
94
+ # Launch with queue for better handling on free tier
95
+ if __name__ == "__main__":
96
+ demo.launch(
97
+ server_name="0.0.0.0",
98
+ server_port=7860,
99
+ share=False # Set to True if you want a public link
100
+ )
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ llama-cpp-python>=0.3.0 # CPU-optimized GGUF loader
3
+ huggingface-hub>=0.20.0
4
+ sentencepiece>=0.1.99 # Tokenizer for many models