Loomis Green commited on
Commit
5c3cb1b
·
1 Parent(s): 086c91c

Switch to Qwen2.5-Coder-14B-Instruct-Uncensored GGUF

Browse files
Files changed (3) hide show
  1. app.py +35 -37
  2. requirements.txt +2 -5
  3. static/index.html +1 -1
app.py CHANGED
@@ -1,22 +1,34 @@
1
- from fastapi import FastAPI, Request
2
  from fastapi.staticfiles import StaticFiles
3
  from fastapi.middleware.cors import CORSMiddleware
4
- from fastapi.responses import FileResponse, JSONResponse
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
6
- import torch
 
7
 
8
- # Load model (Switching to Qwen2.5-1.5B-Instruct for significantly better logic/reasoning)
9
- print("Loading Qwen2.5-1.5B-Instruct Model...")
10
- checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
11
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
12
- model = AutoModelForCausalLM.from_pretrained(checkpoint)
13
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
14
  print("Model Loaded Successfully!")
15
 
16
  app = FastAPI()
17
 
18
  # Global Conversation History (Simple Server-Side Memory)
19
- # We initialize with a system prompt that defines the persona clearly.
20
  DEFAULT_SYSTEM_PROMPT = {
21
  "role": "system",
22
  "content": (
@@ -24,6 +36,7 @@ DEFAULT_SYSTEM_PROMPT = {
24
  "You are chatting with a user named Loomis (unless they tell you otherwise). "
25
  "Your name is Loomyloo. The user's name is Loomis. "
26
  "Never confuse your name with the user's name. "
 
27
  "Keep your answers concise, friendly, and helpful."
28
  )
29
  }
@@ -55,42 +68,27 @@ def ask(prompt: str):
55
  conversation_history.append({"role": "user", "content": prompt})
56
 
57
  # 2. Prune History (Keep System Prompt + Last 10 exchanges)
58
- # This prevents the context from getting too large for the model
59
  if len(conversation_history) > 21:
60
  conversation_history = [DEFAULT_SYSTEM_PROMPT] + conversation_history[-20:]
61
 
62
  print(f"Current History Length: {len(conversation_history)}")
63
 
64
- # 3. Format Prompt for the Model
65
- prompt_text = tokenizer.apply_chat_template(
66
- conversation_history,
67
- tokenize=False,
68
- add_generation_prompt=True
69
- )
70
-
71
- # 4. Generate Response
72
- result = pipe(
73
- prompt_text,
74
- max_new_tokens=512, # Allow for longer, more thoughtful responses
75
- do_sample=True,
76
- temperature=0.7,
77
- top_p=0.9,
78
- return_full_text=False
79
  )
80
 
81
- generated_text = result[0]['generated_text']
 
82
 
83
- # Clean up (sometimes models output the role label)
84
- if generated_text.startswith("assistant"):
85
- generated_text = generated_text.replace("assistant", "", 1).strip()
86
-
87
- # 5. Add Assistant Response to History
88
  conversation_history.append({"role": "assistant", "content": generated_text})
89
 
90
- # 6. Return Result
91
- # We explicitly update the result dict to include the cleaned text
92
- result[0]['generated_text'] = generated_text
93
- return result[0]
94
 
95
  # Serve Static Files
96
  app.mount("/static", StaticFiles(directory="static"), name="static")
 
1
+ from fastapi import FastAPI
2
  from fastapi.staticfiles import StaticFiles
3
  from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.responses import FileResponse
5
+ from huggingface_hub import hf_hub_download
6
+ from llama_cpp import Llama
7
+ import os
8
 
9
+ # Define Model details
10
+ REPO_ID = "roleplaiapp/Qwen2.5-Coder-14B-Instruct-Uncensored-Q4_K_S-GGUF"
11
+ FILENAME = "Qwen2.5-Coder-14B-Instruct-Uncensored.Q4_K_S.gguf"
12
+
13
+ print(f"Downloading {FILENAME} from {REPO_ID}...")
14
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
15
+ print(f"Model downloaded to: {model_path}")
16
+
17
+ print("Loading Llama model...")
18
+ # Initialize Llama model
19
+ # n_ctx=4096: Context window (RAM usage scales with this)
20
+ # n_threads=2: Hugging Face Spaces free tier usually has 2 vCPUs
21
+ llm = Llama(
22
+ model_path=model_path,
23
+ n_ctx=4096,
24
+ n_threads=2,
25
+ verbose=True
26
+ )
27
  print("Model Loaded Successfully!")
28
 
29
  app = FastAPI()
30
 
31
  # Global Conversation History (Simple Server-Side Memory)
 
32
  DEFAULT_SYSTEM_PROMPT = {
33
  "role": "system",
34
  "content": (
 
36
  "You are chatting with a user named Loomis (unless they tell you otherwise). "
37
  "Your name is Loomyloo. The user's name is Loomis. "
38
  "Never confuse your name with the user's name. "
39
+ "You are running on the powerful Qwen2.5-Coder-14B-Instruct-Uncensored model. "
40
  "Keep your answers concise, friendly, and helpful."
41
  )
42
  }
 
68
  conversation_history.append({"role": "user", "content": prompt})
69
 
70
  # 2. Prune History (Keep System Prompt + Last 10 exchanges)
 
71
  if len(conversation_history) > 21:
72
  conversation_history = [DEFAULT_SYSTEM_PROMPT] + conversation_history[-20:]
73
 
74
  print(f"Current History Length: {len(conversation_history)}")
75
 
76
+ # 3. Generate Response using llama-cpp-python chat completion
77
+ response = llm.create_chat_completion(
78
+ messages=conversation_history,
79
+ max_tokens=512,
80
+ temperature=0.7,
81
+ top_p=0.9
 
 
 
 
 
 
 
 
 
82
  )
83
 
84
+ # Extract text from response
85
+ generated_text = response['choices'][0]['message']['content']
86
 
87
+ # 4. Add Assistant Response to History
 
 
 
 
88
  conversation_history.append({"role": "assistant", "content": generated_text})
89
 
90
+ # 5. Return Result (keeping format consistent with previous API)
91
+ return {"generated_text": generated_text}
 
 
92
 
93
  # Serve Static Files
94
  app.mount("/static", StaticFiles(directory="static"), name="static")
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
  fastapi[standard]
2
  uvicorn
3
- transformers
4
- torch
5
- torchvision
6
  aiofiles
7
- sentencepiece
8
- accelerate
 
1
  fastapi[standard]
2
  uvicorn
 
 
 
3
  aiofiles
4
+ huggingface_hub
5
+ llama-cpp-python
static/index.html CHANGED
@@ -81,7 +81,7 @@
81
  <body>
82
 
83
  <div id="chat-container">
84
- <div class="message ai-message">Hello! I am Loomyloo (v4) (running on Qwen2.5-1.5B). I am much smarter now! How can I help you?</div>
85
  </div>
86
 
87
  <div id="input-area">
 
81
  <body>
82
 
83
  <div id="chat-container">
84
+ <div class="message ai-message">Hello! I am Loomyloo (v5) (running on Qwen2.5-Coder-14B-Instruct-Uncensored GGUF). I am powerful and uncensored! How can I help you?</div>
85
  </div>
86
 
87
  <div id="input-area">