fugthchat commited on
Commit
39395e6
·
verified ·
1 Parent(s): e152f39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -81
app.py CHANGED
@@ -1,97 +1,77 @@
1
  import os
2
  from fastapi import FastAPI
3
- from fastapi.middleware.cors import CORSMiddleware
4
  from pydantic import BaseModel
5
  from llama_cpp import Llama
6
- import uvicorn
7
- import requests
8
- from tqdm import tqdm
9
 
10
- # --- Configuration ---
11
- MODEL_NAME = "stablelm-zephyr-3b.Q3_K_S.gguf"
12
- MODEL_URL = f"https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/{MODEL_NAME}"
13
- MODEL_PATH = f"./{MODEL_NAME}"
14
- N_CTX = 4096 # Context window size.
15
 
16
- # --- Model Download ---
17
- def download_model():
18
- if not os.path.exists(MODEL_PATH):
19
- print(f"Model not found. Downloading {MODEL_NAME}...")
20
- try:
21
- response = requests.get(MODEL_URL, stream=True, timeout=300)
22
- response.raise_for_status()
23
- total_size = int(response.headers.get('content-length', 0))
24
- with tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"Downloading {MODEL_NAME}") as bar:
25
- with open(MODEL_PATH, 'wb') as file:
26
- for data in response.iter_content(chunk_size=8192):
27
- file.write(data)
28
- bar.update(len(data))
29
- print("Model downloaded successfully.")
30
- except requests.exceptions.RequestException as e:
31
- print(f"Failed to download model: {e}")
32
- return False
33
- return True
34
 
35
- # --- FastAPI App ---
36
- app = FastAPI()
37
-
38
- app.add_middleware(
39
- CORSMiddleware,
40
- allow_origins=["http://fugthdesign.space", "https://fugthdesign.space"],
41
- allow_credentials=True,
42
- allow_methods=["*"],
43
- allow_headers=["*"],
44
  )
45
 
46
- llm = None
47
- @app.on_event("startup")
48
- def load_llm():
49
- global llm
50
- if download_model():
51
- try:
52
- print("Loading GGUF model for CPU...")
53
- llm = Llama(
54
- model_path=MODEL_PATH,
55
- n_ctx=N_CTX,
56
- n_gpu_layers=0, # ** THIS IS THE KEY CHANGE FOR CPU **
57
- verbose=True
58
- )
59
- print("Model loaded successfully on CPU!")
60
- except Exception as e:
61
- print(f"Error loading model: {e}")
62
 
 
63
  class ChatRequest(BaseModel):
64
- userInput: str
65
- persona: str
66
- localKnowledge: str
 
 
 
67
 
68
  @app.post("/chat")
69
- async def chat(request: ChatRequest):
70
- if not llm:
71
- return {"error": "Model is not loaded or failed to load."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- system_prompt = request.persona or "You are a helpful AI assistant."
74
- knowledge_context = f"Use the following context to inform your answer:\n---CONTEXT---\n{request.localKnowledge}\n---END CONTEXT---" if request.localKnowledge else ""
75
-
76
- full_prompt = f"<|system|>\n{system_prompt}\n{knowledge_context}</s>\n<|user|>\n{request.userInput}</s>\n<|assistant|>"
77
-
78
- print("--- Generating response for prompt ---")
79
- print(full_prompt)
 
80
 
81
- try:
82
- output = llm(
83
- prompt=full_prompt,
84
- max_tokens=256,
85
- stop=["</s>", "<|user|>"],
86
- temperature=0.7,
87
- echo=False
88
- )
89
- response_text = output['choices'][0]['text'].strip()
90
- print(f"Generated response: {response_text}")
91
- return {"response": response_text}
92
- except Exception as e:
93
- print(f"Error during model inference: {e}")
94
- return {"error": "Failed to generate response."}
95
 
96
- if __name__ == "__main__":
97
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import os
2
  from fastapi import FastAPI
 
3
  from pydantic import BaseModel
4
  from llama_cpp import Llama
5
+ from huggingface_hub import hf_hub_download
 
 
6
 
7
+ # --- 1. Configuration ---
8
+ # Pick a small, fast GGUF model. TinyLlama-1.1B is a great choice.
9
+ MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
10
+ MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # 4-bit quantized model
11
+ MODEL_PATH = None # Will be set after download
12
 
13
+ # --- 2. Model Loading ---
14
+ # Download the model from Hugging Face Hub if it's not already present
15
+ try:
16
+ print(f"Downloading model: {MODEL_NAME}/{MODEL_FILE}...")
17
+ MODEL_PATH = hf_hub_download(repo_id=MODEL_NAME, filename=MODEL_FILE)
18
+ print(f"Model downloaded to: {MODEL_PATH}")
19
+ except Exception as e:
20
+ print(f"Error downloading model: {e}")
21
+ # Handle error appropriately, maybe exit or use a fallback
22
+ exit()
 
 
 
 
 
 
 
 
23
 
24
+ # Load the GGUF model from the downloaded path
25
+ # n_gpu_layers=0 means the model will run entirely on the CPU
26
+ llm = Llama(
27
+ model_path=MODEL_PATH,
28
+ n_ctx=2048, # Context window size
29
+ n_gpu_layers=0, # Run on CPU
30
+ verbose=True,
 
 
31
  )
32
 
33
+ # --- 3. FastAPI App ---
34
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Pydantic model for the request body
37
  class ChatRequest(BaseModel):
38
+ message: str
39
+ # You could add history here later: history: list = []
40
+
41
+ @app.get("/")
42
+ def read_root():
43
+ return {"status": "Fugth AI Anvil is running!"}
44
 
45
  @app.post("/chat")
46
+ def chat_with_ai(request: ChatRequest):
47
+ """
48
+ Receives a user message, generates a response using the GGUF model,
49
+ and returns it.
50
+ """
51
+ if not request.message:
52
+ return {"error": "Message cannot be empty"}
53
+
54
+ # This is the prompt template for TinyLlama-Chat. It's crucial for getting good responses.
55
+ prompt_template = f"""
56
+ <|system|>
57
+ You are a friendly and helpful AI assistant for a floating web avatar. Keep your responses concise and engaging.</s>
58
+ <|user|>
59
+ {request.message}</s>
60
+ <|assistant|>
61
+ """
62
+
63
+ print(f"Generating response for prompt: {request.message}")
64
 
65
+ # Generate the response
66
+ output = llm(
67
+ prompt=prompt_template,
68
+ max_tokens=150, # Max length of the response
69
+ stop=["<|user|>", "</s>"], # Stop generating when the model thinks it's the user's turn
70
+ echo=False, # Don't repeat the prompt in the output
71
+ temperature=0.7, # A bit of creativity
72
+ )
73
 
74
+ response_text = output['choices'][0]['text'].strip()
75
+ print(f"Generated response: {response_text}")
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ return {"response": response_text}