Spaces:

fugthchat
/

fugthdes

Sleeping

App Files Files Community

fugthchat commited on Sep 9, 2025

Commit

39395e6

verified ·

1 Parent(s): e152f39

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -81

app.py CHANGED Viewed

@@ -1,97 +1,77 @@
 import os
 from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from llama_cpp import Llama
-import uvicorn
-import requests
-from tqdm import tqdm
-# --- Configuration ---
-MODEL_NAME = "stablelm-zephyr-3b.Q3_K_S.gguf"
-MODEL_URL = f"https://huggingface.co/TheBloke/stablelm-zephyr-3b-GGUF/resolve/main/{MODEL_NAME}"
-MODEL_PATH = f"./{MODEL_NAME}"
-N_CTX = 4096 # Context window size.
-# --- Model Download ---
-def download_model():
-    if not os.path.exists(MODEL_PATH):
-        print(f"Model not found. Downloading {MODEL_NAME}...")
-        try:
-            response = requests.get(MODEL_URL, stream=True, timeout=300)
-            response.raise_for_status()
-            total_size = int(response.headers.get('content-length', 0))
-            with tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"Downloading {MODEL_NAME}") as bar:
-                with open(MODEL_PATH, 'wb') as file:
-                    for data in response.iter_content(chunk_size=8192):
-                        file.write(data)
-                        bar.update(len(data))
-            print("Model downloaded successfully.")
-        except requests.exceptions.RequestException as e:
-            print(f"Failed to download model: {e}")
-            return False
-    return True
-# --- FastAPI App ---
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["http://fugthdesign.space", "https://fugthdesign.space"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-llm = None
-@app.on_event("startup")
-def load_llm():
-    global llm
-    if download_model():
-        try:
-            print("Loading GGUF model for CPU...")
-            llm = Llama(
-                model_path=MODEL_PATH,
-                n_ctx=N_CTX,
-                n_gpu_layers=0,  # ** THIS IS THE KEY CHANGE FOR CPU **
-                verbose=True
-            )
-            print("Model loaded successfully on CPU!")
-        except Exception as e:
-            print(f"Error loading model: {e}")
 class ChatRequest(BaseModel):
-    userInput: str
-    persona: str
-    localKnowledge: str
 @app.post("/chat")
-async def chat(request: ChatRequest):
-    if not llm:
-        return {"error": "Model is not loaded or failed to load."}
-    system_prompt = request.persona or "You are a helpful AI assistant."
-    knowledge_context = f"Use the following context to inform your answer:\n---CONTEXT---\n{request.localKnowledge}\n---END CONTEXT---" if request.localKnowledge else ""
-    full_prompt = f"<|system|>\n{system_prompt}\n{knowledge_context}</s>\n<|user|>\n{request.userInput}</s>\n<|assistant|>"
-    print("--- Generating response for prompt ---")
-    print(full_prompt)
-    try:
-        output = llm(
-            prompt=full_prompt,
-            max_tokens=256,
-            stop=["</s>", "<|user|>"],
-            temperature=0.7,
-            echo=False
-        )
-        response_text = output['choices'][0]['text'].strip()
-        print(f"Generated response: {response_text}")
-        return {"response": response_text}
-    except Exception as e:
-        print(f"Error during model inference: {e}")
-        return {"error": "Failed to generate response."}
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import os
 from fastapi import FastAPI
 from pydantic import BaseModel
 from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+# --- 1. Configuration ---
+# Pick a small, fast GGUF model. TinyLlama-1.1B is a great choice.
+MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # 4-bit quantized model
+MODEL_PATH = None # Will be set after download
+# --- 2. Model Loading ---
+# Download the model from Hugging Face Hub if it's not already present
+try:
+    print(f"Downloading model: {MODEL_NAME}/{MODEL_FILE}...")
+    MODEL_PATH = hf_hub_download(repo_id=MODEL_NAME, filename=MODEL_FILE)
+    print(f"Model downloaded to: {MODEL_PATH}")
+except Exception as e:
+    print(f"Error downloading model: {e}")
+    # Handle error appropriately, maybe exit or use a fallback
+    exit()
+# Load the GGUF model from the downloaded path
+# n_gpu_layers=0 means the model will run entirely on the CPU
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=2048,      # Context window size
+    n_gpu_layers=0,  # Run on CPU
+    verbose=True,
 )
+# --- 3. FastAPI App ---
+app = FastAPI()
+# Pydantic model for the request body
 class ChatRequest(BaseModel):
+    message: str
+    # You could add history here later: history: list = []
+@app.get("/")
+def read_root():
+    return {"status": "Fugth AI Anvil is running!"}
 @app.post("/chat")
+def chat_with_ai(request: ChatRequest):
+    """
+    Receives a user message, generates a response using the GGUF model,
+    and returns it.
+    """
+    if not request.message:
+        return {"error": "Message cannot be empty"}
+    # This is the prompt template for TinyLlama-Chat. It's crucial for getting good responses.
+    prompt_template = f"""
+    <|system|>
+    You are a friendly and helpful AI assistant for a floating web avatar. Keep your responses concise and engaging.</s>
+    <|user|>
+    {request.message}</s>
+    <|assistant|>
+    """
+    print(f"Generating response for prompt: {request.message}")
+    # Generate the response
+    output = llm(
+        prompt=prompt_template,
+        max_tokens=150,        # Max length of the response
+        stop=["<|user|>", "</s>"], # Stop generating when the model thinks it's the user's turn
+        echo=False,            # Don't repeat the prompt in the output
+        temperature=0.7,       # A bit of creativity
+    )
+    response_text = output['choices'][0]['text'].strip()
+    print(f"Generated response: {response_text}")
+    return {"response": response_text}