Toilatop1sever commited on
Commit
b7080ed
·
verified ·
1 Parent(s): b78d107

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -22
app.py CHANGED
@@ -1,36 +1,25 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel
3
- from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
  import os
6
 
7
- app = FastAPI()
8
-
9
  MODEL_REPO = "Qwen/Qwen2.5-3B-Instruct-GGUF"
10
  MODEL_FILE = "qwen2.5-3b-instruct-q4_k_m.gguf"
11
 
12
- # Tải model nếu chưa có (tự động làm sau khi build, không tốn RAM build)
13
  if not os.path.exists(MODEL_FILE):
14
- print(f"Downloading {MODEL_FILE} from {MODEL_REPO}...")
15
  hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=".")
16
  print("Download done!")
17
 
18
  print("Loading model...")
19
- llm = Llama(model_path=MODEL_FILE, n_ctx=2048, n_threads=4)
 
 
20
  print("Model ready!")
21
 
22
- class ChatRequest(BaseModel):
23
- prompt: str
24
- max_tokens: int = 512
25
- temperature: float = 0.7
26
-
27
- @app.post("/chat")
28
- async def chat(req: ChatRequest):
29
- if not req.prompt:
30
- raise HTTPException(status_code=400, detail="Empty prompt")
31
- output = llm(req.prompt, max_tokens=req.max_tokens, temperature=req.temperature)
32
- return {"response": output["choices"][0]["text"]}
33
 
34
- @app.get("/")
35
- async def root():
36
- return {"status": "ok", "message": "AI Coder is running!"}
 
1
+ import gradio as gr
2
+ from ctransformers import AutoModelForCausalLM
 
3
  from huggingface_hub import hf_hub_download
4
  import os
5
 
 
 
6
  MODEL_REPO = "Qwen/Qwen2.5-3B-Instruct-GGUF"
7
  MODEL_FILE = "qwen2.5-3b-instruct-q4_k_m.gguf"
8
 
9
+ # Tự động tải model nếu chưa có
10
  if not os.path.exists(MODEL_FILE):
11
+ print(f"Downloading {MODEL_FILE}...")
12
  hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=".")
13
  print("Download done!")
14
 
15
  print("Loading model...")
16
+ llm = AutoModelForCausalLM.from_pretrained(
17
+ ".", model_file=MODEL_FILE, model_type="qwen", gpu_layers=0
18
+ )
19
  print("Model ready!")
20
 
21
+ def chat(message, history):
22
+ response = llm(message, max_new_tokens=512, temperature=0.7)
23
+ return response
 
 
 
 
 
 
 
 
24
 
25
+ gr.ChatInterface(chat, title="AI Coder Qwen 2.5 3B").launch()