Adedoyinjames commited on
Commit
852bb8b
·
verified ·
1 Parent(s): dcc1a4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -72
app.py CHANGED
@@ -1,89 +1,129 @@
1
- # app.py
2
- from fastapi import FastAPI, HTTPException
 
3
  from pydantic import BaseModel
4
- from typing import List, Optional
5
  import uvicorn
6
- from llama_cpp import Llama
7
- import os
8
 
9
- # Model config (Official Qwen GGUF repo; Q5_K_M: fast on CPU, ~300MB, high quality)
10
- MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
11
- MODEL_FILE = "Qwen1.5-0.5B-Chat-Q5_K_M.gguf" # Correct file name with dots & uppercase
12
- CONTEXT_LENGTH = 32768
13
- MAX_TOKENS = 512
14
- TEMPERATURE = 0.7
15
- TOP_P = 0.8
16
 
17
- app = FastAPI(title="Qwen1.5-0.5B-Chat API", description="Fast CPU-optimized chat API for Qwen1.5-0.5B-Chat")
18
 
19
- # Global model (loads once on startup)
20
- model = None
21
 
22
- class ChatMessage(BaseModel):
23
- role: str
24
- content: str
25
-
26
- class ChatRequest(BaseModel):
27
- messages: List[ChatMessage]
28
- max_tokens: Optional[int] = MAX_TOKENS
29
- temperature: Optional[float] = TEMPERATURE
30
- top_p: Optional[float] = TOP_P
31
-
32
- class ChatResponse(BaseModel):
33
- choices: List[dict]
34
-
35
- def load_model():
36
- global model
37
- print("Loading quantized Qwen1.5-0.5B-Chat model on CPU... (10–15s)")
38
- model = Llama.from_pretrained(
39
- repo_id=MODEL_REPO,
40
- model_file=MODEL_FILE,
41
- n_ctx=CONTEXT_LENGTH,
42
- n_threads=0, # Auto-detect all CPU threads for max speed
43
- verbose=False, # Reduce logs
44
- chat_format="chatml" # Qwen uses ChatML template; auto-applies to messages
45
  )
46
- print("Model loaded! Ready for fast CPU inference.")
47
-
48
- # Load model on startup
49
- load_model()
50
-
51
- def generate_response(messages: List[ChatMessage], max_tokens: int, temperature: float, top_p: float) -> str:
52
- # Prepare messages list (llama-cpp auto-applies Qwen chat template)
53
- chat_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
54
 
55
- # Generate using built-in chat completion (handles template, sampling, etc.)
56
- response = model.create_chat_completion(
57
- messages=chat_messages,
58
- max_tokens=max_tokens,
59
- temperature=temperature,
60
- top_p=top_p,
61
- stream=False,
62
- echo=False # Don't repeat input
63
  )
64
 
65
- # Extract assistant response
66
- bot_reply = response["choices"][0]["message"]["content"]
67
- return bot_reply
68
-
69
- @app.post("/chat/", response_model=ChatResponse)
70
- async def chat_endpoint(request: ChatRequest):
71
- if model is None:
72
- raise HTTPException(status_code=500, detail="Model not loaded")
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  try:
75
- response_text = generate_response(request.messages, request.max_tokens, request.temperature, request.top_p)
76
- choice = {
77
- "message": {"role": "assistant", "content": response_text},
78
- "finish_reason": "stop"
 
79
  }
80
- return ChatResponse(choices=[choice])
81
  except Exception as e:
82
- raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- @app.get("/health")
85
- async def health_check():
86
- return {"status": "healthy", "model_loaded": model is not None}
87
 
88
  if __name__ == "__main__":
89
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from fastapi import FastAPI
4
  from pydantic import BaseModel
 
5
  import uvicorn
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ import gradio as gr
8
 
9
+ # --- Qwen Chat System ---
 
 
 
 
 
 
10
 
11
+ print("🔄 Loading Qwen model from Qwen/Qwen1.5-0.5B-Chat...")
12
 
13
+ # Load Qwen model
14
+ model_name = "Qwen/Qwen1.5-0.5B-Chat"
15
 
16
+ try:
17
+ tokenizer = AutoTokenizer.from_pretrained(
18
+ model_name,
19
+ trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
 
 
 
 
 
 
 
 
21
 
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_name,
24
+ torch_dtype=torch.float16,
25
+ device_map="auto",
26
+ trust_remote_code=True
 
 
 
27
  )
28
 
29
+ print("✅ Qwen model loaded successfully!")
 
 
 
 
 
 
 
30
 
31
+ except Exception as e:
32
+ print(f"❌ Error loading model: {e}")
33
+ raise
34
+
35
+ def generate_response(query):
36
+ """Generates response using only the Qwen model"""
37
+ try:
38
+ # Format prompt using Qwen chat template for better performance
39
+ messages = [
40
+ {"role": "user", "content": query}
41
+ ]
42
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
43
+
44
+ # Tokenize input
45
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
46
+
47
+ # Generate response
48
+ with torch.no_grad():
49
+ outputs = model.generate(
50
+ **inputs,
51
+ max_new_tokens=256,
52
+ temperature=0.7,
53
+ do_sample=True,
54
+ pad_token_id=tokenizer.eos_token_id,
55
+ repetition_penalty=1.1
56
+ )
57
+
58
+ # Decode response
59
+ full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
60
+
61
+ # Extract only the assistant's response
62
+ response = full_text[len(prompt):].strip()
63
+
64
+ return response
65
+
66
+ except Exception as e:
67
+ return f"Error generating response: {str(e)}"
68
+
69
+ # --- FastAPI App ---
70
+ app = FastAPI(title="Qwen AI", description="Chat with Qwen1.5-0.5B-Chat model")
71
+
72
+ app.add_middleware(
73
+ CORSMiddleware,
74
+ allow_origins=["*"],
75
+ allow_credentials=True,
76
+ allow_methods=["*"],
77
+ allow_headers=["*"],
78
+ )
79
+
80
+ class QueryRequest(BaseModel):
81
+ query: str
82
+
83
+ @app.post("/chat/")
84
+ async def chat_with_ai(query_request: QueryRequest):
85
  try:
86
+ response = generate_response(query_request.query)
87
+ return {
88
+ "response": response,
89
+ "model_used": "Qwen/Qwen1.5-0.5B-Chat",
90
+ "status": "success"
91
  }
92
+
93
  except Exception as e:
94
+ return {
95
+ "response": f"Error: {str(e)}",
96
+ "model_used": "Qwen/Qwen1.5-0.5B-Chat",
97
+ "status": "error"
98
+ }
99
+
100
+ @app.get("/status/")
101
+ async def get_status():
102
+ return {
103
+ "model_loaded": True,
104
+ "model_name": "Qwen/Qwen1.5-0.5B-Chat",
105
+ "system_ready": True
106
+ }
107
+
108
+ @app.get("/")
109
+ async def root():
110
+ return {"message": "Qwen AI running with Qwen model"}
111
+
112
+ # Simple Gradio interface
113
+ def chat_interface(message, history):
114
+ try:
115
+ response = generate_response(message)
116
+ return response
117
+ except:
118
+ return "System busy, please try again."
119
+
120
+ gradio_app = gr.ChatInterface(
121
+ fn=chat_interface,
122
+ title="Qwen AI",
123
+ description="Chat with Qwen1.5-0.5B-Chat model"
124
+ )
125
 
126
+ app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
 
 
127
 
128
  if __name__ == "__main__":
129
+ uvicorn.run(app, host="0.0.0.0", port=7860)