Spaces:

Adedoyinjames
/

CVbot

Running

App Files Files Community

Adedoyinjames commited on Dec 5, 2025

Commit

852bb8b

verified ·

1 Parent(s): dcc1a4f

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -72

app.py CHANGED Viewed

@@ -1,89 +1,129 @@
-# app.py
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import List, Optional
 import uvicorn
-from llama_cpp import Llama
-import os
-# Model config (Official Qwen GGUF repo; Q5_K_M: fast on CPU, ~300MB, high quality)
-MODEL_REPO = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
-MODEL_FILE = "Qwen1.5-0.5B-Chat-Q5_K_M.gguf"  # Correct file name with dots & uppercase
-CONTEXT_LENGTH = 32768
-MAX_TOKENS = 512
-TEMPERATURE = 0.7
-TOP_P = 0.8
-app = FastAPI(title="Qwen1.5-0.5B-Chat API", description="Fast CPU-optimized chat API for Qwen1.5-0.5B-Chat")
-# Global model (loads once on startup)
-model = None
-class ChatMessage(BaseModel):
-    role: str
-    content: str
-class ChatRequest(BaseModel):
-    messages: List[ChatMessage]
-    max_tokens: Optional[int] = MAX_TOKENS
-    temperature: Optional[float] = TEMPERATURE
-    top_p: Optional[float] = TOP_P
-class ChatResponse(BaseModel):
-    choices: List[dict]
-def load_model():
-    global model
-    print("Loading quantized Qwen1.5-0.5B-Chat model on CPU... (10–15s)")
-    model = Llama.from_pretrained(
-        repo_id=MODEL_REPO,
-        model_file=MODEL_FILE,
-        n_ctx=CONTEXT_LENGTH,
-        n_threads=0,  # Auto-detect all CPU threads for max speed
-        verbose=False,  # Reduce logs
-        chat_format="chatml"  # Qwen uses ChatML template; auto-applies to messages
     )
-    print("Model loaded! Ready for fast CPU inference.")
-# Load model on startup
-load_model()
-def generate_response(messages: List[ChatMessage], max_tokens: int, temperature: float, top_p: float) -> str:
-    # Prepare messages list (llama-cpp auto-applies Qwen chat template)
-    chat_messages = [{"role": msg.role, "content": msg.content} for msg in messages]
-    # Generate using built-in chat completion (handles template, sampling, etc.)
-    response = model.create_chat_completion(
-        messages=chat_messages,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        stream=False,
-        echo=False  # Don't repeat input
     )
-    # Extract assistant response
-    bot_reply = response["choices"][0]["message"]["content"]
-    return bot_reply
-@app.post("/chat/", response_model=ChatResponse)
-async def chat_endpoint(request: ChatRequest):
-    if model is None:
-        raise HTTPException(status_code=500, detail="Model not loaded")
     try:
-        response_text = generate_response(request.messages, request.max_tokens, request.temperature, request.top_p)
-        choice = {
-            "message": {"role": "assistant", "content": response_text},
-            "finish_reason": "stop"
         }
-        return ChatResponse(choices=[choice])
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy", "model_loaded": model is not None}
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from fastapi import FastAPI
 from pydantic import BaseModel
 import uvicorn
+from fastapi.middleware.cors import CORSMiddleware
+import gradio as gr
+# --- Qwen Chat System ---
+print("🔄 Loading Qwen model from Qwen/Qwen1.5-0.5B-Chat...")
+# Load Qwen model
+model_name = "Qwen/Qwen1.5-0.5B-Chat"
+try:
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=True
     )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
     )
+    print("✅ Qwen model loaded successfully!")
+except Exception as e:
+    print(f"❌ Error loading model: {e}")
+    raise
+def generate_response(query):
+    """Generates response using only the Qwen model"""
+    try:
+        # Format prompt using Qwen chat template for better performance
+        messages = [
+            {"role": "user", "content": query}
+        ]
+        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+        # Generate response
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.1
+            )
+        # Decode response
+        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's response
+        response = full_text[len(prompt):].strip()
+        return response
+    except Exception as e:
+        return f"Error generating response: {str(e)}"
+# --- FastAPI App ---
+app = FastAPI(title="Qwen AI", description="Chat with Qwen1.5-0.5B-Chat model")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class QueryRequest(BaseModel):
+    query: str
+@app.post("/chat/")
+async def chat_with_ai(query_request: QueryRequest):
     try:
+        response = generate_response(query_request.query)
+        return {
+            "response": response,
+            "model_used": "Qwen/Qwen1.5-0.5B-Chat",
+            "status": "success"
         }
     except Exception as e:
+        return {
+            "response": f"Error: {str(e)}",
+            "model_used": "Qwen/Qwen1.5-0.5B-Chat",
+            "status": "error"
+        }
+@app.get("/status/")
+async def get_status():
+    return {
+        "model_loaded": True,
+        "model_name": "Qwen/Qwen1.5-0.5B-Chat",
+        "system_ready": True
+    }
+@app.get("/")
+async def root():
+    return {"message": "Qwen AI running with Qwen model"}
+# Simple Gradio interface
+def chat_interface(message, history):
+    try:
+        response = generate_response(message)
+        return response
+    except:
+        return "System busy, please try again."
+gradio_app = gr.ChatInterface(
+    fn=chat_interface,
+    title="Qwen AI",
+    description="Chat with Qwen1.5-0.5B-Chat model"
+)
+app = gr.mount_gradio_app(app, gradio_app, path="/gradio")
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)