import os import time import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download from fastapi import FastAPI from pydantic import BaseModel from threading import Thread import uvicorn # ---------------------------- # Model # ---------------------------- model_path = hf_hub_download( repo_id="khazarai/Qwen3-4B-Qwen3.6-plus-Reasoning-Distilled-GGUF", filename="Qwen3-4B-Thinking-2507.Q4_1.gguf" ) llm = Llama( model_path=model_path, n_ctx=4096, n_threads=os.cpu_count(), n_batch=512, n_gpu_layers=0, verbose=False ) llm("warmup", max_tokens=1) # ---------------------------- # System Prompt # ---------------------------- SYSTEM_PROMPT = """ You are an advanced AI assistant. Answer questions clearly and concisely. You can handle multi-turn conversations and provide detailed responses if needed. """ # ---------------------------- # Chat Function # ---------------------------- def generate_response(message, history): yield "🤖 Thinking..." time.sleep(0.5) prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" for h in history: if isinstance(h, dict) and "role" in h and "message" in h: role = h["role"] msg = h["message"] if role == "user": prompt += f"<|im_start|>user\n{msg}<|im_end|>\n" else: prompt += f"<|im_start|>assistant\n{msg}<|im_end|>\n" elif isinstance(h, (list, tuple)) and len(h) >= 2: u, a = h[0], h[1] prompt += f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" output = "" for token in llm(prompt, max_tokens=2048, temperature=0.2, top_p=0.9, repeat_penalty=1.1, stream=True): output += token["choices"][0]["text"] yield output # ---------------------------- # FastAPI API # ---------------------------- app = FastAPI() class ChatRequest(BaseModel): message: str history: list = [] # FastAPI endpoint ko simple banayein taaki Web Search client connect kar sake @app.post("/generate_response") # Path ko match karne ke liye change kiya def chat_endpoint(request: ChatRequest): output = "" # Prompt logic (Same as yours) prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" # ... baki prompt logic ... # Streaming off rakhein API response ke liye taaki ek baar mein pura text mile res = llm(prompt, max_tokens=1024, temperature=0.3) return res["choices"][0]["text"] # ---------------------------- # Gradio UI # ---------------------------- with gr.Blocks(theme=gr.Theme.from_hub("JackismyShephard/ultimate-rvc-theme")) as demo: gr.HTML("

Code Explainer AI

") chatbot = gr.ChatInterface( fn=generate_response, chatbot=gr.Chatbot(height=600), textbox=gr.Textbox(placeholder="Paste code or ask for explanation...", container=False) ) # Rounded corners for main container demo.css = """ .gradio-container { border-radius: 25px !important; max-width: 600px !important; margin: auto !important; overflow: hidden; } .message.user { border-radius: 18px 18px 4px 18px !important; } .message.bot { border-radius: 18px 18px 18px 4px !important; } """ # ---------------------------- # Run Gradio + FastAPI together # ---------------------------- def run_gradio(): demo.launch(server_name="0.0.0.0", server_port=7860) thread = Thread(target=run_gradio) thread.start() if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)