Spaces:
Runtime error
Runtime error
File size: 3,701 Bytes
e57f948 50d90dd 12ac796 e57f948 8420891 7dd2541 8420891 12ac796 91ce3bd f3ec931 91ce3bd e57f948 0a74a56 e57f948 91ce3bd e57f948 12ac796 f3ec931 12ac796 7dd2541 50d90dd f3b8c50 7dd2541 50d90dd 7dd2541 91ce3bd e57f948 50d90dd 7dd2541 50d90dd 6dbd7a8 7dd2541 9cf19bf 7dd2541 626c4b7 e57f948 7dd2541 50d90dd 626c4b7 50d90dd 91ce3bd 8420891 7dd2541 8420891 8d63cc4 8420891 8d63cc4 626c4b7 8d63cc4 8420891 626c4b7 7dd2541 626c4b7 7dd2541 626c4b7 478380c 7dd2541 478380c 7dd2541 626c4b7 7dd2541 8420891 626c4b7 8420891 e57f948 8420891 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | import os
import time
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI
from pydantic import BaseModel
from threading import Thread
import uvicorn
# ----------------------------
# Model
# ----------------------------
model_path = hf_hub_download(
repo_id="khazarai/Qwen3-4B-Qwen3.6-plus-Reasoning-Distilled-GGUF",
filename="Qwen3-4B-Thinking-2507.Q4_1.gguf"
)
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=os.cpu_count(),
n_batch=512,
n_gpu_layers=0,
verbose=False
)
llm("warmup", max_tokens=1)
# ----------------------------
# System Prompt
# ----------------------------
SYSTEM_PROMPT = """
You are an advanced AI assistant.
Answer questions clearly and concisely.
You can handle multi-turn conversations and provide detailed responses if needed.
"""
# ----------------------------
# Chat Function
# ----------------------------
def generate_response(message, history):
yield "π€ Thinking..."
time.sleep(0.5)
prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
for h in history:
if isinstance(h, dict) and "role" in h and "message" in h:
role = h["role"]
msg = h["message"]
if role == "user":
prompt += f"<|im_start|>user\n{msg}<|im_end|>\n"
else:
prompt += f"<|im_start|>assistant\n{msg}<|im_end|>\n"
elif isinstance(h, (list, tuple)) and len(h) >= 2:
u, a = h[0], h[1]
prompt += f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
output = ""
for token in llm(prompt, max_tokens=2048, temperature=0.2, top_p=0.9, repeat_penalty=1.1, stream=True):
output += token["choices"][0]["text"]
yield output
# ----------------------------
# FastAPI API
# ----------------------------
app = FastAPI()
class ChatRequest(BaseModel):
message: str
history: list = []
# FastAPI endpoint ko simple banayein taaki Web Search client connect kar sake
@app.post("/generate_response") # Path ko match karne ke liye change kiya
def chat_endpoint(request: ChatRequest):
output = ""
# Prompt logic (Same as yours)
prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
# ... baki prompt logic ...
# Streaming off rakhein API response ke liye taaki ek baar mein pura text mile
res = llm(prompt, max_tokens=1024, temperature=0.3)
return res["choices"][0]["text"]
# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme=gr.Theme.from_hub("JackismyShephard/ultimate-rvc-theme")) as demo:
gr.HTML("<h2 style='text-align:center; color:white;'>Code Explainer AI</h2>")
chatbot = gr.ChatInterface(
fn=generate_response,
chatbot=gr.Chatbot(height=600),
textbox=gr.Textbox(placeholder="Paste code or ask for explanation...", container=False)
)
# Rounded corners for main container
demo.css = """
.gradio-container {
border-radius: 25px !important;
max-width: 600px !important;
margin: auto !important;
overflow: hidden;
}
.message.user { border-radius: 18px 18px 4px 18px !important; }
.message.bot { border-radius: 18px 18px 18px 4px !important; }
"""
# ----------------------------
# Run Gradio + FastAPI together
# ----------------------------
def run_gradio():
demo.launch(server_name="0.0.0.0", server_port=7860)
thread = Thread(target=run_gradio)
thread.start()
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000) |