File size: 3,701 Bytes
e57f948
50d90dd
12ac796
e57f948
 
8420891
 
7dd2541
8420891
12ac796
91ce3bd
f3ec931
91ce3bd
e57f948
0a74a56
 
e57f948
 
 
 
91ce3bd
 
 
e57f948
 
 
12ac796
f3ec931
12ac796
7dd2541
 
 
50d90dd
f3b8c50
 
7dd2541
50d90dd
 
 
7dd2541
91ce3bd
e57f948
50d90dd
7dd2541
 
50d90dd
6dbd7a8
7dd2541
9cf19bf
 
7dd2541
 
 
 
 
 
626c4b7
e57f948
 
7dd2541
50d90dd
626c4b7
50d90dd
 
91ce3bd
8420891
 
 
 
7dd2541
8420891
 
 
 
8d63cc4
 
8420891
 
8d63cc4
626c4b7
8d63cc4
 
 
 
 
8420891
 
626c4b7
 
7dd2541
 
626c4b7
7dd2541
 
 
 
 
 
 
626c4b7
478380c
 
7dd2541
 
 
478380c
7dd2541
 
626c4b7
 
7dd2541
 
 
8420891
626c4b7
8420891
 
 
e57f948
 
8420891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import time
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI
from pydantic import BaseModel
from threading import Thread
import uvicorn

# ----------------------------
# Model
# ----------------------------
model_path = hf_hub_download(
    repo_id="khazarai/Qwen3-4B-Qwen3.6-plus-Reasoning-Distilled-GGUF",
    filename="Qwen3-4B-Thinking-2507.Q4_1.gguf"
)

llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=os.cpu_count(),
    n_batch=512,
    n_gpu_layers=0,
    verbose=False
)

llm("warmup", max_tokens=1)

# ----------------------------
# System Prompt
# ----------------------------
SYSTEM_PROMPT = """
You are an advanced AI assistant.
Answer questions clearly and concisely.
You can handle multi-turn conversations and provide detailed responses if needed.
"""

# ----------------------------
# Chat Function
# ----------------------------
def generate_response(message, history):
    yield "πŸ€– Thinking..."
    time.sleep(0.5)

    prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
    for h in history:
        if isinstance(h, dict) and "role" in h and "message" in h:
            role = h["role"]
            msg = h["message"]
            if role == "user":
                prompt += f"<|im_start|>user\n{msg}<|im_end|>\n"
            else:
                prompt += f"<|im_start|>assistant\n{msg}<|im_end|>\n"
        elif isinstance(h, (list, tuple)) and len(h) >= 2:
            u, a = h[0], h[1]
            prompt += f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n"

    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    output = ""
    for token in llm(prompt, max_tokens=2048, temperature=0.2, top_p=0.9, repeat_penalty=1.1, stream=True):
        output += token["choices"][0]["text"]
        yield output

# ----------------------------
# FastAPI API
# ----------------------------
app = FastAPI()

class ChatRequest(BaseModel):
    message: str
    history: list = []

# FastAPI endpoint ko simple banayein taaki Web Search client connect kar sake
@app.post("/generate_response") # Path ko match karne ke liye change kiya
def chat_endpoint(request: ChatRequest):
    output = ""
    # Prompt logic (Same as yours)
    prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
    # ... baki prompt logic ...
    
    # Streaming off rakhein API response ke liye taaki ek baar mein pura text mile
    res = llm(prompt, max_tokens=1024, temperature=0.3) 
    return res["choices"][0]["text"]

# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme=gr.Theme.from_hub("JackismyShephard/ultimate-rvc-theme")) as demo:
    gr.HTML("<h2 style='text-align:center; color:white;'>Code Explainer AI</h2>")

    chatbot = gr.ChatInterface(
        fn=generate_response,
        chatbot=gr.Chatbot(height=600),
        textbox=gr.Textbox(placeholder="Paste code or ask for explanation...", container=False)
    )

    # Rounded corners for main container
    demo.css = """
    .gradio-container {
        border-radius: 25px !important;
        max-width: 600px !important;
        margin: auto !important;
        overflow: hidden;
    }
    .message.user { border-radius: 18px 18px 4px 18px !important; }
    .message.bot { border-radius: 18px 18px 18px 4px !important; }
    """

# ----------------------------
# Run Gradio + FastAPI together
# ----------------------------
def run_gradio():
    demo.launch(server_name="0.0.0.0", server_port=7860)

thread = Thread(target=run_gradio)
thread.start()

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)