File size: 3,219 Bytes
ac40f97
2152916
ac40f97
 
68b3e68
ac40f97
2152916
57eaeab
2152916
 
 
 
 
 
7b3a1dd
2152916
 
 
 
 
ac40f97
68b3e68
ac40f97
68b3e68
 
 
 
 
 
 
 
 
ac40f97
 
 
 
68b3e68
ac40f97
 
 
68b3e68
ac40f97
68b3e68
 
 
 
 
7b3a1dd
68b3e68
 
 
 
 
 
 
 
 
 
 
 
ac40f97
68b3e68
 
 
 
ac40f97
 
 
 
 
68b3e68
 
 
 
 
ac40f97
 
7b3a1dd
 
 
 
 
ac40f97
 
68b3e68
ac40f97
 
 
68b3e68
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import urllib.request
from collections.abc import Iterator
import gradio as gr
from llama_cpp import Llama

# 💾 Download GGUF from Hugging Face if not already present
GGUF_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
MODEL_FILENAME = "TinyLlama-1.1B-Chat.Q4_K_M.gguf"

if not os.path.exists(MODEL_FILENAME):
    print(f"🔽 Downloading model from Hugging Face: {GGUF_URL}")
    urllib.request.urlretrieve(GGUF_URL, MODEL_FILENAME)
    print("✅ Download complete!")

# 🧠 Load GGUF model using llama-cpp
llm = Llama(model_path=MODEL_FILENAME, n_ctx=4096, n_threads=os.cpu_count())

DESCRIPTION = "# Sheikh AI – TinyLlama (GGUF from HF)"
DESCRIPTION += "<p><strong>Note:</strong> Running on CPU with GGUF – downloaded automatically.</p>"

MAX_NEW_TOKENS = 1024

def format_conversation(system_prompt: str, chat_history: list[dict], user_input: str) -> str:
    chat = f"<|system|>\n{system_prompt.strip()}</s>\n"
    for turn in chat_history:
        if turn["role"] == "user":
            chat += f"<|user|>\n{turn['content'].strip()}</s>\n"
        elif turn["role"] == "assistant":
            chat += f"<|assistant|>\n{turn['content'].strip()}</s>\n"
    chat += f"<|user|>\n{user_input.strip()}</s>\n<|assistant|>\n"
    return chat

def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repeat_penalty: float = 1.2,
) -> Iterator[str]:
    system_prompt = (
        "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions "
        "based on the Qur’an, Hadith, and the understanding of classical scholars. Do not answer "
        "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible."
    )

    prompt = format_conversation(system_prompt, chat_history, message)

    stream = llm(
        prompt,
        max_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repeat_penalty=repeat_penalty,
        stop=["</s>"],
        stream=True,
    )

    partial = ""
    for chunk in stream:
        partial += chunk["choices"][0]["text"]
        yield partial


demo = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(label="Max new tokens", minimum=32, maximum=2048, value=MAX_NEW_TOKENS, step=32),
        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1),
        gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05),
        gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1),
        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, value=1.2, step=0.05),
    ],
    examples=[
        ["What are the five pillars of Islam?"],
        ["Is it allowed to pray in shoes?"],
        ["Explain the meaning of Surah Al-Fatiha."],
        ["Is music haram according to Islamic scholars?"],
        ["Can I make up missed fasts after Ramadan?"]
    ],
    description=DESCRIPTION,
    css_paths="style.css"
)

if __name__ == "__main__":
    demo.launch()