import os import urllib.request from collections.abc import Iterator import gradio as gr from llama_cpp import Llama # ๐พ Download GGUF from Hugging Face if not already present GGUF_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" MODEL_FILENAME = "TinyLlama-1.1B-Chat.Q4_K_M.gguf" if not os.path.exists(MODEL_FILENAME): print(f"๐ฝ Downloading model from Hugging Face: {GGUF_URL}") urllib.request.urlretrieve(GGUF_URL, MODEL_FILENAME) print("โ Download complete!") # ๐ง Load GGUF model using llama-cpp llm = Llama(model_path=MODEL_FILENAME, n_ctx=4096, n_threads=os.cpu_count()) DESCRIPTION = "# Sheikh AI โ TinyLlama (GGUF from HF)" DESCRIPTION += "
Note: Running on CPU with GGUF โ downloaded automatically.
" MAX_NEW_TOKENS = 1024 def format_conversation(system_prompt: str, chat_history: list[dict], user_input: str) -> str: chat = f"<|system|>\n{system_prompt.strip()}\n" for turn in chat_history: if turn["role"] == "user": chat += f"<|user|>\n{turn['content'].strip()}\n" elif turn["role"] == "assistant": chat += f"<|assistant|>\n{turn['content'].strip()}\n" chat += f"<|user|>\n{user_input.strip()}\n<|assistant|>\n" return chat def generate( message: str, chat_history: list[dict], max_new_tokens: int = MAX_NEW_TOKENS, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repeat_penalty: float = 1.2, ) -> Iterator[str]: system_prompt = ( "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions " "based on the Qurโan, Hadith, and the understanding of classical scholars. Do not answer " "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible." ) prompt = format_conversation(system_prompt, chat_history, message) stream = llm( prompt, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k, repeat_penalty=repeat_penalty, stop=[""], stream=True, ) partial = "" for chunk in stream: partial += chunk["choices"][0]["text"] yield partial demo = gr.ChatInterface( fn=generate, additional_inputs=[ gr.Slider(label="Max new tokens", minimum=32, maximum=2048, value=MAX_NEW_TOKENS, step=32), gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1), gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05), gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1), gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, value=1.2, step=0.05), ], examples=[ ["What are the five pillars of Islam?"], ["Is it allowed to pray in shoes?"], ["Explain the meaning of Surah Al-Fatiha."], ["Is music haram according to Islamic scholars?"], ["Can I make up missed fasts after Ramadan?"] ], description=DESCRIPTION, css_paths="style.css" ) if __name__ == "__main__": demo.launch()