Spaces:

YOUSEF2434
/

Muslim-Bot

Sleeping

App Files Files Community

YOUSEF2434 commited on Jun 27, 2025

Commit

68b3e68

verified ·

1 Parent(s): 5576ec8

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -86

app.py CHANGED Viewed

@@ -1,110 +1,75 @@
 import os
 from collections.abc import Iterator
-from threading import Thread
 import gradio as gr
-import spaces
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from huggingface_hub import login
-# 🔐 Authenticate with Hugging Face token stored as secret or env var
-login(token=os.environ.get("HF_TOKEN"))
-DESCRIPTION = "# Sheikh AI – microsoft/Phi-4-mini-instruct (quantized int8)"
-if not torch.cuda.is_available():
-    DESCRIPTION += "\n<p><strong>Note:</strong> Running on CPU – slower performance.</p>"
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-# Load model with int8 quantization on CUDA (if available)
-if torch.cuda.is_available():
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        load_in_8bit=True,
-        device_map="auto",
-    )
-else:
-    # Fallback: load in float32 on CPU (slow)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.float32,
-        device_map="cpu",
-    )
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-@spaces.GPU
 def generate(
     message: str,
     chat_history: list[dict],
-    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
-    repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
-    system_prompt = {
-        "role": "system",
-        "content": (
-            "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions "
-            "based on the Qur’an, Hadith, and the understanding of classical scholars. Do not answer "
-            "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible."
-        )
-    }
-    conversation = [system_prompt] + chat_history + [{"role": "user", "content": message}]
-    chat_text = ""
-    for turn in conversation:
-        role = turn.get("role", "")
-        content = turn.get("content", "")
-        if role == "system":
-            chat_text += f"System: {content}\n"
-        elif role == "user":
-            chat_text += f"User: {content}\n"
-        elif role == "assistant":
-            chat_text += f"Assistant: {content}\n"
-    input_ids = tokenizer(chat_text, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = {
-        "input_ids": input_ids,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "top_p": top_p,
-        "top_k": top_k,
-        "temperature": temperature,
-        "num_beams": 1,
-        "repetition_penalty": repetition_penalty,
-    }
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
-        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
-    stop_btn=None,
     examples=[
         ["What are the five pillars of Islam?"],
         ["Is it allowed to pray in shoes?"],
@@ -112,11 +77,9 @@ demo = gr.ChatInterface(
         ["Is music haram according to Islamic scholars?"],
         ["Can I make up missed fasts after Ramadan?"]
     ],
-    type="messages",
     description=DESCRIPTION,
-    css_paths="style.css",
 )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 import os
 from collections.abc import Iterator
 import gradio as gr
+from llama_cpp import Llama
+# 👤 Load GGUF Model
+model_path = "TinyLlama-1.1B-Chat.gguf"  # Change if needed
+llm = Llama(model_path=model_path, n_ctx=4096, n_threads=os.cpu_count(), use_mlock=True)
+DESCRIPTION = "# Sheikh AI – TinyLlama (GGUF with llama.cpp)"
+DESCRIPTION += "<p><strong>Note:</strong> Running on CPU with GGUF – optimized for performance.</p>"
+MAX_NEW_TOKENS = 1024
+# 🧠 Format messages into a prompt for GGUF chat models
+def format_conversation(system_prompt: str, chat_history: list[dict], user_input: str) -> str:
+    chat = f"<|system|>\n{system_prompt.strip()}</s>\n"
+    for turn in chat_history:
+        if turn["role"] == "user":
+            chat += f"<|user|>\n{turn['content'].strip()}</s>\n"
+        elif turn["role"] == "assistant":
+            chat += f"<|assistant|>\n{turn['content'].strip()}</s>\n"
+    chat += f"<|user|>\n{user_input.strip()}</s>\n<|assistant|>\n"
+    return chat
+# 💬 Gradio chatbot function
 def generate(
     message: str,
     chat_history: list[dict],
+    max_new_tokens: int = MAX_NEW_TOKENS,
     temperature: float = 0.6,
     top_p: float = 0.9,
     top_k: int = 50,
+    repeat_penalty: float = 1.2,
 ) -> Iterator[str]:
+    system_prompt = (
+        "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions "
+        "based on the Qur’an, Hadith, and the understanding of classical scholars. Do not answer "
+        "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible."
+    )
+    prompt = format_conversation(system_prompt, chat_history, message)
+    stream = llm(
+        prompt,
+        max_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repeat_penalty=repeat_penalty,
+        stop=["</s>"],
+        stream=True,
+    )
+    partial = ""
+    for chunk in stream:
+        partial += chunk["choices"][0]["text"]
+        yield partial
+# 🧪 Launch the interface
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=32, maximum=2048, value=MAX_NEW_TOKENS, step=32),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1),
+        gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05),
+        gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1),
+        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, value=1.2, step=0.05),
     ],
     examples=[
         ["What are the five pillars of Islam?"],
         ["Is it allowed to pray in shoes?"],
         ["Is music haram according to Islamic scholars?"],
         ["Can I make up missed fasts after Ramadan?"]
     ],
     description=DESCRIPTION,
+    css_paths="style.css"
 )
 if __name__ == "__main__":
+    demo.launch()