YOUSEF2434 commited on
Commit
68b3e68
·
verified ·
1 Parent(s): 5576ec8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -86
app.py CHANGED
@@ -1,110 +1,75 @@
1
  import os
2
  from collections.abc import Iterator
3
- from threading import Thread
4
 
5
  import gradio as gr
6
- import spaces
7
- import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
- from huggingface_hub import login
10
 
11
- # 🔐 Authenticate with Hugging Face token stored as secret or env var
12
- login(token=os.environ.get("HF_TOKEN"))
 
13
 
14
- DESCRIPTION = "# Sheikh AI – microsoft/Phi-4-mini-instruct (quantized int8)"
15
- if not torch.cuda.is_available():
16
- DESCRIPTION += "\n<p><strong>Note:</strong> Running on CPU – slower performance.</p>"
17
 
18
- MAX_MAX_NEW_TOKENS = 2048
19
- DEFAULT_MAX_NEW_TOKENS = 1024
20
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
21
 
22
- model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
 
 
 
 
 
 
 
23
 
24
- # Load model with int8 quantization on CUDA (if available)
25
- if torch.cuda.is_available():
26
- model = AutoModelForCausalLM.from_pretrained(
27
- model_id,
28
- load_in_8bit=True,
29
- device_map="auto",
30
- )
31
- else:
32
- # Fallback: load in float32 on CPU (slow)
33
- model = AutoModelForCausalLM.from_pretrained(
34
- model_id,
35
- torch_dtype=torch.float32,
36
- device_map="cpu",
37
- )
38
-
39
- tokenizer = AutoTokenizer.from_pretrained(model_id)
40
 
41
-
42
- @spaces.GPU
43
  def generate(
44
  message: str,
45
  chat_history: list[dict],
46
- max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
47
  temperature: float = 0.6,
48
  top_p: float = 0.9,
49
  top_k: int = 50,
50
- repetition_penalty: float = 1.2,
51
  ) -> Iterator[str]:
52
- system_prompt = {
53
- "role": "system",
54
- "content": (
55
- "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions "
56
- "based on the Qur’an, Hadith, and the understanding of classical scholars. Do not answer "
57
- "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible."
58
- )
59
- }
60
-
61
- conversation = [system_prompt] + chat_history + [{"role": "user", "content": message}]
62
-
63
- chat_text = ""
64
- for turn in conversation:
65
- role = turn.get("role", "")
66
- content = turn.get("content", "")
67
- if role == "system":
68
- chat_text += f"System: {content}\n"
69
- elif role == "user":
70
- chat_text += f"User: {content}\n"
71
- elif role == "assistant":
72
- chat_text += f"Assistant: {content}\n"
73
-
74
- input_ids = tokenizer(chat_text, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).input_ids.to(model.device)
75
-
76
- streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
77
- generate_kwargs = {
78
- "input_ids": input_ids,
79
- "streamer": streamer,
80
- "max_new_tokens": max_new_tokens,
81
- "do_sample": True,
82
- "top_p": top_p,
83
- "top_k": top_k,
84
- "temperature": temperature,
85
- "num_beams": 1,
86
- "repetition_penalty": repetition_penalty,
87
- }
88
 
89
- t = Thread(target=model.generate, kwargs=generate_kwargs)
90
- t.start()
 
 
 
 
 
 
 
 
 
 
91
 
92
- outputs = []
93
- for text in streamer:
94
- outputs.append(text)
95
- yield "".join(outputs)
96
 
97
 
 
98
  demo = gr.ChatInterface(
99
  fn=generate,
100
  additional_inputs=[
101
- gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
102
- gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
103
- gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
104
- gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
105
- gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
106
  ],
107
- stop_btn=None,
108
  examples=[
109
  ["What are the five pillars of Islam?"],
110
  ["Is it allowed to pray in shoes?"],
@@ -112,11 +77,9 @@ demo = gr.ChatInterface(
112
  ["Is music haram according to Islamic scholars?"],
113
  ["Can I make up missed fasts after Ramadan?"]
114
  ],
115
- type="messages",
116
  description=DESCRIPTION,
117
- css_paths="style.css",
118
  )
119
 
120
-
121
  if __name__ == "__main__":
122
- demo.queue(max_size=20).launch()
 
1
  import os
2
  from collections.abc import Iterator
 
3
 
4
  import gradio as gr
5
+ from llama_cpp import Llama
 
 
 
6
 
7
+ # 👤 Load GGUF Model
8
+ model_path = "TinyLlama-1.1B-Chat.gguf" # Change if needed
9
+ llm = Llama(model_path=model_path, n_ctx=4096, n_threads=os.cpu_count(), use_mlock=True)
10
 
11
+ DESCRIPTION = "# Sheikh AI – TinyLlama (GGUF with llama.cpp)"
12
+ DESCRIPTION += "<p><strong>Note:</strong> Running on CPU with GGUF – optimized for performance.</p>"
 
13
 
14
+ MAX_NEW_TOKENS = 1024
 
 
15
 
16
+ # 🧠 Format messages into a prompt for GGUF chat models
17
+ def format_conversation(system_prompt: str, chat_history: list[dict], user_input: str) -> str:
18
+ chat = f"<|system|>\n{system_prompt.strip()}</s>\n"
19
+ for turn in chat_history:
20
+ if turn["role"] == "user":
21
+ chat += f"<|user|>\n{turn['content'].strip()}</s>\n"
22
+ elif turn["role"] == "assistant":
23
+ chat += f"<|assistant|>\n{turn['content'].strip()}</s>\n"
24
+ chat += f"<|user|>\n{user_input.strip()}</s>\n<|assistant|>\n"
25
+ return chat
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # 💬 Gradio chatbot function
 
29
  def generate(
30
  message: str,
31
  chat_history: list[dict],
32
+ max_new_tokens: int = MAX_NEW_TOKENS,
33
  temperature: float = 0.6,
34
  top_p: float = 0.9,
35
  top_k: int = 50,
36
+ repeat_penalty: float = 1.2,
37
  ) -> Iterator[str]:
38
+ system_prompt = (
39
+ "You are SheikhGPT, a wise Islamic scholar AI. You respond only to Islamic-related questions "
40
+ "based on the Qur’an, Hadith, and the understanding of classical scholars. Do not answer "
41
+ "questions unrelated to Islam. Speak humbly, respectfully, and provide sources when possible."
42
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ prompt = format_conversation(system_prompt, chat_history, message)
45
+
46
+ stream = llm(
47
+ prompt,
48
+ max_tokens=max_new_tokens,
49
+ temperature=temperature,
50
+ top_p=top_p,
51
+ top_k=top_k,
52
+ repeat_penalty=repeat_penalty,
53
+ stop=["</s>"],
54
+ stream=True,
55
+ )
56
 
57
+ partial = ""
58
+ for chunk in stream:
59
+ partial += chunk["choices"][0]["text"]
60
+ yield partial
61
 
62
 
63
+ # 🧪 Launch the interface
64
  demo = gr.ChatInterface(
65
  fn=generate,
66
  additional_inputs=[
67
+ gr.Slider(label="Max new tokens", minimum=32, maximum=2048, value=MAX_NEW_TOKENS, step=32),
68
+ gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.6, step=0.1),
69
+ gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05),
70
+ gr.Slider(label="Top-k", minimum=1, maximum=100, value=50, step=1),
71
+ gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, value=1.2, step=0.05),
72
  ],
 
73
  examples=[
74
  ["What are the five pillars of Islam?"],
75
  ["Is it allowed to pray in shoes?"],
 
77
  ["Is music haram according to Islamic scholars?"],
78
  ["Can I make up missed fasts after Ramadan?"]
79
  ],
 
80
  description=DESCRIPTION,
81
+ css_paths="style.css"
82
  )
83
 
 
84
  if __name__ == "__main__":
85
+ demo.launch()