anaspro commited on
Commit
3eb706b
·
1 Parent(s): 680dfc1
Files changed (5) hide show
  1. app.py +245 -30
  2. test_deployment.py +0 -77
  3. test_iraqi_model.py +0 -53
  4. test_jais.py +0 -54
  5. test_model.py +0 -66
app.py CHANGED
@@ -1,43 +1,258 @@
1
- # -*- coding: utf-8 -*-
 
 
 
 
2
 
 
 
 
3
  import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
5
 
6
- model_path = "inceptionai/jais-family-13b-chat"
7
 
8
- prompt_eng = "### Instruction:Your name is 'Jais', and you are named after Jebel Jais, the highest mountain in UAE. You were made by 'Inception' in the UAE. You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Complete the conversation between [|Human|] and [|AI|]:\n### Input: [|Human|] {Question}\n[|AI|]\n### Response :"
9
- prompt_ar = "### Instruction:اسمك \"جيس\" وسميت على اسم جبل جيس اعلى جبل في الامارات. تم بنائك بواسطة Inception في الإمارات. أنت مساعد مفيد ومحترم وصادق. أجب دائمًا بأكبر قدر ممكن من المساعدة، مع الحفاظ على البقاء أمناً. أكمل المحادثة بين [|Human|] و[|AI|] :\n### Input:[|Human|] {Question}\n[|AI|]\n### Response :"
10
 
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
12
 
13
- tokenizer = AutoTokenizer.from_pretrained(model_path)
14
- model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
15
 
 
 
 
16
 
17
- def get_response(text, tokenizer=tokenizer, model=model):
18
- input_ids = tokenizer(text, return_tensors="pt").input_ids
19
- inputs = input_ids.to(device)
20
- input_len = inputs.shape[-1]
21
- generate_ids = model.generate(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  inputs,
23
- top_p=0.9,
24
- temperature=0.3,
25
- max_length=2048,
26
- min_length=input_len + 4,
27
- repetition_penalty=1.2,
28
- do_sample=True,
29
  )
30
- response = tokenizer.batch_decode(
31
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
32
- )[0]
33
- response = response.split("### Response :")[-1]
34
- return response
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- ques = "ما هي عاصمة الامارات؟"
38
- text = prompt_ar.format_map({'Question': ques})
39
- print(get_response(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- ques = "What is the capital of UAE?"
42
- text = prompt_eng.format_map({'Question': ques})
43
- print(get_response(text))
 
1
+ import os
2
+ import pathlib
3
+ import tempfile
4
+ from collections.abc import Iterator
5
+ from threading import Thread
6
 
7
+ import av
8
+ import gradio as gr
9
+ import spaces
10
  import torch
11
+ from gradio.utils import get_upload_folder
12
+ from transformers import AutoModelForImageTextToText, AutoProcessor
13
+ from transformers.generation.streamers import TextIteratorStreamer
14
 
15
+ model_id = "unsloth/gemma-3n-E4B-it"
16
 
17
+ processor = AutoProcessor.from_pretrained(model_id)
18
+ model = AutoModelForImageTextToText.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
19
 
20
+ IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
21
+ VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
22
+ AUDIO_FILE_TYPES = (".mp3", ".wav")
23
 
24
+ GRADIO_TEMP_DIR = get_upload_folder()
 
25
 
26
+ TARGET_FPS = int(os.getenv("TARGET_FPS", "3"))
27
+ MAX_FRAMES = int(os.getenv("MAX_FRAMES", "30"))
28
+ MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "10_000"))
29
 
30
+
31
+ def get_file_type(path: str) -> str:
32
+ if path.endswith(IMAGE_FILE_TYPES):
33
+ return "image"
34
+ if path.endswith(VIDEO_FILE_TYPES):
35
+ return "video"
36
+ if path.endswith(AUDIO_FILE_TYPES):
37
+ return "audio"
38
+ error_message = f"Unsupported file type: {path}"
39
+ raise ValueError(error_message)
40
+
41
+
42
+ def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
43
+ video_count = 0
44
+ non_video_count = 0
45
+ for path in paths:
46
+ if path.endswith(VIDEO_FILE_TYPES):
47
+ video_count += 1
48
+ else:
49
+ non_video_count += 1
50
+ return video_count, non_video_count
51
+
52
+
53
+ def validate_media_constraints(message: dict) -> bool:
54
+ video_count, non_video_count = count_files_in_new_message(message["files"])
55
+ if video_count > 1:
56
+ gr.Warning("Only one video is supported.")
57
+ return False
58
+ if video_count == 1 and non_video_count > 0:
59
+ gr.Warning("Mixing images and videos is not allowed.")
60
+ return False
61
+ return True
62
+
63
+
64
+ def extract_frames_to_tempdir(
65
+ video_path: str,
66
+ target_fps: float,
67
+ max_frames: int | None = None,
68
+ parent_dir: str | None = None,
69
+ prefix: str = "frames_",
70
+ ) -> str:
71
+ temp_dir = tempfile.mkdtemp(prefix=prefix, dir=parent_dir)
72
+
73
+ container = av.open(video_path)
74
+ video_stream = container.streams.video[0]
75
+
76
+ if video_stream.duration is None or video_stream.time_base is None:
77
+ raise ValueError("video_stream is missing duration or time_base")
78
+
79
+ time_base = video_stream.time_base
80
+ duration = float(video_stream.duration * time_base)
81
+ interval = 1.0 / target_fps
82
+
83
+ total_frames = int(duration * target_fps)
84
+ if max_frames is not None:
85
+ total_frames = min(total_frames, max_frames)
86
+
87
+ target_times = [i * interval for i in range(total_frames)]
88
+ target_index = 0
89
+
90
+ for frame in container.decode(video=0):
91
+ if frame.pts is None:
92
+ continue
93
+
94
+ timestamp = float(frame.pts * time_base)
95
+
96
+ if target_index < len(target_times) and abs(timestamp - target_times[target_index]) < (interval / 2):
97
+ frame_path = pathlib.Path(temp_dir) / f"frame_{target_index:04d}.jpg"
98
+ frame.to_image().save(frame_path)
99
+ target_index += 1
100
+
101
+ if max_frames is not None and target_index >= max_frames:
102
+ break
103
+
104
+ container.close()
105
+ return temp_dir
106
+
107
+
108
+ def process_new_user_message(message: dict) -> list[dict]:
109
+ if not message["files"]:
110
+ return [{"type": "text", "text": message["text"]}]
111
+
112
+ file_types = [get_file_type(path) for path in message["files"]]
113
+
114
+ if len(file_types) == 1 and file_types[0] == "video":
115
+ gr.Info(f"Video will be processed at {TARGET_FPS} FPS, max {MAX_FRAMES} frames in this Space.")
116
+
117
+ temp_dir = extract_frames_to_tempdir(
118
+ message["files"][0],
119
+ target_fps=TARGET_FPS,
120
+ max_frames=MAX_FRAMES,
121
+ parent_dir=GRADIO_TEMP_DIR,
122
+ )
123
+ paths = sorted(pathlib.Path(temp_dir).glob("*.jpg"))
124
+ return [
125
+ {"type": "text", "text": message["text"]},
126
+ *[{"type": "image", "image": path.as_posix()} for path in paths],
127
+ ]
128
+
129
+ return [
130
+ {"type": "text", "text": message["text"]},
131
+ *[{"type": file_type, file_type: path} for path, file_type in zip(message["files"], file_types, strict=True)],
132
+ ]
133
+
134
+
135
+ def process_history(history: list[dict]) -> list[dict]:
136
+ messages = []
137
+ current_user_content: list[dict] = []
138
+ for item in history:
139
+ if item["role"] == "assistant":
140
+ if current_user_content:
141
+ messages.append({"role": "user", "content": current_user_content})
142
+ current_user_content = []
143
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
144
+ else:
145
+ content = item["content"]
146
+ if isinstance(content, str):
147
+ current_user_content.append({"type": "text", "text": content})
148
+ else:
149
+ filepath = content[0]
150
+ file_type = get_file_type(filepath)
151
+ current_user_content.append({"type": file_type, file_type: filepath})
152
+ return messages
153
+
154
+
155
+ @spaces.GPU(duration=120)
156
+ @torch.inference_mode()
157
+ def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
158
+ if not validate_media_constraints(message):
159
+ yield ""
160
+ return
161
+
162
+ messages = []
163
+ if system_prompt:
164
+ messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
165
+ messages.extend(process_history(history))
166
+ messages.append({"role": "user", "content": process_new_user_message(message)})
167
+
168
+ inputs = processor.apply_chat_template(
169
+ messages,
170
+ add_generation_prompt=True,
171
+ tokenize=True,
172
+ return_dict=True,
173
+ return_tensors="pt",
174
+ )
175
+ n_tokens = inputs["input_ids"].shape[1]
176
+ if n_tokens > MAX_INPUT_TOKENS:
177
+ gr.Warning(
178
+ f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
179
+ )
180
+ yield ""
181
+ return
182
+
183
+ inputs = inputs.to(device=model.device, dtype=torch.bfloat16)
184
+
185
+ streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
186
+ generate_kwargs = dict(
187
  inputs,
188
+ streamer=streamer,
189
+ max_new_tokens=max_new_tokens,
190
+ do_sample=False,
191
+ disable_compile=True,
 
 
192
  )
193
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
194
+ t.start()
195
+
196
+ output = ""
197
+ for delta in streamer:
198
+ output += delta
199
+ yield output
200
+
201
 
202
+ examples = [
203
+ [
204
+ {
205
+ "text": "What is the capital of France?",
206
+ "files": [],
207
+ }
208
+ ],
209
+ [
210
+ {
211
+ "text": "Describe this image in detail.",
212
+ "files": ["assets/cat.jpeg"],
213
+ }
214
+ ],
215
+ [
216
+ {
217
+ "text": "Transcribe the following speech segment in English.",
218
+ "files": ["assets/speech.wav"],
219
+ }
220
+ ],
221
+ [
222
+ {
223
+ "text": "Transcribe the following speech segment in English.",
224
+ "files": ["assets/speech2.wav"],
225
+ }
226
+ ],
227
+ [
228
+ {
229
+ "text": "Describe this video",
230
+ "files": ["assets/holding_phone.mp4"],
231
+ }
232
+ ],
233
+ ]
234
 
235
+ demo = gr.ChatInterface(
236
+ fn=generate,
237
+ type="messages",
238
+ textbox=gr.MultimodalTextbox(
239
+ file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
240
+ file_count="multiple",
241
+ autofocus=True,
242
+ ),
243
+ multimodal=True,
244
+ additional_inputs=[
245
+ gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
246
+ gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
247
+ ],
248
+ stop_btn=False,
249
+ title="Gemma 3n E4B it",
250
+ examples=examples,
251
+ run_examples_on_click=False,
252
+ cache_examples=False,
253
+ css_paths="style.css",
254
+ delete_cache=(1800, 1800),
255
+ )
256
 
257
+ if __name__ == "__main__":
258
+ demo.launch()
 
test_deployment.py DELETED
@@ -1,77 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- اختبار سريع للتأكد من أن التطبيق يعمل قبل النشر
4
- """
5
-
6
- import sys
7
- import os
8
-
9
- def test_imports():
10
- """اختبار الاستيراد"""
11
- try:
12
- import torch
13
- import gradio as gr
14
- import spaces
15
- from transformers import AutoTokenizer, AutoModelForCausalLM
16
- print("✅ جميع الاستيرادات نجحت")
17
- return True
18
- except ImportError as e:
19
- print(f"❌ خطأ في الاستيراد: {e}")
20
- return False
21
-
22
- def test_model_loading():
23
- """اختبار تحميل المودل"""
24
- try:
25
- from transformers import AutoTokenizer, AutoModelForCausalLM
26
- import torch
27
-
28
- model_path = "anaspro/iraqi-7b"
29
- hf_token = os.getenv("HF_TOKEN")
30
-
31
- print("🔄 جاري تحميل المودل للاختبار...")
32
-
33
- # تحميل سريع للاختبار فقط
34
- tokenizer = AutoTokenizer.from_pretrained(
35
- model_path,
36
- token=hf_token,
37
- trust_remote_code=True
38
- )
39
-
40
- # تحميل المودل على CPU فقط للاختبار
41
- model = AutoModelForCausalLM.from_pretrained(
42
- model_path,
43
- device_map="cpu", # استخدام CPU للاختبار فقط
44
- trust_remote_code=True,
45
- token=hf_token,
46
- torch_dtype=torch.float32,
47
- low_cpu_mem_usage=True
48
- )
49
-
50
- print("✅ تحميل المودل نجح")
51
- return True
52
-
53
- except Exception as e:
54
- print(f"❌ خطأ في تحميل المودل: {e}")
55
- return False
56
-
57
- def main():
58
- print("🚀 اختبار النشر على Hugging Face ZeroGPU")
59
- print("=" * 50)
60
-
61
- # اختبار الاستيراد
62
- if not test_imports():
63
- sys.exit(1)
64
-
65
- # اختبار تحميل المودل
66
- if not test_model_loading():
67
- sys.exit(1)
68
-
69
- print("\n🎉 جميع الاختبارات نجحت! التطبيق جاهز للنشر")
70
- print("\n📋 الملفات المطلوبة للنشر:")
71
- print("- app.py")
72
- print("- requirements.txt")
73
- print("- README.md")
74
- print("- system_prompt.txt")
75
-
76
- if __name__ == "__main__":
77
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_iraqi_model.py DELETED
@@ -1,53 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- اختبار مودل Jais - مثل الكود الأصلي
4
- """
5
-
6
- import os
7
- import torch
8
- from transformers import AutoTokenizer, AutoModelForCausalLM
9
-
10
- def test_jais():
11
- model_path = "inceptionai/jais-family-13b-chat"
12
-
13
- # تحميل المودل مثل الكود الأصلي
14
- tokenizer = AutoTokenizer.from_pretrained(model_path)
15
- model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
16
-
17
- # الـ prompts الأصلية
18
- prompt_eng = "### Instruction:Your name is 'Jais', and you are named after Jebel Jais, the highest mountain in UAE. You were made by 'Inception' in the UAE. You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Complete the conversation between [|Human|] and [|AI|]:\n### Input: [|Human|] {Question}\n[|AI|]\n### Response :"
19
- prompt_ar = "### Instruction:اسمك \"جيس\" وسميت على اسم جبل جيس اعلى جبل في الامارات. تم بنائك بواسطة Inception في الإمارات. أنت مساعد مفيد ومحترم وصادق. أجب دائمًا بأكبر قدر ممكن من المساعدة، مع الحفاظ على البقاء أمناً. أكمل المحادثة بين [|Human|] و[|AI|] :\n### Input:[|Human|] {Question}\n[|AI|]\n### Response :"
20
-
21
- def get_response(text):
22
- input_ids = tokenizer(text, return_tensors="pt").input_ids
23
- inputs = input_ids.to("cuda" if torch.cuda.is_available() else "cpu")
24
- input_len = inputs.shape[-1]
25
- generate_ids = model.generate(
26
- inputs,
27
- top_p=0.9,
28
- temperature=0.3,
29
- max_length=2048,
30
- min_length=input_len + 4,
31
- repetition_penalty=1.2,
32
- do_sample=True,
33
- )
34
- response = tokenizer.batch_decode(
35
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
36
- )[0]
37
- response = response.split("### Response :")[-1]
38
- return response
39
-
40
- # اختبار عربي فقط (التركيز على العربية)
41
- ques = "ما هي عاصمة الامارات؟"
42
- text = prompt_ar.format_map({'Question': ques})
43
- response = get_response(text)
44
-
45
- print("=" * 50)
46
- print("الاختبار نجح! ✅")
47
- print(f"الموديل: {model_path}")
48
- print("السؤال: ما هي عاصمة الامارات؟"
49
- print(f"الرد: {response}")
50
- print("=" * 50)
51
-
52
- if __name__ == "__main__":
53
- test_jais()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_jais.py DELETED
@@ -1,54 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- اختبار مودل Jais - مثل الكود الأصلي
4
- """
5
-
6
- import os
7
- import torch
8
- from transformers import AutoTokenizer, AutoModelForCausalLM
9
-
10
- def test_jais():
11
- model_path = "inceptionai/jais-family-13b-chat"
12
-
13
- # تحميل المودل مثل الكود الأصلي
14
- tokenizer = AutoTokenizer.from_pretrained(model_path)
15
- model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
16
-
17
- # الـ prompts الأصلية
18
- prompt_eng = "### Instruction:Your name is 'Jais', and you are named after Jebel Jais, the highest mountain in UAE. You were made by 'Inception' in the UAE. You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while being safe. Complete the conversation between [|Human|] and [|AI|]:\n### Input: [|Human|] {Question}\n[|AI|]\n### Response :"
19
- prompt_ar = "### Instruction:اسمك \"جيس\" وسميت على اسم جبل جيس اعلى جبل في الامارات. تم بنائك بواسطة Inception في الإمارات. أنت مساعد مفيد ومحترم وصادق. أجب دائمًا بأكبر قدر ممكن من المساعدة، مع الحفاظ على البقاء أمناً. أكمل المحادثة بين [|Human|] و[|AI|] :\n### Input:[|Human|] {Question}\n[|AI|]\n### Response :"
20
-
21
- def get_response(text):
22
- input_ids = tokenizer(text, return_tensors="pt").input_ids
23
- inputs = input_ids.to("cuda" if torch.cuda.is_available() else "cpu")
24
- input_len = inputs.shape[-1]
25
- generate_ids = model.generate(
26
- inputs,
27
- top_p=0.9,
28
- temperature=0.3,
29
- max_length=2048,
30
- min_length=input_len + 4,
31
- repetition_penalty=1.2,
32
- do_sample=True,
33
- )
34
- response = tokenizer.batch_decode(
35
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
36
- )[0]
37
- response = response.split("### Response :")[-1]
38
- return response
39
-
40
- # اختبار عربي
41
- ques = "ما هي عاصمة الامارات؟"
42
- text = prompt_ar.format_map({'Question': ques})
43
- print("السؤال العربي:", ques)
44
- print("الرد:", get_response(text))
45
- print()
46
-
47
- # اختبار إنجليزي
48
- ques = "What is the capital of UAE?"
49
- text = prompt_eng.format_map({'Question': ques})
50
- print("السؤال الإنجليزي:", ques)
51
- print("الرد:", get_response(text))
52
-
53
- if __name__ == "__main__":
54
- test_jais()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_model.py DELETED
@@ -1,66 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import os
5
- import torch
6
- import transformers
7
- from transformers import pipeline
8
-
9
- model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
10
-
11
- # إذا كان فيه HF_TOKEN في البيئة
12
- hf_token = os.getenv("HF_TOKEN")
13
-
14
- print("Loading model...")
15
- try:
16
- # Initialize pipeline for chat
17
- # For quantized models, use device=0 instead of device_map="auto" to avoid meta tensor issues
18
- pipeline_model = pipeline(
19
- "text-generation",
20
- model=model_path,
21
- device=0, # Use GPU device directly
22
- torch_dtype=torch.bfloat16,
23
- token=hf_token,
24
- trust_remote_code=True,
25
- model_kwargs={
26
- "torch_dtype": torch.bfloat16,
27
- "load_in_4bit": True,
28
- "bnb_4bit_compute_dtype": torch.bfloat16,
29
- "bnb_4bit_use_double_quant": False,
30
- "bnb_4bit_quant_type": "nf4",
31
- }
32
- )
33
-
34
- print("Model loaded successfully!")
35
-
36
- # Test with a simple message
37
- messages = [
38
- {"role": "system", "content": "You are a helpful assistant."},
39
- {"role": "user", "content": "Hello!"},
40
- ]
41
-
42
- print("Testing generation...")
43
- # Apply chat template for unsloth models
44
- prompt = pipeline_model.tokenizer.apply_chat_template(
45
- messages,
46
- tokenize=False,
47
- add_generation_prompt=True
48
- )
49
-
50
- outputs = pipeline_model(
51
- prompt,
52
- max_new_tokens=50,
53
- temperature=0.7,
54
- top_p=0.9,
55
- do_sample=True,
56
- return_full_text=False
57
- )
58
-
59
- response = outputs[0]["generated_text"]
60
- print(f"Test response: {response}")
61
- print("✅ Model test successful!")
62
-
63
- except Exception as e:
64
- print(f"❌ Error: {e}")
65
- import traceback
66
- traceback.print_exc()