anaspro commited on
Commit
8af3913
·
1 Parent(s): 431107d
Files changed (1) hide show
  1. app.py +77 -25
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
- from transformers import pipeline, TextIteratorStreamer
 
3
  from threading import Thread
4
  import gradio as gr
5
  import spaces
@@ -19,13 +20,70 @@ model_path = "anaspro/meta-llama-3.1-8b-inst-iraqi"
19
  # إذا كان فيه HF_TOKEN في البيئة
20
  hf_token = os.getenv("HF_TOKEN")
21
 
22
- pipe = pipeline(
23
- "text-generation",
24
- model=model_path,
25
- torch_dtype="auto",
26
- device_map="auto",
27
- token=hf_token, # إضافة التوكن إذا موجود
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def format_conversation_history(chat_history):
31
  messages = []
@@ -47,26 +105,20 @@ def generate_response(input_data, chat_history, max_new_tokens, temperature, top
47
  messages.extend(processed_history)
48
  messages.append(new_message)
49
 
50
- # Use Llama's chat template
51
- prompt_text = pipe.tokenizer.apply_chat_template(
 
 
52
  messages,
53
- tokenize=False,
54
- add_generation_prompt=True
 
 
 
 
55
  )
56
 
57
- streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
58
-
59
- generation_kwargs = {
60
- "max_new_tokens": max_new_tokens,
61
- "do_sample": True,
62
- "temperature": temperature,
63
- "top_p": top_p,
64
- "top_k": top_k,
65
- "repetition_penalty": repetition_penalty,
66
- "streamer": streamer,
67
- "return_full_text": False,
68
- }
69
- thread = Thread(target=pipe, args=(prompt_text,), kwargs=generation_kwargs)
70
  thread.start()
71
 
72
  # Stream the response
 
1
  import os
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, pipeline
4
  from threading import Thread
5
  import gradio as gr
6
  import spaces
 
20
  # إذا كان فيه HF_TOKEN في البيئة
21
  hf_token = os.getenv("HF_TOKEN")
22
 
23
+ # استخدام ChatPipeline بدلاً من text-generation العادي
24
+ tokenizer = AutoTokenizer.from_pretrained(model_path, token=hf_token)
25
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", device_map="auto", token=hf_token)
26
+
27
+ # إنشاء chat pipeline مخصص مع streaming
28
+ def create_chat_pipeline(tokenizer, model):
29
+ """إنشاء pipeline مخصص للدردشة مع chat template و streaming"""
30
+ def chat_generate(messages, streamer=None, **kwargs):
31
+ # تحويل الرسائل للـ chat template
32
+ if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template is not None:
33
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
34
+ else:
35
+ # Fallback للموديلات اللي ما عندها chat template
36
+ prompt = ""
37
+ for msg in messages:
38
+ if msg["role"] == "system":
39
+ prompt += f"System: {msg['content']}\n"
40
+ elif msg["role"] == "user":
41
+ prompt += f"Human: {msg['content']}\n"
42
+ elif msg["role"] == "assistant":
43
+ prompt += f"Assistant: {msg['content']}\n"
44
+ prompt += "Assistant:"
45
+
46
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
47
+
48
+ # توليد الرد مع streaming إذا كان مطلوب
49
+ if streamer:
50
+ generation_kwargs = {
51
+ **inputs,
52
+ "max_new_tokens": kwargs.get('max_new_tokens', 512),
53
+ "temperature": kwargs.get('temperature', 0.7),
54
+ "top_p": kwargs.get('top_p', 0.9),
55
+ "top_k": kwargs.get('top_k', 50),
56
+ "repetition_penalty": kwargs.get('repetition_penalty', 1.1),
57
+ "do_sample": True,
58
+ "pad_token_id": tokenizer.eos_token_id,
59
+ "streamer": streamer,
60
+ "return_full_text": False,
61
+ }
62
+
63
+ # نرجع الـ thread للتشغيل
64
+ return generation_kwargs
65
+ else:
66
+ # للتوليد العادي بدون streaming
67
+ with torch.no_grad():
68
+ outputs = model.generate(
69
+ **inputs,
70
+ max_new_tokens=kwargs.get('max_new_tokens', 512),
71
+ temperature=kwargs.get('temperature', 0.7),
72
+ top_p=kwargs.get('top_p', 0.9),
73
+ top_k=kwargs.get('top_k', 50),
74
+ repetition_penalty=kwargs.get('repetition_penalty', 1.1),
75
+ do_sample=True,
76
+ pad_token_id=tokenizer.eos_token_id,
77
+ return_dict_in_generate=True,
78
+ output_scores=False,
79
+ )
80
+
81
+ response = tokenizer.decode(outputs.sequences[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
82
+ return [{"generated_text": response}]
83
+
84
+ return chat_generate
85
+
86
+ pipe = create_chat_pipeline(tokenizer, model)
87
 
88
  def format_conversation_history(chat_history):
89
  messages = []
 
105
  messages.extend(processed_history)
106
  messages.append(new_message)
107
 
108
+ # استخدام ChatPipeline المخصص مع streaming
109
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
110
+
111
+ generation_kwargs = pipe(
112
  messages,
113
+ streamer=streamer,
114
+ max_new_tokens=max_new_tokens,
115
+ temperature=temperature,
116
+ top_p=top_p,
117
+ top_k=top_k,
118
+ repetition_penalty=repetition_penalty
119
  )
120
 
121
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
122
  thread.start()
123
 
124
  # Stream the response