syempuna commited on
Commit
3adc01b
·
verified ·
1 Parent(s): b09b794

Create chatbot_indonesian.py

Browse files
Files changed (1) hide show
  1. chatbot_indonesian.py +191 -0
chatbot_indonesian.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
2
+ import torch
3
+
4
+ class IndonesianChatbot:
5
+ def __init__(self):
6
+ """Initialize Indonesian chatbot with multiple model options"""
7
+ self.device = 0 if torch.cuda.is_available() else -1
8
+ self.models = {}
9
+ self.tokenizers = {}
10
+
11
+ def load_model(self, model_type="bahasa_gpt"):
12
+ """Load Indonesian chatbot model based on type"""
13
+
14
+ if model_type == "bahasa_gpt" and "bahasa_gpt" not in self.models:
15
+ # BahasaGPT - Best untuk Indonesian chat (7B parameters)
16
+ model_name = "Bahasalab/BahasaGpt-chat"
17
+ self.tokenizers["bahasa_gpt"] = AutoTokenizer.from_pretrained(model_name)
18
+ self.models["bahasa_gpt"] = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
21
+ device_map="auto" if torch.cuda.is_available() else None
22
+ )
23
+
24
+ elif model_type == "indo_gpt" and "indo_gpt" not in self.models:
25
+ # IndoGPT - Alternative bagus
26
+ model_name = "indolem/indobart-v2"
27
+ self.models["indo_gpt"] = pipeline(
28
+ "text-generation",
29
+ model=model_name,
30
+ device=self.device,
31
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
32
+ )
33
+
34
+ elif model_type == "sea_llm" and "sea_llm" not in self.models:
35
+ # SeaLLM - Multi-language termasuk Indonesian
36
+ model_name = "SeaLLMs/SeaLLM-7B-v2-Chat"
37
+ self.tokenizers["sea_llm"] = AutoTokenizer.from_pretrained(model_name)
38
+ self.models["sea_llm"] = AutoModelForCausalLM.from_pretrained(
39
+ model_name,
40
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
41
+ device_map="auto" if torch.cuda.is_available() else None
42
+ )
43
+
44
+ elif model_type == "gemma_id" and "gemma_id" not in self.models:
45
+ # Gemma fine-tuned untuk Indonesian
46
+ model_name = "google/gemma-2b-it" # Lightweight option
47
+ self.models["gemma_id"] = pipeline(
48
+ "text-generation",
49
+ model=model_name,
50
+ device=self.device,
51
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
52
+ )
53
+
54
+ def chat_with_bahasa_gpt(self, message, history=None, max_tokens=512, temperature=0.7):
55
+ """Chat using BahasaGPT model"""
56
+ if "bahasa_gpt" not in self.models:
57
+ self.load_model("bahasa_gpt")
58
+
59
+ tokenizer = self.tokenizers["bahasa_gpt"]
60
+ model = self.models["bahasa_gpt"]
61
+
62
+ # Format conversation history
63
+ conversation = ""
64
+ if history:
65
+ for turn in history:
66
+ role = turn.get("role", "user")
67
+ content = turn.get("content", "")
68
+ if role == "user":
69
+ conversation += f"Human: {content}\n"
70
+ elif role == "assistant":
71
+ conversation += f"Assistant: {content}\n"
72
+
73
+ conversation += f"Human: {message}\nAssistant:"
74
+
75
+ # Tokenize and generate
76
+ inputs = tokenizer.encode(conversation, return_tensors="pt")
77
+ if torch.cuda.is_available():
78
+ inputs = inputs.to("cuda")
79
+
80
+ with torch.no_grad():
81
+ outputs = model.generate(
82
+ inputs,
83
+ max_new_tokens=max_tokens,
84
+ temperature=temperature,
85
+ do_sample=True,
86
+ top_p=0.95,
87
+ pad_token_id=tokenizer.eos_token_id
88
+ )
89
+
90
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
91
+ # Extract only the new assistant response
92
+ response = response.replace(conversation, "").strip()
93
+
94
+ return response
95
+
96
+ def chat_with_sea_llm(self, message, history=None, max_tokens=512, temperature=0.7):
97
+ """Chat using SeaLLM model"""
98
+ if "sea_llm" not in self.models:
99
+ self.load_model("sea_llm")
100
+
101
+ tokenizer = self.tokenizers["sea_llm"]
102
+ model = self.models["sea_llm"]
103
+
104
+ # Format prompt for SeaLLM
105
+ system_message = "Kamu adalah asisten AI yang membantu dalam bahasa Indonesia."
106
+
107
+ conversation = f"<|system|>\n{system_message}\n"
108
+ if history:
109
+ for turn in history:
110
+ role = turn.get("role", "user")
111
+ content = turn.get("content", "")
112
+ if role == "user":
113
+ conversation += f"<|user|>\n{content}\n"
114
+ elif role == "assistant":
115
+ conversation += f"<|assistant|>\n{content}\n"
116
+
117
+ conversation += f"<|user|>\n{message}\n<|assistant|>\n"
118
+
119
+ inputs = tokenizer.encode(conversation, return_tensors="pt")
120
+ if torch.cuda.is_available():
121
+ inputs = inputs.to("cuda")
122
+
123
+ with torch.no_grad():
124
+ outputs = model.generate(
125
+ inputs,
126
+ max_new_tokens=max_tokens,
127
+ temperature=temperature,
128
+ do_sample=True,
129
+ top_p=0.95,
130
+ eos_token_id=tokenizer.eos_token_id
131
+ )
132
+
133
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
134
+ response = response.replace(conversation, "").strip()
135
+
136
+ return response
137
+
138
+ def chat_with_pipeline(self, message, model_type="gemma_id", max_tokens=512, temperature=0.7):
139
+ """Chat using pipeline models"""
140
+ if model_type not in self.models:
141
+ self.load_model(model_type)
142
+
143
+ pipeline_model = self.models[model_type]
144
+
145
+ prompt = f"Pertanyaan: {message}\nJawaban:"
146
+
147
+ result = pipeline_model(
148
+ prompt,
149
+ max_new_tokens=max_tokens,
150
+ temperature=temperature,
151
+ do_sample=True,
152
+ top_p=0.95,
153
+ truncation=True
154
+ )
155
+
156
+ response = result[0]['generated_text'].replace(prompt, "").strip()
157
+ return response
158
+
159
+ # Global chatbot instance
160
+ chatbot = IndonesianChatbot()
161
+
162
+ def chat_indonesian(message, history=None, system_message="", max_tokens=512, temperature=0.7, model_type="bahasa_gpt"):
163
+ """
164
+ Main chat function for Indonesian chatbot
165
+
166
+ Args:
167
+ message (str): User message
168
+ history (list): Conversation history
169
+ system_message (str): System prompt
170
+ max_tokens (int): Maximum tokens to generate
171
+ temperature (float): Temperature for generation
172
+ model_type (str): "bahasa_gpt", "sea_llm", "indo_gpt", "gemma_id"
173
+ """
174
+ try:
175
+ if model_type == "bahasa_gpt":
176
+ return chatbot.chat_with_bahasa_gpt(message, history, max_tokens, temperature)
177
+ elif model_type == "sea_llm":
178
+ return chatbot.chat_with_sea_llm(message, history, max_tokens, temperature)
179
+ elif model_type in ["indo_gpt", "gemma_id"]:
180
+ return chatbot.chat_with_pipeline(message, model_type, max_tokens, temperature)
181
+ else:
182
+ return chatbot.chat_with_bahasa_gpt(message, history, max_tokens, temperature)
183
+
184
+ except Exception as e:
185
+ print(f"Chat error with {model_type}: {e}")
186
+ return f"Maaf, terjadi kesalahan: {str(e)}"
187
+
188
+ # Wrapper untuk kompatibilitas
189
+ def chat_simple(message):
190
+ """Simple wrapper for quick testing"""
191
+ return chat_indonesian(message, model_type="bahasa_gpt")