vortexa64 commited on
Commit
800491f
·
verified ·
1 Parent(s): 34d2621

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import json
3
+ import gradio as gr
4
+
5
+ # === LOAD CORPUS & DATASET ===
6
+ with open("corpus.txt", "r", encoding="utf-8") as f:
7
+ corpus = f.read().splitlines()
8
+
9
+ with open("dataset.json", "r", encoding="utf-8") as f:
10
+ qa_data = json.load(f)
11
+
12
+ # === BUILD VOCAB ===
13
+ all_texts = corpus + list(qa_data.keys()) + list(qa_data.values())
14
+ vocab = list(set(" ".join(all_texts).split()))
15
+ word2id = {w: i for i, w in enumerate(vocab)}
16
+ id2word = {i: w for w, i in word2id.items()}
17
+ vocab_size = len(vocab)
18
+
19
+ def one_hot(word):
20
+ vec = np.zeros(vocab_size)
21
+ if word in word2id:
22
+ vec[word2id[word]] = 1
23
+ return vec
24
+
25
+ # === PARAMETER MODEL ===
26
+ np.random.seed(42)
27
+ hidden_size = 512 # bisa diubah
28
+ W1 = np.random.randn(vocab_size, hidden_size) * 0.01
29
+ W2 = np.random.randn(hidden_size, vocab_size) * 0.01
30
+ lr = 0.05
31
+
32
+ def softmax(x):
33
+ e = np.exp(x - np.max(x))
34
+ return e / e.sum()
35
+
36
+ def train_step(sentence):
37
+ global W1, W2
38
+ words = sentence.split()
39
+ loss = 0
40
+ for i in range(len(words)-1):
41
+ x = one_hot(words[i])
42
+ y_true = one_hot(words[i+1])
43
+ h = np.dot(x, W1)
44
+ o = np.dot(h, W2)
45
+ y_pred = softmax(o)
46
+ loss += np.mean((y_true - y_pred)**2)
47
+ grad_o = y_pred - y_true
48
+ dW2 = np.outer(h, grad_o)
49
+ dW1 = np.outer(x, np.dot(W2, grad_o))
50
+ W1 -= lr * dW1
51
+ W2 -= lr * dW2
52
+ return loss
53
+
54
+ # === PRETRAIN DENGAN CORPUS ===
55
+ for epoch in range(200):
56
+ total_loss = 0
57
+ for line in corpus:
58
+ total_loss += train_step(line + " <END>")
59
+ if epoch % 50 == 0:
60
+ print(f"Pretrain Epoch {epoch}, Loss: {total_loss:.4f}")
61
+
62
+ # === FINE-TUNE DENGAN Q&A ===
63
+ for epoch in range(200):
64
+ total_loss = 0
65
+ for q, a in qa_data.items():
66
+ total_loss += train_step(q + " " + a)
67
+ if epoch % 50 == 0:
68
+ print(f"Finetune Epoch {epoch}, Loss: {total_loss:.4f}")
69
+
70
+ # === GENERATE JAWABAN ===
71
+ def generate_reply(question, max_len=30):
72
+ words = question.split()
73
+ if words[0] not in word2id:
74
+ return "Maaf, aku belum ngerti kata itu 🥺"
75
+ x = one_hot(words[0])
76
+ reply = []
77
+ for _ in range(max_len):
78
+ h = np.dot(x, W1)
79
+ o = np.dot(h, W2)
80
+ y_pred = softmax(o)
81
+ pred_id = np.argmax(y_pred) # argmax biar konsisten
82
+ pred_word = id2word[pred_id]
83
+ if pred_word == "<END>":
84
+ break
85
+ reply.append(pred_word)
86
+ x = one_hot(pred_word)
87
+ return " ".join(reply)
88
+
89
+ # === GRADIO INTERFACE ===
90
+ def chatbot(input_text):
91
+ return generate_reply(input_text)
92
+
93
+ demo = gr.Interface(fn=chatbot,
94
+ inputs="text",
95
+ outputs="text",
96
+ title="Chatbot Numpy ala Cici 🤭",
97
+ description="Mini chatbot dengan training 2 tahap: corpus + Q&A")
98
+
99
+ if __name__ == "__main__":
100
+ demo.launch()