AGofficial commited on
Commit
767f47f
·
verified ·
1 Parent(s): 860f32a

Upload 8 files

Browse files
Files changed (8) hide show
  1. AgX_model.pkl +3 -0
  2. agx.py +15 -0
  3. agxpre.py +112 -0
  4. data.jsonl +0 -0
  5. data.py +17 -0
  6. data.txt +0 -0
  7. promptcleaner.py +11 -0
  8. turbo.py +188 -0
AgX_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4141f3157ab92bf86ea01d95ac057961231b4f856f820d9ca9c155ed56b6add
3
+ size 67263903
agx.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agxpre import correct_grammar
2
+ from turbo import AgX
3
+ from promptcleaner import clean_prompt
4
+
5
+ def AskAGX(prompt):
6
+ TURBO = AgX()
7
+ response = TURBO.AskAgGPT8TURBO(prompt)
8
+ response = correct_grammar(response)
9
+ return response
10
+
11
+ if __name__ == "__main__":
12
+ prompt = input("Enter your prompt: ")
13
+ prompt = clean_prompt(prompt)
14
+ response = AskAGX(prompt)
15
+ print(f"Response: {response}")
agxpre.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def correct_grammar(text):
2
+ import re
3
+ from string import punctuation
4
+
5
+ contractions = {
6
+ "cant": "can't",
7
+ "wont": "won't",
8
+ "dont": "don't",
9
+ "doesnt": "doesn't",
10
+ "didnt": "didn't",
11
+ "isnt": "isn't",
12
+ "arent": "aren't",
13
+ "wasnt": "wasn't",
14
+ "werent": "weren't",
15
+ "havent": "haven't",
16
+ "hasnt": "hasn't",
17
+ "hadnt": "hadn't",
18
+ "im": "I'm",
19
+ "youre": "you're",
20
+ "hes": "he's",
21
+ "shes": "she's",
22
+ "its": "it's",
23
+ "theyre": "they're",
24
+ "ive": "I've",
25
+ "weve": "we've",
26
+ "youve": "you've",
27
+ "wouldve": "would've",
28
+ "couldve": "could've",
29
+ "shouldve": "should've",
30
+ "thats": "that's",
31
+ "theres": "there's",
32
+ "heres": "here's",
33
+ "whos": "who's",
34
+ "whats": "what's",
35
+ "wheres": "where's",
36
+ "whens": "when's",
37
+ "whys": "why's",
38
+ "hows": "how's"
39
+ }
40
+
41
+ def fix_contractions(text):
42
+ for wrong, correct in contractions.items():
43
+ text = re.sub(r'\b' + wrong + r'\b', correct, text, flags=re.IGNORECASE)
44
+ return text
45
+
46
+ def fix_capitalization(text):
47
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
48
+ sentences = [s[0].upper() + s[1:] if s else s for s in sentences]
49
+ return ' '.join(sentences)
50
+
51
+ def fix_punctuation_spacing(text):
52
+ text = re.sub(r'\s+([,.!?;])', r'\1', text)
53
+ text = re.sub(r'([,.!?;])\s*([,.!?;])', r'\1 \2', text)
54
+ text = re.sub(r'\s*([\(])\s*', r' \1', text)
55
+ text = re.sub(r'\s*([\)])', r'\1', text)
56
+ return text
57
+
58
+ def fix_possessives(text):
59
+ text = re.sub(r'\b(\w+)s\b', r"\1's", text)
60
+ text = re.sub(r"\b(\w+)'s\s+(own|house|car|book|dog|cat|child|children)\b", r"\1's \2", text)
61
+ return text
62
+
63
+ def fix_common_mistakes(text):
64
+ common_errors = {
65
+ "your": "you're",
66
+ "there": "their",
67
+ "then": "than",
68
+ "loose": "lose",
69
+ "effect": "affect",
70
+ "to": "too",
71
+ "alot": "a lot",
72
+ "wierd": "weird",
73
+ "definately": "definitely",
74
+ "seperate": "separate"
75
+ }
76
+ for wrong, correct in common_errors.items():
77
+ text = re.sub(r'\b' + wrong + r'\b', correct, text, flags=re.IGNORECASE)
78
+ return text
79
+
80
+ def ensure_sentence_ends(text):
81
+ if text and text[-1] not in '.!?':
82
+ text += '.'
83
+ return text
84
+
85
+ def remove_extra_spaces(text):
86
+ text = re.sub(r'\s+', ' ', text)
87
+ return text.strip()
88
+
89
+ def fix_quotation_marks(text):
90
+ text = re.sub(r'"([^"]*)"', r'"\1"', text)
91
+ text = re.sub(r'\'([^\']*)\'', r'"\1"', text)
92
+ return text
93
+
94
+ def fix_subject_verb_agreement(text):
95
+ text = re.sub(r'\b(he|she|it)\s+(go|run|jump|sing|dance)\b', r'\1 \2es', text, flags=re.IGNORECASE)
96
+ text = re.sub(r'\b(I|you|we|they)\s+(goes|runs|jumps|sings|dances)\b', r'\1 \2', text, flags=re.IGNORECASE)
97
+ return text
98
+
99
+ if not text or not isinstance(text, str):
100
+ return ""
101
+
102
+ text = fix_contractions(text)
103
+ text = fix_capitalization(text)
104
+ text = fix_punctuation_spacing(text)
105
+ text = fix_possessives(text)
106
+ text = fix_common_mistakes(text)
107
+ text = fix_quotation_marks(text)
108
+ text = fix_subject_verb_agreement(text)
109
+ text = ensure_sentence_ends(text)
110
+ text = remove_extra_spaces(text)
111
+
112
+ return text
data.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ with open('data.txt', 'r') as file:
4
+ corpus = file.read()
5
+
6
+ lines = []
7
+ with open('data.jsonl', 'r') as file:
8
+ for line in file:
9
+ try:
10
+ data = json.loads(line.strip())
11
+ ai_response = data.get('text', '')
12
+ url = data.get('url', '')
13
+ lines.append(f"User: {url}\nAI: {ai_response}\n<|endoftext|>\n")
14
+ except json.JSONDecodeError:
15
+ pass
16
+
17
+ corpus += ''.join(lines)
data.txt ADDED
The diff for this file is too large to render. See raw diff
 
promptcleaner.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def clean_prompt(prompt):
4
+ prompt = prompt.lower()
5
+ prompt = re.sub(r'[^a-z0-9\s]', '', prompt)
6
+ return prompt
7
+
8
+ if __name__ == "__main__":
9
+ prompt = input("Enter your prompt: ")
10
+ cleaned_prompt = clean_prompt(prompt)
11
+ print(f"Cleaned Prompt: {cleaned_prompt}")
turbo.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import random
3
+ import re
4
+ import pickle
5
+ import os
6
+ from data import corpus
7
+ from agxpre import correct_grammar
8
+
9
+ class AgX:
10
+ def __init__(self, model_name='AgX-2', max_length=10000, model_file='AgX_model.pkl'):
11
+ self.ModelName = model_name
12
+ self.max_length = max_length
13
+ self.user = 'user'
14
+ self.ai = 'ai'
15
+ self.minNgram = 1
16
+ self.maxNgram = 5
17
+ self.RED = '\033[91m'
18
+ self.GREEN = '\033[92m'
19
+ self.BLUE = '\033[94m'
20
+ self.RESET = '\033[0m'
21
+ self.model_file = model_file
22
+ if os.path.exists(self.model_file):
23
+ print(f'{self.RED}Loading saved model from {self.model_file}...{self.RESET}')
24
+ self.ngram_models = self.load_model()
25
+ else:
26
+ print(f'{self.RED}No saved model found. Training new model...{self.RESET}')
27
+ self.ngram_models = self.train_model(corpus)
28
+ self.save_model()
29
+
30
+ def mat_mul(self, A, B):
31
+ result = []
32
+ for i in range(len(A)):
33
+ result.append([sum(A[i][k] * B[k][j] for k in range(len(B))) for j in range(len(B[0]))])
34
+ return result
35
+
36
+ def softmax(self, x):
37
+ exp_x = [math.exp(v - max(x)) for v in x]
38
+ sum_exp_x = sum(exp_x)
39
+ return [e / sum_exp_x for e in exp_x]
40
+
41
+ def self_attention(self, Q, K, V):
42
+ scores = [[sum(Q[i][idx] * K[j][idx] for idx in range(len(Q[i]))) for j in range(len(K))] for i in range(len(Q))]
43
+ attention_weights = [self.softmax(row) for row in scores]
44
+ output = [[sum(attention_weights[i][k] * V[k][j] for k in range(len(V))) for j in range(len(V[0]))] for i in range(len(V))]
45
+ return output
46
+
47
+ def multi_head_attention(self, Q, K, V, num_heads):
48
+ d_model = len(Q[0])
49
+ head_size = d_model // num_heads
50
+ outputs = []
51
+ for head in range(num_heads):
52
+ q_head = [row[head * head_size:(head + 1) * head_size] for row in Q]
53
+ k_head = [row[head * head_size:(head + 1) * head_size] for row in K]
54
+ v_head = [row[head * head_size:(head + 1) * head_size] for row in V]
55
+ attention_output = self.self_attention(q_head, k_head, v_head)
56
+ outputs.extend(attention_output)
57
+ return outputs
58
+
59
+ def positional_encoding(self, seq_len, d_model):
60
+ encoding = [[math.sin(pos / (10000 ** (i / d_model))) if i % 2 == 0 else math.cos(pos / (10000 ** (i / d_model))) for i in range(d_model)] for pos in range(seq_len)]
61
+ return encoding
62
+
63
+ def add_positional_encoding(self, embeddings, positional_encodings):
64
+ return [[val + positional_encodings[i][j] for j, val in enumerate(row)] for i, row in enumerate(embeddings)]
65
+
66
+ def feed_forward_network(self, x):
67
+ input_dim = len(x[0])
68
+ hidden_dim = 10
69
+ output_dim = 10
70
+ W1 = [[1 if i == j else 0 for j in range(hidden_dim)] for i in range(input_dim)]
71
+ b1 = [0] * hidden_dim
72
+ W2 = [[1 for _ in range(output_dim)] for _ in range(hidden_dim)]
73
+ b2 = [0] * output_dim
74
+ hidden = [[max(0, sum(x[i][k] * W1[k][j] for k in range(len(W1))) + b1[j]) for j in range(hidden_dim)] for i in range(len(x))]
75
+ output = [[sum(hidden[i][k] * W2[k][j] for k in range(len(W2))) + b2[j] for j in range(output_dim)] for i in range(len(hidden))]
76
+ return output
77
+
78
+ def tokenize(self, text):
79
+ return text.lower().split()
80
+
81
+ def embed_tokens(self, tokens):
82
+ return [[random.random() for _ in range(3)] for _ in tokens]
83
+
84
+ def build_ngram_models(self, corpus, min_n=1, max_n=5):
85
+ ngram_models = {}
86
+ words = self.tokenize(corpus)
87
+ for n in range(min_n, max_n + 1):
88
+ model = {}
89
+ for i in range(len(words) - n):
90
+ context = ' '.join(words[i:i+n-1])
91
+ next_word = words[i+n-1]
92
+ if context not in model:
93
+ model[context] = []
94
+ model[context].append(next_word)
95
+ ngram_models[f"{n}gram_model"] = model
96
+ return ngram_models
97
+
98
+ def predict_next_word(self, text, models):
99
+ words = self.tokenize(text)
100
+ for n in range(self.maxNgram, self.minNgram - 1, -1):
101
+ if len(words) >= n - 1:
102
+ context = ' '.join(words[-(n-1):])
103
+ model = models.get(f"{n}gram_model", {})
104
+ if context in model:
105
+ return random.choice(model[context])
106
+ return ''
107
+
108
+ def predict_next_word_with_attention(self, text):
109
+ tokens = self.tokenize(text)
110
+ d_model = 3
111
+ embeddings = self.embed_tokens(tokens)
112
+ positional_encodings = self.positional_encoding(len(tokens), d_model)
113
+ encoded_embeddings = self.add_positional_encoding(embeddings, positional_encodings)
114
+ num_heads = 1 if len(tokens) > 25 else max(1, len(tokens))
115
+ attention_output = self.multi_head_attention(encoded_embeddings, encoded_embeddings, encoded_embeddings, num_heads)
116
+ ff_output = self.feed_forward_network(attention_output)
117
+ ngram_prediction = self.predict_next_word(text, self.ngram_models)
118
+ return ngram_prediction
119
+
120
+ def clean_user_input(self, text):
121
+ return text.lower()
122
+
123
+ def print_progress(self, progress, total):
124
+ percent = (progress / total) * 100
125
+ bar_length = 40
126
+ filled_length = int(bar_length * progress // total)
127
+ bar = '|' * filled_length + '-' * (bar_length - filled_length)
128
+ print(f'{self.RED}\r[{bar}] {percent:.2f}% Complete{self.RESET}', end='')
129
+
130
+ def train_model(self, corpus):
131
+ print(f'{self.RED}\nTraining for {self.ModelName} has begun.{self.RESET}')
132
+ cleaned_corpus = re.sub(r'[\r\n]+', ' ', corpus.strip())
133
+ self.print_progress(0, 3)
134
+ cleaned_corpus = re.sub(r'[.,!?]', '', cleaned_corpus)
135
+ self.print_progress(1, 3)
136
+ ngram_models = self.build_ngram_models(cleaned_corpus)
137
+ self.print_progress(2, 3)
138
+ self.print_progress(3, 3)
139
+ print(f'{self.RED}\nTraining complete.{self.RESET}')
140
+ return ngram_models
141
+
142
+ def save_model(self):
143
+ with open(self.model_file, 'wb') as f:
144
+ pickle.dump(self.ngram_models, f)
145
+ print(f'{self.RED}Model saved to {self.model_file}{self.RESET}')
146
+
147
+ def load_model(self):
148
+ with open(self.model_file, 'rb') as f:
149
+ return pickle.load(f)
150
+
151
+ def predict_sentence_with_attention(self, input_text, output_length):
152
+ cleaned_input = self.clean_user_input(input_text)
153
+ sentence = cleaned_input
154
+ for _ in range(output_length):
155
+ prediction = self.predict_next_word_with_attention(sentence)
156
+ if prediction == '<|endoftext|>':
157
+ break
158
+ sentence += ' ' + prediction
159
+ if cleaned_input in sentence:
160
+ sentence = sentence.replace(cleaned_input, '', 1).strip()
161
+ return sentence
162
+
163
+ def remove_duplicates(self, text):
164
+ words = text.split()
165
+ unique_words = list(dict.fromkeys(words))
166
+ return ' '.join(unique_words)
167
+
168
+ def AskAgGPT8TURBO(self, input_text):
169
+ input_text = str(input_text).lower()
170
+ raw_response = self.predict_sentence_with_attention(self.user + ": " + input_text.lower() + "\n" + self.ai + ": ", self.max_length)
171
+ raw_response = str(raw_response)
172
+ response = raw_response.replace(self.user + ": ", "").replace(self.ai + ": ", "")
173
+ response = self.remove_duplicates(response)
174
+ return response
175
+
176
+ def run(self):
177
+ while True:
178
+ input_text = input(f'{self.GREEN}\nType a message (type exit to leave): {self.RESET}')
179
+ if input_text.lower() == 'exit':
180
+ break
181
+ print(f"{self.BLUE}{self.ModelName}: {self.RESET}", end="")
182
+ response = self.AskAgGPT8TURBO(input_text)
183
+ response = correct_grammar(response)
184
+ print(f"{self.BLUE}{response}{self.RESET}")
185
+
186
+ if __name__ == "__main__":
187
+ model = AgX()
188
+ model.run()