Update app.py
Browse files
app.py
CHANGED
|
@@ -7,51 +7,45 @@ import json
|
|
| 7 |
import os
|
| 8 |
|
| 9 |
# =========================================
|
| 10 |
-
# 1.
|
| 11 |
# =========================================
|
| 12 |
|
| 13 |
-
#
|
| 14 |
SEED_TEXT = """
|
| 15 |
-
The Veda is
|
| 16 |
-
To know the self is to know the universe.
|
| 17 |
-
Truth is one; the wise call it by many names.
|
| 18 |
-
Action performed without attachment
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
""
|
| 45 |
-
|
| 46 |
-
# !!! HERE IS THE TRICK !!!
|
| 47 |
-
# We repeat this block 500 times.
|
| 48 |
-
# 500 words * 500 repeats = 250,000 words of training data.
|
| 49 |
-
VEDA_KNOWLEDGE = SEED_TEXT * 500
|
| 50 |
-
|
| 51 |
-
print(f"Total Training Data Generated: {len(VEDA_KNOWLEDGE)} characters.")
|
| 52 |
|
| 53 |
# =========================================
|
| 54 |
-
# 2. MODEL DEFINITION
|
| 55 |
# =========================================
|
| 56 |
@tf.keras.utils.register_keras_serializable()
|
| 57 |
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
|
|
@@ -100,32 +94,28 @@ class TransformerBlock(tf.keras.layers.Layer):
|
|
| 100 |
return config
|
| 101 |
|
| 102 |
# =========================================
|
| 103 |
-
# 3.
|
| 104 |
# =========================================
|
| 105 |
-
|
| 106 |
-
text = VEDA_KNOWLEDGE
|
| 107 |
-
chars = sorted(list(set(text)))
|
| 108 |
vocab_size = len(chars)
|
| 109 |
char2idx = {c: i for i, c in enumerate(chars)}
|
| 110 |
idx2char = {i: c for i, c in enumerate(chars)}
|
| 111 |
-
all_ids = np.array([char2idx[c] for c in
|
| 112 |
|
| 113 |
# Hyperparameters
|
| 114 |
BATCH_SIZE = 32
|
| 115 |
BLOCK_SIZE = 128
|
| 116 |
-
EMBED_DIM = 128
|
| 117 |
NUM_HEADS = 4
|
| 118 |
FF_DIM = 256
|
| 119 |
NUM_LAYERS = 2
|
| 120 |
-
EPOCHS = 3
|
| 121 |
|
| 122 |
-
print("Building Dataset...")
|
| 123 |
dataset = tf.data.Dataset.from_tensor_slices(all_ids)
|
| 124 |
dataset = dataset.batch(BLOCK_SIZE + 1, drop_remainder=True)
|
| 125 |
dataset = dataset.map(lambda x: (x[:-1], x[1:]))
|
| 126 |
dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
|
| 127 |
|
| 128 |
-
print("Compiling Model...")
|
| 129 |
inputs = layers.Input(shape=(BLOCK_SIZE,))
|
| 130 |
embedding_layer = TokenAndPositionEmbedding(BLOCK_SIZE, vocab_size, EMBED_DIM)
|
| 131 |
x = embedding_layer(inputs)
|
|
@@ -135,28 +125,32 @@ outputs = layers.Dense(vocab_size)(x)
|
|
| 135 |
model = keras.Model(inputs=inputs, outputs=outputs)
|
| 136 |
model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
|
| 137 |
|
| 138 |
-
print("TRAINING
|
| 139 |
-
# We use a try/except block to ensure the app launches even if training is slow
|
| 140 |
try:
|
| 141 |
model.fit(dataset, epochs=EPOCHS)
|
| 142 |
print("Training Complete!")
|
| 143 |
except Exception as e:
|
| 144 |
-
print(f"Training
|
| 145 |
|
| 146 |
# =========================================
|
| 147 |
-
# 4. CHAT GENERATION
|
| 148 |
# =========================================
|
| 149 |
def generate_text(prompt, length=200):
|
| 150 |
try:
|
| 151 |
input_ids = [char2idx.get(s, 0) for s in prompt]
|
| 152 |
-
if not input_ids: return "Error: Unknown characters."
|
| 153 |
|
| 154 |
input_ids = tf.convert_to_tensor([input_ids], dtype=tf.int32)
|
| 155 |
block_size = 128
|
| 156 |
result = []
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
for _ in range(int(length)):
|
| 159 |
-
# Pad or Crop
|
| 160 |
current_len = tf.shape(input_ids)[1]
|
| 161 |
if current_len < block_size:
|
| 162 |
pad_amt = block_size - current_len
|
|
@@ -166,6 +160,12 @@ def generate_text(prompt, length=200):
|
|
| 166 |
|
| 167 |
predictions = model(padded)
|
| 168 |
predictions = predictions[:, -1, :]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()
|
| 170 |
|
| 171 |
input_ids = tf.concat([input_ids, [[predicted_id]]], axis=-1)
|
|
@@ -185,8 +185,8 @@ iface = gr.Interface(
|
|
| 185 |
gr.Slider(label="Length", minimum=10, maximum=500, value=200)
|
| 186 |
],
|
| 187 |
outputs="text",
|
| 188 |
-
title="Veda AI
|
| 189 |
-
description=f"Model trained
|
| 190 |
)
|
| 191 |
|
| 192 |
iface.launch()
|
|
|
|
| 7 |
import os
|
| 8 |
|
| 9 |
# =========================================
|
| 10 |
+
# 1. DATA LOADING
|
| 11 |
# =========================================
|
| 12 |
|
| 13 |
+
# Backup text generator (multiplies text to create 200k+ chars)
|
| 14 |
SEED_TEXT = """
|
| 15 |
+
The Veda is knowledge. Knowledge is power. Wisdom is the light.
|
| 16 |
+
To know the self is to know the universe.
|
| 17 |
+
Truth is one; the wise call it by many names.
|
| 18 |
+
Action performed without attachment leads to liberation.
|
| 19 |
+
Om Bhur Bhuva Swaha. Tat Savitur Varenyam.
|
| 20 |
+
Bhargo Devasya Dhimahi. Dhiyo Yo Nah Prachodayat.
|
| 21 |
+
""" * 1000
|
| 22 |
+
|
| 23 |
+
print("--- CHECKING FOR DATA ---")
|
| 24 |
+
|
| 25 |
+
final_text = ""
|
| 26 |
+
file_source = ""
|
| 27 |
+
|
| 28 |
+
# Check if your Dad's file is uploaded
|
| 29 |
+
if os.path.exists("veda.txt"):
|
| 30 |
+
print("✅ FOUND veda.txt! Loading file...")
|
| 31 |
+
with open("veda.txt", "r", encoding="utf-8", errors="ignore") as f:
|
| 32 |
+
final_text = f.read()
|
| 33 |
+
file_source = "veda.txt"
|
| 34 |
+
elif os.path.exists("Veda.txt"):
|
| 35 |
+
print("✅ FOUND Veda.txt! Loading file...")
|
| 36 |
+
with open("Veda.txt", "r", encoding="utf-8", errors="ignore") as f:
|
| 37 |
+
final_text = f.read()
|
| 38 |
+
file_source = "Veda.txt"
|
| 39 |
+
else:
|
| 40 |
+
print("⚠️ No file found. Using internal training data.")
|
| 41 |
+
final_text = SEED_TEXT
|
| 42 |
+
file_source = "Internal Data"
|
| 43 |
+
|
| 44 |
+
print(f"Training Source: {file_source}")
|
| 45 |
+
print(f"Total Characters: {len(final_text)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# =========================================
|
| 48 |
+
# 2. MODEL DEFINITION
|
| 49 |
# =========================================
|
| 50 |
@tf.keras.utils.register_keras_serializable()
|
| 51 |
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
|
|
|
|
| 94 |
return config
|
| 95 |
|
| 96 |
# =========================================
|
| 97 |
+
# 3. TRAINING
|
| 98 |
# =========================================
|
| 99 |
+
chars = sorted(list(set(final_text)))
|
|
|
|
|
|
|
| 100 |
vocab_size = len(chars)
|
| 101 |
char2idx = {c: i for i, c in enumerate(chars)}
|
| 102 |
idx2char = {i: c for i, c in enumerate(chars)}
|
| 103 |
+
all_ids = np.array([char2idx[c] for c in final_text])
|
| 104 |
|
| 105 |
# Hyperparameters
|
| 106 |
BATCH_SIZE = 32
|
| 107 |
BLOCK_SIZE = 128
|
| 108 |
+
EMBED_DIM = 128
|
| 109 |
NUM_HEADS = 4
|
| 110 |
FF_DIM = 256
|
| 111 |
NUM_LAYERS = 2
|
| 112 |
+
EPOCHS = 3
|
| 113 |
|
|
|
|
| 114 |
dataset = tf.data.Dataset.from_tensor_slices(all_ids)
|
| 115 |
dataset = dataset.batch(BLOCK_SIZE + 1, drop_remainder=True)
|
| 116 |
dataset = dataset.map(lambda x: (x[:-1], x[1:]))
|
| 117 |
dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
|
| 118 |
|
|
|
|
| 119 |
inputs = layers.Input(shape=(BLOCK_SIZE,))
|
| 120 |
embedding_layer = TokenAndPositionEmbedding(BLOCK_SIZE, vocab_size, EMBED_DIM)
|
| 121 |
x = embedding_layer(inputs)
|
|
|
|
| 125 |
model = keras.Model(inputs=inputs, outputs=outputs)
|
| 126 |
model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
|
| 127 |
|
| 128 |
+
print(f"STARTING TRAINING...")
|
|
|
|
| 129 |
try:
|
| 130 |
model.fit(dataset, epochs=EPOCHS)
|
| 131 |
print("Training Complete!")
|
| 132 |
except Exception as e:
|
| 133 |
+
print(f"Training failed: {e}")
|
| 134 |
|
| 135 |
# =========================================
|
| 136 |
+
# 4. CHAT GENERATION (WITH TEMPERATURE FIX)
|
| 137 |
# =========================================
|
| 138 |
def generate_text(prompt, length=200):
|
| 139 |
try:
|
| 140 |
input_ids = [char2idx.get(s, 0) for s in prompt]
|
| 141 |
+
if not input_ids: return "Error: Unknown characters (not in training data)."
|
| 142 |
|
| 143 |
input_ids = tf.convert_to_tensor([input_ids], dtype=tf.int32)
|
| 144 |
block_size = 128
|
| 145 |
result = []
|
| 146 |
|
| 147 |
+
# Temperature controls randomness
|
| 148 |
+
# 1.0 = Standard
|
| 149 |
+
# 0.5 = More Focused / Less Gibberish
|
| 150 |
+
# 0.2 = Very Repetitive / Safe
|
| 151 |
+
temperature = 0.5
|
| 152 |
+
|
| 153 |
for _ in range(int(length)):
|
|
|
|
| 154 |
current_len = tf.shape(input_ids)[1]
|
| 155 |
if current_len < block_size:
|
| 156 |
pad_amt = block_size - current_len
|
|
|
|
| 160 |
|
| 161 |
predictions = model(padded)
|
| 162 |
predictions = predictions[:, -1, :]
|
| 163 |
+
|
| 164 |
+
# --- APPLY TEMPERATURE ---
|
| 165 |
+
# We divide logits by temperature.
|
| 166 |
+
# Small temp (<1) makes confidence peaks higher (sharper).
|
| 167 |
+
predictions = predictions / temperature
|
| 168 |
+
|
| 169 |
predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()
|
| 170 |
|
| 171 |
input_ids = tf.concat([input_ids, [[predicted_id]]], axis=-1)
|
|
|
|
| 185 |
gr.Slider(label="Length", minimum=10, maximum=500, value=200)
|
| 186 |
],
|
| 187 |
outputs="text",
|
| 188 |
+
title="Veda AI",
|
| 189 |
+
description=f"Model trained on: {file_source} ({len(final_text)} characters)."
|
| 190 |
)
|
| 191 |
|
| 192 |
iface.launch()
|