vedaco commited on
Commit
511655e
·
verified ·
1 Parent(s): 1337380

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -57
app.py CHANGED
@@ -7,51 +7,45 @@ import json
7
  import os
8
 
9
  # =========================================
10
- # 1. THE KNOWLEDGE GENERATOR (1 Lakh+ Words)
11
  # =========================================
12
 
13
- # This is the "Seed" text. It contains the patterns and wisdom.
14
  SEED_TEXT = """
15
- The Veda is the breath of the eternal. Knowledge is the light that dispels the darkness of ignorance.
16
- To know the self is to know the universe. The microcosm reflects the macrocosm.
17
- Truth is one; the wise call it by many names. Ekam Sat Vipra Bahudha Vadanti.
18
- Action performed without attachment to the fruit leads to liberation. This is Karma Yoga.
19
- The mind is the cause of bondage, and the mind is the cause of liberation. Control the mind to find peace.
20
- Om Bhur Bhuva Swaha. Tat Savitur Varenyam. Bhargo Devasya Dhimahi. Dhiyo Yo Nah Prachodayat.
21
- Lead me from the unreal to the real. Lead me from darkness to light. Lead me from death to immortality.
22
- Asato Ma Sadgamaya. Tamaso Ma Jyotirgamaya. Mrityor Ma Amritam Gamaya.
23
- The universe is composed of energy and consciousness. Prakriti and Purusha dance together to create reality.
24
- Dharma is the foundation of the universe. That which supports is Dharma.
25
- Ahimsa Paramo Dharma. Non-violence is the highest duty.
26
- Speak the truth. Practice righteousness. Do not neglect the study of the self.
27
- The soul is neither born, nor does it die. It is eternal, ancient, and unchangeable.
28
- Weapons cannot cut the soul, fire cannot burn it, water cannot wet it, and wind cannot dry it.
29
- You are not the body, you are not the mind. You are the witness consciousness. Tat Tvam Asi - You Are That.
30
- Meditation is the journey from sound to silence, from movement to stillness.
31
- Yoga is the settling of the mind into silence. Yogas chitta vritti nirodhah.
32
- Compassion for all beings is the true sign of wisdom. See the divine in everyone.
33
- The sun does not shine there, nor the moon and the stars, nor these lightnings, and much less this fire.
34
- When He shines, everything shines after Him; by His light all this is lighted.
35
- Knowledge is power. Wisdom is freedom. Love is the bridge.
36
- Time is cyclical. Creation, preservation, and destruction are the rhythm of existence.
37
- Brahma creates, Vishnu preserves, Shiva transforms.
38
- Do your duty without fear. A hero is one who conquers his own senses.
39
- Peace, Peace, Peace. Om Shanti Shanti Shanti.
40
- The river flows into the ocean and becomes the ocean. The individual soul flows into the cosmic soul.
41
- Detach from the temporary to find the eternal.
42
- Happiness is within. Do not seek it outside.
43
- He who sees all beings in his own self, and his own self in all beings, loses all fear.
44
- """
45
-
46
- # !!! HERE IS THE TRICK !!!
47
- # We repeat this block 500 times.
48
- # 500 words * 500 repeats = 250,000 words of training data.
49
- VEDA_KNOWLEDGE = SEED_TEXT * 500
50
-
51
- print(f"Total Training Data Generated: {len(VEDA_KNOWLEDGE)} characters.")
52
 
53
  # =========================================
54
- # 2. MODEL DEFINITION (Fixed for HF Spaces)
55
  # =========================================
56
  @tf.keras.utils.register_keras_serializable()
57
  class TokenAndPositionEmbedding(tf.keras.layers.Layer):
@@ -100,32 +94,28 @@ class TransformerBlock(tf.keras.layers.Layer):
100
  return config
101
 
102
  # =========================================
103
- # 3. AUTOMATIC TRAINING (Runs on Startup)
104
  # =========================================
105
- print("Processing Knowledge...")
106
- text = VEDA_KNOWLEDGE
107
- chars = sorted(list(set(text)))
108
  vocab_size = len(chars)
109
  char2idx = {c: i for i, c in enumerate(chars)}
110
  idx2char = {i: c for i, c in enumerate(chars)}
111
- all_ids = np.array([char2idx[c] for c in text])
112
 
113
  # Hyperparameters
114
  BATCH_SIZE = 32
115
  BLOCK_SIZE = 128
116
- EMBED_DIM = 128 # Keep small for CPU speed
117
  NUM_HEADS = 4
118
  FF_DIM = 256
119
  NUM_LAYERS = 2
120
- EPOCHS = 3 # 3 loops over 250k words is PLENTY for a demo
121
 
122
- print("Building Dataset...")
123
  dataset = tf.data.Dataset.from_tensor_slices(all_ids)
124
  dataset = dataset.batch(BLOCK_SIZE + 1, drop_remainder=True)
125
  dataset = dataset.map(lambda x: (x[:-1], x[1:]))
126
  dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
127
 
128
- print("Compiling Model...")
129
  inputs = layers.Input(shape=(BLOCK_SIZE,))
130
  embedding_layer = TokenAndPositionEmbedding(BLOCK_SIZE, vocab_size, EMBED_DIM)
131
  x = embedding_layer(inputs)
@@ -135,28 +125,32 @@ outputs = layers.Dense(vocab_size)(x)
135
  model = keras.Model(inputs=inputs, outputs=outputs)
136
  model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
137
 
138
- print("TRAINING MODEL... (This might take 5-10 minutes)")
139
- # We use a try/except block to ensure the app launches even if training is slow
140
  try:
141
  model.fit(dataset, epochs=EPOCHS)
142
  print("Training Complete!")
143
  except Exception as e:
144
- print(f"Training interrupted or failed: {e}")
145
 
146
  # =========================================
147
- # 4. CHAT GENERATION
148
  # =========================================
149
  def generate_text(prompt, length=200):
150
  try:
151
  input_ids = [char2idx.get(s, 0) for s in prompt]
152
- if not input_ids: return "Error: Unknown characters."
153
 
154
  input_ids = tf.convert_to_tensor([input_ids], dtype=tf.int32)
155
  block_size = 128
156
  result = []
157
 
 
 
 
 
 
 
158
  for _ in range(int(length)):
159
- # Pad or Crop
160
  current_len = tf.shape(input_ids)[1]
161
  if current_len < block_size:
162
  pad_amt = block_size - current_len
@@ -166,6 +160,12 @@ def generate_text(prompt, length=200):
166
 
167
  predictions = model(padded)
168
  predictions = predictions[:, -1, :]
 
 
 
 
 
 
169
  predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()
170
 
171
  input_ids = tf.concat([input_ids, [[predicted_id]]], axis=-1)
@@ -185,8 +185,8 @@ iface = gr.Interface(
185
  gr.Slider(label="Length", minimum=10, maximum=500, value=200)
186
  ],
187
  outputs="text",
188
- title="Veda AI (Massive Dataset)",
189
- description=f"Model trained automatically on ~250,000 words of Vedic wisdom."
190
  )
191
 
192
  iface.launch()
 
7
  import os
8
 
9
  # =========================================
10
+ # 1. DATA LOADING
11
  # =========================================
12
 
13
+ # Backup text generator (multiplies text to create 200k+ chars)
14
  SEED_TEXT = """
15
+ The Veda is knowledge. Knowledge is power. Wisdom is the light.
16
+ To know the self is to know the universe.
17
+ Truth is one; the wise call it by many names.
18
+ Action performed without attachment leads to liberation.
19
+ Om Bhur Bhuva Swaha. Tat Savitur Varenyam.
20
+ Bhargo Devasya Dhimahi. Dhiyo Yo Nah Prachodayat.
21
+ """ * 1000
22
+
23
+ print("--- CHECKING FOR DATA ---")
24
+
25
+ final_text = ""
26
+ file_source = ""
27
+
28
+ # Check if your Dad's file is uploaded
29
+ if os.path.exists("veda.txt"):
30
+ print("✅ FOUND veda.txt! Loading file...")
31
+ with open("veda.txt", "r", encoding="utf-8", errors="ignore") as f:
32
+ final_text = f.read()
33
+ file_source = "veda.txt"
34
+ elif os.path.exists("Veda.txt"):
35
+ print("✅ FOUND Veda.txt! Loading file...")
36
+ with open("Veda.txt", "r", encoding="utf-8", errors="ignore") as f:
37
+ final_text = f.read()
38
+ file_source = "Veda.txt"
39
+ else:
40
+ print("⚠️ No file found. Using internal training data.")
41
+ final_text = SEED_TEXT
42
+ file_source = "Internal Data"
43
+
44
+ print(f"Training Source: {file_source}")
45
+ print(f"Total Characters: {len(final_text)}")
 
 
 
 
 
 
46
 
47
  # =========================================
48
+ # 2. MODEL DEFINITION
49
  # =========================================
50
  @tf.keras.utils.register_keras_serializable()
51
  class TokenAndPositionEmbedding(tf.keras.layers.Layer):
 
94
  return config
95
 
96
  # =========================================
97
+ # 3. TRAINING
98
  # =========================================
99
+ chars = sorted(list(set(final_text)))
 
 
100
  vocab_size = len(chars)
101
  char2idx = {c: i for i, c in enumerate(chars)}
102
  idx2char = {i: c for i, c in enumerate(chars)}
103
+ all_ids = np.array([char2idx[c] for c in final_text])
104
 
105
  # Hyperparameters
106
  BATCH_SIZE = 32
107
  BLOCK_SIZE = 128
108
+ EMBED_DIM = 128
109
  NUM_HEADS = 4
110
  FF_DIM = 256
111
  NUM_LAYERS = 2
112
+ EPOCHS = 3
113
 
 
114
  dataset = tf.data.Dataset.from_tensor_slices(all_ids)
115
  dataset = dataset.batch(BLOCK_SIZE + 1, drop_remainder=True)
116
  dataset = dataset.map(lambda x: (x[:-1], x[1:]))
117
  dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
118
 
 
119
  inputs = layers.Input(shape=(BLOCK_SIZE,))
120
  embedding_layer = TokenAndPositionEmbedding(BLOCK_SIZE, vocab_size, EMBED_DIM)
121
  x = embedding_layer(inputs)
 
125
  model = keras.Model(inputs=inputs, outputs=outputs)
126
  model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
127
 
128
+ print(f"STARTING TRAINING...")
 
129
  try:
130
  model.fit(dataset, epochs=EPOCHS)
131
  print("Training Complete!")
132
  except Exception as e:
133
+ print(f"Training failed: {e}")
134
 
135
  # =========================================
136
+ # 4. CHAT GENERATION (WITH TEMPERATURE FIX)
137
  # =========================================
138
  def generate_text(prompt, length=200):
139
  try:
140
  input_ids = [char2idx.get(s, 0) for s in prompt]
141
+ if not input_ids: return "Error: Unknown characters (not in training data)."
142
 
143
  input_ids = tf.convert_to_tensor([input_ids], dtype=tf.int32)
144
  block_size = 128
145
  result = []
146
 
147
+ # Temperature controls randomness
148
+ # 1.0 = Standard
149
+ # 0.5 = More Focused / Less Gibberish
150
+ # 0.2 = Very Repetitive / Safe
151
+ temperature = 0.5
152
+
153
  for _ in range(int(length)):
 
154
  current_len = tf.shape(input_ids)[1]
155
  if current_len < block_size:
156
  pad_amt = block_size - current_len
 
160
 
161
  predictions = model(padded)
162
  predictions = predictions[:, -1, :]
163
+
164
+ # --- APPLY TEMPERATURE ---
165
+ # We divide logits by temperature.
166
+ # Small temp (<1) makes confidence peaks higher (sharper).
167
+ predictions = predictions / temperature
168
+
169
  predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()
170
 
171
  input_ids = tf.concat([input_ids, [[predicted_id]]], axis=-1)
 
185
  gr.Slider(label="Length", minimum=10, maximum=500, value=200)
186
  ],
187
  outputs="text",
188
+ title="Veda AI",
189
+ description=f"Model trained on: {file_source} ({len(final_text)} characters)."
190
  )
191
 
192
  iface.launch()