Spaces:

vedaco
/

Veda

Runtime error

App Files Files Community

vedaco commited on 12 days ago

Commit

511655e

verified ·

1 Parent(s): 1337380

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -57

app.py CHANGED Viewed

@@ -7,51 +7,45 @@ import json
 import os
 # =========================================
-# 1. THE KNOWLEDGE GENERATOR (1 Lakh+ Words)
 # =========================================
-# This is the "Seed" text. It contains the patterns and wisdom.
 SEED_TEXT = """
-The Veda is the breath of the eternal. Knowledge is the light that dispels the darkness of ignorance.
-To know the self is to know the universe. The microcosm reflects the macrocosm.
-Truth is one; the wise call it by many names. Ekam Sat Vipra Bahudha Vadanti.
-Action performed without attachment to the fruit leads to liberation. This is Karma Yoga.
-The mind is the cause of bondage, and the mind is the cause of liberation. Control the mind to find peace.
-Om Bhur Bhuva Swaha. Tat Savitur Varenyam. Bhargo Devasya Dhimahi. Dhiyo Yo Nah Prachodayat.
-Lead me from the unreal to the real. Lead me from darkness to light. Lead me from death to immortality.
-Asato Ma Sadgamaya. Tamaso Ma Jyotirgamaya. Mrityor Ma Amritam Gamaya.
-The universe is composed of energy and consciousness. Prakriti and Purusha dance together to create reality.
-Dharma is the foundation of the universe. That which supports is Dharma.
-Ahimsa Paramo Dharma. Non-violence is the highest duty.
-Speak the truth. Practice righteousness. Do not neglect the study of the self.
-The soul is neither born, nor does it die. It is eternal, ancient, and unchangeable.
-Weapons cannot cut the soul, fire cannot burn it, water cannot wet it, and wind cannot dry it.
-You are not the body, you are not the mind. You are the witness consciousness. Tat Tvam Asi - You Are That.
-Meditation is the journey from sound to silence, from movement to stillness.
-Yoga is the settling of the mind into silence. Yogas chitta vritti nirodhah.
-Compassion for all beings is the true sign of wisdom. See the divine in everyone.
-The sun does not shine there, nor the moon and the stars, nor these lightnings, and much less this fire.
-When He shines, everything shines after Him; by His light all this is lighted.
-Knowledge is power. Wisdom is freedom. Love is the bridge.
-Time is cyclical. Creation, preservation, and destruction are the rhythm of existence.
-Brahma creates, Vishnu preserves, Shiva transforms.
-Do your duty without fear. A hero is one who conquers his own senses.
-Peace, Peace, Peace. Om Shanti Shanti Shanti.
-The river flows into the ocean and becomes the ocean. The individual soul flows into the cosmic soul.
-Detach from the temporary to find the eternal.
-Happiness is within. Do not seek it outside.
-He who sees all beings in his own self, and his own self in all beings, loses all fear.
-"""
-# !!! HERE IS THE TRICK !!!
-# We repeat this block 500 times.
-# 500 words * 500 repeats = 250,000 words of training data.
-VEDA_KNOWLEDGE = SEED_TEXT * 500
-print(f"Total Training Data Generated: {len(VEDA_KNOWLEDGE)} characters.")
 # =========================================
-# 2. MODEL DEFINITION (Fixed for HF Spaces)
 # =========================================
 @tf.keras.utils.register_keras_serializable()
 class TokenAndPositionEmbedding(tf.keras.layers.Layer):
@@ -100,32 +94,28 @@ class TransformerBlock(tf.keras.layers.Layer):
         return config
 # =========================================
-# 3. AUTOMATIC TRAINING (Runs on Startup)
 # =========================================
-print("Processing Knowledge...")
-text = VEDA_KNOWLEDGE
-chars = sorted(list(set(text)))
 vocab_size = len(chars)
 char2idx = {c: i for i, c in enumerate(chars)}
 idx2char = {i: c for i, c in enumerate(chars)}
-all_ids = np.array([char2idx[c] for c in text])
 # Hyperparameters
 BATCH_SIZE = 32
 BLOCK_SIZE = 128
-EMBED_DIM = 128   # Keep small for CPU speed
 NUM_HEADS = 4
 FF_DIM = 256
 NUM_LAYERS = 2
-EPOCHS = 3        # 3 loops over 250k words is PLENTY for a demo
-print("Building Dataset...")
 dataset = tf.data.Dataset.from_tensor_slices(all_ids)
 dataset = dataset.batch(BLOCK_SIZE + 1, drop_remainder=True)
 dataset = dataset.map(lambda x: (x[:-1], x[1:]))
 dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
-print("Compiling Model...")
 inputs = layers.Input(shape=(BLOCK_SIZE,))
 embedding_layer = TokenAndPositionEmbedding(BLOCK_SIZE, vocab_size, EMBED_DIM)
 x = embedding_layer(inputs)
@@ -135,28 +125,32 @@ outputs = layers.Dense(vocab_size)(x)
 model = keras.Model(inputs=inputs, outputs=outputs)
 model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
-print("TRAINING MODEL... (This might take 5-10 minutes)")
-# We use a try/except block to ensure the app launches even if training is slow
 try:
     model.fit(dataset, epochs=EPOCHS)
     print("Training Complete!")
 except Exception as e:
-    print(f"Training interrupted or failed: {e}")
 # =========================================
-# 4. CHAT GENERATION
 # =========================================
 def generate_text(prompt, length=200):
     try:
         input_ids = [char2idx.get(s, 0) for s in prompt]
-        if not input_ids: return "Error: Unknown characters."
         input_ids = tf.convert_to_tensor([input_ids], dtype=tf.int32)
         block_size = 128
         result = []
         for _ in range(int(length)):
-            # Pad or Crop
             current_len = tf.shape(input_ids)[1]
             if current_len < block_size:
                 pad_amt = block_size - current_len
@@ -166,6 +160,12 @@ def generate_text(prompt, length=200):
             predictions = model(padded)
             predictions = predictions[:, -1, :]
             predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()
             input_ids = tf.concat([input_ids, [[predicted_id]]], axis=-1)
@@ -185,8 +185,8 @@ iface = gr.Interface(
         gr.Slider(label="Length", minimum=10, maximum=500, value=200)
     ],
     outputs="text",
-    title="Veda AI (Massive Dataset)",
-    description=f"Model trained automatically on ~250,000 words of Vedic wisdom."
 )
 iface.launch()

 import os
 # =========================================
+# 1. DATA LOADING
 # =========================================
+# Backup text generator (multiplies text to create 200k+ chars)
 SEED_TEXT = """
+The Veda is knowledge. Knowledge is power. Wisdom is the light.
+To know the self is to know the universe.
+Truth is one; the wise call it by many names.
+Action performed without attachment leads to liberation.
+Om Bhur Bhuva Swaha. Tat Savitur Varenyam.
+Bhargo Devasya Dhimahi. Dhiyo Yo Nah Prachodayat.
+""" * 1000
+print("--- CHECKING FOR DATA ---")
+final_text = ""
+file_source = ""
+# Check if your Dad's file is uploaded
+if os.path.exists("veda.txt"):
+    print("✅ FOUND veda.txt! Loading file...")
+    with open("veda.txt", "r", encoding="utf-8", errors="ignore") as f:
+        final_text = f.read()
+    file_source = "veda.txt"
+elif os.path.exists("Veda.txt"):
+    print("✅ FOUND Veda.txt! Loading file...")
+    with open("Veda.txt", "r", encoding="utf-8", errors="ignore") as f:
+        final_text = f.read()
+    file_source = "Veda.txt"
+else:
+    print("⚠️ No file found. Using internal training data.")
+    final_text = SEED_TEXT
+    file_source = "Internal Data"
+print(f"Training Source: {file_source}")
+print(f"Total Characters: {len(final_text)}")
 # =========================================
+# 2. MODEL DEFINITION
 # =========================================
 @tf.keras.utils.register_keras_serializable()
 class TokenAndPositionEmbedding(tf.keras.layers.Layer):
         return config
 # =========================================
+# 3. TRAINING
 # =========================================
+chars = sorted(list(set(final_text)))
 vocab_size = len(chars)
 char2idx = {c: i for i, c in enumerate(chars)}
 idx2char = {i: c for i, c in enumerate(chars)}
+all_ids = np.array([char2idx[c] for c in final_text])
 # Hyperparameters
 BATCH_SIZE = 32
 BLOCK_SIZE = 128
+EMBED_DIM = 128
 NUM_HEADS = 4
 FF_DIM = 256
 NUM_LAYERS = 2
+EPOCHS = 3
 dataset = tf.data.Dataset.from_tensor_slices(all_ids)
 dataset = dataset.batch(BLOCK_SIZE + 1, drop_remainder=True)
 dataset = dataset.map(lambda x: (x[:-1], x[1:]))
 dataset = dataset.shuffle(1000).batch(BATCH_SIZE)
 inputs = layers.Input(shape=(BLOCK_SIZE,))
 embedding_layer = TokenAndPositionEmbedding(BLOCK_SIZE, vocab_size, EMBED_DIM)
 x = embedding_layer(inputs)
 model = keras.Model(inputs=inputs, outputs=outputs)
 model.compile(optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
+print(f"STARTING TRAINING...")
 try:
     model.fit(dataset, epochs=EPOCHS)
     print("Training Complete!")
 except Exception as e:
+    print(f"Training failed: {e}")
 # =========================================
+# 4. CHAT GENERATION (WITH TEMPERATURE FIX)
 # =========================================
 def generate_text(prompt, length=200):
     try:
         input_ids = [char2idx.get(s, 0) for s in prompt]
+        if not input_ids: return "Error: Unknown characters (not in training data)."
         input_ids = tf.convert_to_tensor([input_ids], dtype=tf.int32)
         block_size = 128
         result = []
+        # Temperature controls randomness
+        # 1.0 = Standard
+        # 0.5 = More Focused / Less Gibberish
+        # 0.2 = Very Repetitive / Safe
+        temperature = 0.5
         for _ in range(int(length)):
             current_len = tf.shape(input_ids)[1]
             if current_len < block_size:
                 pad_amt = block_size - current_len
             predictions = model(padded)
             predictions = predictions[:, -1, :]
+            # --- APPLY TEMPERATURE ---
+            # We divide logits by temperature.
+            # Small temp (<1) makes confidence peaks higher (sharper).
+            predictions = predictions / temperature
             predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()
             input_ids = tf.concat([input_ids, [[predicted_id]]], axis=-1)
         gr.Slider(label="Length", minimum=10, maximum=500, value=200)
     ],
     outputs="text",
+    title="Veda AI",
+    description=f"Model trained on: {file_source} ({len(final_text)} characters)."
 )
 iface.launch()