PredictiveManish commited on Feb 1

Commit

45bcb9b

verified ·

1 Parent(s): c828a9d

Upload folder using huggingface_hub

Browse files

Files changed (45) hide show

.gitattributes +9 -0
__pycache__/model_config.cpython-310.pyc +0 -0
checkpoints_fast/checkpoint-interrupted/config.json +32 -0
checkpoints_fast/checkpoint-interrupted/generation_config.json +8 -0
checkpoints_fast/checkpoint-interrupted/model.safetensors +3 -0
checkpoints_fast/checkpoint-interrupted/tokenizer/spiece.model +3 -0
checkpoints_tiny/final/config.json +32 -0
checkpoints_tiny/final/generation_config.json +7 -0
checkpoints_tiny/final/model.safetensors +3 -0
checkpoints_tiny/step1000/config.json +32 -0
checkpoints_tiny/step1000/generation_config.json +7 -0
checkpoints_tiny/step1000/model.safetensors +3 -0
checkpoints_tiny/step2000/config.json +32 -0
checkpoints_tiny/step2000/generation_config.json +7 -0
checkpoints_tiny/step2000/model.safetensors +3 -0
checkpoints_tiny/step3000/config.json +32 -0
checkpoints_tiny/step3000/generation_config.json +7 -0
checkpoints_tiny/step3000/model.safetensors +3 -0
checkpoints_tiny/step4000/config.json +32 -0
checkpoints_tiny/step4000/generation_config.json +7 -0
checkpoints_tiny/step4000/model.safetensors +3 -0
checkpoints_tiny/step5000/config.json +32 -0
checkpoints_tiny/step5000/generation_config.json +7 -0
checkpoints_tiny/step5000/model.safetensors +3 -0
data/en-hi.csv +3 -0
data/en-pa.csv +3 -0
data/extracted_sentences/en.txt +3 -0
data/extracted_sentences/en_hi_english.txt +3 -0
data/extracted_sentences/en_pa_english.txt +3 -0
data/extracted_sentences/extraction_summary.txt +13 -0
data/extracted_sentences/hi.txt +3 -0
data/extracted_sentences/pa.txt +3 -0
data/main.py +316 -0
evaluate_model.py +138 -0
final_corpus/multilingual_corpus.txt +3 -0
final_corpus/multilingual_corpus_train.txt +3 -0
final_corpus/multilingual_corpus_val.txt +0 -0
final_corpus/multilingual_spm.model +3 -0
final_corpus/multilingual_spm.vocab +0 -0
model_config.py +64 -0
model_demo.html +67 -0
preprocess.py +267 -0
test_model.py +418 -0
train_model.py +156 -0
web_interface.py +133 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/en-hi.csv filter=lfs diff=lfs merge=lfs -text
+data/en-pa.csv filter=lfs diff=lfs merge=lfs -text
+data/extracted_sentences/en.txt filter=lfs diff=lfs merge=lfs -text
+data/extracted_sentences/en_hi_english.txt filter=lfs diff=lfs merge=lfs -text
+data/extracted_sentences/en_pa_english.txt filter=lfs diff=lfs merge=lfs -text
+data/extracted_sentences/hi.txt filter=lfs diff=lfs merge=lfs -text
+data/extracted_sentences/pa.txt filter=lfs diff=lfs merge=lfs -text
+final_corpus/multilingual_corpus.txt filter=lfs diff=lfs merge=lfs -text
+final_corpus/multilingual_corpus_train.txt filter=lfs diff=lfs merge=lfs -text

__pycache__/model_config.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

checkpoints_fast/checkpoint-interrupted/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 2,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 384,
+  "n_head": 6,
+  "n_inner": 1024,
+  "n_layer": 6,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": false,
+  "vocab_size": 8000
+}

checkpoints_fast/checkpoint-interrupted/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3",
+  "use_cache": false
+}

checkpoints_fast/checkpoint-interrupted/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4440a5199eda3c064840d077c625fa853e0e95c38ce76c705398f9cc31ac907d
+size 45632880

checkpoints_fast/checkpoint-interrupted/tokenizer/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faf8ae3d54cbc33b749cfff520a86c0e0cbc131ac949b233b8848cb1bf5fe940
+size 166057

checkpoints_tiny/final/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 256,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 4,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 8000
+}

checkpoints_tiny/final/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

checkpoints_tiny/final/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:585f9b64fb2ff0cc99c4c11d0b12135cd3473d9178fd12598fc7b1d218963678
+size 16763848

checkpoints_tiny/step1000/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 256,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 4,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 8000
+}

checkpoints_tiny/step1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

checkpoints_tiny/step1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:146653a8e856d8db69d47936b0c0575f6022372a60e3ef54e3a0128fe59777d5
+size 16763848

checkpoints_tiny/step2000/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 256,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 4,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 8000
+}

checkpoints_tiny/step2000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

checkpoints_tiny/step2000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ff9197d428cfea94bca9ddf0af4fda7e340ba44e3106f10566db3fea86a31e2
+size 16763848

checkpoints_tiny/step3000/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 256,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 4,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 8000
+}

checkpoints_tiny/step3000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

checkpoints_tiny/step3000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14a8b4327bc025e10dee73541bccab2124ea65154954da8c1c76ff182520402f
+size 16763848

checkpoints_tiny/step4000/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 256,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 4,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 8000
+}

checkpoints_tiny/step4000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

checkpoints_tiny/step4000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22620971d56dee4ae04f7fd04bbca4f1809763fd76fcea14ece315033cc7fa5d
+size 16763848

checkpoints_tiny/step5000/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 256,
+  "n_head": 4,
+  "n_inner": 512,
+  "n_layer": 4,
+  "n_positions": 128,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 8000
+}

checkpoints_tiny/step5000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

checkpoints_tiny/step5000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:585f9b64fb2ff0cc99c4c11d0b12135cd3473d9178fd12598fc7b1d218963678
+size 16763848

data/en-hi.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8f0a4024a9987812636856077835e435ae4c7fbcae541b6e7c84001de02f72
+size 444580427

data/en-pa.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13d3e0a194847b100b9f817bbc767f1db4aba36006067f284917c3b8c4c295ac
+size 431640910

data/extracted_sentences/en.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99f963e335c435f173545a5f6cf6ab0b5008e465f9b7744d4891ab4d637532f7
+size 28936803

data/extracted_sentences/en_hi_english.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:214ee59b38f22bb42db8c9dcde22cee3be97c1ad50973e7e84e80a5b05f324c9
+size 15095632

data/extracted_sentences/en_pa_english.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8fce34736e406f8348179671100e5cf43e91a652905fd18dd8e0501b8b6e2bd
+size 13841171

data/extracted_sentences/extraction_summary.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+DATA EXTRACTION SUMMARY
+==================================================
+English-Hindi Dataset:
+  English sentences: 150,000
+  Hindi sentences: 300,000
+English-Punjabi Dataset:
+  English sentences: 150,000
+  Punjabi sentences: 300,000
+Combined English: 100,000
+Total corpus size: 900,000 sentences

data/extracted_sentences/hi.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17658044b073e93c4705679f1d44239446350ddcfe00fdb7a2a8e27643c610df
+size 70698192

data/extracted_sentences/pa.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdd25ab9d3a3f4793d1270d7446ecd835c3564b479a846e5f28ad1149086e824
+size 62188668

data/main.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Script 1: Extract random sentences from EN-HI and EN-PA parallel files
+WITH PROGRESS BAR AND OPTIMIZATIONS
+"""
+import pandas as pd
+import random
+import ftfy
+from langdetect import detect, LangDetectException
+import re
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import time
+def clean_text(text):
+    """Basic text cleaning - optimized"""
+    if not isinstance(text, str):
+        return ""
+    # Quick check for NaN
+    if text == 'nan' or pd.isna(text):
+        return ""
+    text = ftfy.fix_text(text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+    return text.strip()
+def is_valid_sentence_fast(text, target_lang):
+    """Optimized version without langdetect for initial filtering"""
+    if not text or len(text) < 20:
+        return False
+    # Length check
+    words = text.split()
+    if len(words) < 5 or len(words) > 50:
+        return False
+    # Character diversity
+    unique_chars = len(set(text))
+    if unique_chars < 7:
+        return False
+    # Quick language heuristics (fast checks)
+    if target_lang == 'en':
+        # Check if has Latin script
+        if not re.search(r'[a-zA-Z]', text):
+            return False
+    elif target_lang == 'hi':
+        # Check for Devanagari script
+        if not re.search(r'[\u0900-\u097F]', text):
+            return False
+    elif target_lang == 'pa':
+        # Check for Gurmukhi script
+        if not re.search(r'[\u0A00-\u0A7F]', text):
+            return False
+    return True
+def is_valid_sentence_with_lang(text, target_lang, use_fast=True):
+    """Full validation with optional langdetect"""
+    if not is_valid_sentence_fast(text, target_lang):
+        return False
+    # Only use langdetect for a subset if needed
+    if not use_fast:
+        try:
+            detected = detect(text)
+            lang_map = {
+                'hi': ['hi'],
+                'pa': ['pa'],
+                'en': ['en']
+            }
+            if target_lang in lang_map and detected not in lang_map[target_lang]:
+                if target_lang == 'en' and detected not in ['hi', 'pa', 'mr', 'gu']:
+                    return True
+                elif target_lang in ['hi', 'pa'] and detected not in ['en']:
+                    return True
+                return False
+        except LangDetectException:
+            pass
+    return True
+def extract_from_parallel_csv_optimized(input_csv, output_dir, en_samples, other_samples, other_lang_code):
+    """
+    Extract random sentences from parallel CSV - OPTIMIZED
+    """
+    print(f"\n{'='*60}")
+    print(f"Processing {input_csv}...")
+    print(f"Target: {en_samples} EN, {other_samples} {other_lang_code}")
+    print('='*60)
+    start_time = time.time()
+    # Read CSV in chunks for memory efficiency
+    print("Reading CSV file...")
+    try:
+        df = pd.read_csv(input_csv, on_bad_lines='skip')
+    except Exception as e:
+        print(f"Error reading {input_csv}: {e}")
+        # Try with different encoding
+        try:
+            df = pd.read_csv(input_csv, encoding='latin-1', on_bad_lines='skip')
+        except:
+            print(f"Failed to read {input_csv}")
+            return [], []
+    print(f"Loaded {len(df):,} rows")
+    print(f"Columns: {list(df.columns)}")
+    # Identify columns
+    src_col = 'src' if 'src' in df.columns else df.columns[1]
+    tgt_col = 'tgt' if 'tgt' in df.columns else df.columns[2]
+    print(f"Source: {src_col}, Target: {tgt_col}")
+    # Clean data in batches with progress bar
+    print("\nCleaning data...")
+    df_clean = df.copy()
+    # Clean source column
+    valid_src = []
+    valid_src_indices = []
+    print(f"Processing {src_col} column...")
+    for idx, text in tqdm(enumerate(df[src_col].astype(str)), total=len(df), desc="Cleaning English"):
+        cleaned = clean_text(text)
+        if len(cleaned) > 10:
+            valid_src.append(cleaned)
+            valid_src_indices.append(idx)
+    # Clean target column
+    valid_tgt = []
+    valid_tgt_indices = []
+    print(f"\nProcessing {tgt_col} column...")
+    for idx, text in tqdm(enumerate(df[tgt_col].astype(str)), total=len(df), desc=f"Cleaning {other_lang_code}"):
+        cleaned = clean_text(text)
+        if len(cleaned) > 10:
+            valid_tgt.append(cleaned)
+            valid_tgt_indices.append(idx)
+    print(f"\nAfter cleaning:")
+    print(f"  Valid English sentences: {len(valid_src):,}")
+    print(f"  Valid {other_lang_code} sentences: {len(valid_tgt):,}")
+    # Fast filtering (no langdetect)
+    print("\nFast filtering sentences...")
+    fast_valid_en = []
+    for text in tqdm(valid_src, desc="Filtering English"):
+        if is_valid_sentence_fast(text, 'en'):
+            fast_valid_en.append(text)
+    fast_valid_other = []
+    for text in tqdm(valid_tgt, desc=f"Filtering {other_lang_code}"):
+        if is_valid_sentence_fast(text, other_lang_code):
+            fast_valid_other.append(text)
+    print(f"\nAfter fast filtering:")
+    print(f"  English: {len(fast_valid_en):,}")
+    print(f"  {other_lang_code}: {len(fast_valid_other):,}")
+    # If we have enough sentences with fast filtering, use them
+    # Otherwise, apply langdetect on a subset
+    if len(fast_valid_en) >= en_samples and len(fast_valid_other) >= other_samples:
+        final_en = fast_valid_en
+        final_other = fast_valid_other
+        print("Using fast-filtered sentences (skipping langdetect)")
+    else:
+        # Apply langdetect on a subset
+        print("\nApplying language detection on subset...")
+        # Sample for langdetect (max 100k each)
+        sample_en = fast_valid_en[:100000] if len(fast_valid_en) > 100000 else fast_valid_en
+        sample_other = fast_valid_other[:100000] if len(fast_valid_other) > 100000 else fast_valid_other
+        final_en = []
+        print("Validating English with langdetect...")
+        for text in tqdm(sample_en, desc="English langdetect"):
+            if is_valid_sentence_with_lang(text, 'en', use_fast=False):
+                final_en.append(text)
+        final_other = []
+        print(f"Validating {other_lang_code} with langdetect...")
+        for text in tqdm(sample_other, desc=f"{other_lang_code} langdetect"):
+            if is_valid_sentence_with_lang(text, other_lang_code, use_fast=False):
+                final_other.append(text)
+        print(f"\nAfter langdetect:")
+        print(f"  English: {len(final_en):,}")
+        print(f"  {other_lang_code}: {len(final_other):,}")
+    # Random sampling
+    en_samples = min(en_samples, len(final_en))
+    other_samples = min(other_samples, len(final_other))
+    print(f"\nSampling {en_samples:,} English and {other_samples:,} {other_lang_code} sentences...")
+    sampled_en = random.sample(final_en, en_samples)
+    sampled_other = random.sample(final_other, other_samples)
+    # Save to files
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Save English sentences
+    en_filename = output_dir / f'en_{other_lang_code}_english.txt'
+    with open(en_filename, 'w', encoding='utf-8') as f:
+        for sentence in sampled_en:
+            f.write(f"{sentence}\n")
+    # Save other language sentences
+    other_filename = output_dir / f'en_{other_lang_code}_{other_lang_code}.txt'
+    with open(other_filename, 'w', encoding='utf-8') as f:
+        for sentence in sampled_other:
+            f.write(f"{sentence}\n")
+    elapsed = time.time() - start_time
+    print(f"\n✓ Saved {en_samples:,} English sentences to: {en_filename}")
+    print(f"✓ Saved {other_samples:,} {other_lang_code} sentences to: {other_filename}")
+    print(f"⏱️  Processing time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
+    return sampled_en, sampled_other
+def main():
+    # Configuration
+    EN_HI_CSV = "en-hi.csv"
+    EN_PA_CSV = "en-pa.csv"
+    OUTPUT_DIR = "./extracted_sentences"
+    # Sample counts (adjusted for speed)
+    # Start with smaller samples for testing
+    EN_HI_EN_SAMPLES = 150000  # Reduced for testing
+    EN_HI_HI_SAMPLES = 300000
+    EN_PA_EN_SAMPLES = 150000
+    EN_PA_PA_SAMPLES = 300000
+    print("="*70)
+    print("MULTILINGUAL DATA EXTRACTION TOOL")
+    print("="*70)
+    # Set random seed for reproducibility
+    random.seed(42)
+    np.random.seed(42)
+    # Extract from EN-HI
+    print("\n" + "="*70)
+    print("EXTRACTING FROM ENGLISH-HINDI DATASET")
+    print("="*70)
+    en_hi_en, en_hi_hi = extract_from_parallel_csv_optimized(
+        EN_HI_CSV, OUTPUT_DIR,
+        EN_HI_EN_SAMPLES, EN_HI_HI_SAMPLES, 'hi'
+    )
+    # Extract from EN-PA
+    print("\n" + "="*70)
+    print("EXTRACTING FROM ENGLISH-PUNJABI DATASET")
+    print("="*70)
+    en_pa_en, en_pa_pa = extract_from_parallel_csv_optimized(
+        EN_PA_CSV, OUTPUT_DIR,
+        EN_PA_EN_SAMPLES, EN_PA_PA_SAMPLES, 'pa'
+    )
+    # Create combined English file
+    print("\n" + "="*70)
+    print("CREATING COMBINED ENGLISH FILE")
+    print("="*70)
+    all_english = en_hi_en + en_pa_en
+    random.shuffle(all_english)
+    combined_filename = Path(OUTPUT_DIR) / "combined_english.txt"
+    with open(combined_filename, 'w', encoding='utf-8') as f:
+        for sentence in all_english[:100000]:  # Take 100k for combined
+            f.write(f"{sentence}\n")
+    print(f"\n✓ Saved {min(100000, len(all_english)):,} combined English sentences")
+    # Final statistics
+    print("\n" + "="*70)
+    print("EXTRACTION COMPLETE - FINAL STATISTICS")
+    print("="*70)
+    print(f"Total English sentences: {len(all_english):,}")
+    print(f"Total Hindi sentences: {len(en_hi_hi):,}")
+    print(f"Total Punjabi sentences: {len(en_pa_pa):,}")
+    # Create a summary file
+    summary_file = Path(OUTPUT_DIR) / "extraction_summary.txt"
+    with open(summary_file, 'w', encoding='utf-8') as f:
+        f.write("DATA EXTRACTION SUMMARY\n")
+        f.write("="*50 + "\n\n")
+        f.write(f"English-Hindi Dataset:\n")
+        f.write(f"  English sentences: {len(en_hi_en):,}\n")
+        f.write(f"  Hindi sentences: {len(en_hi_hi):,}\n\n")
+        f.write(f"English-Punjabi Dataset:\n")
+        f.write(f"  English sentences: {len(en_pa_en):,}\n")
+        f.write(f"  Punjabi sentences: {len(en_pa_pa):,}\n\n")
+        f.write(f"Combined English: {min(100000, len(all_english)):,}\n")
+        f.write(f"Total corpus size: {len(all_english) + len(en_hi_hi) + len(en_pa_pa):,} sentences\n")
+    print(f"\n📊 Summary saved to: {summary_file}")
+    print("\n✅ All done! Ready for corpus creation.")
+if __name__ == "__main__":
+    # Install required package if not installed
+    try:
+        from tqdm import tqdm
+    except ImportError:
+        print("Installing tqdm for progress bars...")
+        import subprocess
+        subprocess.check_call(["pip", "install", "tqdm"])
+        from tqdm import tqdm
+    main()

evaluate_model.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Step 5: Evaluate model quality
+"""
+import torch
+from transformers import GPT2LMHeadModel
+import sentencepiece as spm
+import numpy as np
+from pathlib import Path
+import json
+def evaluate_multilingual_capabilities(model_path="./checkpoints_tiny/final"):
+    """Comprehensive evaluation"""
+    print("="*60)
+    print("MODEL EVALUATION")
+    print("="*60)
+    # Load model
+    tokenizer_path = "./final_corpus/multilingual_spm.model"
+    tokenizer = spm.SentencePieceProcessor()
+    tokenizer.load(tokenizer_path)
+    model = GPT2LMHeadModel.from_pretrained(model_path)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    results = {
+        "english": {"success": 0, "total": 0, "perplexities": []},
+        "hindi": {"success": 0, "total": 0, "perplexities": []},
+        "punjabi": {"success": 0, "total": 0, "perplexities": []},
+        "mixed": {"success": 0, "total": 0, "perplexities": []},
+    }
+    # Test cases
+    test_cases = [
+        # English
+        ("[EN] The cat sat on the", "mat", "english"),
+        ("[EN] I like to eat", "food", "english"),
+        ("[EN] Water is essential for", "life", "english"),
+        ("[EN] The sun rises in the", "east", "english"),
+        # Hindi
+        ("[HI] बिल्ली चटाई पर", "बैठी", "hindi"),
+        ("[HI] मुझे खाना खाना", "पसंद है", "hindi"),
+        ("[HI] पानी जीवन के लिए", "आवश्यक है", "hindi"),
+        ("[HI] सूरज पूर्व में", "उगता है", "hindi"),
+        # Punjabi
+        ("[PA] ਬਿੱਲੀ ਚੱਟਈ 'ਤੇ", "ਬੈਠੀ", "punjabi"),
+        ("[PA] ਮੈਂ ਖਾਣਾ ਖਾਣਾ", "ਪਸੰਦ ਕਰਦਾ ਹਾਂ", "punjabi"),
+        ("[PA] ਪਾਣੀ ਜੀਵਨ ਲਈ", "ਜ਼ਰੂਰੀ ਹੈ", "punjabi"),
+        ("[PA] ਸੂਰਜ ਪੂਰਬ ਵਿੱਚ", "ਉੱਗਦਾ ਹੈ", "punjabi"),
+        # Mixed
+        ("[EN] Hello [HI] नमस्ते", "दोस्तों", "mixed"),
+        ("[HI] यह है [EN] good", "news", "mixed"),
+    ]
+    print("\nRunning tests...")
+    for prompt, expected_continuation, lang in test_cases:
+        # Generate
+        input_ids = tokenizer.encode(prompt)
+        input_tensor = torch.tensor([input_ids], device=device)
+        with torch.no_grad():
+            output = model.generate(
+                input_ids=input_tensor,
+                max_length=len(input_ids) + 10,
+                temperature=0.7,
+                do_sample=False,  # Greedy for testing
+                pad_token_id=0,
+            )
+        generated = tokenizer.decode(output[0].tolist())
+        # Check if generation continues meaningfully
+        generated_continuation = generated[len(prompt):].strip().lower()
+        expected_lower = expected_continuation.lower()
+        # Simple check: if expected word appears in generation
+        success = expected_lower in generated_continuation or len(generated_continuation) > 3
+        # Calculate perplexity
+        try:
+            full_text = prompt + " " + expected_continuation
+            text_ids = tokenizer.encode(full_text)
+            text_tensor = torch.tensor([text_ids], device=device)
+            with torch.no_grad():
+                outputs = model(input_ids=text_tensor, labels=text_tensor)
+                loss = outputs.loss
+                perplexity = torch.exp(loss).item()
+        except:
+            perplexity = float('inf')
+        # Update results
+        results[lang]["total"] += 1
+        if success:
+            results[lang]["success"] += 1
+        results[lang]["perplexities"].append(perplexity)
+        print(f"\n{lang.upper()}: {prompt}")
+        print(f"  Generated: {generated_continuation[:50]}...")
+        print(f"  Expected: {expected_continuation}")
+        print(f"  Success: {'✓' if success else '✗'}")
+        print(f"  Perplexity: {perplexity:.2f}")
+    # Calculate metrics
+    print("\n" + "="*60)
+    print("EVALUATION RESULTS")
+    print("="*60)
+    for lang in results:
+        if results[lang]["total"] > 0:
+            accuracy = results[lang]["success"] / results[lang]["total"] * 100
+            avg_perplexity = np.mean(results[lang]["perplexities"])
+            print(f"\n{lang.upper()}:")
+            print(f"  Accuracy: {accuracy:.1f}% ({results[lang]['success']}/{results[lang]['total']})")
+            print(f"  Avg Perplexity: {avg_perplexity:.2f}")
+    # Overall score
+    total_tests = sum(r["total"] for r in results.values())
+    total_success = sum(r["success"] for r in results.values())
+    overall_accuracy = total_success / total_tests * 100 if total_tests > 0 else 0
+    print(f"\nOVERALL ACCURACY: {overall_accuracy:.1f}%")
+    # Save results
+    results["overall_accuracy"] = overall_accuracy
+    with open("evaluation_results.json", "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print("\nResults saved to evaluation_results.json")
+if __name__ == "__main__":
+    evaluate_multilingual_capabilities()

final_corpus/multilingual_corpus.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c94985b65991bc86b55f358ebeaf16709e40529c6dd885aeab2d06a96e63be1
+size 107642577

final_corpus/multilingual_corpus_train.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5dcc9630ec5fc03986488bca2a394c70c885fa09f4888b87be55b458500982b
+size 102242796

final_corpus/multilingual_corpus_val.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

final_corpus/multilingual_spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faf8ae3d54cbc33b749cfff520a86c0e0cbc131ac949b233b8848cb1bf5fe940
+size 166057

final_corpus/multilingual_spm.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

model_config.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Step 2: Model configuration
+"""
+from dataclasses import dataclass
+from transformers import GPT2Config
+@dataclass
+class ModelConfig:
+    # Model architecture
+    vocab_size: int = 8000  # Updated from tokenizer
+    n_positions: int = 256  # Context length
+    n_embd: int = 512       # Hidden size
+    n_layer: int = 8        # Number of layers
+    n_head: int = 8         # Attention heads
+    n_inner: int = 1024     # FFN dimension
+    # Training - REALISTIC VALUES
+    batch_size: int = 8     # Per GPU batch size
+    gradient_accumulation: int = 4  # Effective batch = 32
+    learning_rate: float = 3e-4
+    warmup_steps: int = 1000
+    total_steps: int = 20000  # ~8-9 epochs, NOT 50000
+    weight_decay: float = 0.1
+    max_grad_norm: float = 1.0
+    # Data
+    train_file: str = "./final_corpus/multilingual_corpus_train.txt"
+    val_file: str = "./final_corpus/multilingual_corpus_val.txt"
+    tokenizer_path: str = "./final_corpus/multilingual_spm.model"
+    # Checkpoints
+    output_dir: str = "./checkpoints"
+    save_steps: int = 1000
+    eval_steps: int = 500
+    logging_steps: int = 100
+    # Mixed precision
+    fp16: bool = True
+    def __post_init__(self):
+        print(f"\nModel Configuration (REALISTIC):")
+        print(f"  Parameters: ~{self.total_params:.1f}M")
+        print(f"  Hidden size: {self.n_embd}")
+        print(f"  Layers: {self.n_layer}")
+        print(f"  Context length: {self.n_positions}")
+        print(f"  Effective batch: {self.effective_batch_size}")
+        print(f"  Total steps: {self.total_steps} (~8-9 epochs)")
+        print(f"  Learning rate: {self.learning_rate}")
+    @property
+    def effective_batch_size(self):
+        return self.batch_size * self.gradient_accumulation
+    @property
+    def total_params(self):
+        # Rough estimate
+        embedding = self.vocab_size * self.n_embd
+        attention = 4 * self.n_embd * self.n_embd
+        ffn = 2 * self.n_embd * self.n_inner
+        ln = 2 * self.n_embd
+        per_layer = attention + ffn + ln
+        total = embedding + (self.n_layer * per_layer)
+        return total / 1e6  # Millions

model_demo.html ADDED Viewed

	@@ -0,0 +1,67 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>Multilingual LM Demo</title>
+    <style>
+        body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
+        .container { display: flex; flex-direction: column; gap: 20px; }
+        textarea { width: 100%; height: 100px; padding: 10px; font-size: 16px; }
+        button { padding: 10px 20px; background: #4CAF50; color: white; border: none; cursor: pointer; }
+        button:hover { background: #45a049; }
+        .output { border: 1px solid #ccc; padding: 15px; min-height: 100px; background: #f9f9f9; }
+        .language-tag { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; cursor: pointer; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Multilingual Language Model Demo</h1>
+        <div>
+            <strong>Language:</strong>
+            <span class="language-tag" onclick="setLanguage('[EN] ')">English</span>
+            <span class="language-tag" onclick="setLanguage('[HI] ')">Hindi</span>
+            <span class="language-tag" onclick="setLanguage('[PA] ')">Punjabi</span>
+        </div>
+        <textarea id="prompt" placeholder="Enter your prompt here..."></textarea>
+        <div>
+            <label>Temperature: <input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7"></label>
+            <label>Max Length: <input type="number" id="maxlen" min="20" max="500" value="100"></label>
+        </div>
+        <button onclick="generate()">Generate</button>
+        <div class="output" id="output">Response will appear here...</div>
+    </div>
+    <script>
+        function setLanguage(tag) {
+            document.getElementById('prompt').value = tag;
+        }
+        async function generate() {
+            const prompt = document.getElementById('prompt').value;
+            const temp = document.getElementById('temp').value;
+            const maxlen = document.getElementById('maxlen').value;
+            document.getElementById('output').innerHTML = 'Generating...';
+            try {
+                const response = await fetch('/generate', {
+                    method: 'POST',
+                    headers: {'Content-Type': 'application/json'},
+                    body: JSON.stringify({prompt, temp, maxlen})
+                });
+                const data = await response.json();
+                document.getElementById('output').innerHTML = data.response;
+            } catch (error) {
+                document.getElementById('output').innerHTML = 'Error: ' + error;
+            }
+        }
+    </script>
+</body>
+</html>

preprocess.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+Step 1: Create final shuffled corpus and train tokenizer
+"""
+import random
+from pathlib import Path
+import sentencepiece as spm
+from collections import defaultdict
+import numpy as np
+def create_final_corpus(en_file, hi_file, pa_file, output_file, lang_ratios=None):
+    """
+    Create final multilingual corpus with language tags
+    Args:
+        en_file: English sentences file
+        hi_file: Hindi sentences file
+        pa_file: Punjabi sentences file
+        output_file: Output corpus file
+        lang_ratios: Dict with language ratios, {'en': 0.4, 'hi': 0.4, 'pa': 0.2}
+    """
+    print("Creating final corpus...")
+    # Default ratios
+    if lang_ratios is None:
+        lang_ratios = {'en': 0.4, 'hi': 0.4, 'pa': 0.2}
+    # Read sentences
+    with open(en_file, 'r', encoding='utf-8') as f:
+        en_sentences = [line.strip() for line in f if line.strip()]
+    with open(hi_file, 'r', encoding='utf-8') as f:
+        hi_sentences = [line.strip() for line in f if line.strip()]
+    with open(pa_file, 'r', encoding='utf-8') as f:
+        pa_sentences = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(en_sentences):,} English sentences")
+    print(f"Loaded {len(hi_sentences):,} Hindi sentences")
+    print(f"Loaded {len(pa_sentences):,} Punjabi sentences")
+    # Determine sample sizes
+    total_target = min(len(en_sentences), len(hi_sentences), len(pa_sentences)) * 2
+    target_counts = {
+        'en': int(total_target * lang_ratios['en']),
+        'hi': int(total_target * lang_ratios['hi']),
+        'pa': int(total_target * lang_ratios['pa'])
+    }
+    print(f"\nTarget counts:")
+    print(f"  English: {target_counts['en']:,}")
+    print(f"  Hindi: {target_counts['hi']:,}")
+    print(f"  Punjabi: {target_counts['pa']:,}")
+    # Sample sentences
+    sampled_en = random.sample(en_sentences, min(target_counts['en'], len(en_sentences)))
+    sampled_hi = random.sample(hi_sentences, min(target_counts['hi'], len(hi_sentences)))
+    sampled_pa = random.sample(pa_sentences, min(target_counts['pa'], len(pa_sentences)))
+    # Create corpus with language tags
+    corpus = []
+    for sent in sampled_en:
+        corpus.append(f"[EN] {sent}")
+    for sent in sampled_hi:
+        corpus.append(f"[HI] {sent}")
+    for sent in sampled_pa:
+        corpus.append(f"[PA] {sent}")
+    # Shuffle
+    random.shuffle(corpus)
+    # Write to file
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for line in corpus:
+            f.write(f"{line}\n")
+    # Create train/validation split (95/5)
+    val_size = int(len(corpus) * 0.05)
+    train_corpus = corpus[val_size:]
+    val_corpus = corpus[:val_size]
+    train_file = output_file.replace('.txt', '_train.txt')
+    val_file = output_file.replace('.txt', '_val.txt')
+    with open(train_file, 'w', encoding='utf-8') as f:
+        for line in train_corpus:
+            f.write(f"{line}\n")
+    with open(val_file, 'w', encoding='utf-8') as f:
+        for line in val_corpus:
+            f.write(f"{line}\n")
+    # Statistics
+    print(f"\nCorpus created:")
+    print(f"  Total sentences: {len(corpus):,}")
+    print(f"  Training sentences: {len(train_corpus):,}")
+    print(f"  Validation sentences: {len(val_corpus):,}")
+    # Language distribution
+    lang_counts = defaultdict(int)
+    for line in corpus:
+        if line.startswith('[EN]'):
+            lang_counts['en'] += 1
+        elif line.startswith('[HI]'):
+            lang_counts['hi'] += 1
+        elif line.startswith('[PA]'):
+            lang_counts['pa'] += 1
+    print(f"\nLanguage distribution:")
+    for lang, count in lang_counts.items():
+        percentage = (count / len(corpus)) * 100
+        print(f"  {lang.upper()}: {count:,} ({percentage:.1f}%)")
+    return train_file, val_file
+def train_tokenizer(corpus_file, vocab_size=8000, model_prefix='multilingual'):
+    """
+    Train SentencePiece tokenizer
+    """
+    print(f"\nTraining SentencePiece tokenizer with vocab size {vocab_size}...")
+    # First, create a version without language tags for tokenizer training
+    temp_corpus = 'temp_tokenizer_corpus.txt'
+    with open(corpus_file, 'r', encoding='utf-8') as f_in, \
+         open(temp_corpus, 'w', encoding='utf-8') as f_out:
+        for line in f_in:
+            # Remove language tags for tokenizer training
+            if line.startswith('[EN]'):
+                f_out.write(line[5:])  # Remove "[EN] "
+            elif line.startswith('[HI]'):
+                f_out.write(line[5:])  # Remove "[HI] "
+            elif line.startswith('[PA]'):
+                f_out.write(line[5:])  # Remove "[PA] "
+            else:
+                f_out.write(line)
+    # SentencePiece training parameters
+    spm.SentencePieceTrainer.train(
+        input=temp_corpus,
+        model_prefix=model_prefix,
+        vocab_size=vocab_size,
+        character_coverage=0.9995,  # Important for multilingual
+        model_type='unigram',       # Better for multilingual
+        split_digits=True,
+        allow_whitespace_only_pieces=True,
+        remove_extra_whitespaces=False,
+        byte_fallback=True,         # Important for Indic scripts
+        split_by_unicode_script=True,
+        input_sentence_size=1000000,
+        shuffle_input_sentence=True,
+        # Don't use normalization for Indic scripts
+        normalization_rule_name='identity',
+        seed_sentencepiece_size=1000000,
+        num_threads=4
+    )
+    # Load and test tokenizer
+    sp = spm.SentencePieceProcessor()
+    sp.load(f'{model_prefix}.model')
+    print(f"Tokenizer trained successfully!")
+    print(f"Vocabulary size: {sp.get_piece_size()}")
+    # Test tokenizer
+    test_sentences = [
+        "Hello world",  # English
+        "नमस्ते दुनिया",  # Hindi
+        "ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ"  # Punjabi
+    ]
+    print("\nTokenizer test:")
+    for sent in test_sentences:
+        tokens = sp.encode_as_pieces(sent)
+        ids = sp.encode_as_ids(sent)
+        print(f"  '{sent}' -> {tokens} (ids: {ids})")
+    # Clean up
+    Path(temp_corpus).unlink()
+    return sp
+def analyze_tokenizer(sp, corpus_file):
+    """Analyze tokenizer coverage"""
+    print("\nAnalyzing tokenizer coverage...")
+    languages = {'en': 0, 'hi': 0, 'pa': 0}
+    total_tokens = 0
+    lang_tokens = defaultdict(int)
+    with open(corpus_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    # Sample 1000 sentences per language
+    samples_per_lang = 1000
+    for line in lines:
+        if line.startswith('[EN]'):
+            lang = 'en'
+            text = line[5:].strip()
+        elif line.startswith('[HI]'):
+            lang = 'hi'
+            text = line[5:].strip()
+        elif line.startswith('[PA]'):
+            lang = 'pa'
+            text = line[5:].strip()
+        else:
+            continue
+        languages[lang] += 1
+        if languages[lang] <= samples_per_lang:
+            tokens = sp.encode_as_ids(text)
+            total_tokens += len(tokens)
+            lang_tokens[lang] += len(tokens)
+    print(f"Token counts per language (sampled {samples_per_lang} sentences each):")
+    for lang in ['en', 'hi', 'pa']:
+        avg_tokens = lang_tokens[lang] / samples_per_lang
+        print(f"  {lang.upper()}: {avg_tokens:.1f} tokens per sentence")
+def main():
+    # Configuration
+    EN_FILE = r"C:\Users\manis\Desktop\2026-projects\foundational-model\data\extracted_sentences\en.txt"
+    HI_FILE = r"C:\Users\manis\Desktop\2026-projects\foundational-model\data\extracted_sentences\hi.txt"
+    PA_FILE = r"C:\Users\manis\Desktop\2026-projects\foundational-model\data\extracted_sentences\pa.txt"
+    OUTPUT_DIR = "./final_corpus"
+    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+    FINAL_CORPUS = f"{OUTPUT_DIR}/multilingual_corpus.txt"
+    TOKENIZER_PREFIX = f"{OUTPUT_DIR}/multilingual_spm"
+    # Create final corpus
+    train_file, val_file = create_final_corpus(
+        EN_FILE, HI_FILE, PA_FILE, FINAL_CORPUS,
+        lang_ratios={'en': 0.4, 'hi': 0.4, 'pa': 0.2}
+    )
+    # Train tokenizer
+    sp = train_tokenizer(train_file, vocab_size=8000, model_prefix=TOKENIZER_PREFIX)
+    # Analyze tokenizer
+    analyze_tokenizer(sp, train_file)
+    print(f"\n{'='*60}")
+    print("PREPROCESSING COMPLETE!")
+    print(f"{'='*60}")
+    print(f"Files created in {OUTPUT_DIR}:")
+    print(f"  1. {FINAL_CORPUS} - Full corpus")
+    print(f"  2. {train_file} - Training split")
+    print(f"  3. {val_file} - Validation split")
+    print(f"  4. {TOKENIZER_PREFIX}.model - SentencePiece model")
+    print(f"  5. {TOKENIZER_PREFIX}.vocab - Vocabulary")
+    print(f"\nNext step: Train the model with train_model.py")
+if __name__ == "__main__":
+    # Install sentencepiece if not available
+    try:
+        import sentencepiece as spm
+    except ImportError:
+        import subprocess
+        import sys
+        print("Installing sentencepiece...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
+        import sentencepiece as spm
+    main()

test_model.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Step 4: Test your trained multilingual model
+"""
+import torch
+from transformers import GPT2LMHeadModel
+import sentencepiece as spm
+import os
+from pathlib import Path
+class MultilingualModel:
+    def __init__(self, model_path="./checkpoints_tiny/final"):
+        print("="*60)
+        print("LOADING MULTILINGUAL MODEL")
+        print("="*60)
+        # Check if model exists
+        if not os.path.exists(model_path):
+            print(f"❌ Model not found at: {model_path}")
+            print("Available checkpoints:")
+            checkpoints = list(Path("./checkpoints_tiny").glob("checkpoint-*"))
+            checkpoints += list(Path("./checkpoints_tiny").glob("step*"))
+            checkpoints += list(Path("./checkpoints_tiny").glob("final"))
+            for cp in checkpoints:
+                if cp.is_dir():
+                    print(f"  - {cp}")
+            if checkpoints:
+                model_path = str(checkpoints[-1])  # Use most recent
+                print(f"Using: {model_path}")
+            else:
+                raise FileNotFoundError("No checkpoints found!")
+        # Load tokenizer
+        tokenizer_path = os.path.join(model_path, "tokenizer", "spiece.model")
+        if not os.path.exists(tokenizer_path):
+            tokenizer_path = "./final_corpus/multilingual_spm.model"
+        print(f"Loading tokenizer from: {tokenizer_path}")
+        self.tokenizer = spm.SentencePieceProcessor()
+        self.tokenizer.load(tokenizer_path)
+        # Load model
+        print(f"Loading model from: {model_path}")
+        self.model = GPT2LMHeadModel.from_pretrained(model_path)
+        # Setup device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.model.eval()
+        print(f"✅ Model loaded on: {self.device}")
+        print(f"   Parameters: {sum(p.numel() for p in self.model.parameters())/1e6:.1f}M")
+        print("="*60)
+    def generate(self, prompt, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
+        """Generate text from prompt"""
+        # Add language tag if missing
+        if not any(prompt.startswith(tag) for tag in ['[EN]', '[HI]', '[PA]']):
+            # Try to detect language
+            if any(char in prompt for char in 'अआइईउऊएऐओऔकखगघचछजझटठडढणतथदधनपफबभमयरलवशषसह'):
+                prompt = f"[HI] {prompt}"
+            elif any(char in prompt for char in 'ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਚਛਜਝਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਵਸ਼ਸਹ'):
+                prompt = f"[PA] {prompt}"
+            else:
+                prompt = f"[EN] {prompt}"
+        # Encode
+        input_ids = self.tokenizer.encode(prompt)
+        input_tensor = torch.tensor([input_ids], device=self.device)
+        # Generate
+        with torch.no_grad():
+            output = self.model.generate(
+                input_ids=input_tensor,
+                max_length=max_length,
+                temperature=temperature,
+                do_sample=True,
+                top_k=top_k,
+                top_p=top_p,
+                pad_token_id=self.tokenizer.pad_id() if self.tokenizer.pad_id() > 0 else 0,
+                eos_token_id=self.tokenizer.eos_id() if self.tokenizer.eos_id() > 0 else 2,
+                repetition_penalty=1.1,
+            )
+        # Decode
+        generated = self.tokenizer.decode(output[0].tolist())
+        # Clean up (remove prompt if it's repeated)
+        if generated.startswith(prompt):
+            result = generated[len(prompt):].strip()
+        else:
+            result = generated
+        return result
+    def batch_generate(self, prompts, **kwargs):
+        """Generate for multiple prompts"""
+        results = []
+        for prompt in prompts:
+            result = self.generate(prompt, **kwargs)
+            results.append(result)
+        return results
+    def calculate_perplexity(self, text):
+        """Calculate perplexity of given text"""
+        input_ids = self.tokenizer.encode(text)
+        if len(input_ids) < 2:
+            return float('inf')
+        input_tensor = torch.tensor([input_ids], device=self.device)
+        with torch.no_grad():
+            outputs = self.model(input_ids=input_tensor, labels=input_tensor)
+            loss = outputs.loss
+        perplexity = torch.exp(loss).item()
+        return perplexity
+    def interactive_mode(self):
+        """Interactive chat with model"""
+        print("\n" + "="*60)
+        print("INTERACTIVE MODE")
+        print("="*60)
+        print("Enter prompts in any language (add [EN], [HI], [PA] tags)")
+        print("Commands: /temp X, /len X, /quit, /help")
+        print("="*60)
+        temperature = 0.7
+        max_length = 100
+        while True:
+            try:
+                user_input = input("\nYou: ").strip()
+                if not user_input:
+                    continue
+                # Handle commands
+                if user_input.startswith('/'):
+                    if user_input == '/quit':
+                        break
+                    elif user_input == '/help':
+                        print("Commands:")
+                        print("  /temp X - Set temperature (0.1 to 2.0)")
+                        print("  /len X  - Set max length (20 to 500)")
+                        print("  /quit   - Exit")
+                        print("  /help   - Show this help")
+                        continue
+                    elif user_input.startswith('/temp'):
+                        try:
+                            temp = float(user_input.split()[1])
+                            if 0.1 <= temp <= 2.0:
+                                temperature = temp
+                                print(f"Temperature set to {temperature}")
+                            else:
+                                print("Temperature must be between 0.1 and 2.0")
+                        except:
+                            print("Usage: /temp 0.7")
+                        continue
+                    elif user_input.startswith('/len'):
+                        try:
+                            length = int(user_input.split()[1])
+                            if 20 <= length <= 500:
+                                max_length = length
+                                print(f"Max length set to {max_length}")
+                            else:
+                                print("Length must be between 20 and 500")
+                        except:
+                            print("Usage: /len 100")
+                        continue
+                # Generate response
+                print("Model: ", end="", flush=True)
+                response = self.generate(user_input, max_length=max_length, temperature=temperature)
+                print(response)
+            except KeyboardInterrupt:
+                print("\n\nExiting...")
+                break
+            except Exception as e:
+                print(f"Error: {e}")
+def run_tests():
+    """Run comprehensive tests"""
+    print("\n" + "="*60)
+    print("COMPREHENSIVE MODEL TESTS")
+    print("="*60)
+    # Load model
+    model = MultilingualModel()
+    # Test prompts by language
+    test_suites = {
+        "English": [
+            "[EN] The weather today is",
+            "[EN] I want to learn",
+            "[EN] Artificial intelligence",
+            "[EN] The capital of India is",
+            "[EN] Once upon a time",
+        ],
+        "Hindi": [
+            "[HI] आज का मौसम",
+            "[HI] मैं सीखना चाहता हूं",
+            "[HI] कृत्रिम बुद्धिमत्ता",
+            "[HI] भारत की राजधानी है",
+            "[HI] एक बार की बात है",
+        ],
+        "Punjabi": [
+            "[PA] ਅੱਜ ਦਾ ਮੌਸਮ",
+            "[PA] ਮੈਂ ਸਿੱਖਣਾ ਚਾਹੁੰਦਾ ਹਾਂ",
+            "[PA] ਕ੍ਰਿਤਰਿਮ ਬੁੱਧੀ",
+            "[PA] ਭਾਰਤ ਦੀ ਰਾਜਧਾਨੀ ਹੈ",
+            "[PA] ਇੱਕ ਵਾਰ ਦੀ ਗੱਲ ਹੈ",
+        ],
+        "Language Switching": [
+            "[EN] Hello [HI] नमस्ते",
+            "[HI] यह अच्छा है [EN] this is good",
+            "[PA] ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ [EN] Hello everyone",
+        ],
+        "Code Mixing": [
+            "Hello दुनिया",  # No tag, should auto-detect
+            "मेरा name है",  # Hindi + English
+            "Today मौसम is good",  # English + Hindi
+        ]
+    }
+    for suite_name, prompts in test_suites.items():
+        print(f"\n{'='*40}")
+        print(f"{suite_name.upper()} TESTS")
+        print('='*40)
+        for i, prompt in enumerate(prompts):
+            print(f"\nTest {i+1}:")
+            print(f"Prompt: {prompt}")
+            # Generate
+            response = model.generate(prompt, max_length=50, temperature=0.7)
+            print(f"Response: {response}")
+            # Calculate perplexity
+            try:
+                perplexity = model.calculate_perplexity(response)
+                print(f"Perplexity: {perplexity:.2f}")
+            except:
+                pass
+            print("-" * 40)
+def benchmark_model():
+    """Benchmark model performance"""
+    print("\n" + "="*60)
+    print("MODEL BENCHMARK")
+    print("="*60)
+    model = MultilingualModel()
+    import time
+    # Test generation speed
+    test_prompt = "[EN] The quick brown fox"
+    times = []
+    for _ in range(10):
+        start = time.time()
+        model.generate(test_prompt, max_length=50)
+        end = time.time()
+        times.append(end - start)
+    avg_time = sum(times) / len(times)
+    print(f"Average generation time (50 tokens): {avg_time:.3f}s")
+    print(f"Tokens per second: {50/avg_time:.1f}")
+    # Memory usage
+    if torch.cuda.is_available():
+        memory_allocated = torch.cuda.memory_allocated() / 1e9
+        memory_reserved = torch.cuda.memory_reserved() / 1e9
+        print(f"GPU Memory allocated: {memory_allocated:.2f} GB")
+        print(f"GPU Memory reserved: {memory_reserved:.2f} GB")
+def create_web_interface():
+    """Simple web interface for the model"""
+    html_code = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Multilingual LM Demo</title>
+    <style>
+        body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
+        .container { display: flex; flex-direction: column; gap: 20px; }
+        textarea { width: 100%; height: 100px; padding: 10px; font-size: 16px; }
+        button { padding: 10px 20px; background: #4CAF50; color: white; border: none; cursor: pointer; }
+        button:hover { background: #45a049; }
+        .output { border: 1px solid #ccc; padding: 15px; min-height: 100px; background: #f9f9f9; }
+        .language-tag { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; cursor: pointer; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Multilingual Language Model Demo</h1>
+        <div>
+            <strong>Language:</strong>
+            <span class="language-tag" onclick="setLanguage('[EN] ')">English</span>
+            <span class="language-tag" onclick="setLanguage('[HI] ')">Hindi</span>
+            <span class="language-tag" onclick="setLanguage('[PA] ')">Punjabi</span>
+        </div>
+        <textarea id="prompt" placeholder="Enter your prompt here..."></textarea>
+        <div>
+            <label>Temperature: <input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7"></label>
+            <label>Max Length: <input type="number" id="maxlen" min="20" max="500" value="100"></label>
+        </div>
+        <button onclick="generate()">Generate</button>
+        <div class="output" id="output">Response will appear here...</div>
+    </div>
+    <script>
+        function setLanguage(tag) {
+            document.getElementById('prompt').value = tag;
+        }
+        async function generate() {
+            const prompt = document.getElementById('prompt').value;
+            const temp = document.getElementById('temp').value;
+            const maxlen = document.getElementById('maxlen').value;
+            document.getElementById('output').innerHTML = 'Generating...';
+            try {
+                const response = await fetch('/generate', {
+                    method: 'POST',
+                    headers: {'Content-Type': 'application/json'},
+                    body: JSON.stringify({prompt, temp, maxlen})
+                });
+                const data = await response.json();
+                document.getElementById('output').innerHTML = data.response;
+            } catch (error) {
+                document.getElementById('output').innerHTML = 'Error: ' + error;
+            }
+        }
+    </script>
+</body>
+</html>
+    """
+    # Save HTML
+    with open("model_demo.html", "w", encoding="utf-8") as f:
+        f.write(html_code)
+    print("Web interface saved as model_demo.html")
+    print("To use it, you need a backend server (see create_server.py)")
+def main():
+    """Main function"""
+    print("\n" + "="*60)
+    print("MULTILINGUAL MODEL PLAYGROUND")
+    print("="*60)
+    print("\nOptions:")
+    print("1. Interactive chat")
+    print("2. Run comprehensive tests")
+    print("3. Benchmark model")
+    print("4. Create web interface")
+    print("5. Quick generation test")
+    print("6. Exit")
+    # Load model once
+    model = None
+    while True:
+        try:
+            choice = input("\nSelect option (1-6): ").strip()
+            if choice == '1':
+                if model is None:
+                    model = MultilingualModel()
+                model.interactive_mode()
+            elif choice == '2':
+                run_tests()
+            elif choice == '3':
+                benchmark_model()
+            elif choice == '4':
+                create_web_interface()
+            elif choice == '5':
+                if model is None:
+                    model = MultilingualModel()
+                prompt = input("Enter prompt: ").strip()
+                if prompt:
+                    response = model.generate(prompt)
+                    print(f"\nResponse: {response}")
+            elif choice == '6':
+                print("Goodbye!")
+                break
+            else:
+                print("Invalid choice. Please enter 1-6.")
+        except KeyboardInterrupt:
+            print("\n\nExiting...")
+            break
+        except Exception as e:
+            print(f"Error: {e}")
+            import traceback
+            traceback.print_exc()
+if __name__ == "__main__":
+    main()

train_model.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Step 3: STREAMLINED Training - Minimal, Fast
+"""
+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPT2LMHeadModel, GPT2Config
+import sentencepiece as spm
+from tqdm import tqdm
+import time
+# ===== CONFIG =====
+CONFIG = {
+    'train_file': './final_corpus/multilingual_corpus_train.txt',
+    'val_file': './final_corpus/multilingual_corpus_val.txt',
+    'tokenizer_path': './final_corpus/multilingual_spm.model',
+    # Tiny model for fast training
+    'n_positions': 128,
+    'n_embd': 256,
+    'n_layer': 4,
+    'n_head': 4,
+    'n_inner': 512,
+    # Training
+    'batch_size': 2,        # Small batch for 4GB
+    'grad_accum': 8,        # Effective batch = 16
+    'learning_rate': 2e-4,
+    'total_steps': 5000,    # Train for 5000 steps only
+    'save_every': 1000,
+}
+class SimpleDataset(Dataset):
+    def __init__(self, filepath, tokenizer, block_size):
+        self.tokenizer = tokenizer
+        self.block_size = block_size
+        print("Loading data...")
+        with open(filepath, 'r', encoding='utf-8') as f:
+            lines = [line.strip() for line in f if line.strip()]
+        # Tokenize all at once
+        self.examples = []
+        for line in tqdm(lines[:600000], desc="Tokenizing"):  # Use only 50K lines
+            tokens = tokenizer.encode(line)
+            if len(tokens) > 10:
+                if len(tokens) > block_size:
+                    tokens = tokens[:block_size]
+                else:
+                    tokens = tokens + [0] * (block_size - len(tokens))
+                self.examples.append(tokens)
+        print(f"Created {len(self.examples)} examples")
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, idx):
+        return torch.tensor(self.examples[idx], dtype=torch.long)
+def train_streamlined():
+    print("\n" + "="*60)
+    print("STREAMLINED TRAINING - FASTEST POSSIBLE")
+    print("="*60)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Device: {device}")
+    # Load tokenizer
+    tokenizer = spm.SentencePieceProcessor()
+    tokenizer.load(CONFIG['tokenizer_path'])
+    vocab_size = tokenizer.get_piece_size()
+    # Create tiny model
+    config = GPT2Config(
+        vocab_size=vocab_size,
+        n_positions=CONFIG['n_positions'],
+        n_embd=CONFIG['n_embd'],
+        n_layer=CONFIG['n_layer'],
+        n_head=CONFIG['n_head'],
+        n_inner=CONFIG['n_inner'],
+        pad_token_id=0,
+    )
+    model = GPT2LMHeadModel(config)
+    model.to(device)
+    model.train()
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+    # Create dataset (small)
+    dataset = SimpleDataset(CONFIG['train_file'], tokenizer, CONFIG['n_positions'])
+    dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
+    print(f"\nModel: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params")
+    print(f"Training steps: {CONFIG['total_steps']}")
+    print(f"Estimated time: {CONFIG['total_steps']*0.3/3600:.1f} hours\n")
+    # Training loop
+    global_step = 0
+    accumulation_steps = 0
+    start_time = time.time()
+    while global_step < CONFIG['total_steps']:
+        for batch in dataloader:
+            batch = batch.to(device)
+            # Forward
+            outputs = model(input_ids=batch, labels=batch)
+            loss = outputs.loss / CONFIG['grad_accum']
+            # Backward
+            loss.backward()
+            accumulation_steps += 1
+            # Gradient accumulation
+            if accumulation_steps == CONFIG['grad_accum']:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                optimizer.zero_grad()
+                global_step += 1
+                accumulation_steps = 0
+                # Print progress
+                if global_step % 100 == 0:
+                    elapsed = time.time() - start_time
+                    steps_per_second = global_step / elapsed
+                    remaining = (CONFIG['total_steps'] - global_step) / steps_per_second
+                    print(f"Step {global_step}/{CONFIG['total_steps']} | "
+                          f"Loss: {loss.item()*CONFIG['grad_accum']:.3f} | "
+                          f"Remaining: {remaining/3600:.1f}h")
+                # Save checkpoint
+                if global_step % CONFIG['save_every'] == 0:
+                    save_path = f"./checkpoints_tiny/step{global_step}"
+                    model.save_pretrained(save_path)
+                    print(f"Saved checkpoint: {save_path}")
+                # Stop if reached total steps
+                if global_step >= CONFIG['total_steps']:
+                    break
+    print(f"\nTraining completed in {(time.time()-start_time)/3600:.2f} hours")
+    # Save final model
+    model.save_pretrained("./checkpoints_tiny/final")
+    print("Final model saved to ./checkpoints_tiny/final")
+if __name__ == "__main__":
+    train_streamlined()

web_interface.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+Simple web interface using Gradio
+"""
+import torch
+from transformers import GPT2LMHeadModel
+import sentencepiece as spm
+import gradio as gr
+import os
+class SimpleModel:
+    def __init__(self, model_path="./checkpoints_tiny/final"):
+        # Load tokenizer
+        tokenizer_path = os.path.join(model_path, "tokenizer", "spiece.model")
+        if not os.path.exists(tokenizer_path):
+            tokenizer_path = "./final_corpus/multilingual_spm.model"
+        self.tokenizer = spm.SentencePieceProcessor()
+        self.tokenizer.load(tokenizer_path)
+        # Load model
+        self.model = GPT2LMHeadModel.from_pretrained(model_path)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.model.eval()
+    def generate(self, prompt, max_length=100, temperature=0.7, top_p=0.95):
+        # Add language tag if missing
+        if not any(prompt.startswith(tag) for tag in ['[EN]', '[HI]', '[PA]']):
+            prompt = f"[EN] {prompt}"
+        input_ids = self.tokenizer.encode(prompt)
+        input_tensor = torch.tensor([input_ids], device=self.device)
+        with torch.no_grad():
+            output = self.model.generate(
+                input_ids=input_tensor,
+                max_length=max_length,
+                temperature=temperature,
+                do_sample=True,
+                top_p=top_p,
+                pad_token_id=0,
+                repetition_penalty=1.1,
+            )
+        generated = self.tokenizer.decode(output[0].tolist())
+        if generated.startswith(prompt):
+            return generated[len(prompt):].strip()
+        return generated
+def create_gradio_interface():
+    # Initialize model
+    model = SimpleModel()
+    def generate_text(prompt, max_length, temperature, top_p):
+        try:
+            result = model.generate(prompt, int(max_length), float(temperature), float(top_p))
+            return result
+        except Exception as e:
+            return f"Error: {str(e)}"
+    # Create interface
+    with gr.Blocks(title="Multilingual LM Demo", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🌍 Multilingual Language Model")
+        gr.Markdown("Generate text in English, Hindi, or Punjabi")
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(
+                    label="Enter prompt",
+                    placeholder="Start with [EN], [HI], or [PA] for language...",
+                    lines=3
+                )
+                with gr.Row():
+                    max_length = gr.Slider(20, 500, value=100, label="Max Length")
+                    temperature = gr.Slider(0.1, 2.0, value=0.7, label="Temperature")
+                    top_p = gr.Slider(0.1, 1.0, value=0.95, label="Top-p")
+                generate_btn = gr.Button("Generate", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(label="Generated Text", lines=10)
+        # Examples
+        gr.Examples(
+            examples=[
+                ["[EN] The weather today is"],
+                ["[HI] आज का मौसम"],
+                ["[PA] ਅੱਜ ਦਾ ਮੌਸਮ"],
+                ["[EN] Once upon a time in India"],
+                ["[HI] भारत एक महान देश है"],
+                ["[PA] ਭਾਰਤ ਇੱਕ ਮਹਾਨ ਦੇਸ਼ ਹੈ"],
+            ],
+            inputs=prompt,
+            label="Try these examples:"
+        )
+        # Button click
+        generate_btn.click(
+            fn=generate_text,
+            inputs=[prompt, max_length, temperature, top_p],
+            outputs=output
+        )
+        # Also generate on Enter key
+        prompt.submit(
+            fn=generate_text,
+            inputs=[prompt, max_length, temperature, top_p],
+            outputs=output
+        )
+    return demo
+if __name__ == "__main__":
+    # Install gradio if not installed
+    try:
+        import gradio as gr
+    except ImportError:
+        print("Installing gradio...")
+        import subprocess
+        subprocess.check_call(["pip", "install", "gradio"])
+        import gradio as gr
+    # Create and launch interface
+    demo = create_gradio_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Set to True to get public link
+        debug=False
+    )