Spaces:

heerjtdev
/

LSTM-CRF_Train

Sleeping

App Files Files Community

aagamjtdev commited on Oct 29, 2025

Commit

f17ce9b

1 Parent(s): 9203994

add download button

Browse files

Files changed (1) hide show

app.py +607 -72

app.py CHANGED Viewed

@@ -1,3 +1,457 @@
 import os
@@ -6,6 +460,7 @@ import tempfile
 import gradio as gr
 from huggingface_hub import hf_hub_download, upload_file, HfApi
 import sys
 # Add current directory to path to import train_model
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -18,6 +473,8 @@ CHECKPOINT_FILE = "checkpoint_enhanced.pt"
 # IMPORTANT: Update this with your actual Hugging Face repository ID
 REPO_ID = "heerjtdev/LSTM_CRF"  # Replace with your repo ID
 # HF_TOKEN = os.environ.get("HF_TOKEN")  # Set this as a secret in your Space settings
@@ -25,7 +482,7 @@ def download_existing_models():
     """Download existing model files from the Hugging Face Hub if available."""
     try:
         api = HfApi()
-        #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
         files = api.list_repo_files(REPO_ID)
         os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -91,19 +548,20 @@ def train_model(dataset_file, progress=gr.Progress()):
         progress(0.05, desc="Checking Hugging Face Hub for existing models...")
         download_status = download_existing_models()
         status_log = f"{download_status}\n\n"
-        yield status_log, None, None
         # Step 2: Save uploaded file
         progress(0.1, desc="Processing uploaded dataset...")
         dataset_path = dataset_file.name
         status_log += f"📂 Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
-        yield status_log, None, None
         # Step 3: Import and run training
         progress(0.15, desc="Initializing training...")
         status_log += "🚀 Starting training...\n"
         status_log += "📊 This may take a while. Training progress will appear in the terminal.\n\n"
-        yield status_log, None, None
         # Import the training module
         try:
@@ -121,16 +579,16 @@ def train_model(dataset_file, progress=gr.Progress()):
             print("=" * 80)
             status_log += "✅ Training completed successfully!\n\n"
-            yield status_log, None, None
         except ImportError as ie:
             error_msg = f"❌ Failed to import training module: {str(ie)}\n"
             error_msg += "Make sure train_model.py is in the same directory as app.py"
-            yield status_log + error_msg, None, None
             return
         except Exception as train_error:
             error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
-            yield status_log + error_msg, None, None
             return
         # Step 4: Verify files exist
@@ -147,16 +605,16 @@ def train_model(dataset_file, progress=gr.Progress()):
         if not files_exist:
             error_msg = "❌ Error: Model files were not created. Check training logs."
-            yield status_log + error_msg, None, None
             return
         status_log += f"✅ Found trained files: {', '.join(files_exist)}\n\n"
-        yield status_log, None, None
         # Step 5: Upload to Hub
         progress(0.9, desc="Uploading models to Hugging Face Hub...")
         status_log += "☁️ Uploading to Hugging Face Hub...\n"
-        yield status_log, None, None
         upload_status = []
@@ -208,7 +666,7 @@ def train_model(dataset_file, progress=gr.Progress()):
         else:
             status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
-        yield status_log, None, None
         # Step 6: Copy to temp directory for download
         progress(0.95, desc="Preparing download files...")
@@ -236,13 +694,15 @@ def train_model(dataset_file, progress=gr.Progress()):
         status_log += "TRAINING COMPLETE - You can now download the model files\n"
         status_log += "=" * 50
-        yield status_log, model_download, vocab_download
     except Exception as e:
         error_msg = f"❌ Unexpected error: {str(e)}\n"
         import traceback
         error_msg += f"\nTraceback:\n{traceback.format_exc()}"
-        yield error_msg, None, None
 def download_models_from_hub():
@@ -251,7 +711,7 @@ def download_models_from_hub():
         os.makedirs(OUTPUT_DIR, exist_ok=True)
         api = HfApi()
-        #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
         files = api.list_repo_files(REPO_ID)
         downloaded_files = []
@@ -268,7 +728,7 @@ def download_models_from_hub():
             )
             downloaded_files.append(MODEL_FILE)
         else:
-            return f"❌ {MODEL_FILE} not found in repository", None, None
         # Download vocab
         if VOCAB_FILE in files:
@@ -282,7 +742,7 @@ def download_models_from_hub():
             )
             downloaded_files.append(VOCAB_FILE)
         else:
-            return f"❌ {VOCAB_FILE} not found in repository", None, None
         # Copy to temp for download
         temp_dir = tempfile.mkdtemp()
@@ -297,7 +757,8 @@ def download_models_from_hub():
         success_msg += f"   • {VOCAB_FILE}\n\n"
         success_msg += "📦 Files are ready to download below!"
-        return success_msg, temp_model, temp_vocab
     except Exception as e:
         error_msg = f"❌ Error downloading models: {str(e)}\n\n"
@@ -305,7 +766,89 @@ def download_models_from_hub():
         error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
         error_msg += f"2. HF_TOKEN is set in Space secrets\n"
         error_msg += f"3. Model files exist in the repository"
-        return error_msg, None, None
 # Create Gradio interface
@@ -328,6 +871,13 @@ with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.theme
         """
     )
     with gr.Tab("🚀 Train New Model"):
         gr.Markdown(
             """
@@ -351,37 +901,58 @@ with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.theme
                 )
                 train_button = gr.Button("🚀 Start Training", variant="primary", size="lg")
             with gr.Column():
                 status_output = gr.Textbox(
-                    label="📊 Training Status",
                     lines=12,
                     interactive=False,
                     show_copy_button=True
                 )
-        gr.Markdown("### 📦 Download Trained Models")
         with gr.Row():
-            model_output = gr.File(label="💾 Model File (.pt)")
-            vocab_output = gr.File(label="📚 Vocabulary File (.pkl)")
         train_button.click(
             fn=train_model,
             inputs=[dataset_input],
-            outputs=[status_output, model_output, vocab_output]
         )
     with gr.Tab("☁️ Download from Hub"):
         gr.Markdown(
             """
             ### Download Pre-trained Models
             Download the latest trained models directly from your Hugging Face repository.
-            This is useful if:
-            - You want to use pre-trained models without training
-            - You need to download models trained in a previous session
-            - You want to get the latest version from the Hub
-            The downloaded files can be used for inference with your MCQ extraction pipeline.
             """
         )
@@ -396,58 +967,22 @@ with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.theme
         gr.Markdown("### 📦 Downloaded Files")
         with gr.Row():
-            hub_model_output = gr.File(label="💾 Model File (.pt)")
-            hub_vocab_output = gr.File(label="📚 Vocabulary File (.pkl)")
         download_button.click(
             fn=download_models_from_hub,
-            outputs=[download_status, hub_model_output, hub_vocab_output]
         )
     gr.Markdown(
         """
         ---
         ### ⚙️ Model Configuration:
-        **Architecture:**
-        - BiLSTM-CRF with spatial attention mechanism
-        - Word embeddings + Character-level CNN
-        - Bounding box encoding with MLP
-        - Spatial & context feature extraction
-        - Learnable positional embeddings
-        **Features Used:**
-        - Token text (word-level and character-level)
-        - Bounding box coordinates (normalized)
-        - Spatial features: vertical spacing, alignment, dimensions (11 features)
-        - Context features: surrounding question/option markers (8 features)
-        **Output Labels (13 total):**
-        - Questions, Options, Answers, Images, Section Headings, Passages (BIO tagging)
-        **Training Parameters:**
-        - Batch Size: 8
-        - Epochs: 10 (with early stopping after 10 epochs without improvement)
-        - Learning Rate: 5e-4 (AdamW optimizer with OneCycleLR scheduler)
-        - Hidden Size: 768
-        - Total Parameters: ~15.6M
-        **Hardware Requirements:**
-        - GPU recommended for reasonable training speed
-        - CPU training supported but significantly slower
-        ---
-        **Environment Variables Required:**
-        - `SPACE_ID`: Your Hugging Face Space/Repo ID (auto-set in Spaces)
-        - `HF_TOKEN`: Your Hugging Face write token (set as a secret)
-        **Model Persistence:**
-        - Models are automatically saved to `output_data/` directory
-        - Best model is uploaded to Hugging Face Hub after each improvement
-        - Training can be resumed from checkpoints
         """
     )

+# import os
+# import shutil
+# import tempfile
+# import gradio as gr
+# from huggingface_hub import hf_hub_download, upload_file, HfApi
+# import sys
+#
+# # Add current directory to path to import train_model
+# sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+#
+# # Configuration
+# OUTPUT_DIR = "output_data"
+# MODEL_FILE = "model_enhanced.pt"
+# VOCAB_FILE = "vocabs_enhanced.pkl"
+# CHECKPOINT_FILE = "checkpoint_enhanced.pt"
+#
+# # IMPORTANT: Update this with your actual Hugging Face repository ID
+# REPO_ID = "heerjtdev/LSTM_CRF"  # Replace with your repo ID
+# # HF_TOKEN = os.environ.get("HF_TOKEN")  # Set this as a secret in your Space settings
+#
+#
+# def download_existing_models():
+#     """Download existing model files from the Hugging Face Hub if available."""
+#     try:
+#         api = HfApi()
+#         #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
+#         files = api.list_repo_files(REPO_ID)
+#
+#         os.makedirs(OUTPUT_DIR, exist_ok=True)
+#
+#         downloaded_files = []
+#
+#         # Download model file
+#         if MODEL_FILE in files:
+#             print(f"📥 Downloading {MODEL_FILE} from Hub...")
+#             model_path = hf_hub_download(
+#                 repo_id=REPO_ID,
+#                 filename=MODEL_FILE,
+#                 # token=HF_TOKEN,
+#                 local_dir=OUTPUT_DIR,
+#                 force_download=True  # Always get latest version
+#             )
+#             downloaded_files.append(MODEL_FILE)
+#             print(f"✅ Downloaded {MODEL_FILE}")
+#
+#         # Download vocab file
+#         if VOCAB_FILE in files:
+#             print(f"📥 Downloading {VOCAB_FILE} from Hub...")
+#             vocab_path = hf_hub_download(
+#                 repo_id=REPO_ID,
+#                 filename=VOCAB_FILE,
+#                 # token=HF_TOKEN,
+#                 local_dir=OUTPUT_DIR,
+#                 force_download=True  # Always get latest version
+#             )
+#             downloaded_files.append(VOCAB_FILE)
+#             print(f"✅ Downloaded {VOCAB_FILE}")
+#
+#         # Download checkpoint file (optional, for resuming training)
+#         if CHECKPOINT_FILE in files:
+#             print(f"📥 Downloading {CHECKPOINT_FILE} from Hub...")
+#             checkpoint_path = hf_hub_download(
+#                 repo_id=REPO_ID,
+#                 filename=CHECKPOINT_FILE,
+#                 # token=HF_TOKEN,
+#                 local_dir=OUTPUT_DIR,
+#                 force_download=True
+#             )
+#             downloaded_files.append(CHECKPOINT_FILE)
+#             print(f"✅ Downloaded {CHECKPOINT_FILE}")
+#
+#         if downloaded_files:
+#             return f"✅ Downloaded from Hub: {', '.join(downloaded_files)}"
+#         else:
+#             return "ℹ️ No existing model files found in repository. Starting fresh."
+#     except Exception as e:
+#         error_msg = f"⚠️ Could not download existing models: {str(e)}"
+#         print(error_msg)
+#         return error_msg
+#
+#
+# def train_model(dataset_file, progress=gr.Progress()):
+#     """Train the model with the uploaded dataset."""
+#     if dataset_file is None:
+#         return "❌ Please upload a dataset file!", None, None
+#
+#     try:
+#         # Step 1: Download existing models from Hub (if any) BEFORE training starts
+#         progress(0.05, desc="Checking Hugging Face Hub for existing models...")
+#         download_status = download_existing_models()
+#         status_log = f"{download_status}\n\n"
+#         yield status_log, None, None
+#
+#         # Step 2: Save uploaded file
+#         progress(0.1, desc="Processing uploaded dataset...")
+#         dataset_path = dataset_file.name
+#         status_log += f"📂 Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
+#         yield status_log, None, None
+#
+#         # Step 3: Import and run training
+#         progress(0.15, desc="Initializing training...")
+#         status_log += "🚀 Starting training...\n"
+#         status_log += "📊 This may take a while. Training progress will appear in the terminal.\n\n"
+#         yield status_log, None, None
+#
+#         # Import the training module
+#         try:
+#             import train_model as tm
+#             print("=" * 80)
+#             print("TRAINING STARTED")
+#             print("=" * 80)
+#
+#             # Run training - this will handle model loading internally
+#             progress(0.2, desc="Training in progress... (check terminal for details)")
+#             tm.train_from_json(dataset_path)
+#
+#             print("=" * 80)
+#             print("TRAINING COMPLETED")
+#             print("=" * 80)
+#
+#             status_log += "✅ Training completed successfully!\n\n"
+#             yield status_log, None, None
+#
+#         except ImportError as ie:
+#             error_msg = f"❌ Failed to import training module: {str(ie)}\n"
+#             error_msg += "Make sure train_model.py is in the same directory as app.py"
+#             yield status_log + error_msg, None, None
+#             return
+#         except Exception as train_error:
+#             error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
+#             yield status_log + error_msg, None, None
+#             return
+#
+#         # Step 4: Verify files exist
+#         progress(0.85, desc="Verifying trained model files...")
+#         model_path = os.path.join(OUTPUT_DIR, MODEL_FILE)
+#         vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE)
+#         checkpoint_path = os.path.join(OUTPUT_DIR, CHECKPOINT_FILE)
+#
+#         files_exist = []
+#         if os.path.exists(model_path):
+#             files_exist.append(MODEL_FILE)
+#         if os.path.exists(vocab_path):
+#             files_exist.append(VOCAB_FILE)
+#
+#         if not files_exist:
+#             error_msg = "❌ Error: Model files were not created. Check training logs."
+#             yield status_log + error_msg, None, None
+#             return
+#
+#         status_log += f"✅ Found trained files: {', '.join(files_exist)}\n\n"
+#         yield status_log, None, None
+#
+#         # Step 5: Upload to Hub
+#         progress(0.9, desc="Uploading models to Hugging Face Hub...")
+#         status_log += "☁️ Uploading to Hugging Face Hub...\n"
+#         yield status_log, None, None
+#
+#         upload_status = []
+#
+#         if os.path.exists(model_path):
+#             try:
+#                 upload_file(
+#                     path_or_fileobj=model_path,
+#                     path_in_repo=MODEL_FILE,
+#                     repo_id=REPO_ID,
+#                     # token=HF_TOKEN,
+#                     commit_message="Update trained model"
+#                 )
+#                 upload_status.append(MODEL_FILE)
+#                 print(f"✅ Uploaded {MODEL_FILE} to Hub")
+#             except Exception as e:
+#                 print(f"⚠️ Failed to upload {MODEL_FILE}: {e}")
+#
+#         if os.path.exists(vocab_path):
+#             try:
+#                 upload_file(
+#                     path_or_fileobj=vocab_path,
+#                     path_in_repo=VOCAB_FILE,
+#                     repo_id=REPO_ID,
+#                     # token=HF_TOKEN,
+#                     commit_message="Update vocabulary"
+#                 )
+#                 upload_status.append(VOCAB_FILE)
+#                 print(f"✅ Uploaded {VOCAB_FILE} to Hub")
+#             except Exception as e:
+#                 print(f"⚠️ Failed to upload {VOCAB_FILE}: {e}")
+#
+#         # Also upload checkpoint for future resume capability
+#         if os.path.exists(checkpoint_path):
+#             try:
+#                 upload_file(
+#                     path_or_fileobj=checkpoint_path,
+#                     path_in_repo=CHECKPOINT_FILE,
+#                     repo_id=REPO_ID,
+#                     # token=HF_TOKEN,
+#                     commit_message="Update checkpoint"
+#                 )
+#                 upload_status.append(CHECKPOINT_FILE)
+#                 print(f"✅ Uploaded {CHECKPOINT_FILE} to Hub")
+#             except Exception as e:
+#                 print(f"⚠️ Failed to upload {CHECKPOINT_FILE}: {e}")
+#
+#         if upload_status:
+#             status_log += f"✅ Uploaded to Hub: {', '.join(upload_status)}\n\n"
+#         else:
+#             status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
+#
+#         yield status_log, None, None
+#
+#         # Step 6: Copy to temp directory for download
+#         progress(0.95, desc="Preparing download files...")
+#         temp_dir = tempfile.mkdtemp()
+#
+#         model_download = None
+#         vocab_download = None
+#
+#         if os.path.exists(model_path):
+#             temp_model = os.path.join(temp_dir, MODEL_FILE)
+#             shutil.copy2(model_path, temp_model)
+#             model_download = temp_model
+#             print(f"📦 Prepared {MODEL_FILE} for download")
+#
+#         if os.path.exists(vocab_path):
+#             temp_vocab = os.path.join(temp_dir, VOCAB_FILE)
+#             shutil.copy2(vocab_path, temp_vocab)
+#             vocab_download = temp_vocab
+#             print(f"📦 Prepared {VOCAB_FILE} for download")
+#
+#         progress(1.0, desc="Complete!")
+#
+#         status_log += "📦 Files ready for download below!\n"
+#         status_log += "\n" + "=" * 50 + "\n"
+#         status_log += "TRAINING COMPLETE - You can now download the model files\n"
+#         status_log += "=" * 50
+#
+#         yield status_log, model_download, vocab_download
+#
+#     except Exception as e:
+#         error_msg = f"❌ Unexpected error: {str(e)}\n"
+#         import traceback
+#         error_msg += f"\nTraceback:\n{traceback.format_exc()}"
+#         yield error_msg, None, None
+#
+#
+# def download_models_from_hub():
+#     """Download the latest models from the Hugging Face Hub."""
+#     try:
+#         os.makedirs(OUTPUT_DIR, exist_ok=True)
+#
+#         api = HfApi()
+#         #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
+#         files = api.list_repo_files(REPO_ID)
+#
+#         downloaded_files = []
+#
+#         # Download model
+#         if MODEL_FILE in files:
+#             print(f"📥 Downloading {MODEL_FILE} from Hub...")
+#             model_path = hf_hub_download(
+#                 repo_id=REPO_ID,
+#                 filename=MODEL_FILE,
+#                 # token=HF_TOKEN,
+#                 local_dir=OUTPUT_DIR,
+#                 force_download=True
+#             )
+#             downloaded_files.append(MODEL_FILE)
+#         else:
+#             return f"❌ {MODEL_FILE} not found in repository", None, None
+#
+#         # Download vocab
+#         if VOCAB_FILE in files:
+#             print(f"📥 Downloading {VOCAB_FILE} from Hub...")
+#             vocab_path = hf_hub_download(
+#                 repo_id=REPO_ID,
+#                 filename=VOCAB_FILE,
+#                 # token=HF_TOKEN,
+#                 local_dir=OUTPUT_DIR,
+#                 force_download=True
+#             )
+#             downloaded_files.append(VOCAB_FILE)
+#         else:
+#             return f"❌ {VOCAB_FILE} not found in repository", None, None
+#
+#         # Copy to temp for download
+#         temp_dir = tempfile.mkdtemp()
+#         temp_model = os.path.join(temp_dir, MODEL_FILE)
+#         temp_vocab = os.path.join(temp_dir, VOCAB_FILE)
+#
+#         shutil.copy2(os.path.join(OUTPUT_DIR, MODEL_FILE), temp_model)
+#         shutil.copy2(os.path.join(OUTPUT_DIR, VOCAB_FILE), temp_vocab)
+#
+#         success_msg = f"✅ Successfully downloaded from Hub:\n"
+#         success_msg += f"   • {MODEL_FILE}\n"
+#         success_msg += f"   • {VOCAB_FILE}\n\n"
+#         success_msg += "📦 Files are ready to download below!"
+#
+#         return success_msg, temp_model, temp_vocab
+#
+#     except Exception as e:
+#         error_msg = f"❌ Error downloading models: {str(e)}\n\n"
+#         error_msg += f"Make sure:\n"
+#         error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
+#         error_msg += f"2. HF_TOKEN is set in Space secrets\n"
+#         error_msg += f"3. Model files exist in the repository"
+#         return error_msg, None, None
+#
+#
+# # Create Gradio interface
+# with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.themes.Soft()) as demo:
+#     gr.Markdown(
+#         """
+#         # 🎓 MCQ Structure Extraction - Model Training
+#
+#         Train a BiLSTM-CRF model with deep layout understanding for extracting structured information from MCQ documents.
+#
+#         ## 📋 Instructions:
+#         1. **Upload Dataset**: Provide your unified JSON file containing tokens, bounding boxes, and labels
+#         2. **Train Model**: Click "Start Training" and wait for completion (this may take a while)
+#         3. **Download Models**: Once training is complete, download the trained model and vocabulary files
+#
+#         ## 📥 Or Download Existing Models:
+#         If you just want to download the latest trained models from the repository, use the "Download from Hub" tab.
+#
+#         ---
+#         """
+#     )
+#
+#     with gr.Tab("🚀 Train New Model"):
+#         gr.Markdown(
+#             """
+#             ### Training Process:
+#             The app will automatically:
+#             1. ✅ Download any existing models from Hugging Face Hub (for resuming training)
+#             2. 🎯 Train the model on your uploaded dataset
+#             3. ☁️ Upload the trained models back to the Hub
+#             4. 📥 Provide download links for the trained files
+#
+#             **Note**: Training progress details appear in the terminal/logs. The status box shows major milestones.
+#             """
+#         )
+#
+#         with gr.Row():
+#             with gr.Column():
+#                 dataset_input = gr.File(
+#                     label="📂 Upload Training Dataset (JSON)",
+#                     file_types=[".json"],
+#                     type="filepath"
+#                 )
+#                 train_button = gr.Button("🚀 Start Training", variant="primary", size="lg")
+#
+#             with gr.Column():
+#                 status_output = gr.Textbox(
+#                     label="📊 Training Status",
+#                     lines=12,
+#                     interactive=False,
+#                     show_copy_button=True
+#                 )
+#
+#         gr.Markdown("### 📦 Download Trained Models")
+#         with gr.Row():
+#             model_output = gr.File(label="💾 Model File (.pt)")
+#             vocab_output = gr.File(label="📚 Vocabulary File (.pkl)")
+#
+#         train_button.click(
+#             fn=train_model,
+#             inputs=[dataset_input],
+#             outputs=[status_output, model_output, vocab_output]
+#         )
+#
+#     with gr.Tab("☁️ Download from Hub"):
+#         gr.Markdown(
+#             """
+#             ### Download Pre-trained Models
+#
+#             Download the latest trained models directly from your Hugging Face repository.
+#             This is useful if:
+#             - You want to use pre-trained models without training
+#             - You need to download models trained in a previous session
+#             - You want to get the latest version from the Hub
+#
+#             The downloaded files can be used for inference with your MCQ extraction pipeline.
+#             """
+#         )
+#
+#         download_button = gr.Button("☁️ Download Latest Models from Hub", variant="primary", size="lg")
+#
+#         download_status = gr.Textbox(
+#             label="Download Status",
+#             lines=6,
+#             interactive=False,
+#             show_copy_button=True
+#         )
+#
+#         gr.Markdown("### 📦 Downloaded Files")
+#         with gr.Row():
+#             hub_model_output = gr.File(label="💾 Model File (.pt)")
+#             hub_vocab_output = gr.File(label="📚 Vocabulary File (.pkl)")
+#
+#         download_button.click(
+#             fn=download_models_from_hub,
+#             outputs=[download_status, hub_model_output, hub_vocab_output]
+#         )
+#
+#     gr.Markdown(
+#         """
+#         ---
+#         ### ⚙️ Model Configuration:
+#
+#         **Architecture:**
+#         - BiLSTM-CRF with spatial attention mechanism
+#         - Word embeddings + Character-level CNN
+#         - Bounding box encoding with MLP
+#         - Spatial & context feature extraction
+#         - Learnable positional embeddings
+#
+#         **Features Used:**
+#         - Token text (word-level and character-level)
+#         - Bounding box coordinates (normalized)
+#         - Spatial features: vertical spacing, alignment, dimensions (11 features)
+#         - Context features: surrounding question/option markers (8 features)
+#
+#         **Output Labels (13 total):**
+#         - Questions, Options, Answers, Images, Section Headings, Passages (BIO tagging)
+#
+#         **Training Parameters:**
+#         - Batch Size: 8
+#         - Epochs: 10 (with early stopping after 10 epochs without improvement)
+#         - Learning Rate: 5e-4 (AdamW optimizer with OneCycleLR scheduler)
+#         - Hidden Size: 768
+#         - Total Parameters: ~15.6M
+#
+#         **Hardware Requirements:**
+#         - GPU recommended for reasonable training speed
+#         - CPU training supported but significantly slower
+#
+#         ---
+#
+#
+#
+#         **Environment Variables Required:**
+#         - `SPACE_ID`: Your Hugging Face Space/Repo ID (auto-set in Spaces)
+#         - `HF_TOKEN`: Your Hugging Face write token (set as a secret)
+#
+#         **Model Persistence:**
+#         - Models are automatically saved to `output_data/` directory
+#         - Best model is uploaded to Hugging Face Hub after each improvement
+#         - Training can be resumed from checkpoints
+#         """
+#     )
+#
+# # Launch the app
+# if __name__ == "__main__":
+#     demo.launch()
 import os
 import gradio as gr
 from huggingface_hub import hf_hub_download, upload_file, HfApi
 import sys
+import glob
 # Add current directory to path to import train_model
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 # IMPORTANT: Update this with your actual Hugging Face repository ID
 REPO_ID = "heerjtdev/LSTM_CRF"  # Replace with your repo ID
 # HF_TOKEN = os.environ.get("HF_TOKEN")  # Set this as a secret in your Space settings
     """Download existing model files from the Hugging Face Hub if available."""
     try:
         api = HfApi()
+        # files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
         files = api.list_repo_files(REPO_ID)
         os.makedirs(OUTPUT_DIR, exist_ok=True)
         progress(0.05, desc="Checking Hugging Face Hub for existing models...")
         download_status = download_existing_models()
         status_log = f"{download_status}\n\n"
+        # Reset download outputs before training starts
+        yield status_log, None, None, None, None
         # Step 2: Save uploaded file
         progress(0.1, desc="Processing uploaded dataset...")
         dataset_path = dataset_file.name
         status_log += f"📂 Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
+        yield status_log, None, None, None, None
         # Step 3: Import and run training
         progress(0.15, desc="Initializing training...")
         status_log += "🚀 Starting training...\n"
         status_log += "📊 This may take a while. Training progress will appear in the terminal.\n\n"
+        yield status_log, None, None, None, None
         # Import the training module
         try:
             print("=" * 80)
             status_log += "✅ Training completed successfully!\n\n"
+            yield status_log, None, None, None, None
         except ImportError as ie:
             error_msg = f"❌ Failed to import training module: {str(ie)}\n"
             error_msg += "Make sure train_model.py is in the same directory as app.py"
+            yield status_log + error_msg, None, None, None, None
             return
         except Exception as train_error:
             error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
+            yield status_log + error_msg, None, None, None, None
             return
         # Step 4: Verify files exist
         if not files_exist:
             error_msg = "❌ Error: Model files were not created. Check training logs."
+            yield status_log + error_msg, None, None, None, None
             return
         status_log += f"✅ Found trained files: {', '.join(files_exist)}\n\n"
+        yield status_log, None, None, None, None
         # Step 5: Upload to Hub
         progress(0.9, desc="Uploading models to Hugging Face Hub...")
         status_log += "☁️ Uploading to Hugging Face Hub...\n"
+        yield status_log, None, None, None, None
         upload_status = []
         else:
             status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
+        yield status_log, None, None, None, None
         # Step 6: Copy to temp directory for download
         progress(0.95, desc="Preparing download files...")
         status_log += "TRAINING COMPLETE - You can now download the model files\n"
         status_log += "=" * 50
+        # Note: We return the model_download and vocab_download twice for both sets of File outputs
+        yield status_log, model_download, vocab_download, model_download, vocab_download
     except Exception as e:
         error_msg = f"❌ Unexpected error: {str(e)}\n"
         import traceback
         error_msg += f"\nTraceback:\n{traceback.format_exc()}"
+        # Return Nones for all file outputs
+        yield error_msg, None, None, None, None
 def download_models_from_hub():
         os.makedirs(OUTPUT_DIR, exist_ok=True)
         api = HfApi()
+        # files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
         files = api.list_repo_files(REPO_ID)
         downloaded_files = []
             )
             downloaded_files.append(MODEL_FILE)
         else:
+            return f"❌ {MODEL_FILE} not found in repository", None, None, None, None
         # Download vocab
         if VOCAB_FILE in files:
             )
             downloaded_files.append(VOCAB_FILE)
         else:
+            return f"❌ {VOCAB_FILE} not found in repository", None, None, None, None
         # Copy to temp for download
         temp_dir = tempfile.mkdtemp()
         success_msg += f"   • {VOCAB_FILE}\n\n"
         success_msg += "📦 Files are ready to download below!"
+        # Return the downloaded files for both sets of file outputs
+        return success_msg, temp_model, temp_vocab, temp_model, temp_vocab
     except Exception as e:
         error_msg = f"❌ Error downloading models: {str(e)}\n\n"
         error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
         error_msg += f"2. HF_TOKEN is set in Space secrets\n"
         error_msg += f"3. Model files exist in the repository"
+        return error_msg, None, None, None, None
+# --- UPDATED check_local_files FUNCTION ---
+def check_local_files():
+    """
+    Checks and reports the files present in the local output directory.
+    If core model files exist, it prepares and returns them for download.
+    """
+    if not os.path.exists(OUTPUT_DIR):
+        return f"ℹ️ Directory **'{OUTPUT_DIR}'** does not exist.", None, None
+    all_files = os.listdir(OUTPUT_DIR)
+    model_path = os.path.join(OUTPUT_DIR, MODEL_FILE)
+    vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE)
+    model_download = None
+    vocab_download = None
+    # 1. Prepare download paths if files exist
+    if os.path.exists(model_path):
+        model_download = model_path
+    if os.path.exists(vocab_path):
+        vocab_download = vocab_path
+    # 2. Generate status message
+    if not all_files:
+        return f"ℹ️ Directory **'{OUTPUT_DIR}'** is empty.", None, None
+    file_list = []
+    total_size = 0
+    # Sort files to put core files first
+    sorted_files = sorted(all_files, key=lambda x: (x != MODEL_FILE, x != VOCAB_FILE, x != CHECKPOINT_FILE, x))
+    for filename in sorted_files:
+        filepath = os.path.join(OUTPUT_DIR, filename)
+        if os.path.isfile(filepath):
+            size_bytes = os.path.getsize(filepath)
+            total_size += size_bytes
+            # Simple size formatting
+            if size_bytes > 1024 * 1024:
+                size_str = f"{size_bytes / (1024 * 1024):.2f} MB"
+            elif size_bytes > 1024:
+                size_str = f"{size_bytes / 1024:.2f} KB"
+            else:
+                size_str = f"{size_bytes} bytes"
+            file_list.append(f"• **{filename}** (Size: {size_str})")
+    # Format total size
+    if total_size > 1024 * 1024 * 1024:
+        total_size_str = f"{total_size / (1024 * 1024 * 1024):.2f} GB"
+    elif total_size > 1024 * 1024:
+        total_size_str = f"{total_size / (1024 * 1024):.2f} MB"
+    else:
+        total_size_str = f"{total_size / 1024:.2f} KB"
+    header = f"✅ Contents of **'{OUTPUT_DIR}'** ({len(file_list)} files, Total Size: {total_size_str}):\n"
+    if model_download and vocab_download:
+        header += "\n**📦 Core model files found! Ready for download below.**"
+    elif model_download or vocab_download:
+        header += "\n**⚠️ Found some model files, but not both.**"
+    return header + "\n" + "\n".join(file_list), model_download, vocab_download
+def clear_local_memory():
+    """Deletes the local output directory and its contents."""
+    if os.path.exists(OUTPUT_DIR):
+        try:
+            shutil.rmtree(OUTPUT_DIR)
+            return f"🗑️ Successfully deleted local directory **'{OUTPUT_DIR}'** and all its contents. Memory cleared.", None, None
+        except Exception as e:
+            return f"❌ Error clearing memory (deleting '{OUTPUT_DIR}'): {str(e)}", None, None
+    else:
+        return f"ℹ️ Local directory **'{OUTPUT_DIR}'** does not exist. No memory to clear.", None, None
+# --- END NEW FUNCTIONS ---
 # Create Gradio interface
         """
     )
+    # Define common File components for outputs
+    download_model_output = gr.File(label="💾 Model File (.pt)", interactive=False)
+    download_vocab_output = gr.File(label="📚 Vocabulary File (.pkl)", interactive=False)
+    # We need a dummy set of outputs to clear the download boxes when starting training,
+    # and a permanent set for the utility functions. We'll use the permanent ones below.
     with gr.Tab("🚀 Train New Model"):
         gr.Markdown(
             """
                 )
                 train_button = gr.Button("🚀 Start Training", variant="primary", size="lg")
+                # --- NEW BUTTONS for utility ---
+                with gr.Row():
+                    check_button = gr.Button("🔎 Check Local Models", variant="secondary")
+                    clear_button = gr.Button("🧹 Clear Local Memory", variant="stop")
+                # ------------------------------
             with gr.Column():
                 status_output = gr.Textbox(
+                    label="📊 Training/Utility Status",
                     lines=12,
                     interactive=False,
                     show_copy_button=True
                 )
+        gr.Markdown("### 📦 Download Trained/Local Models")
         with gr.Row():
+            # Use the defined components for the training output
+            train_model_output = download_model_output
+            train_vocab_output = download_vocab_output
+        # Note: The train_model function now returns 5 values (status, model_file, vocab_file, model_file_again, vocab_file_again)
+        # We target the two download outputs directly for the final model and vocab files.
         train_button.click(
             fn=train_model,
             inputs=[dataset_input],
+            outputs=[status_output, train_model_output, train_vocab_output, download_model_output,
+                     download_vocab_output]
+        )
+        # --- NEW BUTTON ACTIONS ---
+        # check_local_files now returns (status, model_download_path, vocab_download_path)
+        # We target the status output AND the two global download outputs
+        check_button.click(
+            fn=check_local_files,
+            inputs=[],
+            outputs=[status_output, download_model_output, download_vocab_output]
         )
+        # clear_local_memory now returns (status, None, None) to clear the download boxes
+        clear_button.click(
+            fn=clear_local_memory,
+            inputs=[],
+            outputs=[status_output, download_model_output, download_vocab_output]
+        )
+        # --------------------------
     with gr.Tab("☁️ Download from Hub"):
         gr.Markdown(
             """
             ### Download Pre-trained Models
             Download the latest trained models directly from your Hugging Face repository.
             """
         )
         gr.Markdown("### 📦 Downloaded Files")
         with gr.Row():
+            # Use the defined components for the Hub output
+            hub_model_output = download_model_output
+            hub_vocab_output = download_vocab_output
+        # Note: The download_models_from_hub function now returns 5 values (status, model_file, vocab_file, model_file_again, vocab_file_again)
+        # We target the two download outputs directly for the final model and vocab files.
         download_button.click(
             fn=download_models_from_hub,
+            outputs=[download_status, hub_model_output, hub_vocab_output, download_model_output, download_vocab_output]
         )
     gr.Markdown(
         """
         ---
         ### ⚙️ Model Configuration:
+        ... (rest of the markdown)
         """
     )