# import os # import shutil # import tempfile # import gradio as gr # from huggingface_hub import hf_hub_download, upload_file, HfApi # import sys # # # Add current directory to path to import train_model # sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) # # # Configuration # OUTPUT_DIR = "output_data" # MODEL_FILE = "model_enhanced.pt" # VOCAB_FILE = "vocabs_enhanced.pkl" # CHECKPOINT_FILE = "checkpoint_enhanced.pt" # # # IMPORTANT: Update this with your actual Hugging Face repository ID # REPO_ID = "heerjtdev/LSTM_CRF" # Replace with your repo ID # # HF_TOKEN = os.environ.get("HF_TOKEN") # Set this as a secret in your Space settings # # # def download_existing_models(): # """Download existing model files from the Hugging Face Hub if available.""" # try: # api = HfApi() # #files = api.list_repo_files(REPO_ID, token=HF_TOKEN) # files = api.list_repo_files(REPO_ID) # # os.makedirs(OUTPUT_DIR, exist_ok=True) # # downloaded_files = [] # # # Download model file # if MODEL_FILE in files: # print(f"๐Ÿ“ฅ Downloading {MODEL_FILE} from Hub...") # model_path = hf_hub_download( # repo_id=REPO_ID, # filename=MODEL_FILE, # # token=HF_TOKEN, # local_dir=OUTPUT_DIR, # force_download=True # Always get latest version # ) # downloaded_files.append(MODEL_FILE) # print(f"โœ… Downloaded {MODEL_FILE}") # # # Download vocab file # if VOCAB_FILE in files: # print(f"๐Ÿ“ฅ Downloading {VOCAB_FILE} from Hub...") # vocab_path = hf_hub_download( # repo_id=REPO_ID, # filename=VOCAB_FILE, # # token=HF_TOKEN, # local_dir=OUTPUT_DIR, # force_download=True # Always get latest version # ) # downloaded_files.append(VOCAB_FILE) # print(f"โœ… Downloaded {VOCAB_FILE}") # # # Download checkpoint file (optional, for resuming training) # if CHECKPOINT_FILE in files: # print(f"๐Ÿ“ฅ Downloading {CHECKPOINT_FILE} from Hub...") # checkpoint_path = hf_hub_download( # repo_id=REPO_ID, # filename=CHECKPOINT_FILE, # # token=HF_TOKEN, # local_dir=OUTPUT_DIR, # force_download=True # ) # downloaded_files.append(CHECKPOINT_FILE) # print(f"โœ… Downloaded {CHECKPOINT_FILE}") # # if downloaded_files: # return f"โœ… Downloaded from Hub: {', '.join(downloaded_files)}" # else: # return "โ„น๏ธ No existing model files found in repository. Starting fresh." # except Exception as e: # error_msg = f"โš ๏ธ Could not download existing models: {str(e)}" # print(error_msg) # return error_msg # # # def train_model(dataset_file, progress=gr.Progress()): # """Train the model with the uploaded dataset.""" # if dataset_file is None: # return "โŒ Please upload a dataset file!", None, None # # try: # # Step 1: Download existing models from Hub (if any) BEFORE training starts # progress(0.05, desc="Checking Hugging Face Hub for existing models...") # download_status = download_existing_models() # status_log = f"{download_status}\n\n" # yield status_log, None, None # # # Step 2: Save uploaded file # progress(0.1, desc="Processing uploaded dataset...") # dataset_path = dataset_file.name # status_log += f"๐Ÿ“‚ Dataset uploaded: {os.path.basename(dataset_path)}\n\n" # yield status_log, None, None # # # Step 3: Import and run training # progress(0.15, desc="Initializing training...") # status_log += "๐Ÿš€ Starting training...\n" # status_log += "๐Ÿ“Š This may take a while. Training progress will appear in the terminal.\n\n" # yield status_log, None, None # # # Import the training module # try: # import train_model as tm # print("=" * 80) # print("TRAINING STARTED") # print("=" * 80) # # # Run training - this will handle model loading internally # progress(0.2, desc="Training in progress... (check terminal for details)") # tm.train_from_json(dataset_path) # # print("=" * 80) # print("TRAINING COMPLETED") # print("=" * 80) # # status_log += "โœ… Training completed successfully!\n\n" # yield status_log, None, None # # except ImportError as ie: # error_msg = f"โŒ Failed to import training module: {str(ie)}\n" # error_msg += "Make sure train_model.py is in the same directory as app.py" # yield status_log + error_msg, None, None # return # except Exception as train_error: # error_msg = f"โŒ Training failed with error:\n{str(train_error)}\n" # yield status_log + error_msg, None, None # return # # # Step 4: Verify files exist # progress(0.85, desc="Verifying trained model files...") # model_path = os.path.join(OUTPUT_DIR, MODEL_FILE) # vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE) # checkpoint_path = os.path.join(OUTPUT_DIR, CHECKPOINT_FILE) # # files_exist = [] # if os.path.exists(model_path): # files_exist.append(MODEL_FILE) # if os.path.exists(vocab_path): # files_exist.append(VOCAB_FILE) # # if not files_exist: # error_msg = "โŒ Error: Model files were not created. Check training logs." # yield status_log + error_msg, None, None # return # # status_log += f"โœ… Found trained files: {', '.join(files_exist)}\n\n" # yield status_log, None, None # # # Step 5: Upload to Hub # progress(0.9, desc="Uploading models to Hugging Face Hub...") # status_log += "โ˜๏ธ Uploading to Hugging Face Hub...\n" # yield status_log, None, None # # upload_status = [] # # if os.path.exists(model_path): # try: # upload_file( # path_or_fileobj=model_path, # path_in_repo=MODEL_FILE, # repo_id=REPO_ID, # # token=HF_TOKEN, # commit_message="Update trained model" # ) # upload_status.append(MODEL_FILE) # print(f"โœ… Uploaded {MODEL_FILE} to Hub") # except Exception as e: # print(f"โš ๏ธ Failed to upload {MODEL_FILE}: {e}") # # if os.path.exists(vocab_path): # try: # upload_file( # path_or_fileobj=vocab_path, # path_in_repo=VOCAB_FILE, # repo_id=REPO_ID, # # token=HF_TOKEN, # commit_message="Update vocabulary" # ) # upload_status.append(VOCAB_FILE) # print(f"โœ… Uploaded {VOCAB_FILE} to Hub") # except Exception as e: # print(f"โš ๏ธ Failed to upload {VOCAB_FILE}: {e}") # # # Also upload checkpoint for future resume capability # if os.path.exists(checkpoint_path): # try: # upload_file( # path_or_fileobj=checkpoint_path, # path_in_repo=CHECKPOINT_FILE, # repo_id=REPO_ID, # # token=HF_TOKEN, # commit_message="Update checkpoint" # ) # upload_status.append(CHECKPOINT_FILE) # print(f"โœ… Uploaded {CHECKPOINT_FILE} to Hub") # except Exception as e: # print(f"โš ๏ธ Failed to upload {CHECKPOINT_FILE}: {e}") # # if upload_status: # status_log += f"โœ… Uploaded to Hub: {', '.join(upload_status)}\n\n" # else: # status_log += "โš ๏ธ Warning: No files were uploaded to Hub\n\n" # # yield status_log, None, None # # # Step 6: Copy to temp directory for download # progress(0.95, desc="Preparing download files...") # temp_dir = tempfile.mkdtemp() # # model_download = None # vocab_download = None # # if os.path.exists(model_path): # temp_model = os.path.join(temp_dir, MODEL_FILE) # shutil.copy2(model_path, temp_model) # model_download = temp_model # print(f"๐Ÿ“ฆ Prepared {MODEL_FILE} for download") # # if os.path.exists(vocab_path): # temp_vocab = os.path.join(temp_dir, VOCAB_FILE) # shutil.copy2(vocab_path, temp_vocab) # vocab_download = temp_vocab # print(f"๐Ÿ“ฆ Prepared {VOCAB_FILE} for download") # # progress(1.0, desc="Complete!") # # status_log += "๐Ÿ“ฆ Files ready for download below!\n" # status_log += "\n" + "=" * 50 + "\n" # status_log += "TRAINING COMPLETE - You can now download the model files\n" # status_log += "=" * 50 # # yield status_log, model_download, vocab_download # # except Exception as e: # error_msg = f"โŒ Unexpected error: {str(e)}\n" # import traceback # error_msg += f"\nTraceback:\n{traceback.format_exc()}" # yield error_msg, None, None # # # def download_models_from_hub(): # """Download the latest models from the Hugging Face Hub.""" # try: # os.makedirs(OUTPUT_DIR, exist_ok=True) # # api = HfApi() # #files = api.list_repo_files(REPO_ID, token=HF_TOKEN) # files = api.list_repo_files(REPO_ID) # # downloaded_files = [] # # # Download model # if MODEL_FILE in files: # print(f"๐Ÿ“ฅ Downloading {MODEL_FILE} from Hub...") # model_path = hf_hub_download( # repo_id=REPO_ID, # filename=MODEL_FILE, # # token=HF_TOKEN, # local_dir=OUTPUT_DIR, # force_download=True # ) # downloaded_files.append(MODEL_FILE) # else: # return f"โŒ {MODEL_FILE} not found in repository", None, None # # # Download vocab # if VOCAB_FILE in files: # print(f"๐Ÿ“ฅ Downloading {VOCAB_FILE} from Hub...") # vocab_path = hf_hub_download( # repo_id=REPO_ID, # filename=VOCAB_FILE, # # token=HF_TOKEN, # local_dir=OUTPUT_DIR, # force_download=True # ) # downloaded_files.append(VOCAB_FILE) # else: # return f"โŒ {VOCAB_FILE} not found in repository", None, None # # # Copy to temp for download # temp_dir = tempfile.mkdtemp() # temp_model = os.path.join(temp_dir, MODEL_FILE) # temp_vocab = os.path.join(temp_dir, VOCAB_FILE) # # shutil.copy2(os.path.join(OUTPUT_DIR, MODEL_FILE), temp_model) # shutil.copy2(os.path.join(OUTPUT_DIR, VOCAB_FILE), temp_vocab) # # success_msg = f"โœ… Successfully downloaded from Hub:\n" # success_msg += f" โ€ข {MODEL_FILE}\n" # success_msg += f" โ€ข {VOCAB_FILE}\n\n" # success_msg += "๐Ÿ“ฆ Files are ready to download below!" # # return success_msg, temp_model, temp_vocab # # except Exception as e: # error_msg = f"โŒ Error downloading models: {str(e)}\n\n" # error_msg += f"Make sure:\n" # error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n" # error_msg += f"2. HF_TOKEN is set in Space secrets\n" # error_msg += f"3. Model files exist in the repository" # return error_msg, None, None # # # # Create Gradio interface # with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.themes.Soft()) as demo: # gr.Markdown( # """ # # ๐ŸŽ“ MCQ Structure Extraction - Model Training # # Train a BiLSTM-CRF model with deep layout understanding for extracting structured information from MCQ documents. # # ## ๐Ÿ“‹ Instructions: # 1. **Upload Dataset**: Provide your unified JSON file containing tokens, bounding boxes, and labels # 2. **Train Model**: Click "Start Training" and wait for completion (this may take a while) # 3. **Download Models**: Once training is complete, download the trained model and vocabulary files # # ## ๐Ÿ“ฅ Or Download Existing Models: # If you just want to download the latest trained models from the repository, use the "Download from Hub" tab. # # --- # """ # ) # # with gr.Tab("๐Ÿš€ Train New Model"): # gr.Markdown( # """ # ### Training Process: # The app will automatically: # 1. โœ… Download any existing models from Hugging Face Hub (for resuming training) # 2. ๐ŸŽฏ Train the model on your uploaded dataset # 3. โ˜๏ธ Upload the trained models back to the Hub # 4. ๐Ÿ“ฅ Provide download links for the trained files # # **Note**: Training progress details appear in the terminal/logs. The status box shows major milestones. # """ # ) # # with gr.Row(): # with gr.Column(): # dataset_input = gr.File( # label="๐Ÿ“‚ Upload Training Dataset (JSON)", # file_types=[".json"], # type="filepath" # ) # train_button = gr.Button("๐Ÿš€ Start Training", variant="primary", size="lg") # # with gr.Column(): # status_output = gr.Textbox( # label="๐Ÿ“Š Training Status", # lines=12, # interactive=False, # show_copy_button=True # ) # # gr.Markdown("### ๐Ÿ“ฆ Download Trained Models") # with gr.Row(): # model_output = gr.File(label="๐Ÿ’พ Model File (.pt)") # vocab_output = gr.File(label="๐Ÿ“š Vocabulary File (.pkl)") # # train_button.click( # fn=train_model, # inputs=[dataset_input], # outputs=[status_output, model_output, vocab_output] # ) # # with gr.Tab("โ˜๏ธ Download from Hub"): # gr.Markdown( # """ # ### Download Pre-trained Models # # Download the latest trained models directly from your Hugging Face repository. # This is useful if: # - You want to use pre-trained models without training # - You need to download models trained in a previous session # - You want to get the latest version from the Hub # # The downloaded files can be used for inference with your MCQ extraction pipeline. # """ # ) # # download_button = gr.Button("โ˜๏ธ Download Latest Models from Hub", variant="primary", size="lg") # # download_status = gr.Textbox( # label="Download Status", # lines=6, # interactive=False, # show_copy_button=True # ) # # gr.Markdown("### ๐Ÿ“ฆ Downloaded Files") # with gr.Row(): # hub_model_output = gr.File(label="๐Ÿ’พ Model File (.pt)") # hub_vocab_output = gr.File(label="๐Ÿ“š Vocabulary File (.pkl)") # # download_button.click( # fn=download_models_from_hub, # outputs=[download_status, hub_model_output, hub_vocab_output] # ) # # gr.Markdown( # """ # --- # ### โš™๏ธ Model Configuration: # # **Architecture:** # - BiLSTM-CRF with spatial attention mechanism # - Word embeddings + Character-level CNN # - Bounding box encoding with MLP # - Spatial & context feature extraction # - Learnable positional embeddings # # **Features Used:** # - Token text (word-level and character-level) # - Bounding box coordinates (normalized) # - Spatial features: vertical spacing, alignment, dimensions (11 features) # - Context features: surrounding question/option markers (8 features) # # **Output Labels (13 total):** # - Questions, Options, Answers, Images, Section Headings, Passages (BIO tagging) # # **Training Parameters:** # - Batch Size: 8 # - Epochs: 10 (with early stopping after 10 epochs without improvement) # - Learning Rate: 5e-4 (AdamW optimizer with OneCycleLR scheduler) # - Hidden Size: 768 # - Total Parameters: ~15.6M # # **Hardware Requirements:** # - GPU recommended for reasonable training speed # - CPU training supported but significantly slower # # --- # # # # **Environment Variables Required:** # - `SPACE_ID`: Your Hugging Face Space/Repo ID (auto-set in Spaces) # - `HF_TOKEN`: Your Hugging Face write token (set as a secret) # # **Model Persistence:** # - Models are automatically saved to `output_data/` directory # - Best model is uploaded to Hugging Face Hub after each improvement # - Training can be resumed from checkpoints # """ # ) # # # Launch the app # if __name__ == "__main__": # demo.launch() import os import shutil import tempfile import gradio as gr from huggingface_hub import hf_hub_download, upload_file, HfApi import sys import glob # Add current directory to path to import train_model sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) # Configuration OUTPUT_DIR = "output_data" MODEL_FILE = "model_enhanced.pt" VOCAB_FILE = "vocabs_enhanced.pkl" CHECKPOINT_FILE = "checkpoint_enhanced.pt" # IMPORTANT: Update this with your actual Hugging Face repository ID REPO_ID = "heerjtdev/LSTM_CRF" # Replace with your repo ID # HF_TOKEN = os.environ.get("HF_TOKEN") # Set this as a secret in your Space settings def download_existing_models(): """Download existing model files from the Hugging Face Hub if available.""" try: api = HfApi() # files = api.list_repo_files(REPO_ID, token=HF_TOKEN) files = api.list_repo_files(REPO_ID) os.makedirs(OUTPUT_DIR, exist_ok=True) downloaded_files = [] # Download model file if MODEL_FILE in files: print(f"๐Ÿ“ฅ Downloading {MODEL_FILE} from Hub...") model_path = hf_hub_download( repo_id=REPO_ID, filename=MODEL_FILE, # token=HF_TOKEN, local_dir=OUTPUT_DIR, force_download=True # Always get latest version ) downloaded_files.append(MODEL_FILE) print(f"โœ… Downloaded {MODEL_FILE}") # Download vocab file if VOCAB_FILE in files: print(f"๐Ÿ“ฅ Downloading {VOCAB_FILE} from Hub...") vocab_path = hf_hub_download( repo_id=REPO_ID, filename=VOCAB_FILE, # token=HF_TOKEN, local_dir=OUTPUT_DIR, force_download=True # Always get latest version ) downloaded_files.append(VOCAB_FILE) print(f"โœ… Downloaded {VOCAB_FILE}") # Download checkpoint file (optional, for resuming training) if CHECKPOINT_FILE in files: print(f"๐Ÿ“ฅ Downloading {CHECKPOINT_FILE} from Hub...") checkpoint_path = hf_hub_download( repo_id=REPO_ID, filename=CHECKPOINT_FILE, # token=HF_TOKEN, local_dir=OUTPUT_DIR, force_download=True ) downloaded_files.append(CHECKPOINT_FILE) print(f"โœ… Downloaded {CHECKPOINT_FILE}") if downloaded_files: return f"โœ… Downloaded from Hub: {', '.join(downloaded_files)}" else: return "โ„น๏ธ No existing model files found in repository. Starting fresh." except Exception as e: error_msg = f"โš ๏ธ Could not download existing models: {str(e)}" print(error_msg) return error_msg def train_model(dataset_file, progress=gr.Progress()): """Train the model with the uploaded dataset.""" if dataset_file is None: return "โŒ Please upload a dataset file!", None, None try: # Step 1: Download existing models from Hub (if any) BEFORE training starts progress(0.05, desc="Checking Hugging Face Hub for existing models...") download_status = download_existing_models() status_log = f"{download_status}\n\n" # Reset download outputs before training starts yield status_log, None, None, None, None # Step 2: Save uploaded file progress(0.1, desc="Processing uploaded dataset...") dataset_path = dataset_file.name status_log += f"๐Ÿ“‚ Dataset uploaded: {os.path.basename(dataset_path)}\n\n" yield status_log, None, None, None, None # Step 3: Import and run training progress(0.15, desc="Initializing training...") status_log += "๐Ÿš€ Starting training...\n" status_log += "๐Ÿ“Š This may take a while. Training progress will appear in the terminal.\n\n" yield status_log, None, None, None, None # Import the training module try: import train_model as tm print("=" * 80) print("TRAINING STARTED") print("=" * 80) # Run training - this will handle model loading internally progress(0.2, desc="Training in progress... (check terminal for details)") tm.train_from_json(dataset_path) print("=" * 80) print("TRAINING COMPLETED") print("=" * 80) status_log += "โœ… Training completed successfully!\n\n" yield status_log, None, None, None, None except ImportError as ie: error_msg = f"โŒ Failed to import training module: {str(ie)}\n" error_msg += "Make sure train_model.py is in the same directory as app.py" yield status_log + error_msg, None, None, None, None return except Exception as train_error: error_msg = f"โŒ Training failed with error:\n{str(train_error)}\n" yield status_log + error_msg, None, None, None, None return # Step 4: Verify files exist progress(0.85, desc="Verifying trained model files...") model_path = os.path.join(OUTPUT_DIR, MODEL_FILE) vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE) checkpoint_path = os.path.join(OUTPUT_DIR, CHECKPOINT_FILE) files_exist = [] if os.path.exists(model_path): files_exist.append(MODEL_FILE) if os.path.exists(vocab_path): files_exist.append(VOCAB_FILE) if not files_exist: error_msg = "โŒ Error: Model files were not created. Check training logs." yield status_log + error_msg, None, None, None, None return status_log += f"โœ… Found trained files: {', '.join(files_exist)}\n\n" yield status_log, None, None, None, None # Step 5: Upload to Hub progress(0.9, desc="Uploading models to Hugging Face Hub...") status_log += "โ˜๏ธ Uploading to Hugging Face Hub...\n" yield status_log, None, None, None, None upload_status = [] if os.path.exists(model_path): try: upload_file( path_or_fileobj=model_path, path_in_repo=MODEL_FILE, repo_id=REPO_ID, # token=HF_TOKEN, commit_message="Update trained model" ) upload_status.append(MODEL_FILE) print(f"โœ… Uploaded {MODEL_FILE} to Hub") except Exception as e: print(f"โš ๏ธ Failed to upload {MODEL_FILE}: {e}") if os.path.exists(vocab_path): try: upload_file( path_or_fileobj=vocab_path, path_in_repo=VOCAB_FILE, repo_id=REPO_ID, # token=HF_TOKEN, commit_message="Update vocabulary" ) upload_status.append(VOCAB_FILE) print(f"โœ… Uploaded {VOCAB_FILE} to Hub") except Exception as e: print(f"โš ๏ธ Failed to upload {VOCAB_FILE}: {e}") # Also upload checkpoint for future resume capability if os.path.exists(checkpoint_path): try: upload_file( path_or_fileobj=checkpoint_path, path_in_repo=CHECKPOINT_FILE, repo_id=REPO_ID, # token=HF_TOKEN, commit_message="Update checkpoint" ) upload_status.append(CHECKPOINT_FILE) print(f"โœ… Uploaded {CHECKPOINT_FILE} to Hub") except Exception as e: print(f"โš ๏ธ Failed to upload {CHECKPOINT_FILE}: {e}") if upload_status: status_log += f"โœ… Uploaded to Hub: {', '.join(upload_status)}\n\n" else: status_log += "โš ๏ธ Warning: No files were uploaded to Hub\n\n" yield status_log, None, None, None, None # Step 6: Copy to temp directory for download progress(0.95, desc="Preparing download files...") temp_dir = tempfile.mkdtemp() model_download = None vocab_download = None if os.path.exists(model_path): temp_model = os.path.join(temp_dir, MODEL_FILE) shutil.copy2(model_path, temp_model) model_download = temp_model print(f"๐Ÿ“ฆ Prepared {MODEL_FILE} for download") if os.path.exists(vocab_path): temp_vocab = os.path.join(temp_dir, VOCAB_FILE) shutil.copy2(vocab_path, temp_vocab) vocab_download = temp_vocab print(f"๐Ÿ“ฆ Prepared {VOCAB_FILE} for download") progress(1.0, desc="Complete!") status_log += "๐Ÿ“ฆ Files ready for download below!\n" status_log += "\n" + "=" * 50 + "\n" status_log += "TRAINING COMPLETE - You can now download the model files\n" status_log += "=" * 50 # Note: We return the model_download and vocab_download twice for both sets of File outputs yield status_log, model_download, vocab_download, model_download, vocab_download except Exception as e: error_msg = f"โŒ Unexpected error: {str(e)}\n" import traceback error_msg += f"\nTraceback:\n{traceback.format_exc()}" # Return Nones for all file outputs yield error_msg, None, None, None, None def download_models_from_hub(): """Download the latest models from the Hugging Face Hub.""" try: os.makedirs(OUTPUT_DIR, exist_ok=True) api = HfApi() # files = api.list_repo_files(REPO_ID, token=HF_TOKEN) files = api.list_repo_files(REPO_ID) downloaded_files = [] # Download model if MODEL_FILE in files: print(f"๐Ÿ“ฅ Downloading {MODEL_FILE} from Hub...") model_path = hf_hub_download( repo_id=REPO_ID, filename=MODEL_FILE, # token=HF_TOKEN, local_dir=OUTPUT_DIR, force_download=True ) downloaded_files.append(MODEL_FILE) else: return f"โŒ {MODEL_FILE} not found in repository", None, None, None, None # Download vocab if VOCAB_FILE in files: print(f"๐Ÿ“ฅ Downloading {VOCAB_FILE} from Hub...") vocab_path = hf_hub_download( repo_id=REPO_ID, filename=VOCAB_FILE, # token=HF_TOKEN, local_dir=OUTPUT_DIR, force_download=True ) downloaded_files.append(VOCAB_FILE) else: return f"โŒ {VOCAB_FILE} not found in repository", None, None, None, None # Copy to temp for download temp_dir = tempfile.mkdtemp() temp_model = os.path.join(temp_dir, MODEL_FILE) temp_vocab = os.path.join(temp_dir, VOCAB_FILE) shutil.copy2(os.path.join(OUTPUT_DIR, MODEL_FILE), temp_model) shutil.copy2(os.path.join(OUTPUT_DIR, VOCAB_FILE), temp_vocab) success_msg = f"โœ… Successfully downloaded from Hub:\n" success_msg += f" โ€ข {MODEL_FILE}\n" success_msg += f" โ€ข {VOCAB_FILE}\n\n" success_msg += "๐Ÿ“ฆ Files are ready to download below!" # Return the downloaded files for both sets of file outputs return success_msg, temp_model, temp_vocab, temp_model, temp_vocab except Exception as e: error_msg = f"โŒ Error downloading models: {str(e)}\n\n" error_msg += f"Make sure:\n" error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n" error_msg += f"2. HF_TOKEN is set in Space secrets\n" error_msg += f"3. Model files exist in the repository" return error_msg, None, None, None, None # --- UPDATED check_local_files FUNCTION --- def check_local_files(): """ Checks and reports the files present in the local output directory. If core model files exist, it prepares and returns them for download. """ if not os.path.exists(OUTPUT_DIR): return f"โ„น๏ธ Directory **'{OUTPUT_DIR}'** does not exist.", None, None all_files = os.listdir(OUTPUT_DIR) model_path = os.path.join(OUTPUT_DIR, MODEL_FILE) vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE) model_download = None vocab_download = None # 1. Prepare download paths if files exist if os.path.exists(model_path): model_download = model_path if os.path.exists(vocab_path): vocab_download = vocab_path # 2. Generate status message if not all_files: return f"โ„น๏ธ Directory **'{OUTPUT_DIR}'** is empty.", None, None file_list = [] total_size = 0 # Sort files to put core files first sorted_files = sorted(all_files, key=lambda x: (x != MODEL_FILE, x != VOCAB_FILE, x != CHECKPOINT_FILE, x)) for filename in sorted_files: filepath = os.path.join(OUTPUT_DIR, filename) if os.path.isfile(filepath): size_bytes = os.path.getsize(filepath) total_size += size_bytes # Simple size formatting if size_bytes > 1024 * 1024: size_str = f"{size_bytes / (1024 * 1024):.2f} MB" elif size_bytes > 1024: size_str = f"{size_bytes / 1024:.2f} KB" else: size_str = f"{size_bytes} bytes" file_list.append(f"โ€ข **{filename}** (Size: {size_str})") # Format total size if total_size > 1024 * 1024 * 1024: total_size_str = f"{total_size / (1024 * 1024 * 1024):.2f} GB" elif total_size > 1024 * 1024: total_size_str = f"{total_size / (1024 * 1024):.2f} MB" else: total_size_str = f"{total_size / 1024:.2f} KB" header = f"โœ… Contents of **'{OUTPUT_DIR}'** ({len(file_list)} files, Total Size: {total_size_str}):\n" if model_download and vocab_download: header += "\n**๐Ÿ“ฆ Core model files found! Ready for download below.**" elif model_download or vocab_download: header += "\n**โš ๏ธ Found some model files, but not both.**" return header + "\n" + "\n".join(file_list), model_download, vocab_download def clear_local_memory(): """Deletes the local output directory and its contents.""" if os.path.exists(OUTPUT_DIR): try: shutil.rmtree(OUTPUT_DIR) return f"๐Ÿ—‘๏ธ Successfully deleted local directory **'{OUTPUT_DIR}'** and all its contents. Memory cleared.", None, None except Exception as e: return f"โŒ Error clearing memory (deleting '{OUTPUT_DIR}'): {str(e)}", None, None else: return f"โ„น๏ธ Local directory **'{OUTPUT_DIR}'** does not exist. No memory to clear.", None, None # --- END NEW FUNCTIONS --- # Create Gradio interface with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # ๐ŸŽ“ MCQ Structure Extraction - Model Training Train a BiLSTM-CRF model with deep layout understanding for extracting structured information from MCQ documents. ## ๐Ÿ“‹ Instructions: 1. **Upload Dataset**: Provide your unified JSON file containing tokens, bounding boxes, and labels 2. **Train Model**: Click "Start Training" and wait for completion (this may take a while) 3. **Download Models**: Once training is complete, download the trained model and vocabulary files ## ๐Ÿ“ฅ Or Download Existing Models: If you just want to download the latest trained models from the repository, use the "Download from Hub" tab. --- """ ) # Define common File components for outputs download_model_output = gr.File(label="๐Ÿ’พ Model File (.pt)", interactive=False) download_vocab_output = gr.File(label="๐Ÿ“š Vocabulary File (.pkl)", interactive=False) # We need a dummy set of outputs to clear the download boxes when starting training, # and a permanent set for the utility functions. We'll use the permanent ones below. with gr.Tab("๐Ÿš€ Train New Model"): gr.Markdown( """ ### Training Process: The app will automatically: 1. โœ… Download any existing models from Hugging Face Hub (for resuming training) 2. ๐ŸŽฏ Train the model on your uploaded dataset 3. โ˜๏ธ Upload the trained models back to the Hub 4. ๐Ÿ“ฅ Provide download links for the trained files **Note**: Training progress details appear in the terminal/logs. The status box shows major milestones. """ ) with gr.Row(): with gr.Column(): dataset_input = gr.File( label="๐Ÿ“‚ Upload Training Dataset (JSON)", file_types=[".json"], type="filepath" ) train_button = gr.Button("๐Ÿš€ Start Training", variant="primary", size="lg") # --- NEW BUTTONS for utility --- with gr.Row(): check_button = gr.Button("๐Ÿ”Ž Check Local Models", variant="secondary") clear_button = gr.Button("๐Ÿงน Clear Local Memory", variant="stop") # ------------------------------ with gr.Column(): status_output = gr.Textbox( label="๐Ÿ“Š Training/Utility Status", lines=12, interactive=False, show_copy_button=True ) gr.Markdown("### ๐Ÿ“ฆ Download Trained/Local Models") with gr.Row(): # Use the defined components for the training output train_model_output = download_model_output train_vocab_output = download_vocab_output # Note: The train_model function now returns 5 values (status, model_file, vocab_file, model_file_again, vocab_file_again) # We target the two download outputs directly for the final model and vocab files. train_button.click( fn=train_model, inputs=[dataset_input], outputs=[status_output, train_model_output, train_vocab_output, download_model_output, download_vocab_output] ) # --- NEW BUTTON ACTIONS --- # check_local_files now returns (status, model_download_path, vocab_download_path) # We target the status output AND the two global download outputs check_button.click( fn=check_local_files, inputs=[], outputs=[status_output, download_model_output, download_vocab_output] ) # clear_local_memory now returns (status, None, None) to clear the download boxes clear_button.click( fn=clear_local_memory, inputs=[], outputs=[status_output, download_model_output, download_vocab_output] ) # -------------------------- with gr.Tab("โ˜๏ธ Download from Hub"): gr.Markdown( """ ### Download Pre-trained Models Download the latest trained models directly from your Hugging Face repository. """ ) download_button = gr.Button("โ˜๏ธ Download Latest Models from Hub", variant="primary", size="lg") download_status = gr.Textbox( label="Download Status", lines=6, interactive=False, show_copy_button=True ) gr.Markdown("### ๐Ÿ“ฆ Downloaded Files") with gr.Row(): # Use the defined components for the Hub output hub_model_output = download_model_output hub_vocab_output = download_vocab_output # Note: The download_models_from_hub function now returns 5 values (status, model_file, vocab_file, model_file_again, vocab_file_again) # We target the two download outputs directly for the final model and vocab files. download_button.click( fn=download_models_from_hub, outputs=[download_status, hub_model_output, hub_vocab_output, download_model_output, download_vocab_output] ) gr.Markdown( """ --- ### โš™๏ธ Model Configuration: ... (rest of the markdown) """ ) # Launch the app if __name__ == "__main__": demo.launch()