aagamjtdev commited on
Commit
f17ce9b
Β·
1 Parent(s): 9203994

add download button

Browse files
Files changed (1) hide show
  1. app.py +607 -72
app.py CHANGED
@@ -1,3 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
 
3
  import os
@@ -6,6 +460,7 @@ import tempfile
6
  import gradio as gr
7
  from huggingface_hub import hf_hub_download, upload_file, HfApi
8
  import sys
 
9
 
10
  # Add current directory to path to import train_model
11
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -18,6 +473,8 @@ CHECKPOINT_FILE = "checkpoint_enhanced.pt"
18
 
19
  # IMPORTANT: Update this with your actual Hugging Face repository ID
20
  REPO_ID = "heerjtdev/LSTM_CRF" # Replace with your repo ID
 
 
21
  # HF_TOKEN = os.environ.get("HF_TOKEN") # Set this as a secret in your Space settings
22
 
23
 
@@ -25,7 +482,7 @@ def download_existing_models():
25
  """Download existing model files from the Hugging Face Hub if available."""
26
  try:
27
  api = HfApi()
28
- #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
29
  files = api.list_repo_files(REPO_ID)
30
 
31
  os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -91,19 +548,20 @@ def train_model(dataset_file, progress=gr.Progress()):
91
  progress(0.05, desc="Checking Hugging Face Hub for existing models...")
92
  download_status = download_existing_models()
93
  status_log = f"{download_status}\n\n"
94
- yield status_log, None, None
 
95
 
96
  # Step 2: Save uploaded file
97
  progress(0.1, desc="Processing uploaded dataset...")
98
  dataset_path = dataset_file.name
99
  status_log += f"πŸ“‚ Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
100
- yield status_log, None, None
101
 
102
  # Step 3: Import and run training
103
  progress(0.15, desc="Initializing training...")
104
  status_log += "πŸš€ Starting training...\n"
105
  status_log += "πŸ“Š This may take a while. Training progress will appear in the terminal.\n\n"
106
- yield status_log, None, None
107
 
108
  # Import the training module
109
  try:
@@ -121,16 +579,16 @@ def train_model(dataset_file, progress=gr.Progress()):
121
  print("=" * 80)
122
 
123
  status_log += "βœ… Training completed successfully!\n\n"
124
- yield status_log, None, None
125
 
126
  except ImportError as ie:
127
  error_msg = f"❌ Failed to import training module: {str(ie)}\n"
128
  error_msg += "Make sure train_model.py is in the same directory as app.py"
129
- yield status_log + error_msg, None, None
130
  return
131
  except Exception as train_error:
132
  error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
133
- yield status_log + error_msg, None, None
134
  return
135
 
136
  # Step 4: Verify files exist
@@ -147,16 +605,16 @@ def train_model(dataset_file, progress=gr.Progress()):
147
 
148
  if not files_exist:
149
  error_msg = "❌ Error: Model files were not created. Check training logs."
150
- yield status_log + error_msg, None, None
151
  return
152
 
153
  status_log += f"βœ… Found trained files: {', '.join(files_exist)}\n\n"
154
- yield status_log, None, None
155
 
156
  # Step 5: Upload to Hub
157
  progress(0.9, desc="Uploading models to Hugging Face Hub...")
158
  status_log += "☁️ Uploading to Hugging Face Hub...\n"
159
- yield status_log, None, None
160
 
161
  upload_status = []
162
 
@@ -208,7 +666,7 @@ def train_model(dataset_file, progress=gr.Progress()):
208
  else:
209
  status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
210
 
211
- yield status_log, None, None
212
 
213
  # Step 6: Copy to temp directory for download
214
  progress(0.95, desc="Preparing download files...")
@@ -236,13 +694,15 @@ def train_model(dataset_file, progress=gr.Progress()):
236
  status_log += "TRAINING COMPLETE - You can now download the model files\n"
237
  status_log += "=" * 50
238
 
239
- yield status_log, model_download, vocab_download
 
240
 
241
  except Exception as e:
242
  error_msg = f"❌ Unexpected error: {str(e)}\n"
243
  import traceback
244
  error_msg += f"\nTraceback:\n{traceback.format_exc()}"
245
- yield error_msg, None, None
 
246
 
247
 
248
  def download_models_from_hub():
@@ -251,7 +711,7 @@ def download_models_from_hub():
251
  os.makedirs(OUTPUT_DIR, exist_ok=True)
252
 
253
  api = HfApi()
254
- #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
255
  files = api.list_repo_files(REPO_ID)
256
 
257
  downloaded_files = []
@@ -268,7 +728,7 @@ def download_models_from_hub():
268
  )
269
  downloaded_files.append(MODEL_FILE)
270
  else:
271
- return f"❌ {MODEL_FILE} not found in repository", None, None
272
 
273
  # Download vocab
274
  if VOCAB_FILE in files:
@@ -282,7 +742,7 @@ def download_models_from_hub():
282
  )
283
  downloaded_files.append(VOCAB_FILE)
284
  else:
285
- return f"❌ {VOCAB_FILE} not found in repository", None, None
286
 
287
  # Copy to temp for download
288
  temp_dir = tempfile.mkdtemp()
@@ -297,7 +757,8 @@ def download_models_from_hub():
297
  success_msg += f" β€’ {VOCAB_FILE}\n\n"
298
  success_msg += "πŸ“¦ Files are ready to download below!"
299
 
300
- return success_msg, temp_model, temp_vocab
 
301
 
302
  except Exception as e:
303
  error_msg = f"❌ Error downloading models: {str(e)}\n\n"
@@ -305,7 +766,89 @@ def download_models_from_hub():
305
  error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
306
  error_msg += f"2. HF_TOKEN is set in Space secrets\n"
307
  error_msg += f"3. Model files exist in the repository"
308
- return error_msg, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
 
311
  # Create Gradio interface
@@ -328,6 +871,13 @@ with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.theme
328
  """
329
  )
330
 
 
 
 
 
 
 
 
331
  with gr.Tab("πŸš€ Train New Model"):
332
  gr.Markdown(
333
  """
@@ -351,37 +901,58 @@ with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.theme
351
  )
352
  train_button = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
353
 
 
 
 
 
 
 
354
  with gr.Column():
355
  status_output = gr.Textbox(
356
- label="πŸ“Š Training Status",
357
  lines=12,
358
  interactive=False,
359
  show_copy_button=True
360
  )
361
 
362
- gr.Markdown("### πŸ“¦ Download Trained Models")
363
  with gr.Row():
364
- model_output = gr.File(label="πŸ’Ύ Model File (.pt)")
365
- vocab_output = gr.File(label="πŸ“š Vocabulary File (.pkl)")
 
366
 
 
 
367
  train_button.click(
368
  fn=train_model,
369
  inputs=[dataset_input],
370
- outputs=[status_output, model_output, vocab_output]
 
 
 
 
 
 
 
 
 
 
371
  )
372
 
 
 
 
 
 
 
 
 
373
  with gr.Tab("☁️ Download from Hub"):
374
  gr.Markdown(
375
  """
376
  ### Download Pre-trained Models
377
 
378
  Download the latest trained models directly from your Hugging Face repository.
379
- This is useful if:
380
- - You want to use pre-trained models without training
381
- - You need to download models trained in a previous session
382
- - You want to get the latest version from the Hub
383
-
384
- The downloaded files can be used for inference with your MCQ extraction pipeline.
385
  """
386
  )
387
 
@@ -396,58 +967,22 @@ with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.theme
396
 
397
  gr.Markdown("### πŸ“¦ Downloaded Files")
398
  with gr.Row():
399
- hub_model_output = gr.File(label="πŸ’Ύ Model File (.pt)")
400
- hub_vocab_output = gr.File(label="πŸ“š Vocabulary File (.pkl)")
 
401
 
 
 
402
  download_button.click(
403
  fn=download_models_from_hub,
404
- outputs=[download_status, hub_model_output, hub_vocab_output]
405
  )
406
 
407
  gr.Markdown(
408
  """
409
  ---
410
  ### βš™οΈ Model Configuration:
411
-
412
- **Architecture:**
413
- - BiLSTM-CRF with spatial attention mechanism
414
- - Word embeddings + Character-level CNN
415
- - Bounding box encoding with MLP
416
- - Spatial & context feature extraction
417
- - Learnable positional embeddings
418
-
419
- **Features Used:**
420
- - Token text (word-level and character-level)
421
- - Bounding box coordinates (normalized)
422
- - Spatial features: vertical spacing, alignment, dimensions (11 features)
423
- - Context features: surrounding question/option markers (8 features)
424
-
425
- **Output Labels (13 total):**
426
- - Questions, Options, Answers, Images, Section Headings, Passages (BIO tagging)
427
-
428
- **Training Parameters:**
429
- - Batch Size: 8
430
- - Epochs: 10 (with early stopping after 10 epochs without improvement)
431
- - Learning Rate: 5e-4 (AdamW optimizer with OneCycleLR scheduler)
432
- - Hidden Size: 768
433
- - Total Parameters: ~15.6M
434
-
435
- **Hardware Requirements:**
436
- - GPU recommended for reasonable training speed
437
- - CPU training supported but significantly slower
438
-
439
- ---
440
-
441
-
442
-
443
- **Environment Variables Required:**
444
- - `SPACE_ID`: Your Hugging Face Space/Repo ID (auto-set in Spaces)
445
- - `HF_TOKEN`: Your Hugging Face write token (set as a secret)
446
-
447
- **Model Persistence:**
448
- - Models are automatically saved to `output_data/` directory
449
- - Best model is uploaded to Hugging Face Hub after each improvement
450
- - Training can be resumed from checkpoints
451
  """
452
  )
453
 
 
1
+ # import os
2
+ # import shutil
3
+ # import tempfile
4
+ # import gradio as gr
5
+ # from huggingface_hub import hf_hub_download, upload_file, HfApi
6
+ # import sys
7
+ #
8
+ # # Add current directory to path to import train_model
9
+ # sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
10
+ #
11
+ # # Configuration
12
+ # OUTPUT_DIR = "output_data"
13
+ # MODEL_FILE = "model_enhanced.pt"
14
+ # VOCAB_FILE = "vocabs_enhanced.pkl"
15
+ # CHECKPOINT_FILE = "checkpoint_enhanced.pt"
16
+ #
17
+ # # IMPORTANT: Update this with your actual Hugging Face repository ID
18
+ # REPO_ID = "heerjtdev/LSTM_CRF" # Replace with your repo ID
19
+ # # HF_TOKEN = os.environ.get("HF_TOKEN") # Set this as a secret in your Space settings
20
+ #
21
+ #
22
+ # def download_existing_models():
23
+ # """Download existing model files from the Hugging Face Hub if available."""
24
+ # try:
25
+ # api = HfApi()
26
+ # #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
27
+ # files = api.list_repo_files(REPO_ID)
28
+ #
29
+ # os.makedirs(OUTPUT_DIR, exist_ok=True)
30
+ #
31
+ # downloaded_files = []
32
+ #
33
+ # # Download model file
34
+ # if MODEL_FILE in files:
35
+ # print(f"πŸ“₯ Downloading {MODEL_FILE} from Hub...")
36
+ # model_path = hf_hub_download(
37
+ # repo_id=REPO_ID,
38
+ # filename=MODEL_FILE,
39
+ # # token=HF_TOKEN,
40
+ # local_dir=OUTPUT_DIR,
41
+ # force_download=True # Always get latest version
42
+ # )
43
+ # downloaded_files.append(MODEL_FILE)
44
+ # print(f"βœ… Downloaded {MODEL_FILE}")
45
+ #
46
+ # # Download vocab file
47
+ # if VOCAB_FILE in files:
48
+ # print(f"πŸ“₯ Downloading {VOCAB_FILE} from Hub...")
49
+ # vocab_path = hf_hub_download(
50
+ # repo_id=REPO_ID,
51
+ # filename=VOCAB_FILE,
52
+ # # token=HF_TOKEN,
53
+ # local_dir=OUTPUT_DIR,
54
+ # force_download=True # Always get latest version
55
+ # )
56
+ # downloaded_files.append(VOCAB_FILE)
57
+ # print(f"βœ… Downloaded {VOCAB_FILE}")
58
+ #
59
+ # # Download checkpoint file (optional, for resuming training)
60
+ # if CHECKPOINT_FILE in files:
61
+ # print(f"πŸ“₯ Downloading {CHECKPOINT_FILE} from Hub...")
62
+ # checkpoint_path = hf_hub_download(
63
+ # repo_id=REPO_ID,
64
+ # filename=CHECKPOINT_FILE,
65
+ # # token=HF_TOKEN,
66
+ # local_dir=OUTPUT_DIR,
67
+ # force_download=True
68
+ # )
69
+ # downloaded_files.append(CHECKPOINT_FILE)
70
+ # print(f"βœ… Downloaded {CHECKPOINT_FILE}")
71
+ #
72
+ # if downloaded_files:
73
+ # return f"βœ… Downloaded from Hub: {', '.join(downloaded_files)}"
74
+ # else:
75
+ # return "ℹ️ No existing model files found in repository. Starting fresh."
76
+ # except Exception as e:
77
+ # error_msg = f"⚠️ Could not download existing models: {str(e)}"
78
+ # print(error_msg)
79
+ # return error_msg
80
+ #
81
+ #
82
+ # def train_model(dataset_file, progress=gr.Progress()):
83
+ # """Train the model with the uploaded dataset."""
84
+ # if dataset_file is None:
85
+ # return "❌ Please upload a dataset file!", None, None
86
+ #
87
+ # try:
88
+ # # Step 1: Download existing models from Hub (if any) BEFORE training starts
89
+ # progress(0.05, desc="Checking Hugging Face Hub for existing models...")
90
+ # download_status = download_existing_models()
91
+ # status_log = f"{download_status}\n\n"
92
+ # yield status_log, None, None
93
+ #
94
+ # # Step 2: Save uploaded file
95
+ # progress(0.1, desc="Processing uploaded dataset...")
96
+ # dataset_path = dataset_file.name
97
+ # status_log += f"πŸ“‚ Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
98
+ # yield status_log, None, None
99
+ #
100
+ # # Step 3: Import and run training
101
+ # progress(0.15, desc="Initializing training...")
102
+ # status_log += "πŸš€ Starting training...\n"
103
+ # status_log += "πŸ“Š This may take a while. Training progress will appear in the terminal.\n\n"
104
+ # yield status_log, None, None
105
+ #
106
+ # # Import the training module
107
+ # try:
108
+ # import train_model as tm
109
+ # print("=" * 80)
110
+ # print("TRAINING STARTED")
111
+ # print("=" * 80)
112
+ #
113
+ # # Run training - this will handle model loading internally
114
+ # progress(0.2, desc="Training in progress... (check terminal for details)")
115
+ # tm.train_from_json(dataset_path)
116
+ #
117
+ # print("=" * 80)
118
+ # print("TRAINING COMPLETED")
119
+ # print("=" * 80)
120
+ #
121
+ # status_log += "βœ… Training completed successfully!\n\n"
122
+ # yield status_log, None, None
123
+ #
124
+ # except ImportError as ie:
125
+ # error_msg = f"❌ Failed to import training module: {str(ie)}\n"
126
+ # error_msg += "Make sure train_model.py is in the same directory as app.py"
127
+ # yield status_log + error_msg, None, None
128
+ # return
129
+ # except Exception as train_error:
130
+ # error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
131
+ # yield status_log + error_msg, None, None
132
+ # return
133
+ #
134
+ # # Step 4: Verify files exist
135
+ # progress(0.85, desc="Verifying trained model files...")
136
+ # model_path = os.path.join(OUTPUT_DIR, MODEL_FILE)
137
+ # vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE)
138
+ # checkpoint_path = os.path.join(OUTPUT_DIR, CHECKPOINT_FILE)
139
+ #
140
+ # files_exist = []
141
+ # if os.path.exists(model_path):
142
+ # files_exist.append(MODEL_FILE)
143
+ # if os.path.exists(vocab_path):
144
+ # files_exist.append(VOCAB_FILE)
145
+ #
146
+ # if not files_exist:
147
+ # error_msg = "❌ Error: Model files were not created. Check training logs."
148
+ # yield status_log + error_msg, None, None
149
+ # return
150
+ #
151
+ # status_log += f"βœ… Found trained files: {', '.join(files_exist)}\n\n"
152
+ # yield status_log, None, None
153
+ #
154
+ # # Step 5: Upload to Hub
155
+ # progress(0.9, desc="Uploading models to Hugging Face Hub...")
156
+ # status_log += "☁️ Uploading to Hugging Face Hub...\n"
157
+ # yield status_log, None, None
158
+ #
159
+ # upload_status = []
160
+ #
161
+ # if os.path.exists(model_path):
162
+ # try:
163
+ # upload_file(
164
+ # path_or_fileobj=model_path,
165
+ # path_in_repo=MODEL_FILE,
166
+ # repo_id=REPO_ID,
167
+ # # token=HF_TOKEN,
168
+ # commit_message="Update trained model"
169
+ # )
170
+ # upload_status.append(MODEL_FILE)
171
+ # print(f"βœ… Uploaded {MODEL_FILE} to Hub")
172
+ # except Exception as e:
173
+ # print(f"⚠️ Failed to upload {MODEL_FILE}: {e}")
174
+ #
175
+ # if os.path.exists(vocab_path):
176
+ # try:
177
+ # upload_file(
178
+ # path_or_fileobj=vocab_path,
179
+ # path_in_repo=VOCAB_FILE,
180
+ # repo_id=REPO_ID,
181
+ # # token=HF_TOKEN,
182
+ # commit_message="Update vocabulary"
183
+ # )
184
+ # upload_status.append(VOCAB_FILE)
185
+ # print(f"βœ… Uploaded {VOCAB_FILE} to Hub")
186
+ # except Exception as e:
187
+ # print(f"⚠️ Failed to upload {VOCAB_FILE}: {e}")
188
+ #
189
+ # # Also upload checkpoint for future resume capability
190
+ # if os.path.exists(checkpoint_path):
191
+ # try:
192
+ # upload_file(
193
+ # path_or_fileobj=checkpoint_path,
194
+ # path_in_repo=CHECKPOINT_FILE,
195
+ # repo_id=REPO_ID,
196
+ # # token=HF_TOKEN,
197
+ # commit_message="Update checkpoint"
198
+ # )
199
+ # upload_status.append(CHECKPOINT_FILE)
200
+ # print(f"βœ… Uploaded {CHECKPOINT_FILE} to Hub")
201
+ # except Exception as e:
202
+ # print(f"⚠️ Failed to upload {CHECKPOINT_FILE}: {e}")
203
+ #
204
+ # if upload_status:
205
+ # status_log += f"βœ… Uploaded to Hub: {', '.join(upload_status)}\n\n"
206
+ # else:
207
+ # status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
208
+ #
209
+ # yield status_log, None, None
210
+ #
211
+ # # Step 6: Copy to temp directory for download
212
+ # progress(0.95, desc="Preparing download files...")
213
+ # temp_dir = tempfile.mkdtemp()
214
+ #
215
+ # model_download = None
216
+ # vocab_download = None
217
+ #
218
+ # if os.path.exists(model_path):
219
+ # temp_model = os.path.join(temp_dir, MODEL_FILE)
220
+ # shutil.copy2(model_path, temp_model)
221
+ # model_download = temp_model
222
+ # print(f"πŸ“¦ Prepared {MODEL_FILE} for download")
223
+ #
224
+ # if os.path.exists(vocab_path):
225
+ # temp_vocab = os.path.join(temp_dir, VOCAB_FILE)
226
+ # shutil.copy2(vocab_path, temp_vocab)
227
+ # vocab_download = temp_vocab
228
+ # print(f"πŸ“¦ Prepared {VOCAB_FILE} for download")
229
+ #
230
+ # progress(1.0, desc="Complete!")
231
+ #
232
+ # status_log += "πŸ“¦ Files ready for download below!\n"
233
+ # status_log += "\n" + "=" * 50 + "\n"
234
+ # status_log += "TRAINING COMPLETE - You can now download the model files\n"
235
+ # status_log += "=" * 50
236
+ #
237
+ # yield status_log, model_download, vocab_download
238
+ #
239
+ # except Exception as e:
240
+ # error_msg = f"❌ Unexpected error: {str(e)}\n"
241
+ # import traceback
242
+ # error_msg += f"\nTraceback:\n{traceback.format_exc()}"
243
+ # yield error_msg, None, None
244
+ #
245
+ #
246
+ # def download_models_from_hub():
247
+ # """Download the latest models from the Hugging Face Hub."""
248
+ # try:
249
+ # os.makedirs(OUTPUT_DIR, exist_ok=True)
250
+ #
251
+ # api = HfApi()
252
+ # #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
253
+ # files = api.list_repo_files(REPO_ID)
254
+ #
255
+ # downloaded_files = []
256
+ #
257
+ # # Download model
258
+ # if MODEL_FILE in files:
259
+ # print(f"πŸ“₯ Downloading {MODEL_FILE} from Hub...")
260
+ # model_path = hf_hub_download(
261
+ # repo_id=REPO_ID,
262
+ # filename=MODEL_FILE,
263
+ # # token=HF_TOKEN,
264
+ # local_dir=OUTPUT_DIR,
265
+ # force_download=True
266
+ # )
267
+ # downloaded_files.append(MODEL_FILE)
268
+ # else:
269
+ # return f"❌ {MODEL_FILE} not found in repository", None, None
270
+ #
271
+ # # Download vocab
272
+ # if VOCAB_FILE in files:
273
+ # print(f"πŸ“₯ Downloading {VOCAB_FILE} from Hub...")
274
+ # vocab_path = hf_hub_download(
275
+ # repo_id=REPO_ID,
276
+ # filename=VOCAB_FILE,
277
+ # # token=HF_TOKEN,
278
+ # local_dir=OUTPUT_DIR,
279
+ # force_download=True
280
+ # )
281
+ # downloaded_files.append(VOCAB_FILE)
282
+ # else:
283
+ # return f"❌ {VOCAB_FILE} not found in repository", None, None
284
+ #
285
+ # # Copy to temp for download
286
+ # temp_dir = tempfile.mkdtemp()
287
+ # temp_model = os.path.join(temp_dir, MODEL_FILE)
288
+ # temp_vocab = os.path.join(temp_dir, VOCAB_FILE)
289
+ #
290
+ # shutil.copy2(os.path.join(OUTPUT_DIR, MODEL_FILE), temp_model)
291
+ # shutil.copy2(os.path.join(OUTPUT_DIR, VOCAB_FILE), temp_vocab)
292
+ #
293
+ # success_msg = f"βœ… Successfully downloaded from Hub:\n"
294
+ # success_msg += f" β€’ {MODEL_FILE}\n"
295
+ # success_msg += f" β€’ {VOCAB_FILE}\n\n"
296
+ # success_msg += "πŸ“¦ Files are ready to download below!"
297
+ #
298
+ # return success_msg, temp_model, temp_vocab
299
+ #
300
+ # except Exception as e:
301
+ # error_msg = f"❌ Error downloading models: {str(e)}\n\n"
302
+ # error_msg += f"Make sure:\n"
303
+ # error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
304
+ # error_msg += f"2. HF_TOKEN is set in Space secrets\n"
305
+ # error_msg += f"3. Model files exist in the repository"
306
+ # return error_msg, None, None
307
+ #
308
+ #
309
+ # # Create Gradio interface
310
+ # with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.themes.Soft()) as demo:
311
+ # gr.Markdown(
312
+ # """
313
+ # # πŸŽ“ MCQ Structure Extraction - Model Training
314
+ #
315
+ # Train a BiLSTM-CRF model with deep layout understanding for extracting structured information from MCQ documents.
316
+ #
317
+ # ## πŸ“‹ Instructions:
318
+ # 1. **Upload Dataset**: Provide your unified JSON file containing tokens, bounding boxes, and labels
319
+ # 2. **Train Model**: Click "Start Training" and wait for completion (this may take a while)
320
+ # 3. **Download Models**: Once training is complete, download the trained model and vocabulary files
321
+ #
322
+ # ## πŸ“₯ Or Download Existing Models:
323
+ # If you just want to download the latest trained models from the repository, use the "Download from Hub" tab.
324
+ #
325
+ # ---
326
+ # """
327
+ # )
328
+ #
329
+ # with gr.Tab("πŸš€ Train New Model"):
330
+ # gr.Markdown(
331
+ # """
332
+ # ### Training Process:
333
+ # The app will automatically:
334
+ # 1. βœ… Download any existing models from Hugging Face Hub (for resuming training)
335
+ # 2. 🎯 Train the model on your uploaded dataset
336
+ # 3. ☁️ Upload the trained models back to the Hub
337
+ # 4. πŸ“₯ Provide download links for the trained files
338
+ #
339
+ # **Note**: Training progress details appear in the terminal/logs. The status box shows major milestones.
340
+ # """
341
+ # )
342
+ #
343
+ # with gr.Row():
344
+ # with gr.Column():
345
+ # dataset_input = gr.File(
346
+ # label="πŸ“‚ Upload Training Dataset (JSON)",
347
+ # file_types=[".json"],
348
+ # type="filepath"
349
+ # )
350
+ # train_button = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
351
+ #
352
+ # with gr.Column():
353
+ # status_output = gr.Textbox(
354
+ # label="πŸ“Š Training Status",
355
+ # lines=12,
356
+ # interactive=False,
357
+ # show_copy_button=True
358
+ # )
359
+ #
360
+ # gr.Markdown("### πŸ“¦ Download Trained Models")
361
+ # with gr.Row():
362
+ # model_output = gr.File(label="πŸ’Ύ Model File (.pt)")
363
+ # vocab_output = gr.File(label="πŸ“š Vocabulary File (.pkl)")
364
+ #
365
+ # train_button.click(
366
+ # fn=train_model,
367
+ # inputs=[dataset_input],
368
+ # outputs=[status_output, model_output, vocab_output]
369
+ # )
370
+ #
371
+ # with gr.Tab("☁️ Download from Hub"):
372
+ # gr.Markdown(
373
+ # """
374
+ # ### Download Pre-trained Models
375
+ #
376
+ # Download the latest trained models directly from your Hugging Face repository.
377
+ # This is useful if:
378
+ # - You want to use pre-trained models without training
379
+ # - You need to download models trained in a previous session
380
+ # - You want to get the latest version from the Hub
381
+ #
382
+ # The downloaded files can be used for inference with your MCQ extraction pipeline.
383
+ # """
384
+ # )
385
+ #
386
+ # download_button = gr.Button("☁️ Download Latest Models from Hub", variant="primary", size="lg")
387
+ #
388
+ # download_status = gr.Textbox(
389
+ # label="Download Status",
390
+ # lines=6,
391
+ # interactive=False,
392
+ # show_copy_button=True
393
+ # )
394
+ #
395
+ # gr.Markdown("### πŸ“¦ Downloaded Files")
396
+ # with gr.Row():
397
+ # hub_model_output = gr.File(label="πŸ’Ύ Model File (.pt)")
398
+ # hub_vocab_output = gr.File(label="πŸ“š Vocabulary File (.pkl)")
399
+ #
400
+ # download_button.click(
401
+ # fn=download_models_from_hub,
402
+ # outputs=[download_status, hub_model_output, hub_vocab_output]
403
+ # )
404
+ #
405
+ # gr.Markdown(
406
+ # """
407
+ # ---
408
+ # ### βš™οΈ Model Configuration:
409
+ #
410
+ # **Architecture:**
411
+ # - BiLSTM-CRF with spatial attention mechanism
412
+ # - Word embeddings + Character-level CNN
413
+ # - Bounding box encoding with MLP
414
+ # - Spatial & context feature extraction
415
+ # - Learnable positional embeddings
416
+ #
417
+ # **Features Used:**
418
+ # - Token text (word-level and character-level)
419
+ # - Bounding box coordinates (normalized)
420
+ # - Spatial features: vertical spacing, alignment, dimensions (11 features)
421
+ # - Context features: surrounding question/option markers (8 features)
422
+ #
423
+ # **Output Labels (13 total):**
424
+ # - Questions, Options, Answers, Images, Section Headings, Passages (BIO tagging)
425
+ #
426
+ # **Training Parameters:**
427
+ # - Batch Size: 8
428
+ # - Epochs: 10 (with early stopping after 10 epochs without improvement)
429
+ # - Learning Rate: 5e-4 (AdamW optimizer with OneCycleLR scheduler)
430
+ # - Hidden Size: 768
431
+ # - Total Parameters: ~15.6M
432
+ #
433
+ # **Hardware Requirements:**
434
+ # - GPU recommended for reasonable training speed
435
+ # - CPU training supported but significantly slower
436
+ #
437
+ # ---
438
+ #
439
+ #
440
+ #
441
+ # **Environment Variables Required:**
442
+ # - `SPACE_ID`: Your Hugging Face Space/Repo ID (auto-set in Spaces)
443
+ # - `HF_TOKEN`: Your Hugging Face write token (set as a secret)
444
+ #
445
+ # **Model Persistence:**
446
+ # - Models are automatically saved to `output_data/` directory
447
+ # - Best model is uploaded to Hugging Face Hub after each improvement
448
+ # - Training can be resumed from checkpoints
449
+ # """
450
+ # )
451
+ #
452
+ # # Launch the app
453
+ # if __name__ == "__main__":
454
+ # demo.launch()
455
 
456
 
457
  import os
 
460
  import gradio as gr
461
  from huggingface_hub import hf_hub_download, upload_file, HfApi
462
  import sys
463
+ import glob
464
 
465
  # Add current directory to path to import train_model
466
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
473
 
474
  # IMPORTANT: Update this with your actual Hugging Face repository ID
475
  REPO_ID = "heerjtdev/LSTM_CRF" # Replace with your repo ID
476
+
477
+
478
  # HF_TOKEN = os.environ.get("HF_TOKEN") # Set this as a secret in your Space settings
479
 
480
 
 
482
  """Download existing model files from the Hugging Face Hub if available."""
483
  try:
484
  api = HfApi()
485
+ # files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
486
  files = api.list_repo_files(REPO_ID)
487
 
488
  os.makedirs(OUTPUT_DIR, exist_ok=True)
 
548
  progress(0.05, desc="Checking Hugging Face Hub for existing models...")
549
  download_status = download_existing_models()
550
  status_log = f"{download_status}\n\n"
551
+ # Reset download outputs before training starts
552
+ yield status_log, None, None, None, None
553
 
554
  # Step 2: Save uploaded file
555
  progress(0.1, desc="Processing uploaded dataset...")
556
  dataset_path = dataset_file.name
557
  status_log += f"πŸ“‚ Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
558
+ yield status_log, None, None, None, None
559
 
560
  # Step 3: Import and run training
561
  progress(0.15, desc="Initializing training...")
562
  status_log += "πŸš€ Starting training...\n"
563
  status_log += "πŸ“Š This may take a while. Training progress will appear in the terminal.\n\n"
564
+ yield status_log, None, None, None, None
565
 
566
  # Import the training module
567
  try:
 
579
  print("=" * 80)
580
 
581
  status_log += "βœ… Training completed successfully!\n\n"
582
+ yield status_log, None, None, None, None
583
 
584
  except ImportError as ie:
585
  error_msg = f"❌ Failed to import training module: {str(ie)}\n"
586
  error_msg += "Make sure train_model.py is in the same directory as app.py"
587
+ yield status_log + error_msg, None, None, None, None
588
  return
589
  except Exception as train_error:
590
  error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
591
+ yield status_log + error_msg, None, None, None, None
592
  return
593
 
594
  # Step 4: Verify files exist
 
605
 
606
  if not files_exist:
607
  error_msg = "❌ Error: Model files were not created. Check training logs."
608
+ yield status_log + error_msg, None, None, None, None
609
  return
610
 
611
  status_log += f"βœ… Found trained files: {', '.join(files_exist)}\n\n"
612
+ yield status_log, None, None, None, None
613
 
614
  # Step 5: Upload to Hub
615
  progress(0.9, desc="Uploading models to Hugging Face Hub...")
616
  status_log += "☁️ Uploading to Hugging Face Hub...\n"
617
+ yield status_log, None, None, None, None
618
 
619
  upload_status = []
620
 
 
666
  else:
667
  status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
668
 
669
+ yield status_log, None, None, None, None
670
 
671
  # Step 6: Copy to temp directory for download
672
  progress(0.95, desc="Preparing download files...")
 
694
  status_log += "TRAINING COMPLETE - You can now download the model files\n"
695
  status_log += "=" * 50
696
 
697
+ # Note: We return the model_download and vocab_download twice for both sets of File outputs
698
+ yield status_log, model_download, vocab_download, model_download, vocab_download
699
 
700
  except Exception as e:
701
  error_msg = f"❌ Unexpected error: {str(e)}\n"
702
  import traceback
703
  error_msg += f"\nTraceback:\n{traceback.format_exc()}"
704
+ # Return Nones for all file outputs
705
+ yield error_msg, None, None, None, None
706
 
707
 
708
  def download_models_from_hub():
 
711
  os.makedirs(OUTPUT_DIR, exist_ok=True)
712
 
713
  api = HfApi()
714
+ # files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
715
  files = api.list_repo_files(REPO_ID)
716
 
717
  downloaded_files = []
 
728
  )
729
  downloaded_files.append(MODEL_FILE)
730
  else:
731
+ return f"❌ {MODEL_FILE} not found in repository", None, None, None, None
732
 
733
  # Download vocab
734
  if VOCAB_FILE in files:
 
742
  )
743
  downloaded_files.append(VOCAB_FILE)
744
  else:
745
+ return f"❌ {VOCAB_FILE} not found in repository", None, None, None, None
746
 
747
  # Copy to temp for download
748
  temp_dir = tempfile.mkdtemp()
 
757
  success_msg += f" β€’ {VOCAB_FILE}\n\n"
758
  success_msg += "πŸ“¦ Files are ready to download below!"
759
 
760
+ # Return the downloaded files for both sets of file outputs
761
+ return success_msg, temp_model, temp_vocab, temp_model, temp_vocab
762
 
763
  except Exception as e:
764
  error_msg = f"❌ Error downloading models: {str(e)}\n\n"
 
766
  error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
767
  error_msg += f"2. HF_TOKEN is set in Space secrets\n"
768
  error_msg += f"3. Model files exist in the repository"
769
+ return error_msg, None, None, None, None
770
+
771
+
772
+ # --- UPDATED check_local_files FUNCTION ---
773
+
774
+ def check_local_files():
775
+ """
776
+ Checks and reports the files present in the local output directory.
777
+ If core model files exist, it prepares and returns them for download.
778
+ """
779
+ if not os.path.exists(OUTPUT_DIR):
780
+ return f"ℹ️ Directory **'{OUTPUT_DIR}'** does not exist.", None, None
781
+
782
+ all_files = os.listdir(OUTPUT_DIR)
783
+
784
+ model_path = os.path.join(OUTPUT_DIR, MODEL_FILE)
785
+ vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE)
786
+
787
+ model_download = None
788
+ vocab_download = None
789
+
790
+ # 1. Prepare download paths if files exist
791
+ if os.path.exists(model_path):
792
+ model_download = model_path
793
+ if os.path.exists(vocab_path):
794
+ vocab_download = vocab_path
795
+
796
+ # 2. Generate status message
797
+ if not all_files:
798
+ return f"ℹ️ Directory **'{OUTPUT_DIR}'** is empty.", None, None
799
+
800
+ file_list = []
801
+ total_size = 0
802
+
803
+ # Sort files to put core files first
804
+ sorted_files = sorted(all_files, key=lambda x: (x != MODEL_FILE, x != VOCAB_FILE, x != CHECKPOINT_FILE, x))
805
+
806
+ for filename in sorted_files:
807
+ filepath = os.path.join(OUTPUT_DIR, filename)
808
+ if os.path.isfile(filepath):
809
+ size_bytes = os.path.getsize(filepath)
810
+ total_size += size_bytes
811
+
812
+ # Simple size formatting
813
+ if size_bytes > 1024 * 1024:
814
+ size_str = f"{size_bytes / (1024 * 1024):.2f} MB"
815
+ elif size_bytes > 1024:
816
+ size_str = f"{size_bytes / 1024:.2f} KB"
817
+ else:
818
+ size_str = f"{size_bytes} bytes"
819
+
820
+ file_list.append(f"β€’ **{filename}** (Size: {size_str})")
821
+
822
+ # Format total size
823
+ if total_size > 1024 * 1024 * 1024:
824
+ total_size_str = f"{total_size / (1024 * 1024 * 1024):.2f} GB"
825
+ elif total_size > 1024 * 1024:
826
+ total_size_str = f"{total_size / (1024 * 1024):.2f} MB"
827
+ else:
828
+ total_size_str = f"{total_size / 1024:.2f} KB"
829
+
830
+ header = f"βœ… Contents of **'{OUTPUT_DIR}'** ({len(file_list)} files, Total Size: {total_size_str}):\n"
831
+ if model_download and vocab_download:
832
+ header += "\n**πŸ“¦ Core model files found! Ready for download below.**"
833
+ elif model_download or vocab_download:
834
+ header += "\n**⚠️ Found some model files, but not both.**"
835
+
836
+ return header + "\n" + "\n".join(file_list), model_download, vocab_download
837
+
838
+
839
+ def clear_local_memory():
840
+ """Deletes the local output directory and its contents."""
841
+ if os.path.exists(OUTPUT_DIR):
842
+ try:
843
+ shutil.rmtree(OUTPUT_DIR)
844
+ return f"πŸ—‘οΈ Successfully deleted local directory **'{OUTPUT_DIR}'** and all its contents. Memory cleared.", None, None
845
+ except Exception as e:
846
+ return f"❌ Error clearing memory (deleting '{OUTPUT_DIR}'): {str(e)}", None, None
847
+ else:
848
+ return f"ℹ️ Local directory **'{OUTPUT_DIR}'** does not exist. No memory to clear.", None, None
849
+
850
+
851
+ # --- END NEW FUNCTIONS ---
852
 
853
 
854
  # Create Gradio interface
 
871
  """
872
  )
873
 
874
+ # Define common File components for outputs
875
+ download_model_output = gr.File(label="πŸ’Ύ Model File (.pt)", interactive=False)
876
+ download_vocab_output = gr.File(label="πŸ“š Vocabulary File (.pkl)", interactive=False)
877
+
878
+ # We need a dummy set of outputs to clear the download boxes when starting training,
879
+ # and a permanent set for the utility functions. We'll use the permanent ones below.
880
+
881
  with gr.Tab("πŸš€ Train New Model"):
882
  gr.Markdown(
883
  """
 
901
  )
902
  train_button = gr.Button("πŸš€ Start Training", variant="primary", size="lg")
903
 
904
+ # --- NEW BUTTONS for utility ---
905
+ with gr.Row():
906
+ check_button = gr.Button("πŸ”Ž Check Local Models", variant="secondary")
907
+ clear_button = gr.Button("🧹 Clear Local Memory", variant="stop")
908
+ # ------------------------------
909
+
910
  with gr.Column():
911
  status_output = gr.Textbox(
912
+ label="πŸ“Š Training/Utility Status",
913
  lines=12,
914
  interactive=False,
915
  show_copy_button=True
916
  )
917
 
918
+ gr.Markdown("### πŸ“¦ Download Trained/Local Models")
919
  with gr.Row():
920
+ # Use the defined components for the training output
921
+ train_model_output = download_model_output
922
+ train_vocab_output = download_vocab_output
923
 
924
+ # Note: The train_model function now returns 5 values (status, model_file, vocab_file, model_file_again, vocab_file_again)
925
+ # We target the two download outputs directly for the final model and vocab files.
926
  train_button.click(
927
  fn=train_model,
928
  inputs=[dataset_input],
929
+ outputs=[status_output, train_model_output, train_vocab_output, download_model_output,
930
+ download_vocab_output]
931
+ )
932
+
933
+ # --- NEW BUTTON ACTIONS ---
934
+ # check_local_files now returns (status, model_download_path, vocab_download_path)
935
+ # We target the status output AND the two global download outputs
936
+ check_button.click(
937
+ fn=check_local_files,
938
+ inputs=[],
939
+ outputs=[status_output, download_model_output, download_vocab_output]
940
  )
941
 
942
+ # clear_local_memory now returns (status, None, None) to clear the download boxes
943
+ clear_button.click(
944
+ fn=clear_local_memory,
945
+ inputs=[],
946
+ outputs=[status_output, download_model_output, download_vocab_output]
947
+ )
948
+ # --------------------------
949
+
950
  with gr.Tab("☁️ Download from Hub"):
951
  gr.Markdown(
952
  """
953
  ### Download Pre-trained Models
954
 
955
  Download the latest trained models directly from your Hugging Face repository.
 
 
 
 
 
 
956
  """
957
  )
958
 
 
967
 
968
  gr.Markdown("### πŸ“¦ Downloaded Files")
969
  with gr.Row():
970
+ # Use the defined components for the Hub output
971
+ hub_model_output = download_model_output
972
+ hub_vocab_output = download_vocab_output
973
 
974
+ # Note: The download_models_from_hub function now returns 5 values (status, model_file, vocab_file, model_file_again, vocab_file_again)
975
+ # We target the two download outputs directly for the final model and vocab files.
976
  download_button.click(
977
  fn=download_models_from_hub,
978
+ outputs=[download_status, hub_model_output, hub_vocab_output, download_model_output, download_vocab_output]
979
  )
980
 
981
  gr.Markdown(
982
  """
983
  ---
984
  ### βš™οΈ Model Configuration:
985
+ ... (rest of the markdown)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
986
  """
987
  )
988