heerjtdev commited on
Commit
c061f7e
·
verified ·
1 Parent(s): 7169978

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -454
app.py CHANGED
@@ -1,457 +1,3 @@
1
- # import os
2
- # import shutil
3
- # import tempfile
4
- # import gradio as gr
5
- # from huggingface_hub import hf_hub_download, upload_file, HfApi
6
- # import sys
7
- #
8
- # # Add current directory to path to import train_model
9
- # sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
10
- #
11
- # # Configuration
12
- # OUTPUT_DIR = "output_data"
13
- # MODEL_FILE = "model_enhanced.pt"
14
- # VOCAB_FILE = "vocabs_enhanced.pkl"
15
- # CHECKPOINT_FILE = "checkpoint_enhanced.pt"
16
- #
17
- # # IMPORTANT: Update this with your actual Hugging Face repository ID
18
- # REPO_ID = "heerjtdev/LSTM_CRF" # Replace with your repo ID
19
- # # HF_TOKEN = os.environ.get("HF_TOKEN") # Set this as a secret in your Space settings
20
- #
21
- #
22
- # def download_existing_models():
23
- # """Download existing model files from the Hugging Face Hub if available."""
24
- # try:
25
- # api = HfApi()
26
- # #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
27
- # files = api.list_repo_files(REPO_ID)
28
- #
29
- # os.makedirs(OUTPUT_DIR, exist_ok=True)
30
- #
31
- # downloaded_files = []
32
- #
33
- # # Download model file
34
- # if MODEL_FILE in files:
35
- # print(f"📥 Downloading {MODEL_FILE} from Hub...")
36
- # model_path = hf_hub_download(
37
- # repo_id=REPO_ID,
38
- # filename=MODEL_FILE,
39
- # # token=HF_TOKEN,
40
- # local_dir=OUTPUT_DIR,
41
- # force_download=True # Always get latest version
42
- # )
43
- # downloaded_files.append(MODEL_FILE)
44
- # print(f"✅ Downloaded {MODEL_FILE}")
45
- #
46
- # # Download vocab file
47
- # if VOCAB_FILE in files:
48
- # print(f"📥 Downloading {VOCAB_FILE} from Hub...")
49
- # vocab_path = hf_hub_download(
50
- # repo_id=REPO_ID,
51
- # filename=VOCAB_FILE,
52
- # # token=HF_TOKEN,
53
- # local_dir=OUTPUT_DIR,
54
- # force_download=True # Always get latest version
55
- # )
56
- # downloaded_files.append(VOCAB_FILE)
57
- # print(f"✅ Downloaded {VOCAB_FILE}")
58
- #
59
- # # Download checkpoint file (optional, for resuming training)
60
- # if CHECKPOINT_FILE in files:
61
- # print(f"📥 Downloading {CHECKPOINT_FILE} from Hub...")
62
- # checkpoint_path = hf_hub_download(
63
- # repo_id=REPO_ID,
64
- # filename=CHECKPOINT_FILE,
65
- # # token=HF_TOKEN,
66
- # local_dir=OUTPUT_DIR,
67
- # force_download=True
68
- # )
69
- # downloaded_files.append(CHECKPOINT_FILE)
70
- # print(f"✅ Downloaded {CHECKPOINT_FILE}")
71
- #
72
- # if downloaded_files:
73
- # return f"✅ Downloaded from Hub: {', '.join(downloaded_files)}"
74
- # else:
75
- # return "ℹ️ No existing model files found in repository. Starting fresh."
76
- # except Exception as e:
77
- # error_msg = f"⚠️ Could not download existing models: {str(e)}"
78
- # print(error_msg)
79
- # return error_msg
80
- #
81
- #
82
- # def train_model(dataset_file, progress=gr.Progress()):
83
- # """Train the model with the uploaded dataset."""
84
- # if dataset_file is None:
85
- # return "❌ Please upload a dataset file!", None, None
86
- #
87
- # try:
88
- # # Step 1: Download existing models from Hub (if any) BEFORE training starts
89
- # progress(0.05, desc="Checking Hugging Face Hub for existing models...")
90
- # download_status = download_existing_models()
91
- # status_log = f"{download_status}\n\n"
92
- # yield status_log, None, None
93
- #
94
- # # Step 2: Save uploaded file
95
- # progress(0.1, desc="Processing uploaded dataset...")
96
- # dataset_path = dataset_file.name
97
- # status_log += f"📂 Dataset uploaded: {os.path.basename(dataset_path)}\n\n"
98
- # yield status_log, None, None
99
- #
100
- # # Step 3: Import and run training
101
- # progress(0.15, desc="Initializing training...")
102
- # status_log += "🚀 Starting training...\n"
103
- # status_log += "📊 This may take a while. Training progress will appear in the terminal.\n\n"
104
- # yield status_log, None, None
105
- #
106
- # # Import the training module
107
- # try:
108
- # import train_model as tm
109
- # print("=" * 80)
110
- # print("TRAINING STARTED")
111
- # print("=" * 80)
112
- #
113
- # # Run training - this will handle model loading internally
114
- # progress(0.2, desc="Training in progress... (check terminal for details)")
115
- # tm.train_from_json(dataset_path)
116
- #
117
- # print("=" * 80)
118
- # print("TRAINING COMPLETED")
119
- # print("=" * 80)
120
- #
121
- # status_log += "✅ Training completed successfully!\n\n"
122
- # yield status_log, None, None
123
- #
124
- # except ImportError as ie:
125
- # error_msg = f"❌ Failed to import training module: {str(ie)}\n"
126
- # error_msg += "Make sure train_model.py is in the same directory as app.py"
127
- # yield status_log + error_msg, None, None
128
- # return
129
- # except Exception as train_error:
130
- # error_msg = f"❌ Training failed with error:\n{str(train_error)}\n"
131
- # yield status_log + error_msg, None, None
132
- # return
133
- #
134
- # # Step 4: Verify files exist
135
- # progress(0.85, desc="Verifying trained model files...")
136
- # model_path = os.path.join(OUTPUT_DIR, MODEL_FILE)
137
- # vocab_path = os.path.join(OUTPUT_DIR, VOCAB_FILE)
138
- # checkpoint_path = os.path.join(OUTPUT_DIR, CHECKPOINT_FILE)
139
- #
140
- # files_exist = []
141
- # if os.path.exists(model_path):
142
- # files_exist.append(MODEL_FILE)
143
- # if os.path.exists(vocab_path):
144
- # files_exist.append(VOCAB_FILE)
145
- #
146
- # if not files_exist:
147
- # error_msg = "❌ Error: Model files were not created. Check training logs."
148
- # yield status_log + error_msg, None, None
149
- # return
150
- #
151
- # status_log += f"✅ Found trained files: {', '.join(files_exist)}\n\n"
152
- # yield status_log, None, None
153
- #
154
- # # Step 5: Upload to Hub
155
- # progress(0.9, desc="Uploading models to Hugging Face Hub...")
156
- # status_log += "☁️ Uploading to Hugging Face Hub...\n"
157
- # yield status_log, None, None
158
- #
159
- # upload_status = []
160
- #
161
- # if os.path.exists(model_path):
162
- # try:
163
- # upload_file(
164
- # path_or_fileobj=model_path,
165
- # path_in_repo=MODEL_FILE,
166
- # repo_id=REPO_ID,
167
- # # token=HF_TOKEN,
168
- # commit_message="Update trained model"
169
- # )
170
- # upload_status.append(MODEL_FILE)
171
- # print(f"✅ Uploaded {MODEL_FILE} to Hub")
172
- # except Exception as e:
173
- # print(f"⚠️ Failed to upload {MODEL_FILE}: {e}")
174
- #
175
- # if os.path.exists(vocab_path):
176
- # try:
177
- # upload_file(
178
- # path_or_fileobj=vocab_path,
179
- # path_in_repo=VOCAB_FILE,
180
- # repo_id=REPO_ID,
181
- # # token=HF_TOKEN,
182
- # commit_message="Update vocabulary"
183
- # )
184
- # upload_status.append(VOCAB_FILE)
185
- # print(f"✅ Uploaded {VOCAB_FILE} to Hub")
186
- # except Exception as e:
187
- # print(f"⚠️ Failed to upload {VOCAB_FILE}: {e}")
188
- #
189
- # # Also upload checkpoint for future resume capability
190
- # if os.path.exists(checkpoint_path):
191
- # try:
192
- # upload_file(
193
- # path_or_fileobj=checkpoint_path,
194
- # path_in_repo=CHECKPOINT_FILE,
195
- # repo_id=REPO_ID,
196
- # # token=HF_TOKEN,
197
- # commit_message="Update checkpoint"
198
- # )
199
- # upload_status.append(CHECKPOINT_FILE)
200
- # print(f"✅ Uploaded {CHECKPOINT_FILE} to Hub")
201
- # except Exception as e:
202
- # print(f"⚠️ Failed to upload {CHECKPOINT_FILE}: {e}")
203
- #
204
- # if upload_status:
205
- # status_log += f"✅ Uploaded to Hub: {', '.join(upload_status)}\n\n"
206
- # else:
207
- # status_log += "⚠️ Warning: No files were uploaded to Hub\n\n"
208
- #
209
- # yield status_log, None, None
210
- #
211
- # # Step 6: Copy to temp directory for download
212
- # progress(0.95, desc="Preparing download files...")
213
- # temp_dir = tempfile.mkdtemp()
214
- #
215
- # model_download = None
216
- # vocab_download = None
217
- #
218
- # if os.path.exists(model_path):
219
- # temp_model = os.path.join(temp_dir, MODEL_FILE)
220
- # shutil.copy2(model_path, temp_model)
221
- # model_download = temp_model
222
- # print(f"📦 Prepared {MODEL_FILE} for download")
223
- #
224
- # if os.path.exists(vocab_path):
225
- # temp_vocab = os.path.join(temp_dir, VOCAB_FILE)
226
- # shutil.copy2(vocab_path, temp_vocab)
227
- # vocab_download = temp_vocab
228
- # print(f"📦 Prepared {VOCAB_FILE} for download")
229
- #
230
- # progress(1.0, desc="Complete!")
231
- #
232
- # status_log += "📦 Files ready for download below!\n"
233
- # status_log += "\n" + "=" * 50 + "\n"
234
- # status_log += "TRAINING COMPLETE - You can now download the model files\n"
235
- # status_log += "=" * 50
236
- #
237
- # yield status_log, model_download, vocab_download
238
- #
239
- # except Exception as e:
240
- # error_msg = f"❌ Unexpected error: {str(e)}\n"
241
- # import traceback
242
- # error_msg += f"\nTraceback:\n{traceback.format_exc()}"
243
- # yield error_msg, None, None
244
- #
245
- #
246
- # def download_models_from_hub():
247
- # """Download the latest models from the Hugging Face Hub."""
248
- # try:
249
- # os.makedirs(OUTPUT_DIR, exist_ok=True)
250
- #
251
- # api = HfApi()
252
- # #files = api.list_repo_files(REPO_ID, token=HF_TOKEN)
253
- # files = api.list_repo_files(REPO_ID)
254
- #
255
- # downloaded_files = []
256
- #
257
- # # Download model
258
- # if MODEL_FILE in files:
259
- # print(f"📥 Downloading {MODEL_FILE} from Hub...")
260
- # model_path = hf_hub_download(
261
- # repo_id=REPO_ID,
262
- # filename=MODEL_FILE,
263
- # # token=HF_TOKEN,
264
- # local_dir=OUTPUT_DIR,
265
- # force_download=True
266
- # )
267
- # downloaded_files.append(MODEL_FILE)
268
- # else:
269
- # return f"❌ {MODEL_FILE} not found in repository", None, None
270
- #
271
- # # Download vocab
272
- # if VOCAB_FILE in files:
273
- # print(f"📥 Downloading {VOCAB_FILE} from Hub...")
274
- # vocab_path = hf_hub_download(
275
- # repo_id=REPO_ID,
276
- # filename=VOCAB_FILE,
277
- # # token=HF_TOKEN,
278
- # local_dir=OUTPUT_DIR,
279
- # force_download=True
280
- # )
281
- # downloaded_files.append(VOCAB_FILE)
282
- # else:
283
- # return f"❌ {VOCAB_FILE} not found in repository", None, None
284
- #
285
- # # Copy to temp for download
286
- # temp_dir = tempfile.mkdtemp()
287
- # temp_model = os.path.join(temp_dir, MODEL_FILE)
288
- # temp_vocab = os.path.join(temp_dir, VOCAB_FILE)
289
- #
290
- # shutil.copy2(os.path.join(OUTPUT_DIR, MODEL_FILE), temp_model)
291
- # shutil.copy2(os.path.join(OUTPUT_DIR, VOCAB_FILE), temp_vocab)
292
- #
293
- # success_msg = f"✅ Successfully downloaded from Hub:\n"
294
- # success_msg += f" • {MODEL_FILE}\n"
295
- # success_msg += f" • {VOCAB_FILE}\n\n"
296
- # success_msg += "📦 Files are ready to download below!"
297
- #
298
- # return success_msg, temp_model, temp_vocab
299
- #
300
- # except Exception as e:
301
- # error_msg = f"❌ Error downloading models: {str(e)}\n\n"
302
- # error_msg += f"Make sure:\n"
303
- # error_msg += f"1. REPO_ID is set correctly: {REPO_ID}\n"
304
- # error_msg += f"2. HF_TOKEN is set in Space secrets\n"
305
- # error_msg += f"3. Model files exist in the repository"
306
- # return error_msg, None, None
307
- #
308
- #
309
- # # Create Gradio interface
310
- # with gr.Blocks(title="MCQ Structure Extraction - Model Training", theme=gr.themes.Soft()) as demo:
311
- # gr.Markdown(
312
- # """
313
- # # 🎓 MCQ Structure Extraction - Model Training
314
- #
315
- # Train a BiLSTM-CRF model with deep layout understanding for extracting structured information from MCQ documents.
316
- #
317
- # ## 📋 Instructions:
318
- # 1. **Upload Dataset**: Provide your unified JSON file containing tokens, bounding boxes, and labels
319
- # 2. **Train Model**: Click "Start Training" and wait for completion (this may take a while)
320
- # 3. **Download Models**: Once training is complete, download the trained model and vocabulary files
321
- #
322
- # ## 📥 Or Download Existing Models:
323
- # If you just want to download the latest trained models from the repository, use the "Download from Hub" tab.
324
- #
325
- # ---
326
- # """
327
- # )
328
- #
329
- # with gr.Tab("🚀 Train New Model"):
330
- # gr.Markdown(
331
- # """
332
- # ### Training Process:
333
- # The app will automatically:
334
- # 1. ✅ Download any existing models from Hugging Face Hub (for resuming training)
335
- # 2. 🎯 Train the model on your uploaded dataset
336
- # 3. ☁️ Upload the trained models back to the Hub
337
- # 4. 📥 Provide download links for the trained files
338
- #
339
- # **Note**: Training progress details appear in the terminal/logs. The status box shows major milestones.
340
- # """
341
- # )
342
- #
343
- # with gr.Row():
344
- # with gr.Column():
345
- # dataset_input = gr.File(
346
- # label="📂 Upload Training Dataset (JSON)",
347
- # file_types=[".json"],
348
- # type="filepath"
349
- # )
350
- # train_button = gr.Button("🚀 Start Training", variant="primary", size="lg")
351
- #
352
- # with gr.Column():
353
- # status_output = gr.Textbox(
354
- # label="📊 Training Status",
355
- # lines=12,
356
- # interactive=False,
357
- # show_copy_button=True
358
- # )
359
- #
360
- # gr.Markdown("### 📦 Download Trained Models")
361
- # with gr.Row():
362
- # model_output = gr.File(label="💾 Model File (.pt)")
363
- # vocab_output = gr.File(label="📚 Vocabulary File (.pkl)")
364
- #
365
- # train_button.click(
366
- # fn=train_model,
367
- # inputs=[dataset_input],
368
- # outputs=[status_output, model_output, vocab_output]
369
- # )
370
- #
371
- # with gr.Tab("☁️ Download from Hub"):
372
- # gr.Markdown(
373
- # """
374
- # ### Download Pre-trained Models
375
- #
376
- # Download the latest trained models directly from your Hugging Face repository.
377
- # This is useful if:
378
- # - You want to use pre-trained models without training
379
- # - You need to download models trained in a previous session
380
- # - You want to get the latest version from the Hub
381
- #
382
- # The downloaded files can be used for inference with your MCQ extraction pipeline.
383
- # """
384
- # )
385
- #
386
- # download_button = gr.Button("☁️ Download Latest Models from Hub", variant="primary", size="lg")
387
- #
388
- # download_status = gr.Textbox(
389
- # label="Download Status",
390
- # lines=6,
391
- # interactive=False,
392
- # show_copy_button=True
393
- # )
394
- #
395
- # gr.Markdown("### 📦 Downloaded Files")
396
- # with gr.Row():
397
- # hub_model_output = gr.File(label="💾 Model File (.pt)")
398
- # hub_vocab_output = gr.File(label="📚 Vocabulary File (.pkl)")
399
- #
400
- # download_button.click(
401
- # fn=download_models_from_hub,
402
- # outputs=[download_status, hub_model_output, hub_vocab_output]
403
- # )
404
- #
405
- # gr.Markdown(
406
- # """
407
- # ---
408
- # ### ⚙️ Model Configuration:
409
- #
410
- # **Architecture:**
411
- # - BiLSTM-CRF with spatial attention mechanism
412
- # - Word embeddings + Character-level CNN
413
- # - Bounding box encoding with MLP
414
- # - Spatial & context feature extraction
415
- # - Learnable positional embeddings
416
- #
417
- # **Features Used:**
418
- # - Token text (word-level and character-level)
419
- # - Bounding box coordinates (normalized)
420
- # - Spatial features: vertical spacing, alignment, dimensions (11 features)
421
- # - Context features: surrounding question/option markers (8 features)
422
- #
423
- # **Output Labels (13 total):**
424
- # - Questions, Options, Answers, Images, Section Headings, Passages (BIO tagging)
425
- #
426
- # **Training Parameters:**
427
- # - Batch Size: 8
428
- # - Epochs: 10 (with early stopping after 10 epochs without improvement)
429
- # - Learning Rate: 5e-4 (AdamW optimizer with OneCycleLR scheduler)
430
- # - Hidden Size: 768
431
- # - Total Parameters: ~15.6M
432
- #
433
- # **Hardware Requirements:**
434
- # - GPU recommended for reasonable training speed
435
- # - CPU training supported but significantly slower
436
- #
437
- # ---
438
- #
439
- #
440
- #
441
- # **Environment Variables Required:**
442
- # - `SPACE_ID`: Your Hugging Face Space/Repo ID (auto-set in Spaces)
443
- # - `HF_TOKEN`: Your Hugging Face write token (set as a secret)
444
- #
445
- # **Model Persistence:**
446
- # - Models are automatically saved to `output_data/` directory
447
- # - Best model is uploaded to Hugging Face Hub after each improvement
448
- # - Training can be resumed from checkpoints
449
- # """
450
- # )
451
- #
452
- # # Launch the app
453
- # if __name__ == "__main__":
454
- # demo.launch()
455
 
456
 
457
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
 
3
  import os