heerjtdev commited on
Commit
1419db4
Β·
verified Β·
1 Parent(s): 7225749

Rename LSTM_datset_converter.py to app.py

Browse files
Files changed (2) hide show
  1. LSTM_datset_converter.py +0 -103
  2. app.py +426 -0
LSTM_datset_converter.py DELETED
@@ -1,103 +0,0 @@
1
- import json
2
- import os
3
- from typing import List, Dict, Any
4
-
5
-
6
- def load_and_align_unified_data(input_json_path: str, output_json_path: str) -> str:
7
- """
8
- Loads the Label Studio JSON (with pre-extracted words and bboxes) and
9
- aligns the character-offset labels to create a token-level (Token, Label, Bbox)
10
- training file (The Unified JSON).
11
- """
12
-
13
- if not os.path.exists(input_json_path):
14
- raise FileNotFoundError(f"Input JSON file not found at: {input_json_path}")
15
-
16
- with open(input_json_path, "r", encoding="utf-8") as f:
17
- data = json.load(f)
18
-
19
- # We assume the Label Studio output is a list where each item is a document/page
20
- processed_documents = []
21
-
22
- for item in data:
23
- # 1. Get the pre-extracted tokens and bboxes from the 'data' field
24
- words = item["data"].get("original_words", [])
25
- bboxes = item["data"].get("original_bboxes", [])
26
- full_text = item["data"].get("text", "")
27
-
28
- if not words or not bboxes or not item.get("annotations"):
29
- print(f"Skipping item {item.get('id', 'N/A')}: Missing words, bboxes, or annotations.")
30
- continue
31
-
32
- # Initialize labels for every token to 'O' (Outside)
33
- labels = ["O"] * len(words)
34
-
35
- # 2. Get the character-offset annotations from the 'annotations' field
36
- annotations = item["annotations"][0].get("result", [])
37
-
38
- # 3. Perform Alignment: Match the labeled text to the token list
39
- for res in annotations:
40
- if "value" in res and "labels" in res["value"]:
41
- text_snippet = res["value"]["text"]
42
- tag = res["value"]["labels"][0].upper() # e.g., 'QUESTION'
43
-
44
- # Tokenize the labeled snippet using simple space split
45
- # (MUST match the original tokenization used to create 'original_words' if possible)
46
- text_tokens = text_snippet.split()
47
-
48
- # Find the starting index (i) where the sequence of tokens matches the snippet
49
- for i in range(len(words) - len(text_tokens) + 1):
50
- # Check if the sequence of original words matches the sequence of labeled words
51
- if words[i:i + len(text_tokens)] == text_tokens:
52
- # Apply B-I-O scheme
53
- labels[i] = f"B-{tag}"
54
- for j in range(1, len(text_tokens)):
55
- # Check bounds just in case
56
- if i + j < len(labels):
57
- labels[i + j] = f"I-{tag}"
58
- break # Found the match, move to the next annotation
59
-
60
- # 4. Construct the final token-level output structure
61
- document_tokens = []
62
- for word, label, bbox in zip(words, labels, bboxes):
63
- document_tokens.append({
64
- "token": word,
65
- "label": label,
66
- "bbox": bbox
67
- })
68
-
69
- processed_documents.append(document_tokens)
70
-
71
- # Flatten the list of documents into a single sequence if your training script expects it
72
- # Note: The 'load_unified_data' function in the training script expects a flat list
73
- flat_output = [token for doc in processed_documents for token in doc]
74
-
75
- # Save the final Unified JSON
76
- with open(output_json_path, "w", encoding="utf-8") as f:
77
- json.dump(flat_output, f, indent=2, ensure_ascii=False)
78
-
79
- print(f"βœ… Alignment successful. Unified training data saved to: {output_json_path}")
80
- print(f"Total aligned tokens: {len(flat_output)}")
81
- return output_json_path
82
-
83
-
84
- # ==============================================================================
85
-
86
- if __name__ == '__main__':
87
- # --- Configuration ---
88
- # ⚠️ 1. Set the path to your uploaded Label Studio output JSON file
89
- INPUT_FILE = "project-6-at-2026-01-21-07-10-460e552c.json"
90
-
91
- # 2. Set the path for the output file (This is your Unified JSON Path)
92
- OUTPUT_FILE = "unified_training_data_bluuhhhhh.json"
93
-
94
- os.makedirs("output_data", exist_ok=True)
95
-
96
- try:
97
- # Run the alignment
98
- unified_path = load_and_align_unified_data(INPUT_FILE, OUTPUT_FILE)
99
- print("\nReady for Training! Use this path in your fixed training script:")
100
- print(f"UNIFIED_DATA_PATH = \"{unified_path}\"")
101
-
102
- except Exception as e:
103
- print(f"\n❌ An error occurred during alignment: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import subprocess
4
+ import os
5
+ import sys
6
+ from datetime import datetime
7
+ import shutil
8
+
9
+
10
+ TRAINING_SCRIPT = "train.py"
11
+
12
+ MODEL_OUTPUT_DIR = "checkpoints"
13
+ MODEL_FILE_NAME = "layoutlmv3_crf_passage.pth"
14
+ MODEL_FILE_PATH = os.path.join(MODEL_OUTPUT_DIR, MODEL_FILE_NAME)
15
+
16
+
17
+ # ----------------------------------------------------------------
18
+
19
+ def retrieve_model():
20
+ """
21
+ Checks for the final model file and prepares it for download.
22
+ Useful for when the training job finishes server-side but the
23
+ client connection has timed out.
24
+ """
25
+ MODEL_OUTPUT_DIR = "checkpoints"
26
+ MODEL_FILE_NAME = "layoutlmv3_crf_passage.pth"
27
+ MODEL_FILE_PATH = os.path.join(MODEL_OUTPUT_DIR, MODEL_FILE_NAME)
28
+
29
+ if os.path.exists(MODEL_FILE_PATH):
30
+ file_size = os.path.getsize(MODEL_FILE_PATH) / (1024 * 1024) # Size in MB
31
+
32
+ # CRITICAL: Copy to a simple location that Gradio can reliably serve
33
+ import tempfile
34
+ temp_dir = tempfile.gettempdir()
35
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
36
+ temp_model_path = os.path.join(temp_dir, f"layoutlmv3_trained_{timestamp}_recovered.pth")
37
+
38
+ try:
39
+ shutil.copy2(MODEL_FILE_PATH, temp_model_path)
40
+ download_path = temp_model_path
41
+
42
+ log_output = (
43
+ f"--- Model Status Check: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n"
44
+ f"πŸŽ‰ SUCCESS! A trained model was found and recovered. Boobs! AASTIK MERA NAAM\n"
45
+ f"πŸ“¦ Model file: {MODEL_FILE_PATH}\n"
46
+ f"πŸ“Š Model size: {file_size:.2f} MB\n"
47
+ f"πŸ”— Download path prepared: {download_path}\n\n"
48
+ f"⬇️ Click the 'πŸ“₯ Download Model' button below to save your model."
49
+ )
50
+ return log_output, download_path, gr.Button(visible=True)
51
+
52
+ except Exception as e:
53
+ log_output = (
54
+ f"--- Model Status Check FAILED ---\n"
55
+ f"⚠️ Trained model found, but could not prepare for download: {e}\n"
56
+ f"πŸ“ Original Path: {MODEL_FILE_PATH}. Try again or check Space logs."
57
+ )
58
+ return log_output, None, gr.Button(visible=False)
59
+
60
+ else:
61
+ log_output = (
62
+ f"--- Model Status Check: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n"
63
+ f"❌ Model file not found at {MODEL_FILE_PATH}.\n"
64
+ f"Training may still be running or it failed. Check back later."
65
+ )
66
+ return log_output, None, gr.Button(visible=False)
67
+
68
+
69
+
70
+
71
+ def clear_memory(dataset_file: gr.File):
72
+ """
73
+ Deletes the model output directory and the uploaded dataset file.
74
+ """
75
+ MODEL_OUTPUT_DIR = "checkpoints"
76
+
77
+ log_output = f"--- Memory Clear Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n"
78
+
79
+ # 1. Clear Model Checkpoints Directory
80
+ if os.path.exists(MODEL_OUTPUT_DIR):
81
+ try:
82
+ shutil.rmtree(MODEL_OUTPUT_DIR)
83
+ log_output += f"βœ… Successfully deleted model directory: {MODEL_OUTPUT_DIR}\n"
84
+ except Exception as e:
85
+ log_output += f"❌ ERROR deleting model directory {MODEL_OUTPUT_DIR}: {e}\n"
86
+ else:
87
+ log_output += f"ℹ️ Model directory not found: {MODEL_OUTPUT_DIR} (Nothing to delete)\n"
88
+
89
+ # 2. Clear Uploaded Dataset File (Temporary file cleanup)
90
+ if dataset_file is not None:
91
+ input_path = dataset_file.name if hasattr(dataset_file, 'name') else str(dataset_file)
92
+ if os.path.exists(input_path):
93
+ try:
94
+ os.remove(input_path)
95
+ log_output += f"βœ… Successfully deleted uploaded dataset file: {input_path}\n"
96
+ except Exception as e:
97
+ log_output += f"❌ ERROR deleting dataset file {input_path}: {e}\n"
98
+ else:
99
+ log_output += f"ℹ️ Uploaded dataset file not found at {input_path}.\n"
100
+ else:
101
+ log_output += f"ℹ️ No dataset file currently tracked for deletion.\n"
102
+
103
+ # 3. Final message and state reset
104
+ log_output += f"--- Memory Clear Complete: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n"
105
+ log_output += "✨ Files and checkpoints have been removed. You can now start a fresh training run."
106
+
107
+ # Reset log_output, model_path_state, download_btn visibility, and model_download component
108
+ return log_output, None, gr.Button(visible=False), None
109
+
110
+
111
+
112
+ def train_model(dataset_file: gr.File, batch_size: int, epochs: int, lr: float, max_len: int, progress=gr.Progress()):
113
+ """
114
+ Handles the Gradio submission and executes the training script using subprocess.
115
+ Yields logs in real-time for user feedback.
116
+ """
117
+
118
+ # 1. Setup: Create output directory if it doesn't exist
119
+ os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
120
+
121
+ # 2. File Handling: Use the temporary path of the uploaded file
122
+ if dataset_file is None:
123
+ yield "❌ ERROR: Please upload a file.", None, gr.Button(visible=False)
124
+ return
125
+
126
+ # CRITICAL FIX: dataset_file is a gradio.File object, use .name to get the path
127
+ # This is a temporary file path like /tmp/gradio/.../filename.json
128
+ input_path = dataset_file.name if hasattr(dataset_file, 'name') else str(dataset_file)
129
+
130
+ # Verify the file actually exists before proceeding
131
+ if not os.path.exists(input_path):
132
+ error_msg = f"❌ ERROR: Uploaded file not found at {input_path}. Please try uploading again."
133
+ yield error_msg, None, gr.Button(visible=False)
134
+ return
135
+
136
+ if not input_path.lower().endswith(".json"):
137
+ yield "❌ ERROR: Please upload a valid Label Studio JSON file (.json).", None, gr.Button(visible=False)
138
+ return
139
+
140
+ progress(0.1, desc="Starting LayoutLMv3 Training...")
141
+
142
+ log_output = f"--- Training Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ---\n"
143
+
144
+ # 3. Construct the subprocess command
145
+ command = [
146
+ sys.executable,
147
+ TRAINING_SCRIPT,
148
+ "--mode", "train",
149
+ "--input", input_path,
150
+ "--batch_size", str(batch_size),
151
+ "--epochs", str(epochs),
152
+ "--lr", str(lr),
153
+ "--max_len", str(max_len)
154
+ ]
155
+
156
+ log_output += f"Executing command: {' '.join(command)}\n\n"
157
+ yield log_output, None, gr.Button(visible=False) # Initial yield
158
+
159
+ try:
160
+ # 4. Run the training script and capture output
161
+ process = subprocess.Popen(
162
+ command,
163
+ stdout=subprocess.PIPE,
164
+ stderr=subprocess.STDOUT,
165
+ text=True,
166
+ bufsize=1
167
+ )
168
+
169
+ # Stream logs in real-time
170
+ for line in iter(process.stdout.readline, ""):
171
+ log_output += line
172
+ # Print to console as well for debugging
173
+ print(line, end='')
174
+ # Yield updated logs in real-time
175
+ yield log_output, None, gr.Button(visible=False)
176
+
177
+ process.stdout.close()
178
+ return_code = process.wait()
179
+
180
+ # 5. Check for successful completion
181
+ if return_code == 0:
182
+ log_output += "\n" + "=" * 60 + "\n"
183
+ log_output += "βœ… TRAINING COMPLETE! Model saved successfully.\n"
184
+ log_output += "=" * 60 + "\n"
185
+ print("\nβœ… TRAINING COMPLETE! Model saved.")
186
+
187
+ # 6. Verify model file exists
188
+ if os.path.exists(MODEL_FILE_PATH):
189
+ file_size = os.path.getsize(MODEL_FILE_PATH) / (1024 * 1024) # Size in MB
190
+ log_output += f"\nπŸ“¦ Model file found: {MODEL_FILE_PATH}"
191
+ log_output += f"\nπŸ“Š Model size: {file_size:.2f} MB"
192
+
193
+ print(f"\nβœ… Model exists at: {MODEL_FILE_PATH} ({file_size:.2f} MB)")
194
+
195
+ # CRITICAL: Copy to a simple location that Gradio can reliably serve
196
+ # Use the same temp directory pattern as the uploaded JSON file
197
+ import tempfile
198
+ temp_dir = tempfile.gettempdir()
199
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
200
+
201
+ # Create filename in temp directory
202
+ temp_model_path = os.path.join(temp_dir, f"layoutlmv3_trained_{timestamp}.pth")
203
+
204
+ try:
205
+ # Copy the model to temp directory
206
+ shutil.copy2(MODEL_FILE_PATH, temp_model_path)
207
+ log_output += f"\nπŸ“‹ Model copied to temporary download location"
208
+ log_output += f"\nπŸ”— Download path: {temp_model_path}"
209
+ print(f"βœ… Model copied to temp location: {temp_model_path}")
210
+
211
+ # Verify the copy exists
212
+ if os.path.exists(temp_model_path):
213
+ log_output += f"\nβœ… Download file verified and ready!"
214
+ download_path = temp_model_path
215
+ else:
216
+ log_output += f"\n⚠️ Warning: Temp copy verification failed, using original path"
217
+ download_path = MODEL_FILE_PATH
218
+
219
+ except Exception as e:
220
+ log_output += f"\n⚠️ Could not create temp copy: {e}"
221
+ log_output += f"\nπŸ“ Using original path: {MODEL_FILE_PATH}"
222
+ print(f"⚠️ Copy failed: {e}, using original path")
223
+ download_path = MODEL_FILE_PATH
224
+
225
+ # Final success message
226
+ log_output += f"\n\n{'=' * 60}"
227
+ log_output += f"\nπŸŽ‰ SUCCESS! Your model is ready for download."
228
+ log_output += f"\n{'=' * 60}"
229
+ log_output += f"\n\n⬇️ Click the 'πŸ“₯ Download Model' button below to save your model."
230
+ log_output += f"\n⚠️ CRITICAL: Download NOW! File will be deleted when:"
231
+ log_output += f"\n - This tab is closed"
232
+ log_output += f"\n - Space restarts or goes idle"
233
+ log_output += f"\n - System clears temp files"
234
+ log_output += f"\n\nπŸ“₯ The file will download as a .pth file to your computer's Downloads folder."
235
+ log_output += f"\n\n{'=' * 60}\n"
236
+
237
+ # Return final logs and make download button visible
238
+ # IMPORTANT: Return the path that Gradio can access
239
+ yield log_output, download_path, gr.Button(visible=True)
240
+ return
241
+ else:
242
+ log_output += f"\n⚠️ WARNING: Training completed, but model file not found at expected path ({MODEL_FILE_PATH})."
243
+ log_output += f"\nπŸ” Checking directory contents..."
244
+
245
+ # List files in checkpoints directory for debugging
246
+ if os.path.exists(MODEL_OUTPUT_DIR):
247
+ files = os.listdir(MODEL_OUTPUT_DIR)
248
+ log_output += f"\nπŸ“ Files in {MODEL_OUTPUT_DIR}: {files}"
249
+ else:
250
+ log_output += f"\n❌ Directory {MODEL_OUTPUT_DIR} does not exist!"
251
+
252
+ yield log_output, None, gr.Button(visible=False)
253
+ return
254
+ else:
255
+ log_output += f"\n\n{'=' * 60}\n"
256
+ log_output += f"❌ TRAINING FAILED with return code {return_code}\n"
257
+ log_output += f"{'=' * 60}\n"
258
+ log_output += f"\nPlease check the logs above for error details.\n"
259
+ yield log_output, None, gr.Button(visible=False)
260
+ return
261
+
262
+ except FileNotFoundError:
263
+ error_msg = f"❌ ERROR: The training script '{TRAINING_SCRIPT}' was not found. Ensure it is in the root directory of your Space."
264
+ print(error_msg)
265
+ yield log_output + "\n" + error_msg, None, gr.Button(visible=False)
266
+ return
267
+ except Exception as e:
268
+ error_msg = f"❌ An unexpected error occurred: {e}"
269
+ print(error_msg)
270
+ import traceback
271
+ print(traceback.format_exc())
272
+ yield log_output + "\n" + error_msg, None, gr.Button(visible=False)
273
+ return
274
+
275
+
276
+ # --- Gradio Interface Setup (using Blocks for a nicer layout) ---
277
+ with gr.Blocks(title="LayoutLMv3 Fine-Tuning App by Aastik", theme=gr.themes.Soft()) as demo:
278
+ gr.Markdown("# πŸš€ LayoutLMv3 Fine-Tuning on Hugging Face Spaces")
279
+ gr.Markdown(
280
+ """
281
+ Upload your Label Studio JSON file, set your hyperparameters, and click **Train Model** to fine-tune the LayoutLMv3 model.
282
+
283
+ **⚠️ IMPORTANT - Free Tier Users:**
284
+ - **Download your model IMMEDIATELY** after training completes!
285
+ - The model file is **temporary** and will be deleted when the Space restarts.
286
+ - A download button will appear below once training is complete.
287
+ - **Real-time logs** will stream during training so you can monitor progress.
288
+
289
+ **⏱️ Timeout Note:** Training may timeout on free tier. Consider reducing epochs or batch size for faster training.
290
+ """
291
+ )
292
+
293
+ with gr.Row():
294
+ with gr.Column(scale=1):
295
+ gr.Markdown("### πŸ“ Dataset Upload")
296
+ file_input = gr.File(
297
+ label="Upload Label Studio JSON Dataset",
298
+ file_types=[".json"]
299
+ )
300
+
301
+ gr.Markdown("---")
302
+ gr.Markdown("### βš™οΈ Training Parameters")
303
+
304
+ batch_size_input = gr.Slider(
305
+ minimum=1, maximum=16, step=1, value=4,
306
+ label="Batch Size",
307
+ info="Smaller = less memory, slower training"
308
+ )
309
+ epochs_input = gr.Slider(
310
+ minimum=1, maximum=10, step=1, value=3,
311
+ label="Epochs",
312
+ info="Fewer epochs = faster training (recommended: 3-5)"
313
+ )
314
+ lr_input = gr.Number(
315
+ value=5e-5, label="Learning Rate",
316
+ info="Default: 5e-5"
317
+ )
318
+ max_len_input = gr.Slider(
319
+ minimum=128, maximum=512, step=128, value=512,
320
+ label="Max Sequence Length",
321
+ info="Shorter = faster training, less memory"
322
+ )
323
+
324
+ train_button = gr.Button("πŸ”₯ Start Training", variant="primary", size="lg")
325
+ check_button = gr.Button("πŸ” Check Model Status/Download", variant="secondary", size="lg")
326
+ clear_button = gr.Button("🧹 Clear Model/Dataset Files", variant="stop", size="lg")
327
+
328
+ with gr.Column(scale=2):
329
+ gr.Markdown("### πŸ“Š Training Progress (Real-Time Logs)")
330
+
331
+ log_output = gr.Textbox(
332
+ label="Training Logs - Updates in Real-Time",
333
+ lines=25,
334
+ max_lines=30,
335
+ autoscroll=True,
336
+ show_copy_button=True,
337
+ placeholder="Click 'Start Training' to begin...\n\nLogs will stream here in real-time as training progresses."
338
+ )
339
+
340
+ gr.Markdown("### ⬇️ Download Trained Model")
341
+
342
+ # Hidden state to store the file path
343
+ model_path_state = gr.State(value=None)
344
+
345
+ # Download button (initially hidden)
346
+ download_btn = gr.Button(
347
+ "πŸ“₯ Download Model (.pth file)",
348
+ variant="primary",
349
+ size="lg",
350
+ visible=False
351
+ )
352
+
353
+ check_button.click(
354
+ fn=retrieve_model, # A new function we'll define
355
+ inputs=[],
356
+ outputs=[log_output, model_path_state, download_btn]
357
+ )
358
+
359
+
360
+
361
+ # File output for download
362
+ model_download = gr.File(
363
+ label="Your trained model will appear here after clicking Download",
364
+ interactive=False,
365
+ visible=True
366
+ )
367
+
368
+ clear_button.click(
369
+ fn=clear_memory,
370
+ inputs=[file_input], # Pass the uploaded file object to delete the temp file
371
+ outputs=[log_output, model_path_state, download_btn, model_download]
372
+ )
373
+
374
+ gr.Markdown(
375
+ """
376
+ **πŸ“₯ Download Instructions:**
377
+ 1. Wait for training to complete - watch the real-time logs above
378
+ 2. Look for **"βœ… TRAINING COMPLETE!"** message
379
+ 3. Click the **"πŸ“₯ Download Model"** button that appears above
380
+ 4. Save the `.pth` file to your local machine
381
+ 5. **Do this immediately** - file is temporary and will be deleted on Space restart!
382
+
383
+ **πŸ”§ Troubleshooting:**
384
+ - If download button doesn't appear, check the logs for errors
385
+ - Try reducing epochs or batch size if timeout occurs
386
+ - Ensure your JSON file is properly formatted
387
+ - Logs update in real-time - you can monitor training progress
388
+ """
389
+ )
390
+
391
+ # Define the training action - now with real-time log streaming via yield
392
+ train_button.click(
393
+ fn=train_model,
394
+ inputs=[file_input, batch_size_input, epochs_input, lr_input, max_len_input],
395
+ outputs=[log_output, model_path_state, download_btn],
396
+ api_name="train"
397
+ )
398
+
399
+ # Define the download action
400
+ download_btn.click(
401
+ fn=lambda path: path,
402
+ inputs=[model_path_state],
403
+ outputs=[model_download]
404
+ )
405
+
406
+ gr.Markdown(
407
+ """
408
+ ---
409
+ ### πŸ“– About
410
+ This Space fine-tunes LayoutLMv3 with CRF for document understanding tasks including:
411
+ - Questions, Options, Answers
412
+ - Section Headings
413
+ - Passages
414
+
415
+ **Model Details:** LayoutLMv3-base + CRF layer for sequence labeling
416
+
417
+ **Features:**
418
+ - βœ… Real-time log streaming during training
419
+ - βœ… Progress monitoring with epoch/batch updates
420
+ - βœ… Immediate model download after completion
421
+ - βœ… Automatic file preparation for download
422
+ """
423
+ )
424
+
425
+ if __name__ == "__main__":
426
+ demo.launch()