Rogaton Claude commited on
Commit
38ecdf5
·
1 Parent(s): 6d0a56b

Create Gradio-based Coptic translation interface for HF Space

Browse files

- Replace training app with bidirectional translation interface
- Support Coptic→English and English→Coptic translation
- Use Norelad/coptic-megalaa-finetuned and megalaa/english-coptic-translator
- Update requirements.txt for Gradio deployment
- Remove old training space files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Coptic Translation Interface - Hugging Face Space
4
+ Supports Coptic↔English translation using fine-tuned MEGALAA models
5
+ """
6
+
7
+ import gradio as gr
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
10
+
11
+ # Coptic-Greek character mappings for model preprocessing
12
+ COPTIC_TO_GREEK = {
13
+ "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
14
+ "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
15
+ "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
16
+ "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", "ⲱ": "ω",
17
+ "ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ", "ϭ": "c", "ϯ": "ti",
18
+ "Ⲁ": "Α", "Ⲃ": "Β", "Ⲅ": "Γ", "Ⲇ": "Δ", "Ⲉ": "Ε", "Ⲍ": "Ζ", "Ⲏ": "Η",
19
+ "Ⲑ": "Θ", "Ⲓ": "Ι", "Ⲕ": "Κ", "Ⲗ": "Λ", "Ⲙ": "Μ", "Ⲛ": "Ν", "Ⲝ": "Ξ",
20
+ "Ⲟ": "Ο", "Ⲡ": "Π", "Ⲣ": "Ρ", "Ⲥ": "Σ", "Ⲧ": "Τ", "Ⲩ": "Υ", "Ⲫ": "Φ",
21
+ "Ⲭ": "Χ", "Ⲯ": "Ψ", "Ⲱ": "Ω", "Ϣ": "Ʃ", "Ϥ": "F", "Ϧ": "X", "Ϩ": "H",
22
+ "Ϫ": "Ɉ", "Ϭ": "C", "Ϯ": "TI"
23
+ }
24
+
25
+ GREEK_TO_COPTIC = {v: k for k, v in COPTIC_TO_GREEK.items()}
26
+
27
+ def greekify(coptic_text):
28
+ """Convert Coptic Unicode to Greek transcription"""
29
+ return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in coptic_text)
30
+
31
+ def degreekify(greek_text):
32
+ """Convert Greek transcription back to Coptic Unicode"""
33
+ result = []
34
+ i = 0
35
+ while i < len(greek_text):
36
+ if i < len(greek_text) - 1 and greek_text[i:i+2].lower() == 'ti':
37
+ result.append(GREEK_TO_COPTIC.get('ti', greek_text[i:i+2]))
38
+ i += 2
39
+ else:
40
+ result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i]))
41
+ i += 1
42
+ return ''.join(result)
43
+
44
+ # Model loading with caching
45
+ coptic_to_english_model = None
46
+ english_to_coptic_model = None
47
+ device = "cuda" if torch.cuda.is_available() else "cpu"
48
+
49
+ def load_coptic_to_english():
50
+ global coptic_to_english_model
51
+ if coptic_to_english_model is None:
52
+ model_name = "Norelad/coptic-megalaa-finetuned"
53
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
54
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
55
+ coptic_to_english_model = (tokenizer, model)
56
+ return coptic_to_english_model
57
+
58
+ def load_english_to_coptic():
59
+ global english_to_coptic_model
60
+ if english_to_coptic_model is None:
61
+ model_name = "megalaa/english-coptic-translator"
62
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
63
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
64
+ english_to_coptic_model = (tokenizer, model)
65
+ return english_to_coptic_model
66
+
67
+ def translate_coptic_to_english(text, dialect='cop-sa'):
68
+ """Translate Coptic to English"""
69
+ try:
70
+ tokenizer, model = load_coptic_to_english()
71
+
72
+ # Dialect tags
73
+ DIALECT_TAGS = {'cop-sa': 'з', 'cop-bo': 'б', 'cop': 'з'}
74
+ dialect_tag = DIALECT_TAGS.get(dialect, 'з')
75
+
76
+ # Preprocess: Convert to Greek transcription and add dialect tag
77
+ greek_input = greekify(text.lower())
78
+ greek_input = f"{dialect_tag} {greek_input}"
79
+
80
+ # Generate translation
81
+ inputs = tokenizer(greek_input, return_tensors="pt", padding=True).to(device)
82
+ outputs = model.generate(
83
+ **inputs,
84
+ max_new_tokens=128,
85
+ num_beams=5,
86
+ early_stopping=True
87
+ )
88
+
89
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
90
+ except Exception as e:
91
+ return f"Translation error: {e}"
92
+
93
+ def translate_english_to_coptic(text):
94
+ """Translate English to Coptic"""
95
+ try:
96
+ tokenizer, model = load_english_to_coptic()
97
+
98
+ # Generate translation
99
+ inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
100
+ outputs = model.generate(
101
+ **inputs,
102
+ max_new_tokens=128,
103
+ num_beams=5,
104
+ early_stopping=True
105
+ )
106
+
107
+ # Convert Greek output to Coptic
108
+ greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
109
+ return degreekify(greek_output)
110
+ except Exception as e:
111
+ return f"Translation error: {e}"
112
+
113
+ # Example texts
114
+ COPTIC_EXAMPLES = [
115
+ ("ϯⲛⲁⲃⲱⲕ ⲉⲡⲏⲓ", "I will go to the house"),
116
+ ("ⲡⲉⲭⲣⲓⲥⲧⲟⲥ ⲡⲉ ⲡⲛⲟⲩⲧⲉ", "Christ is God"),
117
+ ("ⲁⲓⲛⲁⲩ ⲉⲡⲣⲱⲙⲉ", "I saw the man"),
118
+ ]
119
+
120
+ ENGLISH_EXAMPLES = [
121
+ "The Lord is good",
122
+ "I am a teacher",
123
+ "We give thanks to God",
124
+ ]
125
+
126
+ # Gradio Interface
127
+ with gr.Blocks(title="Coptic Translation Interface", theme=gr.themes.Soft()) as demo:
128
+ gr.Markdown("""
129
+ # 🔮 Coptic Translation Interface
130
+
131
+ Translate between Coptic and English using fine-tuned MEGALAA models:
132
+ - **Coptic → English**: `Norelad/coptic-megalaa-finetuned`
133
+ - **English → Coptic**: `megalaa/english-coptic-translator`
134
+
135
+ Based on 50,000+ parallel sentences from CopticScriptorium corpus.
136
+ """)
137
+
138
+ with gr.Tab("Coptic → English"):
139
+ with gr.Row():
140
+ with gr.Column():
141
+ cop_input = gr.Textbox(
142
+ label="Coptic Text",
143
+ placeholder="Enter Coptic text (Unicode)...",
144
+ lines=5
145
+ )
146
+ cop_dialect = gr.Radio(
147
+ choices=[("Sahidic", "cop-sa"), ("Bohairic", "cop-bo")],
148
+ value="cop-sa",
149
+ label="Dialect"
150
+ )
151
+ cop_translate_btn = gr.Button("Translate to English", variant="primary")
152
+
153
+ with gr.Column():
154
+ cop_output = gr.Textbox(
155
+ label="English Translation",
156
+ lines=5,
157
+ interactive=False
158
+ )
159
+
160
+ gr.Examples(
161
+ examples=[[ex[0], "cop-sa"] for ex in COPTIC_EXAMPLES],
162
+ inputs=[cop_input, cop_dialect],
163
+ outputs=cop_output,
164
+ fn=translate_coptic_to_english,
165
+ label="Example Coptic Texts"
166
+ )
167
+
168
+ cop_translate_btn.click(
169
+ fn=translate_coptic_to_english,
170
+ inputs=[cop_input, cop_dialect],
171
+ outputs=cop_output
172
+ )
173
+
174
+ with gr.Tab("English → Coptic"):
175
+ with gr.Row():
176
+ with gr.Column():
177
+ eng_input = gr.Textbox(
178
+ label="English Text",
179
+ placeholder="Enter English text...",
180
+ lines=5
181
+ )
182
+ eng_translate_btn = gr.Button("Translate to Coptic", variant="primary")
183
+
184
+ with gr.Column():
185
+ eng_output = gr.Textbox(
186
+ label="Coptic Translation",
187
+ lines=5,
188
+ interactive=False
189
+ )
190
+
191
+ gr.Examples(
192
+ examples=[[ex] for ex in ENGLISH_EXAMPLES],
193
+ inputs=eng_input,
194
+ outputs=eng_output,
195
+ fn=translate_english_to_coptic,
196
+ label="Example English Texts"
197
+ )
198
+
199
+ eng_translate_btn.click(
200
+ fn=translate_english_to_coptic,
201
+ inputs=eng_input,
202
+ outputs=eng_output
203
+ )
204
+
205
+ gr.Markdown("""
206
+ ---
207
+ ### About
208
+ This interface uses fine-tuned MarianMT models trained on the CopticScriptorium parallel corpus.
209
+ The models support bidirectional translation between Sahidic/Bohairic Coptic and English.
210
+
211
+ **Note**: For best results with Coptic input, use proper Unicode Coptic characters (U+2C80–U+2CFF).
212
+ """)
213
+
214
+ if __name__ == "__main__":
215
+ demo.launch()
hf_space_megalaa_training/app.py DELETED
@@ -1,470 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- HuggingFace Space for fine-tuning megalaa Coptic translation model
4
-
5
- This Gradio app provides a user-friendly interface for training the
6
- megalaa/coptic-english-translator model on your CopticScriptorium corpus.
7
- """
8
-
9
- import gradio as gr
10
- import os
11
- import subprocess
12
- import threading
13
- import time
14
- from pathlib import Path
15
-
16
- # Global variable to track training status
17
- training_status = {
18
- "running": False,
19
- "log": [],
20
- "completed": False,
21
- "error": None
22
- }
23
-
24
-
25
- def train_model(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
26
- """
27
- Start model training with uploaded data files
28
- """
29
- global training_status
30
-
31
- # Reset status
32
- training_status = {
33
- "running": True,
34
- "log": ["🚀 Starting training setup...\n"],
35
- "completed": False,
36
- "error": None
37
- }
38
-
39
- try:
40
- # Save uploaded files
41
- train_path = "train.jsonl"
42
- val_path = "val.jsonl"
43
-
44
- with open(train_path, "wb") as f:
45
- f.write(train_file)
46
- with open(val_path, "wb") as f:
47
- f.write(val_file)
48
-
49
- training_status["log"].append(f"✓ Training data saved: {train_path}\n")
50
- training_status["log"].append(f"✓ Validation data saved: {val_path}\n")
51
-
52
- # Create training script
53
- script_content = f'''#!/usr/bin/env python3
54
- import os
55
- import json
56
- import torch
57
- from datasets import load_dataset
58
- from transformers import (
59
- AutoTokenizer,
60
- AutoModelForSeq2SeqLM,
61
- Seq2SeqTrainingArguments,
62
- Seq2SeqTrainer,
63
- DataCollatorForSeq2Seq,
64
- )
65
- from huggingface_hub import HfApi, login
66
- from evaluate import load
67
- import numpy as np
68
- import logging
69
-
70
- logging.basicConfig(level=logging.INFO)
71
- logger = logging.getLogger(__name__)
72
-
73
- # HuggingFace Hub configuration
74
- HF_TOKEN = "{hf_token}"
75
- MODEL_REPO_NAME = "{model_repo_name}"
76
-
77
- if HF_TOKEN:
78
- login(token=HF_TOKEN)
79
- logger.info("✓ Logged in to HuggingFace Hub")
80
-
81
- # Greekification for megalaa models
82
- COPTIC_TO_GREEK = {{
83
- "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
84
- "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
85
- "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
86
- "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ",
87
- "ⲱ": "ω", "ϣ": "s", "ϥ": "f", "ϧ": "k", "ϩ": "h", "ϫ": "j",
88
- "ϭ": "c", "ϯ": "t",
89
- }}
90
-
91
- def greekify(text):
92
- if not text:
93
- return ""
94
- return "".join(COPTIC_TO_GREEK.get(c.lower(), c.lower()) for c in text)
95
-
96
- def extract_parallel_texts(examples):
97
- coptic_texts = []
98
- english_texts = []
99
-
100
- for messages in examples['messages']:
101
- coptic_text = None
102
- english_text = None
103
-
104
- for msg in messages:
105
- if msg['role'] == 'user' and 'Coptic text to English:' in msg['content']:
106
- coptic_text = msg['content'].split('Coptic text to English:')[-1].strip()
107
- elif msg['role'] == 'assistant':
108
- english_text = msg['content']
109
-
110
- coptic_texts.append(coptic_text)
111
- english_texts.append(english_text)
112
-
113
- return {{'coptic': coptic_texts, 'english': english_texts}}
114
-
115
- def preprocess_function(examples, tokenizer, max_length=256):
116
- greekified_coptic = [greekify(text.lower()) if text else "" for text in examples["coptic"]]
117
-
118
- model_inputs = tokenizer(
119
- greekified_coptic,
120
- max_length=max_length,
121
- truncation=True,
122
- padding="max_length"
123
- )
124
-
125
- labels = tokenizer(
126
- text_target=examples["english"],
127
- max_length=max_length,
128
- truncation=True,
129
- padding="max_length"
130
- )
131
-
132
- labels["input_ids"] = [
133
- [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
134
- for labels_example in labels["input_ids"]
135
- ]
136
-
137
- model_inputs["labels"] = labels["input_ids"]
138
- return model_inputs
139
-
140
- def compute_metrics(eval_preds, tokenizer, metric):
141
- preds, labels = eval_preds
142
-
143
- if isinstance(preds, tuple):
144
- preds = preds[0]
145
-
146
- labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
147
-
148
- decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
149
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
150
- decoded_labels = [[label] for label in decoded_labels]
151
-
152
- result = metric.compute(predictions=decoded_preds, references=decoded_labels)
153
- return {{"bleu": result["score"]}}
154
-
155
- # Configuration
156
- model_name = "megalaa/coptic-english-translator"
157
- output_dir = "coptic_megalaa_finetuned"
158
- num_epochs = {num_epochs}
159
- batch_size = {batch_size}
160
- learning_rate = {learning_rate}
161
-
162
- logger.info("="*60)
163
- logger.info("MEGALAA FINE-TUNING ON HUGGINGFACE SPACES")
164
- logger.info("="*60)
165
- logger.info(f"Base model: {{model_name}}")
166
- logger.info(f"Epochs: {{num_epochs}}")
167
- logger.info(f"Batch size: {{batch_size}}")
168
- logger.info(f"Learning rate: {{learning_rate}}")
169
-
170
- # Check GPU
171
- if torch.cuda.is_available():
172
- logger.info(f"GPU: {{torch.cuda.get_device_name(0)}}")
173
- logger.info(f"GPU Memory: {{torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}} GB")
174
- else:
175
- logger.warning("No GPU detected!")
176
-
177
- # Load model
178
- logger.info("\\nLoading model...")
179
- tokenizer = AutoTokenizer.from_pretrained(model_name)
180
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
181
-
182
- # Load datasets
183
- logger.info("Loading datasets...")
184
- train_dataset = load_dataset('json', data_files='{train_path}', split='train')
185
- val_dataset = load_dataset('json', data_files='{val_path}', split='train')
186
-
187
- logger.info(f"Train samples: {{len(train_dataset):,}}")
188
- logger.info(f"Validation samples: {{len(val_dataset):,}}")
189
-
190
- # Extract and tokenize
191
- logger.info("Processing datasets...")
192
- train_dataset = train_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
193
- val_dataset = val_dataset.map(extract_parallel_texts, batched=True, remove_columns=['messages'])
194
-
195
- tokenized_train = train_dataset.map(
196
- lambda examples: preprocess_function(examples, tokenizer),
197
- batched=True,
198
- remove_columns=['coptic', 'english']
199
- )
200
- tokenized_val = val_dataset.map(
201
- lambda examples: preprocess_function(examples, tokenizer),
202
- batched=True,
203
- remove_columns=['coptic', 'english']
204
- )
205
-
206
- # Setup training
207
- data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)
208
- metric = load("sacrebleu")
209
-
210
- training_args = Seq2SeqTrainingArguments(
211
- output_dir=output_dir,
212
- num_train_epochs=num_epochs,
213
- per_device_train_batch_size=batch_size,
214
- per_device_eval_batch_size=batch_size,
215
- gradient_accumulation_steps=2,
216
- learning_rate=learning_rate,
217
- warmup_steps=500,
218
- max_grad_norm=1.0,
219
- weight_decay=0.01,
220
- eval_strategy="steps",
221
- eval_steps=500,
222
- logging_steps=50,
223
- save_steps=500,
224
- save_total_limit=3,
225
- load_best_model_at_end=True,
226
- metric_for_best_model="bleu",
227
- greater_is_better=True,
228
- predict_with_generate=True,
229
- generation_max_length=256,
230
- generation_num_beams=5,
231
- fp16=torch.cuda.is_available(),
232
- report_to="tensorboard",
233
- logging_dir=f"{{output_dir}}/logs",
234
- push_to_hub=False,
235
- )
236
-
237
- trainer = Seq2SeqTrainer(
238
- model=model,
239
- args=training_args,
240
- train_dataset=tokenized_train,
241
- eval_dataset=tokenized_val,
242
- tokenizer=tokenizer,
243
- data_collator=data_collator,
244
- compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer, metric)
245
- )
246
-
247
- logger.info("\\nSTARTING TRAINING")
248
- logger.info("="*60)
249
-
250
- # Train
251
- trainer.train()
252
-
253
- # Save locally
254
- logger.info("\\nSaving final model...")
255
- trainer.save_model(f"{{output_dir}}/final")
256
- tokenizer.save_pretrained(f"{{output_dir}}/final")
257
-
258
- # Push to HuggingFace Hub
259
- if HF_TOKEN and MODEL_REPO_NAME:
260
- logger.info(f"\\nPushing model to HuggingFace Hub: {{MODEL_REPO_NAME}}")
261
- try:
262
- api = HfApi()
263
- api.create_repo(repo_id=MODEL_REPO_NAME, repo_type="model", exist_ok=True)
264
-
265
- # Upload all files
266
- api.upload_folder(
267
- folder_path=f"{{output_dir}}/final",
268
- repo_id=MODEL_REPO_NAME,
269
- repo_type="model",
270
- )
271
- logger.info(f"✅ Model successfully pushed to: https://huggingface.co/{{MODEL_REPO_NAME}}")
272
- except Exception as e:
273
- logger.error(f"❌ Failed to push to Hub: {{e}}")
274
-
275
- # Final evaluation
276
- logger.info("\\nFinal evaluation...")
277
- eval_results = trainer.evaluate()
278
-
279
- logger.info("\\n" + "="*60)
280
- logger.info("TRAINING COMPLETE!")
281
- logger.info("="*60)
282
- for key, value in eval_results.items():
283
- logger.info(f"{{key}}: {{value}}")
284
-
285
- logger.info(f"\\n✅ Model saved locally to: {{output_dir}}/final")
286
- if HF_TOKEN and MODEL_REPO_NAME:
287
- logger.info(f"✅ Model available at: https://huggingface.co/{{MODEL_REPO_NAME}}")
288
- '''
289
-
290
- with open("train_script.py", "w") as f:
291
- f.write(script_content)
292
-
293
- training_status["log"].append("✓ Training script created\n")
294
- training_status["log"].append("🚀 Starting training...\n\n")
295
-
296
- # Run training in subprocess
297
- process = subprocess.Popen(
298
- ["python", "train_script.py"],
299
- stdout=subprocess.PIPE,
300
- stderr=subprocess.STDOUT,
301
- text=True,
302
- bufsize=1
303
- )
304
-
305
- # Stream output
306
- for line in process.stdout:
307
- training_status["log"].append(line)
308
- time.sleep(0.01) # Small delay to allow UI updates
309
-
310
- process.wait()
311
-
312
- if process.returncode == 0:
313
- training_status["completed"] = True
314
- training_status["log"].append("\n\n✅ TRAINING COMPLETED SUCCESSFULLY!\n")
315
- training_status["log"].append("📦 Model saved locally to: coptic_megalaa_finetuned/final\n")
316
- if hf_token and model_repo_name:
317
- training_status["log"].append(f"📦 Model pushed to: https://huggingface.co/{model_repo_name}\n")
318
- else:
319
- training_status["error"] = f"Training failed with exit code {process.returncode}"
320
- training_status["log"].append(f"\n\n❌ Training failed with exit code {process.returncode}\n")
321
-
322
- except Exception as e:
323
- training_status["error"] = str(e)
324
- training_status["log"].append(f"\n\n❌ Error: {str(e)}\n")
325
-
326
- finally:
327
- training_status["running"] = False
328
-
329
-
330
- def start_training(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name):
331
- """
332
- Start training in background thread
333
- """
334
- if training_status["running"]:
335
- return "⚠️ Training already in progress!"
336
-
337
- if not hf_token or not model_repo_name:
338
- return "⚠️ Please provide both HuggingFace Token and Model Repository Name!"
339
-
340
- # Start training thread
341
- thread = threading.Thread(
342
- target=train_model,
343
- args=(train_file, val_file, num_epochs, batch_size, learning_rate, hf_token, model_repo_name)
344
- )
345
- thread.daemon = True
346
- thread.start()
347
-
348
- return "🚀 Training started! Monitor progress in the logs below."
349
-
350
-
351
- def get_training_log():
352
- """
353
- Return current training log
354
- """
355
- return "".join(training_status["log"])
356
-
357
-
358
- def check_status():
359
- """
360
- Return training status
361
- """
362
- if training_status["completed"]:
363
- return "✅ Training completed!"
364
- elif training_status["error"]:
365
- return f"❌ Error: {training_status['error']}"
366
- elif training_status["running"]:
367
- return "🔄 Training in progress..."
368
- else:
369
- return "⏸️ Ready to train"
370
-
371
-
372
- # Create Gradio interface
373
- with gr.Blocks(title="Megalaa Coptic Fine-tuning") as demo:
374
- gr.Markdown("""
375
- # 🏛️ Megalaa Coptic Translation Fine-tuning
376
-
377
- Fine-tune the megalaa/coptic-english-translator model on your CopticScriptorium corpus.
378
-
379
- **⚙️ IMPORTANT:** Make sure this Space is running on **T4 Small GPU** for optimal performance!
380
- """)
381
-
382
- with gr.Row():
383
- with gr.Column():
384
- gr.Markdown("### 🔑 HuggingFace Hub Configuration")
385
- hf_token_input = gr.Textbox(
386
- label="HuggingFace Token",
387
- placeholder="hf_...",
388
- type="password",
389
- info="Get your token from https://huggingface.co/settings/tokens"
390
- )
391
- model_repo_input = gr.Textbox(
392
- label="Model Repository Name",
393
- placeholder="username/coptic-megalaa-finetuned",
394
- info="Example: john-doe/coptic-megalaa-finetuned"
395
- )
396
-
397
- gr.Markdown("### 📤 Upload Training Data")
398
- train_file_upload = gr.File(
399
- label="Training Data (train.jsonl)",
400
- file_types=[".jsonl"]
401
- )
402
- val_file_upload = gr.File(
403
- label="Validation Data (val.jsonl)",
404
- file_types=[".jsonl"]
405
- )
406
-
407
- gr.Markdown("### ⚙️ Training Parameters")
408
- num_epochs = gr.Slider(
409
- minimum=1,
410
- maximum=10,
411
- value=5,
412
- step=1,
413
- label="Number of Epochs"
414
- )
415
- batch_size = gr.Slider(
416
- minimum=4,
417
- maximum=16,
418
- value=8,
419
- step=4,
420
- label="Batch Size"
421
- )
422
- learning_rate = gr.Number(
423
- value=2e-5,
424
- label="Learning Rate"
425
- )
426
-
427
- start_btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
428
- status_text = gr.Textbox(label="Status", value="⏸️ Ready to train")
429
-
430
- with gr.Column():
431
- gr.Markdown("### 📊 Training Log")
432
- log_output = gr.Textbox(
433
- label="Real-time Training Log",
434
- lines=30,
435
- max_lines=30,
436
- autoscroll=True,
437
- every=2
438
- )
439
-
440
- # Button actions
441
- start_btn.click(
442
- fn=start_training,
443
- inputs=[train_file_upload, val_file_upload, num_epochs, batch_size, learning_rate, hf_token_input, model_repo_input],
444
- outputs=status_text
445
- )
446
-
447
- # Auto-refresh log and status
448
- demo.load(fn=get_training_log, outputs=log_output, every=2)
449
- demo.load(fn=check_status, outputs=status_text, every=2)
450
-
451
- gr.Markdown("""
452
- ---
453
- ### 📥 After Training
454
-
455
- When training completes, your fine-tuned model will be automatically pushed to HuggingFace Hub!
456
-
457
- **Next steps:**
458
- 1. Visit your model repository at `https://huggingface.co/YOUR_USERNAME/MODEL_NAME`
459
- 2. Download and test with: `python evaluate_megalaa_model.py`
460
- 3. Integrate into your Coptic translation interface
461
- 4. Share your model with the community!
462
-
463
- **Estimated training time:** 6-8 hours on T4 GPU
464
-
465
- **Note:** The model is also saved temporarily to `coptic_megalaa_finetuned/final/` during training,
466
- but this local copy will be lost when the Space restarts. Use the HuggingFace Hub version!
467
- """)
468
-
469
- if __name__ == "__main__":
470
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hf_space_megalaa_training/requirements.txt DELETED
@@ -1,11 +0,0 @@
1
- torch>=2.0.0
2
- transformers>=4.35.0
3
- datasets>=2.14.0
4
- accelerate>=0.24.0
5
- evaluate>=0.4.1
6
- sacrebleu>=2.3.1
7
- sentencepiece>=0.1.99
8
- protobuf>=3.20.0
9
- gradio>=4.44.0
10
- tensorboard>=2.15.0
11
- huggingface_hub>=0.20.0
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,10 +1,6 @@
1
- streamlit
2
- huggingface_hub
3
- lxml
4
- stanza
5
  torch>=2.0.0
6
  transformers>=4.35.0
7
  sentencepiece>=0.1.99
8
  protobuf>=3.20.0
9
  accelerate>=0.20.0
10
- pyswip>=0.2.10
 
1
+ gradio
 
 
 
2
  torch>=2.0.0
3
  transformers>=4.35.0
4
  sentencepiece>=0.1.99
5
  protobuf>=3.20.0
6
  accelerate>=0.20.0