st192011 commited on
Commit
b8da437
·
verified ·
1 Parent(s): 34b244c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -23
app.py CHANGED
@@ -2,27 +2,38 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
- # --- CONFIGURATION ---
6
  MODELS_CONFIG = {
7
  "Phase 2: Stable (Formal)": {
8
  "id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
9
- "description": "The 'Bureaucrat Bot'. Optimized for formal precision.",
 
 
 
 
 
10
  "chrf": "60.18",
11
  "comet": "0.6431"
12
  },
13
  "Phase 4: Anchored (Native)": {
14
  "id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
15
- "description": "The 'Native Speaker'. Optimized for cultural awareness and logic.",
 
 
 
 
 
 
16
  "chrf": "52.68",
17
  "comet": "0.6567"
18
  }
19
  }
20
 
21
- # --- MODEL LOADING ---
22
- # We load them globally so they stay in memory (this requires ~14GB RAM total)
23
- print("Loading models to CPU... this may take a few minutes.")
24
 
25
- # Load Model 2
26
  tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
27
  model_p2 = AutoModelForCausalLM.from_pretrained(
28
  MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"],
@@ -30,7 +41,7 @@ model_p2 = AutoModelForCausalLM.from_pretrained(
30
  torch_dtype=torch.float32
31
  )
32
 
33
- # Load Model 4
34
  tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
35
  model_p4 = AutoModelForCausalLM.from_pretrained(
36
  MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"],
@@ -39,6 +50,10 @@ model_p4 = AutoModelForCausalLM.from_pretrained(
39
  )
40
 
41
  def local_translate(model, tokenizer, text, temp):
 
 
 
 
42
  prompt = f"### INGLIŻ: {text}\n### MALTI:"
43
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
44
 
@@ -52,19 +67,21 @@ def local_translate(model, tokenizer, text, temp):
52
  pad_token_id=tokenizer.eos_token_id
53
  )
54
 
55
- # Decode only the new tokens
56
  full_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
57
- # Extract the part after ### MALTI:
58
- maltese_text = full_text.split("### MALTI:")[-1].strip()
 
 
 
 
 
59
  return maltese_text
60
 
61
  def translate_logic(text, selected_models, temp):
62
  out_p2 = "Model not selected."
63
  out_p4 = "Model not selected."
64
 
65
- if not text.strip():
66
- return "Please enter text.", "Please enter text."
67
-
68
  if "Phase 2: Stable (Formal)" in selected_models:
69
  try:
70
  out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
@@ -81,30 +98,42 @@ def translate_logic(text, selected_models, temp):
81
 
82
  # --- GRADIO UI ---
83
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
84
- gr.Markdown("# 🇲🇹 Maltese-MT Lab (Local CPU)")
85
- gr.Markdown("Comparing English-to-Maltese EuroLLM models running directly on this machine.")
86
 
87
  with gr.Row():
88
  with gr.Column(scale=2):
89
- input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text here...", lines=4)
 
 
 
 
90
  model_selector = gr.CheckboxGroup(
91
  choices=list(MODELS_CONFIG.keys()),
92
  value=list(MODELS_CONFIG.keys()),
93
  label="Select Models to Compare"
94
  )
95
- temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Creativity (Temperature)")
 
 
 
 
 
 
96
  btn = gr.Button("🚀 Run Translation", variant="primary")
97
 
98
  with gr.Row():
99
  with gr.Column():
100
- gr.Markdown("### Phase 2: Stable")
101
  p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
102
- gr.Markdown(f"**ChrF++:** `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | **COMET:** `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")
 
103
 
104
  with gr.Column():
105
- gr.Markdown("### Phase 4: Anchored")
106
  p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
107
- gr.Markdown(f"**ChrF++:** `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | **COMET:** `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")
 
108
 
109
  gr.Examples(
110
  examples=[
@@ -121,4 +150,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
121
  outputs=[p2_out, p4_out]
122
  )
123
 
124
- demo.launch()
 
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
+ # --- MODEL DATA (Original Detailed Descriptions) ---
6
  MODELS_CONFIG = {
7
  "Phase 2: Stable (Formal)": {
8
  "id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
9
+ "description": (
10
+ "The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed "
11
+ "monolingual Maltese and Italian to bridge morphological roots. This version "
12
+ "was fine-tuned on high-fidelity EU and governmental parallel corpora, "
13
+ "optimizing it for extreme formal precision and administrative accuracy."
14
+ ),
15
  "chrf": "60.18",
16
  "comet": "0.6431"
17
  },
18
  "Phase 4: Anchored (Native)": {
19
  "id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
20
+ "description": (
21
+ "The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based "
22
+ "'Full Circle' approach. It integrates synthesized reasoning chains (CoT) "
23
+ "that allow the model to process linguistic logic before translating. By mixing "
24
+ "all previous data types, it anchors factual accuracy to native-level phrasing "
25
+ "and cultural awareness."
26
+ ),
27
  "chrf": "52.68",
28
  "comet": "0.6567"
29
  }
30
  }
31
 
32
+ # --- MODEL LOADING (Local CPU) ---
33
+ # Note: Loading two 1.7B models takes ~14GB of RAM.
34
+ print("Loading models to CPU... Please wait.")
35
 
36
+ # Load Model Phase 2
37
  tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
38
  model_p2 = AutoModelForCausalLM.from_pretrained(
39
  MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"],
 
41
  torch_dtype=torch.float32
42
  )
43
 
44
+ # Load Model Phase 4
45
  tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
46
  model_p4 = AutoModelForCausalLM.from_pretrained(
47
  MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"],
 
50
  )
51
 
52
  def local_translate(model, tokenizer, text, temp):
53
+ if not text.strip():
54
+ return ""
55
+
56
+ # Prompt format consistent with training
57
  prompt = f"### INGLIŻ: {text}\n### MALTI:"
58
  inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
59
 
 
67
  pad_token_id=tokenizer.eos_token_id
68
  )
69
 
70
+ # skip_special_tokens=True removes the <|endoftext|> and other technical tokens
71
  full_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
72
+
73
+ # Extract only the Maltese translation part (the text after the prompt)
74
+ if "### MALTI:" in full_text:
75
+ maltese_text = full_text.split("### MALTI:")[-1].strip()
76
+ else:
77
+ maltese_text = full_text.strip()
78
+
79
  return maltese_text
80
 
81
  def translate_logic(text, selected_models, temp):
82
  out_p2 = "Model not selected."
83
  out_p4 = "Model not selected."
84
 
 
 
 
85
  if "Phase 2: Stable (Formal)" in selected_models:
86
  try:
87
  out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
 
98
 
99
  # --- GRADIO UI ---
100
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
101
+ gr.Markdown("# 🇲🇹 Maltese-MT Lab")
102
+ gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.")
103
 
104
  with gr.Row():
105
  with gr.Column(scale=2):
106
+ input_text = gr.Textbox(
107
+ label="English Source Text",
108
+ placeholder="Enter English text here...",
109
+ lines=4
110
+ )
111
  model_selector = gr.CheckboxGroup(
112
  choices=list(MODELS_CONFIG.keys()),
113
  value=list(MODELS_CONFIG.keys()),
114
  label="Select Models to Compare"
115
  )
116
+ temp_slider = gr.Slider(
117
+ minimum=0.1,
118
+ maximum=1.0,
119
+ value=0.1,
120
+ step=0.1,
121
+ label="Creativity (Temperature)"
122
+ )
123
  btn = gr.Button("🚀 Run Translation", variant="primary")
124
 
125
  with gr.Row():
126
  with gr.Column():
127
+ gr.Markdown("### Phase 2: Stable (Formal)")
128
  p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
129
+ gr.Markdown(f"**Training Strategy:**\n{MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}")
130
+ gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")
131
 
132
  with gr.Column():
133
+ gr.Markdown("### Phase 4: Anchored (Native)")
134
  p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
135
+ gr.Markdown(f"**Training Strategy:**\n{MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}")
136
+ gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")
137
 
138
  gr.Examples(
139
  examples=[
 
150
  outputs=[p2_out, p4_out]
151
  )
152
 
153
+ if __name__ == "__main__":
154
+ demo.launch()