jing-ju commited on
Commit
3a03ca4
·
verified ·
1 Parent(s): 9e876fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -262
app.py CHANGED
@@ -1,273 +1,122 @@
 
 
 
 
1
  import gradio as gr
2
  import torch
3
- from transformers import (
4
- AutoTokenizer,
5
- AutoModelForSeq2SeqLM,
6
- BitsAndBytesConfig
7
- )
8
- import logging
9
- import gc
10
- import psutil
11
- import os
12
 
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
 
15
 
16
- # Global variables
17
- tokenizer = None
18
- model = None
 
 
 
 
 
19
 
20
- def get_memory_usage():
21
- """Get current memory usage"""
22
- process = psutil.Process(os.getpid())
23
- return process.memory_info().rss / 1024 / 1024 / 1024 # GB
 
 
 
 
 
 
 
24
 
25
- def load_model_optimized():
26
- """Load model with maximum optimization for CPU"""
27
- global tokenizer, model
28
-
29
- if model is not None:
30
- return model, tokenizer
31
-
32
- model_name = "Tencent/Hunyuan-MT-7B-FS8"
33
- logger.info(f"Loading {model_name} with optimizations...")
34
- logger.info(f"Memory before loading: {get_memory_usage():.2f} GB")
35
-
36
- try:
37
- # Load tokenizer first
38
- tokenizer = AutoTokenizer.from_pretrained(
39
- model_name,
40
- trust_remote_code=True
41
- )
42
-
43
- # Load model with aggressive optimizations
44
- model = AutoModelForSeq2SeqLM.from_pretrained(
45
- model_name,
46
- torch_dtype=torch.float16, # Half precision
47
- device_map="cpu",
48
- low_cpu_mem_usage=True, # Reduce memory usage
49
- trust_remote_code=True,
50
- use_cache=False, # Disable KV cache
51
- offload_folder="./offload", # Offload to disk if needed
52
- )
53
-
54
- # Additional optimizations
55
- model.eval() # Set to evaluation mode
56
-
57
- # Enable torch optimizations
58
- torch.set_num_threads(2) # Limit threads
59
-
60
- logger.info(f"Memory after loading: {get_memory_usage():.2f} GB")
61
- logger.info("Model loaded successfully!")
62
-
63
- return model, tokenizer
64
-
65
- except Exception as e:
66
- logger.error(f"Error loading model: {e}")
67
- # Try fallback with 8-bit quantization
68
- try:
69
- logger.info("Trying 8-bit quantization...")
70
- quantization_config = BitsAndBytesConfig(
71
- load_in_8bit=True,
72
- llm_int8_enable_fp32_cpu_offload=True
73
- )
74
-
75
- model = AutoModelForSeq2SeqLM.from_pretrained(
76
- model_name,
77
- quantization_config=quantization_config,
78
- device_map="auto",
79
- trust_remote_code=True,
80
- low_cpu_mem_usage=True
81
- )
82
-
83
- logger.info("8-bit model loaded!")
84
- return model, tokenizer
85
-
86
- except Exception as e2:
87
- logger.error(f"8-bit loading also failed: {e2}")
88
- raise e2
89
 
90
- def translate_text_optimized(
91
- text: str,
92
- source_lang: str = "auto",
93
- target_lang: str = "en"
94
- ) -> str:
95
- """Optimized translation function"""
96
-
97
- if not text.strip():
98
- return "Please enter text to translate"
99
-
100
- # Memory cleanup before translation
101
- gc.collect()
102
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
103
-
104
- try:
105
- model, tokenizer = load_model_optimized()
106
-
107
- # Format input
108
- if source_lang == "auto":
109
- input_text = f"Translate to {target_lang}: {text}"
110
  else:
111
- input_text = f"Translate from {source_lang} to {target_lang}: {text}"
112
-
113
- logger.info(f"Translating: {input_text[:50]}...")
114
- start_memory = get_memory_usage()
115
-
116
- # Tokenize with truncation
117
- inputs = tokenizer(
118
- input_text,
119
- return_tensors="pt",
120
- max_length=512, # Limit input length
121
- truncation=True,
122
- padding=False # No padding for single input
123
- )
124
-
125
- # Generate with minimal settings
126
- with torch.no_grad():
127
- outputs = model.generate(
128
- **inputs,
129
- max_new_tokens=256, # Limit output length
130
- min_length=1,
131
- num_beams=2, # Reduce beams for speed
132
- early_stopping=True,
133
- do_sample=False,
134
- pad_token_id=tokenizer.pad_token_id,
135
- eos_token_id=tokenizer.eos_token_id,
136
- use_cache=False # Disable cache
137
- )
138
-
139
- # Decode output
140
- translated_text = tokenizer.decode(
141
- outputs[0],
142
- skip_special_tokens=True
143
- )
144
-
145
- # Clean output
146
- if ":" in translated_text:
147
- translated_text = translated_text.split(":", 1)[-1].strip()
148
-
149
- # Memory cleanup after translation
150
- del inputs, outputs
151
- gc.collect()
152
-
153
- end_memory = get_memory_usage()
154
- logger.info(f"Translation completed. Memory: {start_memory:.2f}GB -> {end_memory:.2f}GB")
155
-
156
- return translated_text
157
-
158
- except Exception as e:
159
- logger.error(f"Translation error: {e}")
160
- gc.collect() # Cleanup on error
161
- return f"Translation failed: {str(e)}"
162
 
163
- # Language mapping
164
- LANGUAGES = {
165
- "auto": "Auto Detect",
166
- "en": "English",
167
- "zh": "Chinese",
168
- "vi": "Vietnamese",
169
- "ja": "Japanese",
170
- "ko": "Korean",
171
- "th": "Thai",
172
- "id": "Indonesian",
173
- "ms": "Malay",
174
- "fil": "Filipino"
175
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- # Create Gradio interface
178
- with gr.Blocks(
179
- title="Hunyuan-MT Translation (CPU Optimized)",
180
- theme=gr.themes.Monochrome(),
181
- ) as demo:
182
-
183
- gr.HTML("""
184
- <div style="text-align: center; margin: 20px;">
185
- <h1>🧠 Hunyuan-MT-7B Translation</h1>
186
- <p><strong>CPU Optimized Version</strong></p>
187
- <p><em>⚠️ First translation may take 1-2 minutes to load model</em></p>
188
- </div>
189
- """)
190
-
191
- with gr.Row():
192
- with gr.Column():
193
- input_text = gr.Textbox(
194
- label="Input Text",
195
- placeholder="Enter text to translate (max 200 words for best performance)...",
196
- lines=4,
197
- max_lines=8
198
- )
199
-
200
- with gr.Row():
201
- source_lang = gr.Dropdown(
202
- choices=list(LANGUAGES.items()),
203
- label="From",
204
- value="auto"
205
- )
206
- target_lang = gr.Dropdown(
207
- choices=[(k, v) for k, v in LANGUAGES.items() if k != "auto"],
208
- label="To",
209
- value="en"
210
- )
211
-
212
- translate_btn = gr.Button(
213
- "🔄 Translate",
214
- variant="primary",
215
- size="lg"
216
- )
217
-
218
- with gr.Column():
219
- output_text = gr.Textbox(
220
- label="Translation",
221
- lines=4,
222
- max_lines=8,
223
- interactive=False
224
- )
225
-
226
- memory_display = gr.Textbox(
227
- label="System Status",
228
- value="Ready",
229
- interactive=False
230
- )
231
-
232
- # Memory monitoring
233
- def update_memory():
234
- return f"Memory: {get_memory_usage():.1f}GB / 16GB"
235
-
236
- def translate_with_status(text, src, tgt):
237
- if len(text.split()) > 100: # Limit word count
238
- return "Please limit input to 100 words for optimal performance", update_memory()
239
-
240
- result = translate_text_optimized(text, src, tgt)
241
- return result, update_memory()
242
-
243
- # Examples for testing
244
- gr.Examples(
245
- examples=[
246
- ["Hello, how are you?", "en", "vi"],
247
- ["Xin chào", "vi", "en"],
248
- ["Good morning", "en", "zh"],
249
- ["Thank you very much", "en", "ja"],
250
- ],
251
- inputs=[input_text, source_lang, target_lang],
252
- outputs=[output_text, memory_display],
253
- fn=translate_with_status
254
- )
255
-
256
- translate_btn.click(
257
- fn=translate_with_status,
258
- inputs=[input_text, source_lang, target_lang],
259
- outputs=[output_text, memory_display]
260
- )
261
-
262
- # Auto-update memory display
263
- demo.load(fn=update_memory, outputs=memory_display)
264
 
265
- # Launch with specific settings for HF Spaces
266
- if __name__ == "__main__":
267
- demo.launch(
268
- server_name="0.0.0.0",
269
- server_port=7860,
270
- share=False,
271
- show_api=True,
272
- enable_monitoring=False # Disable to save resources
273
- )
 
1
+ # app.py — HF Spaces Free (CPU), Hunyuan-MT 7B-fp8, đa ngôn ngữ, chia đoạn, UI + API
2
+ import os, re
3
+ from typing import List, Optional
4
+
5
  import gradio as gr
6
  import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
 
 
8
 
9
+ # ===== Cấu hình =====
10
+ DEFAULT_MODEL = "tencent/Hunyuan-MT-7B-fp8" # đổi bằng env MODEL_NAME nếu muốn
11
+ MODEL_NAME = os.getenv("MODEL_NAME", DEFAULT_MODEL)
12
 
13
+ GEN_KW = dict( # tham số sinh nhẹ cho CPU
14
+ max_new_tokens=256,
15
+ top_k=20,
16
+ top_p=0.6,
17
+ repetition_penalty=1.05,
18
+ temperature=0.7,
19
+ do_sample=True,
20
+ )
21
 
22
+ MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "800")) # giới hạn input mỗi mảnh
23
+
24
+ # ===== Load tokenizer & model (fp8 bằng dict quantization_config) =====
25
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
26
+ quant_cfg = {"quantization_method": "fp8", "ignore": []} # tránh lỗi ignore=None
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ MODEL_NAME,
29
+ trust_remote_code=True,
30
+ quantization_config=quant_cfg,
31
+ )
32
+ DEVICE = getattr(model, "device", torch.device("cpu"))
33
 
34
+ # ===== Chuẩn hóa tên ngôn ngữ =====
35
+ LANG_ALIASES = {
36
+ "vi": "Vietnamese", "vie": "Vietnamese", "vietnamese": "Vietnamese", "tiếng việt": "Vietnamese",
37
+ "zh": "Chinese", "chi": "Chinese", "zho": "Chinese", "chinese": "Chinese", "tiếng trung": "Chinese", "hán ngữ": "Chinese", "mandarin": "Chinese",
38
+ "en": "English", "eng": "English", "tiếng anh": "English", "english": "English",
39
+ "ja": "Japanese", "jpn": "Japanese", "tiếng nhật": "Japanese", "japanese": "Japanese",
40
+ "ko": "Korean", "kor": "Korean", "tiếng hàn": "Korean", "korean": "Korean",
41
+ "fr": "French", "fra": "French", "fre": "French", "tiếng pháp": "French", "french": "French",
42
+ "de": "German", "deu": "German", "ger": "German", "tiếng đức": "German", "german": "German",
43
+ "es": "Spanish", "spa": "Spanish", "tiếng tây ban nha": "Spanish", "spanish": "Spanish",
44
+ "th": "Thai", "tha": "Thai", "tiếng thái": "Thai", "thai": "Thai",
45
+ "id": "Indonesian", "ind": "Indonesian", "tiếng indonesia": "Indonesian", "indonesian": "Indonesian",
46
+ "ms": "Malay", "msa": "Malay", "tiếng malaysia": "Malay", "malay": "Malay",
47
+ "pt": "Portuguese", "por": "Portuguese", "tiếng bồ đào nha": "Portuguese", "portuguese": "Portuguese",
48
+ "ru": "Russian", "rus": "Russian", "tiếng nga": "Russian", "russian": "Russian",
49
+ }
50
+ LANG_CHOICES = sorted(set(LANG_ALIASES.values()))
51
+ def norm_lang(s: Optional[str]) -> Optional[str]:
52
+ if not s: return None
53
+ k = s.strip().lower()
54
+ return LANG_ALIASES.get(k, s.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # ===== Chia văn bản theo token =====
57
+ def chunk_by_tokens(text: str, max_tokens: int) -> List[str]:
58
+ text = text.strip()
59
+ if not text: return []
60
+ rough = re.split(r"(?<=[\.!?。!?])\s+", text)
61
+ chunks, buf = [], ""
62
+ def tok_len(s: str) -> int:
63
+ return tokenizer(s, add_special_tokens=False, return_length=True)["length"]
64
+ for part in rough:
65
+ cand = (buf + " " + part).strip() if buf else part
66
+ if tok_len(cand) <= max_tokens:
67
+ buf = cand
 
 
 
 
 
 
 
 
68
  else:
69
+ if buf: chunks.append(buf); buf = ""
70
+ if tok_len(part) <= max_tokens:
71
+ buf = part
72
+ else:
73
+ ids = tokenizer(part, add_special_tokens=False)["input_ids"]
74
+ for i in range(0, len(ids), max_tokens):
75
+ piece = tokenizer.decode(ids[i:i+max_tokens], skip_special_tokens=True)
76
+ if piece.strip(): chunks.append(piece.strip())
77
+ if buf: chunks.append(buf)
78
+ return [c for c in chunks if c.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ # ===== Core translate (chat template) =====
81
+ @torch.inference_mode()
82
+ def translate_text(text: str, target_lang: str, source_lang: Optional[str]=None) -> str:
83
+ tgt = norm_lang(target_lang) or "Vietnamese"
84
+ src = norm_lang(source_lang)
85
+ sys_prompt = (f"Translate the following segment from {src} into {tgt}, without additional explanation."
86
+ if src else
87
+ f"Translate the following segment into {tgt}, without additional explanation.")
88
+ outs = []
89
+ for piece in chunk_by_tokens(text, MAX_INPUT_TOKENS):
90
+ msgs = [{"role":"user","content": f"{sys_prompt}\n\n{piece}"}]
91
+ inputs = tokenizer.apply_chat_template(msgs, tokenize=True, add_generation_prompt=False, return_tensors="pt")
92
+ out_ids = model.generate(inputs.to(DEVICE), **GEN_KW)
93
+ outs.append(tokenizer.decode(out_ids[0], skip_special_tokens=True).strip())
94
+ return "\n".join(outs).strip()
95
+
96
+ def translate_batch(texts: List[str], target_lang: str, source_lang: Optional[str]=None) -> List[str]:
97
+ return [translate_text(t, target_lang, source_lang) for t in texts]
98
+
99
+ # ===== Gradio UI + API =====
100
+ with gr.Blocks() as demo:
101
+ gr.Markdown("## Hunyuan-MT 7B-fp8 — Multilingual Translation (HF Free CPU)\nChia đoạn theo token, UI + API (Gradio).")
102
+
103
+ with gr.Tab("Single"):
104
+ src = gr.Textbox(label="Văn bản nguồn", lines=10, placeholder="Dán văn bản cần dịch…")
105
+ with gr.Row():
106
+ src_lang = gr.Textbox(label="Ngôn ngữ nguồn (tùy chọn)", placeholder="Ví dụ: Vietnamese/Chinese/English…")
107
+ tgt_lang = gr.Dropdown(label="Ngôn ngữ đích", choices=LANG_CHOICES, value="Vietnamese")
108
+ out = gr.Textbox(label="Bản dịch", lines=10)
109
+ gr.Button("Dịch").click(translate_text, inputs=[src, tgt_lang, src_lang], outputs=out, api_name="translate_text")
110
 
111
+ with gr.Tab("Batch"):
112
+ src_list = gr.Textbox(label="Mỗi dòng 1 câu/đoạn", lines=10)
113
+ with gr.Row():
114
+ src_lang_b = gr.Textbox(label="Ngôn ngữ nguồn (tùy chọn)")
115
+ tgt_lang_b = gr.Dropdown(label="Ngôn ngữ đích", choices=LANG_CHOICES, value="Vietnamese")
116
+ out_list = gr.Textbox(label="Kết quả (mỗi dòng tương ứng 1 đầu vào)", lines=10)
117
+ def _batch(txts_raw: str, tgt: str, src_: Optional[str]):
118
+ texts = [x for x in txts_raw.splitlines() if x.strip()]
119
+ return "\n".join(translate_batch(texts, tgt, src_))
120
+ gr.Button("Dịch Batch").click(_batch, inputs=[src_list, tgt_lang_b, src_lang_b], outputs=out_list, api_name="translate_batch")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ demo.queue(concurrency_count=1, max_size=2).launch()