jing-ju commited on
Commit
6bb64e2
·
verified ·
1 Parent(s): 0fdedb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -349
app.py CHANGED
@@ -1,390 +1,273 @@
1
- import os
2
- import torch
3
- import re
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import gradio as gr
6
-
7
- # Environment variables
8
- MODEL_NAME = os.getenv("MODEL_NAME", "tencent/Hunyuan-MT-7B-fp8")
9
- MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "800"))
10
-
11
- # Generation parameters optimized for CPU
12
- GEN_KW = dict(
13
- max_new_tokens=256,
14
- top_k=20,
15
- top_p=0.6,
16
- repetition_penalty=1.05,
17
- temperature=0.7,
18
- do_sample=True,
19
  )
 
 
 
 
20
 
21
- # Language mapping for normalization
22
- LANGUAGE_MAPPING = {
23
- "vi": "Vietnamese",
24
- "vietnamese": "Vietnamese",
25
- "tiếng việt": "Vietnamese",
26
- "zh": "Chinese",
27
- "chinese": "Chinese",
28
- "tiếng trung": "Chinese",
29
- "中文": "Chinese",
30
- "en": "English",
31
- "english": "English",
32
- "tiếng anh": "English",
33
- "ja": "Japanese",
34
- "japanese": "Japanese",
35
- "tiếng nhật": "Japanese",
36
- "日本語": "Japanese",
37
- "ko": "Korean",
38
- "korean": "Korean",
39
- "tiếng hàn": "Korean",
40
- "한국어": "Korean",
41
- "fr": "French",
42
- "french": "French",
43
- "tiếng pháp": "French",
44
- "de": "German",
45
- "german": "German",
46
- "tiếng đức": "German",
47
- "es": "Spanish",
48
- "spanish": "Spanish",
49
- "tiếng tây ban nha": "Spanish",
50
- "th": "Thai",
51
- "thai": "Thai",
52
- "tiếng thái": "Thai",
53
- "id": "Indonesian",
54
- "indonesian": "Indonesian",
55
- "tiếng indonesia": "Indonesian",
56
- "ms": "Malay",
57
- "malay": "Malay",
58
- "tiếng malaysia": "Malay",
59
- "pt": "Portuguese",
60
- "portuguese": "Portuguese",
61
- "tiếng bồ đào nha": "Portuguese",
62
- "ru": "Russian",
63
- "russian": "Russian",
64
- "tiếng nga": "Russian",
65
- }
66
 
67
- SUPPORTED_LANGUAGES = [
68
- "Vietnamese", "Chinese", "English", "Japanese", "Korean",
69
- "French", "German", "Spanish", "Thai", "Indonesian",
70
- "Malay", "Portuguese", "Russian"
71
- ]
72
 
73
- def normalize_language(lang):
74
- """Normalize language name"""
75
- if not lang:
76
- return None
77
- lang_lower = lang.strip().lower()
78
- return LANGUAGE_MAPPING.get(lang_lower, lang.strip())
79
 
80
- def load_model():
81
- """Load model and tokenizer with fp8 quantization config"""
82
- print(f"Loading model: {MODEL_NAME}")
83
 
84
- # Load tokenizer
85
- tokenizer = AutoTokenizer.from_pretrained(
86
- MODEL_NAME,
87
- trust_remote_code=True
88
- )
 
89
 
90
- # Create quantization config for fp8 - must use the actual class
91
  try:
92
- from compressed_tensors import CompressedTensorsConfig
93
- quantization_config = CompressedTensorsConfig(
94
- quantization_method="fp8",
95
- ignore=[]
96
  )
97
- print("Using CompressedTensorsConfig")
98
- except ImportError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  try:
100
- from transformers.quantizers import CompressedTensorsQuantizationConfig
101
- quantization_config = CompressedTensorsQuantizationConfig(
102
- quantization_method="fp8",
103
- ignore=[]
104
  )
105
- print("Using CompressedTensorsQuantizationConfig")
106
- except ImportError:
107
- # If both fail, load without custom quantization config
108
- print("Loading model without custom quantization config")
109
- quantization_config = None
110
-
111
- # Load model with quantization config
112
- model_kwargs = {
113
- "trust_remote_code": True,
114
- "dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
115
- }
116
-
117
- if quantization_config is not None:
118
- model_kwargs["quantization_config"] = quantization_config
119
-
120
- model = AutoModelForCausalLM.from_pretrained(
121
- MODEL_NAME,
122
- **model_kwargs
123
- )
124
-
125
- return tokenizer, model
126
-
127
- def chunk_text_by_tokens(text, tokenizer, max_tokens):
128
- """Split text into chunks based on token count"""
129
- if not text.strip():
130
- return []
131
-
132
- # First, try splitting by sentence delimiters
133
- sentences = re.split(r'[.!?。!?]', text)
134
- chunks = []
135
- current_chunk = ""
136
-
137
- for sentence in sentences:
138
- sentence = sentence.strip()
139
- if not sentence:
140
- continue
141
 
142
- test_chunk = current_chunk + " " + sentence if current_chunk else sentence
143
-
144
- # Estimate token length
145
- try:
146
- token_count = len(tokenizer.encode(test_chunk, add_special_tokens=False))
147
- except:
148
- token_count = len(test_chunk.split()) * 1.3 # rough estimation
149
-
150
- if token_count <= max_tokens:
151
- current_chunk = test_chunk
152
- else:
153
- if current_chunk:
154
- chunks.append(current_chunk.strip())
155
 
156
- # If single sentence is too long, split it forcefully
157
- try:
158
- sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
159
- if len(sentence_tokens) > max_tokens:
160
- for i in range(0, len(sentence_tokens), max_tokens):
161
- chunk_tokens = sentence_tokens[i:i + max_tokens]
162
- chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
163
- chunks.append(chunk_text)
164
- current_chunk = ""
165
- else:
166
- current_chunk = sentence
167
- except:
168
- current_chunk = sentence
169
-
170
- if current_chunk:
171
- chunks.append(current_chunk.strip())
172
-
173
- return chunks
174
 
175
- def translate_text_chunk(text, target_lang, source_lang, tokenizer, model):
176
- """Translate a single chunk of text"""
177
- target_lang = normalize_language(target_lang)
178
- source_lang = normalize_language(source_lang) if source_lang else None
 
 
179
 
180
- if not target_lang:
181
- return "Error: Invalid target language"
182
 
183
- # Create prompt
184
- if source_lang:
185
- prompt = f"Translate the following segment from {source_lang} into {target_lang}, without additional explanation.\n\n{text}"
186
- else:
187
- prompt = f"Translate the following segment into {target_lang}, without additional explanation.\n\n{text}"
188
 
189
- # Apply chat template
190
  try:
191
- messages = [{"role": "user", "content": prompt}]
192
- input_text = tokenizer.apply_chat_template(
193
- messages,
194
- tokenize=False,
195
- add_generation_prompt=True
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  )
197
- except:
198
- # Fallback if chat template fails
199
- input_text = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
200
-
201
- # Tokenize
202
- inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
203
-
204
- # Generate
205
- with torch.no_grad():
206
- outputs = model.generate(
207
- **inputs,
208
- **GEN_KW,
209
- pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id
 
 
 
 
 
 
210
  )
211
-
212
- # Decode
213
- response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
214
- return response.strip()
215
-
216
- def translate_single(text, target_lang, source_lang, tokenizer, model):
217
- """Translate text with automatic chunking"""
218
- if not text.strip():
219
- return "Please enter text to translate."
220
-
221
- if not target_lang:
222
- return "Please select a target language."
223
-
224
- try:
225
- # Split into chunks
226
- chunks = chunk_text_by_tokens(text, tokenizer, MAX_INPUT_TOKENS)
227
 
228
- if not chunks:
229
- return "No valid text to translate."
 
230
 
231
- # Translate each chunk
232
- translations = []
233
- for chunk in chunks:
234
- translation = translate_text_chunk(chunk, target_lang, source_lang, tokenizer, model)
235
- translations.append(translation)
236
 
237
- return " ".join(translations)
238
-
239
- except Exception as e:
240
- return f"Translation error: {str(e)}"
241
-
242
- def translate_batch(text_lines, target_lang, source_lang, tokenizer, model):
243
- """Translate multiple lines of text"""
244
- if not text_lines.strip():
245
- return "Please enter text lines to translate."
246
-
247
- if not target_lang:
248
- return "Please select a target language."
249
-
250
- lines = [line.strip() for line in text_lines.split('\n') if line.strip()]
251
-
252
- if not lines:
253
- return "No valid text lines to translate."
254
-
255
- try:
256
- results = []
257
- for line in lines:
258
- translation = translate_single(line, target_lang, source_lang, tokenizer, model)
259
- results.append(translation)
260
 
261
- return '\n'.join(results)
262
-
263
  except Exception as e:
264
- return f"Batch translation error: {str(e)}"
 
 
265
 
266
- # Load model and tokenizer
267
- print("Initializing model...")
268
- try:
269
- tokenizer, model = load_model()
270
- device = model.device
271
- print(f"Model loaded successfully on device: {device}")
272
- except Exception as e:
273
- print(f"Error loading model: {e}")
274
- # Create dummy functions for interface
275
- tokenizer = None
276
- model = None
277
-
278
- def dummy_translate(text, target_lang, source_lang):
279
- return f"Model loading failed: {e}"
280
-
281
- translate_single = dummy_translate
282
- translate_batch = lambda text_lines, target_lang, source_lang, *args: dummy_translate(text_lines, target_lang, source_lang)
283
 
284
  # Create Gradio interface
285
- with gr.Blocks(title="Hunyuan-MT Multi-language Translation") as demo:
286
- gr.Markdown("# 🌍 Hunyuan-MT Multi-language Translation")
287
- gr.Markdown(f"**Model**: {MODEL_NAME}")
288
- gr.Markdown("⚠️ **Note**: Running on Free CPU - translation may be slow and length is limited.")
289
 
290
- with gr.Tabs():
291
- with gr.TabItem("Single Translation"):
292
- with gr.Row():
293
- with gr.Column():
294
- input_text = gr.Textbox(
295
- label="Text to translate",
296
- placeholder="Enter your text here...",
297
- lines=5
298
- )
299
- target_lang = gr.Dropdown(
300
- choices=SUPPORTED_LANGUAGES,
301
- label="Target Language",
302
- value="Vietnamese"
303
- )
304
- source_lang = gr.Textbox(
305
- label="Source Language (optional)",
306
- placeholder="Leave empty for auto-detection"
307
- )
308
- translate_btn = gr.Button("Translate", variant="primary")
309
-
310
- with gr.Column():
311
- output_text = gr.Textbox(
312
- label="Translation",
313
- lines=5,
314
- interactive=False
315
- )
316
 
317
- if tokenizer and model:
318
- translate_btn.click(
319
- fn=lambda text, tgt, src: translate_single(text, tgt, src, tokenizer, model),
320
- inputs=[input_text, target_lang, source_lang],
321
- outputs=output_text,
322
- api_name="translate_text"
323
  )
324
- else:
325
- translate_btn.click(
326
- fn=lambda text, tgt, src: translate_single(text, tgt, src),
327
- inputs=[input_text, target_lang, source_lang],
328
- outputs=output_text,
329
- api_name="translate_text"
330
  )
 
 
 
 
 
 
331
 
332
- with gr.TabItem("Batch Translation"):
333
- with gr.Row():
334
- with gr.Column():
335
- batch_input = gr.Textbox(
336
- label="Text lines to translate (one per line)",
337
- placeholder="Line 1\nLine 2\nLine 3...",
338
- lines=8
339
- )
340
- batch_target_lang = gr.Dropdown(
341
- choices=SUPPORTED_LANGUAGES,
342
- label="Target Language",
343
- value="Vietnamese"
344
- )
345
- batch_source_lang = gr.Textbox(
346
- label="Source Language (optional)",
347
- placeholder="Leave empty for auto-detection"
348
- )
349
- batch_translate_btn = gr.Button("Translate Batch", variant="primary")
350
-
351
- with gr.Column():
352
- batch_output = gr.Textbox(
353
- label="Batch Translation Results",
354
- lines=8,
355
- interactive=False
356
- )
357
 
358
- if tokenizer and model:
359
- batch_translate_btn.click(
360
- fn=lambda text, tgt, src: translate_batch(text, tgt, src, tokenizer, model),
361
- inputs=[batch_input, batch_target_lang, batch_source_lang],
362
- outputs=batch_output,
363
- api_name="translate_batch"
364
- )
365
- else:
366
- batch_translate_btn.click(
367
- fn=lambda text, tgt, src: translate_batch(text, tgt, src),
368
- inputs=[batch_input, batch_target_lang, batch_source_lang],
369
- outputs=batch_output,
370
- api_name="translate_batch"
371
- )
372
 
373
- gr.Markdown("### API Usage")
374
- gr.Markdown("""
375
- ```python
376
- from gradio_client import Client
377
 
378
- client = Client("YOUR_SPACE_URL")
 
 
 
 
 
379
 
380
- # Single translation
381
- result = client.predict("你好", "Vietnamese", None, api_name="/translate_text")
 
 
 
 
 
 
 
 
 
 
382
 
383
- # Batch translation
384
- result = client.predict("你好\\n再见", "Vietnamese", None, api_name="/translate_batch")
385
- ```
386
- """)
 
 
 
 
387
 
388
- # Launch the app
389
  if __name__ == "__main__":
390
- demo.queue(concurrency_count=1, max_size=2).launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ AutoModelForSeq2SeqLM,
6
+ BitsAndBytesConfig
 
 
 
 
 
 
 
 
7
  )
8
+ import logging
9
+ import gc
10
+ import psutil
11
+ import os
12
 
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Global variables
17
+ tokenizer = None
18
+ model = None
 
 
19
 
20
+ def get_memory_usage():
21
+ """Get current memory usage"""
22
+ process = psutil.Process(os.getpid())
23
+ return process.memory_info().rss / 1024 / 1024 / 1024 # GB
 
 
24
 
25
+ def load_model_optimized():
26
+ """Load model with maximum optimization for CPU"""
27
+ global tokenizer, model
28
 
29
+ if model is not None:
30
+ return model, tokenizer
31
+
32
+ model_name = "Tencent/Hunyuan-MT-7B-FS8"
33
+ logger.info(f"Loading {model_name} with optimizations...")
34
+ logger.info(f"Memory before loading: {get_memory_usage():.2f} GB")
35
 
 
36
  try:
37
+ # Load tokenizer first
38
+ tokenizer = AutoTokenizer.from_pretrained(
39
+ model_name,
40
+ trust_remote_code=True
41
  )
42
+
43
+ # Load model with aggressive optimizations
44
+ model = AutoModelForSeq2SeqLM.from_pretrained(
45
+ model_name,
46
+ torch_dtype=torch.float16, # Half precision
47
+ device_map="cpu",
48
+ low_cpu_mem_usage=True, # Reduce memory usage
49
+ trust_remote_code=True,
50
+ use_cache=False, # Disable KV cache
51
+ offload_folder="./offload", # Offload to disk if needed
52
+ )
53
+
54
+ # Additional optimizations
55
+ model.eval() # Set to evaluation mode
56
+
57
+ # Enable torch optimizations
58
+ torch.set_num_threads(2) # Limit threads
59
+
60
+ logger.info(f"Memory after loading: {get_memory_usage():.2f} GB")
61
+ logger.info("Model loaded successfully!")
62
+
63
+ return model, tokenizer
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error loading model: {e}")
67
+ # Try fallback with 8-bit quantization
68
  try:
69
+ logger.info("Trying 8-bit quantization...")
70
+ quantization_config = BitsAndBytesConfig(
71
+ load_in_8bit=True,
72
+ llm_int8_enable_fp32_cpu_offload=True
73
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ model = AutoModelForSeq2SeqLM.from_pretrained(
76
+ model_name,
77
+ quantization_config=quantization_config,
78
+ device_map="auto",
79
+ trust_remote_code=True,
80
+ low_cpu_mem_usage=True
81
+ )
 
 
 
 
 
 
82
 
83
+ logger.info("8-bit model loaded!")
84
+ return model, tokenizer
85
+
86
+ except Exception as e2:
87
+ logger.error(f"8-bit loading also failed: {e2}")
88
+ raise e2
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ def translate_text_optimized(
91
+ text: str,
92
+ source_lang: str = "auto",
93
+ target_lang: str = "en"
94
+ ) -> str:
95
+ """Optimized translation function"""
96
 
97
+ if not text.strip():
98
+ return "Please enter text to translate"
99
 
100
+ # Memory cleanup before translation
101
+ gc.collect()
102
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
 
 
103
 
 
104
  try:
105
+ model, tokenizer = load_model_optimized()
106
+
107
+ # Format input
108
+ if source_lang == "auto":
109
+ input_text = f"Translate to {target_lang}: {text}"
110
+ else:
111
+ input_text = f"Translate from {source_lang} to {target_lang}: {text}"
112
+
113
+ logger.info(f"Translating: {input_text[:50]}...")
114
+ start_memory = get_memory_usage()
115
+
116
+ # Tokenize with truncation
117
+ inputs = tokenizer(
118
+ input_text,
119
+ return_tensors="pt",
120
+ max_length=512, # Limit input length
121
+ truncation=True,
122
+ padding=False # No padding for single input
123
  )
124
+
125
+ # Generate with minimal settings
126
+ with torch.no_grad():
127
+ outputs = model.generate(
128
+ **inputs,
129
+ max_new_tokens=256, # Limit output length
130
+ min_length=1,
131
+ num_beams=2, # Reduce beams for speed
132
+ early_stopping=True,
133
+ do_sample=False,
134
+ pad_token_id=tokenizer.pad_token_id,
135
+ eos_token_id=tokenizer.eos_token_id,
136
+ use_cache=False # Disable cache
137
+ )
138
+
139
+ # Decode output
140
+ translated_text = tokenizer.decode(
141
+ outputs[0],
142
+ skip_special_tokens=True
143
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # Clean output
146
+ if ":" in translated_text:
147
+ translated_text = translated_text.split(":", 1)[-1].strip()
148
 
149
+ # Memory cleanup after translation
150
+ del inputs, outputs
151
+ gc.collect()
 
 
152
 
153
+ end_memory = get_memory_usage()
154
+ logger.info(f"Translation completed. Memory: {start_memory:.2f}GB -> {end_memory:.2f}GB")
155
+
156
+ return translated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
 
 
158
  except Exception as e:
159
+ logger.error(f"Translation error: {e}")
160
+ gc.collect() # Cleanup on error
161
+ return f"Translation failed: {str(e)}"
162
 
163
+ # Language mapping
164
+ LANGUAGES = {
165
+ "auto": "Auto Detect",
166
+ "en": "English",
167
+ "zh": "Chinese",
168
+ "vi": "Vietnamese",
169
+ "ja": "Japanese",
170
+ "ko": "Korean",
171
+ "th": "Thai",
172
+ "id": "Indonesian",
173
+ "ms": "Malay",
174
+ "fil": "Filipino"
175
+ }
 
 
 
 
176
 
177
  # Create Gradio interface
178
+ with gr.Blocks(
179
+ title="Hunyuan-MT Translation (CPU Optimized)",
180
+ theme=gr.themes.Monochrome(),
181
+ ) as demo:
182
 
183
+ gr.HTML("""
184
+ <div style="text-align: center; margin: 20px;">
185
+ <h1>🧠 Hunyuan-MT-7B Translation</h1>
186
+ <p><strong>CPU Optimized Version</strong></p>
187
+ <p><em>⚠️ First translation may take 1-2 minutes to load model</em></p>
188
+ </div>
189
+ """)
190
+
191
+ with gr.Row():
192
+ with gr.Column():
193
+ input_text = gr.Textbox(
194
+ label="Input Text",
195
+ placeholder="Enter text to translate (max 200 words for best performance)...",
196
+ lines=4,
197
+ max_lines=8
198
+ )
 
 
 
 
 
 
 
 
 
 
199
 
200
+ with gr.Row():
201
+ source_lang = gr.Dropdown(
202
+ choices=list(LANGUAGES.items()),
203
+ label="From",
204
+ value="auto"
 
205
  )
206
+ target_lang = gr.Dropdown(
207
+ choices=[(k, v) for k, v in LANGUAGES.items() if k != "auto"],
208
+ label="To",
209
+ value="en"
 
 
210
  )
211
+
212
+ translate_btn = gr.Button(
213
+ "🔄 Translate",
214
+ variant="primary",
215
+ size="lg"
216
+ )
217
 
218
+ with gr.Column():
219
+ output_text = gr.Textbox(
220
+ label="Translation",
221
+ lines=4,
222
+ max_lines=8,
223
+ interactive=False
224
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ memory_display = gr.Textbox(
227
+ label="System Status",
228
+ value="Ready",
229
+ interactive=False
230
+ )
 
 
 
 
 
 
 
 
 
231
 
232
+ # Memory monitoring
233
+ def update_memory():
234
+ return f"Memory: {get_memory_usage():.1f}GB / 16GB"
 
235
 
236
+ def translate_with_status(text, src, tgt):
237
+ if len(text.split()) > 100: # Limit word count
238
+ return "Please limit input to 100 words for optimal performance", update_memory()
239
+
240
+ result = translate_text_optimized(text, src, tgt)
241
+ return result, update_memory()
242
 
243
+ # Examples for testing
244
+ gr.Examples(
245
+ examples=[
246
+ ["Hello, how are you?", "en", "vi"],
247
+ ["Xin chào", "vi", "en"],
248
+ ["Good morning", "en", "zh"],
249
+ ["Thank you very much", "en", "ja"],
250
+ ],
251
+ inputs=[input_text, source_lang, target_lang],
252
+ outputs=[output_text, memory_display],
253
+ fn=translate_with_status
254
+ )
255
 
256
+ translate_btn.click(
257
+ fn=translate_with_status,
258
+ inputs=[input_text, source_lang, target_lang],
259
+ outputs=[output_text, memory_display]
260
+ )
261
+
262
+ # Auto-update memory display
263
+ demo.load(fn=update_memory, outputs=memory_display)
264
 
265
+ # Launch with specific settings for HF Spaces
266
  if __name__ == "__main__":
267
+ demo.launch(
268
+ server_name="0.0.0.0",
269
+ server_port=7860,
270
+ share=False,
271
+ show_api=True,
272
+ enable_monitoring=False # Disable to save resources
273
+ )