LiamKhoaLe commited on
Commit
53e751d
·
1 Parent(s): e3a165a

Upd dynamic translation NLPs

Browse files
Dockerfile CHANGED
@@ -28,7 +28,7 @@ RUN pip install --upgrade pip && \
28
  # Copy the application
29
  COPY --chown=user . .
30
 
31
- # Download Vietnamese translation model
32
  RUN python vi/download.py
33
 
34
  # Hugging Face cache setup
@@ -40,10 +40,14 @@ ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
40
  RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs $HOME/app/data && \
41
  chown -R user:user $HOME/app
42
 
43
- # Download MedAlpaca model if in local mode
44
  RUN if [ "$IS_LOCAL" = "true" ]; then \
45
- echo "Downloading MedAlpaca-13b model for local mode..."; \
 
46
  python -c "from huggingface_hub import snapshot_download; import os; snapshot_download('medalpaca/medalpaca-13b', token=os.getenv('HF_TOKEN'), cache_dir='$HOME/.cache/huggingface')"; \
 
 
 
47
  fi
48
 
49
  USER user
 
28
  # Copy the application
29
  COPY --chown=user . .
30
 
31
+ # Download Vietnamese translation model (always needed for fallback)
32
  RUN python vi/download.py
33
 
34
  # Hugging Face cache setup
 
40
  RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs $HOME/app/data && \
41
  chown -R user:user $HOME/app
42
 
43
+ # Download models based on mode
44
  RUN if [ "$IS_LOCAL" = "true" ]; then \
45
+ echo "Downloading models for local mode..."; \
46
+ echo "Downloading MedAlpaca-13b model..."; \
47
  python -c "from huggingface_hub import snapshot_download; import os; snapshot_download('medalpaca/medalpaca-13b', token=os.getenv('HF_TOKEN'), cache_dir='$HOME/.cache/huggingface')"; \
48
+ echo "Vietnamese translation model already downloaded above"; \
49
+ else \
50
+ echo "Cloud mode: Only Vietnamese translation model downloaded"; \
51
  fi
52
 
53
  USER user
utils/local_llm.py CHANGED
@@ -291,10 +291,25 @@ class MedAlpacaClient:
291
  logger.info("[LOCAL_LLM] Model unloaded and memory freed")
292
 
293
  class LocalParaphraser:
294
- """Local paraphraser using MedAlpaca model"""
295
 
296
  def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
297
  self.client = MedAlpacaClient(model_name, hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
300
  """Paraphrase text using MedAlpaca with medical-specific optimization"""
@@ -321,50 +336,149 @@ class LocalParaphraser:
321
  result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=temperature)
322
  return result if result else text
323
 
324
- def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
325
- """Translate text using MedAlpaca with medical terminology preservation"""
326
  if not text:
327
  return text
328
 
329
- # Medical-specific translation prompt
330
- if target_lang == "vi":
331
- prompt = (
332
- "Translate the following English medical text to Vietnamese while preserving all medical terminology, clinical facts, and professional medical language. Use appropriate Vietnamese medical terms. Return only the translation without any introduction or commentary.\n\n"
333
- f"{text}"
334
- )
335
- else:
336
- prompt = (
337
- f"Translate the following medical text to {target_lang} while preserving all medical terminology, clinical facts, and professional medical language. Return only the translation without any introduction or commentary.\n\n"
338
- f"{text}"
339
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
342
  return result.strip() if result else None
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
345
- """Backtranslate text using MedAlpaca with medical accuracy"""
346
  if not text:
347
  return text
348
 
349
- # First translate to target language
350
  translated = self.translate(text, target_lang=via_lang)
351
  if not translated:
352
  return None
353
 
354
  # Then translate back to English with medical focus
355
  if via_lang == "vi":
 
356
  prompt = (
357
  "Translate the following Vietnamese medical text back to English while preserving all medical terminology, clinical facts, and professional medical language. Ensure the translation is medically accurate. Return only the translation without any introduction or commentary.\n\n"
358
  f"{translated}"
359
  )
 
 
 
 
 
 
 
 
360
  else:
361
  prompt = (
362
  f"Translate the following {via_lang} medical text back to English while preserving all medical terminology, clinical facts, and professional medical language. Return only the translation without any introduction or commentary.\n\n"
363
  f"{translated}"
364
  )
 
 
 
 
 
 
 
 
 
365
 
366
- result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
367
- return result.strip() if result else None
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
  def consistency_check(self, user: str, output: str) -> bool:
370
  """Check consistency using MedAlpaca with medical validation focus"""
@@ -475,6 +589,111 @@ class LocalParaphraser:
475
 
476
  return False
477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  def unload(self):
479
  """Unload the model"""
480
  self.client.unload_model()
 
291
  logger.info("[LOCAL_LLM] Model unloaded and memory freed")
292
 
293
  class LocalParaphraser:
294
+ """Local paraphraser using MedAlpaca model with Vietnamese fallback translation"""
295
 
296
  def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
297
  self.client = MedAlpacaClient(model_name, hf_token)
298
+ self.vietnamese_translator = None
299
+ self._init_vietnamese_translator()
300
+
301
+ def _init_vietnamese_translator(self):
302
+ """Initialize Vietnamese translator for fallback translation"""
303
+ try:
304
+ from vi.translator import VietnameseTranslator
305
+ self.vietnamese_translator = VietnameseTranslator()
306
+ logger.info("[LOCAL_LLM] Vietnamese translator initialized for fallback")
307
+ except ImportError as e:
308
+ logger.warning(f"[LOCAL_LLM] Vietnamese translator not available: {e}")
309
+ self.vietnamese_translator = None
310
+ except Exception as e:
311
+ logger.warning(f"[LOCAL_LLM] Failed to initialize Vietnamese translator: {e}")
312
+ self.vietnamese_translator = None
313
 
314
  def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
315
  """Paraphrase text using MedAlpaca with medical-specific optimization"""
 
336
  result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=temperature)
337
  return result if result else text
338
 
339
+ def translate(self, text: str, target_lang: str = "vi", max_retries: int = 2) -> Optional[str]:
340
+ """Translate text using MedAlpaca with Vietnamese fallback mechanism"""
341
  if not text:
342
  return text
343
 
344
+ # Only implement fallback for Vietnamese translation
345
+ if target_lang != "vi":
346
+ return self._translate_other_language(text, target_lang)
347
+
348
+ # Try MedAlpaca translation with retries
349
+ for attempt in range(max_retries + 1):
350
+ try:
351
+ # Medical-specific Vietnamese translation prompt
352
+ prompt = (
353
+ "Translate the following English medical text to Vietnamese while preserving all medical terminology, clinical facts, and professional medical language. Use appropriate Vietnamese medical terms. Return only the translation without any introduction or commentary.\n\n"
354
+ f"{text}"
355
+ )
356
+
357
+ result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
358
+
359
+ if result and result.strip():
360
+ # Validate the translation
361
+ if self._is_valid_vietnamese_translation(text, result.strip()):
362
+ logger.info(f"[LOCAL_LLM] Vietnamese translation successful (attempt {attempt + 1})")
363
+ return result.strip()
364
+ else:
365
+ logger.warning(f"[LOCAL_LLM] Invalid Vietnamese translation (attempt {attempt + 1}): {result[:100]}...")
366
+ else:
367
+ logger.warning(f"[LOCAL_LLM] Empty Vietnamese translation (attempt {attempt + 1})")
368
+
369
+ except Exception as e:
370
+ logger.warning(f"[LOCAL_LLM] Vietnamese translation attempt {attempt + 1} failed: {e}")
371
+
372
+ # Fallback: Use translation model to translate English answer
373
+ logger.info("[LOCAL_LLM] MedAlpaca Vietnamese translation failed, using fallback translation model")
374
+ return self._fallback_vietnamese_translation(text)
375
+
376
+ def _translate_other_language(self, text: str, target_lang: str) -> Optional[str]:
377
+ """Translate to languages other than Vietnamese using MedAlpaca"""
378
+ prompt = (
379
+ f"Translate the following medical text to {target_lang} while preserving all medical terminology, clinical facts, and professional medical language. Return only the translation without any introduction or commentary.\n\n"
380
+ f"{text}"
381
+ )
382
 
383
  result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
384
  return result.strip() if result else None
385
 
386
+ def _is_valid_vietnamese_translation(self, original: str, translation: str) -> bool:
387
+ """Check if the Vietnamese translation is valid"""
388
+ if not translation or not translation.strip():
389
+ return False
390
+
391
+ # Check if translation is too similar to original (likely failed)
392
+ if translation.strip().lower() == original.strip().lower():
393
+ return False
394
+
395
+ # Check if translation contains English words (likely failed)
396
+ english_words = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must']
397
+ translation_lower = translation.lower()
398
+ english_word_count = sum(1 for word in english_words if word in translation_lower)
399
+
400
+ # If more than 30% of common English words are present, likely failed
401
+ if english_word_count > len(translation.split()) * 0.3:
402
+ return False
403
+
404
+ # Check minimum length (should be reasonable)
405
+ if len(translation.strip()) < len(original.strip()) * 0.3:
406
+ return False
407
+
408
+ return True
409
+
410
+ def _fallback_vietnamese_translation(self, text: str) -> Optional[str]:
411
+ """Use translation model as fallback for Vietnamese translation"""
412
+ if not self.vietnamese_translator:
413
+ logger.warning("[LOCAL_LLM] Vietnamese translator not available for fallback")
414
+ return None
415
+
416
+ try:
417
+ result = self.vietnamese_translator.translate_text(text)
418
+ if result and result.strip() and result.strip() != text.strip():
419
+ logger.info("[LOCAL_LLM] Fallback Vietnamese translation successful")
420
+ return result.strip()
421
+ else:
422
+ logger.warning("[LOCAL_LLM] Fallback Vietnamese translation failed or returned identical text")
423
+ return None
424
+ except Exception as e:
425
+ logger.error(f"[LOCAL_LLM] Fallback Vietnamese translation error: {e}")
426
+ return None
427
+
428
  def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
429
+ """Backtranslate text using MedAlpaca with Vietnamese fallback mechanism"""
430
  if not text:
431
  return text
432
 
433
+ # First translate to target language (this will use fallback if needed)
434
  translated = self.translate(text, target_lang=via_lang)
435
  if not translated:
436
  return None
437
 
438
  # Then translate back to English with medical focus
439
  if via_lang == "vi":
440
+ # Try MedAlpaca for back-translation first
441
  prompt = (
442
  "Translate the following Vietnamese medical text back to English while preserving all medical terminology, clinical facts, and professional medical language. Ensure the translation is medically accurate. Return only the translation without any introduction or commentary.\n\n"
443
  f"{translated}"
444
  )
445
+
446
+ result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
447
+ if result and result.strip():
448
+ return result.strip()
449
+
450
+ # Fallback: Use translation model for back-translation
451
+ logger.info("[LOCAL_LLM] MedAlpaca back-translation failed, using fallback translation model")
452
+ return self._fallback_english_translation(translated)
453
  else:
454
  prompt = (
455
  f"Translate the following {via_lang} medical text back to English while preserving all medical terminology, clinical facts, and professional medical language. Return only the translation without any introduction or commentary.\n\n"
456
  f"{translated}"
457
  )
458
+
459
+ result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
460
+ return result.strip() if result else None
461
+
462
+ def _fallback_english_translation(self, vietnamese_text: str) -> Optional[str]:
463
+ """Use translation model as fallback for English back-translation"""
464
+ if not self.vietnamese_translator:
465
+ logger.warning("[LOCAL_LLM] Vietnamese translator not available for back-translation fallback")
466
+ return None
467
 
468
+ try:
469
+ # Use the translator's back-translation capability
470
+ # Note: This would need to be implemented in the VietnameseTranslator class
471
+ # For now, we'll use a simple approach
472
+ result = self.vietnamese_translator.translate_text(vietnamese_text)
473
+ if result and result.strip() and result.strip() != vietnamese_text.strip():
474
+ logger.info("[LOCAL_LLM] Fallback English back-translation successful")
475
+ return result.strip()
476
+ else:
477
+ logger.warning("[LOCAL_LLM] Fallback English back-translation failed or returned identical text")
478
+ return None
479
+ except Exception as e:
480
+ logger.error(f"[LOCAL_LLM] Fallback English back-translation error: {e}")
481
+ return None
482
 
483
  def consistency_check(self, user: str, output: str) -> bool:
484
  """Check consistency using MedAlpaca with medical validation focus"""
 
589
 
590
  return False
591
 
592
+ def create_vietnamese_training_data(self, question: str, answer: str, max_retries: int = 2) -> list:
593
+ """
594
+ Create Vietnamese training data with fallback mechanism.
595
+
596
+ This method tries to get Vietnamese translations from MedAlpaca first.
597
+ If MedAlpaca fails (max 2 retries), it allows MedAlpaca to answer in English
598
+ and uses translation models to create Vietnamese versions.
599
+
600
+ Args:
601
+ question: English question
602
+ answer: English answer
603
+ max_retries: Maximum retries for MedAlpaca Vietnamese translation
604
+
605
+ Returns:
606
+ List of training data tuples: [(question_vi, answer_vi), ...]
607
+ """
608
+ training_data = []
609
+
610
+ # Try to get Vietnamese translation from MedAlpaca
611
+ question_vi = self.translate(question, target_lang="vi", max_retries=max_retries)
612
+ answer_vi = self.translate(answer, target_lang="vi", max_retries=max_retries)
613
+
614
+ if question_vi and answer_vi:
615
+ # MedAlpaca successfully translated both
616
+ training_data.append((question_vi, answer_vi))
617
+ logger.info("[LOCAL_LLM] Created Vietnamese training data using MedAlpaca translation")
618
+ else:
619
+ # MedAlpaca failed, use fallback mechanism
620
+ logger.info("[LOCAL_LLM] MedAlpaca Vietnamese translation failed, using fallback mechanism")
621
+
622
+ # Allow MedAlpaca to answer in English (this should always work)
623
+ english_answer = self.client.generate(
624
+ f"Answer the following medical question: {question}",
625
+ max_tokens=min(800, len(answer)+100),
626
+ temperature=0.1
627
+ )
628
+
629
+ if english_answer and english_answer.strip():
630
+ # Use translation models to create Vietnamese versions
631
+ if self.vietnamese_translator:
632
+ try:
633
+ # Translate question using fallback
634
+ question_vi_fallback = self._fallback_vietnamese_translation(question)
635
+ # Translate answer using fallback
636
+ answer_vi_fallback = self._fallback_vietnamese_translation(english_answer.strip())
637
+
638
+ if question_vi_fallback and answer_vi_fallback:
639
+ training_data.append((question_vi_fallback, answer_vi_fallback))
640
+ logger.info("[LOCAL_LLM] Created Vietnamese training data using fallback translation")
641
+ else:
642
+ logger.warning("[LOCAL_LLM] Fallback translation failed, no Vietnamese training data created")
643
+ except Exception as e:
644
+ logger.error(f"[LOCAL_LLM] Fallback translation error: {e}")
645
+ else:
646
+ logger.warning("[LOCAL_LLM] Vietnamese translator not available for fallback")
647
+ else:
648
+ logger.warning("[LOCAL_LLM] MedAlpaca failed to generate English answer for fallback")
649
+
650
+ return training_data
651
+
652
+ def create_vietnamese_augmented_data(self, question: str, answer: str) -> list:
653
+ """
654
+ Create multiple Vietnamese training data variations using different approaches.
655
+
656
+ This method creates:
657
+ 1. Direct Vietnamese translation (if successful)
658
+ 2. English answer + Vietnamese translation fallback
659
+ 3. Paraphrased English + Vietnamese translation
660
+
661
+ Args:
662
+ question: English question
663
+ answer: English answer
664
+
665
+ Returns:
666
+ List of training data tuples: [(question_vi, answer_vi), ...]
667
+ """
668
+ training_data = []
669
+
670
+ # 1. Try direct Vietnamese translation
671
+ direct_data = self.create_vietnamese_training_data(question, answer)
672
+ training_data.extend(direct_data)
673
+
674
+ # 2. Create paraphrased English version and translate
675
+ try:
676
+ paraphrased_answer = self.paraphrase(answer, difficulty="easy")
677
+ if paraphrased_answer and paraphrased_answer != answer:
678
+ paraphrased_data = self.create_vietnamese_training_data(question, paraphrased_answer)
679
+ training_data.extend(paraphrased_data)
680
+ logger.info("[LOCAL_LLM] Created Vietnamese training data from paraphrased English")
681
+ except Exception as e:
682
+ logger.warning(f"[LOCAL_LLM] Failed to create paraphrased Vietnamese data: {e}")
683
+
684
+ # 3. Create back-translated version
685
+ try:
686
+ backtranslated_answer = self.backtranslate(answer, via_lang="vi")
687
+ if backtranslated_answer and backtranslated_answer != answer:
688
+ backtranslated_data = self.create_vietnamese_training_data(question, backtranslated_answer)
689
+ training_data.extend(backtranslated_data)
690
+ logger.info("[LOCAL_LLM] Created Vietnamese training data from back-translated English")
691
+ except Exception as e:
692
+ logger.warning(f"[LOCAL_LLM] Failed to create back-translated Vietnamese data: {e}")
693
+
694
+ logger.info(f"[LOCAL_LLM] Created {len(training_data)} Vietnamese training data variations")
695
+ return training_data
696
+
697
  def unload(self):
698
  """Unload the model"""
699
  self.client.unload_model()
~/.cache/huggingface/.locks/models--medalpaca--medalpaca-13b/091214b5ef67577540895961dcef678657b46f45.lock ADDED
File without changes
~/.cache/huggingface/models--medalpaca--medalpaca-13b/blobs/091214b5ef67577540895961dcef678657b46f45 ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "</s>",
3
+ "eos_token": "</s>",
4
+ "model_max_length": 512,
5
+ "padding_side": "right",
6
+ "special_tokens_map_file": "/sc-projects/sc-proj-cc06-medbert/hfcache/hub/models--decapoda-research--llama-13b-hf/snapshots/438770a656712a5072229b62256521845d4de5ce/special_tokens_map.json",
7
+ "tokenizer_class": "LlamaTokenizer",
8
+ "unk_token": "</s>",
9
+ "pad_token": "[PAD]"
10
+ }
~/.cache/huggingface/models--medalpaca--medalpaca-13b/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ 3e5d80d26b0373167ab2daed45b7f9ef35f998e5