Spaces:

MedSwin
/

MedAI_Processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 3, 2025

Commit

a7fd3ba

1 Parent(s): 19d62ff

Enhance RAG conciseness and SFT aug

Browse files

Files changed (3) hide show

utils/augment.py +16 -1
utils/rag.py +12 -7
vi/processing.py +34 -2

utils/augment.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # augmentation utility agent
 import re
 import random
 from typing import Dict, Tuple
 import ftfy
@@ -94,7 +95,21 @@ def maybe_backtranslate(text: str, ratio: float, paraphraser) -> Tuple[str, bool
     if ratio <= 0 or not text: return text, False
     if random.random() < ratio:
         bt = paraphraser.backtranslate(text, via_lang="vi")
-        return bt if bt else text, bool(bt)
     return text, False
 def consistency_ok(user: str, out: str, ratio: float, paraphraser) -> bool:

 # augmentation utility agent
 import re
+import difflib
 import random
 from typing import Dict, Tuple
 import ftfy
     if ratio <= 0 or not text: return text, False
     if random.random() < ratio:
         bt = paraphraser.backtranslate(text, via_lang="vi")
+        if not bt:
+            return text, False
+        # Guardrails: reject if too short/long or too dissimilar/similar
+        try:
+            orig_len = max(1, len(text))
+            len_delta = abs(len(bt) - len(text)) / orig_len
+            sim = difflib.SequenceMatcher(None, text, bt).ratio()
+            # Accept if moderate change and not excessive drift
+            if len_delta > 0.5:
+                return text, False
+            if sim < 0.45 or sim > 0.98:
+                return text, False
+        except Exception:
+            pass
+        return bt, True
     return text, False
 def consistency_ok(user: str, out: str, ratio: float, paraphraser) -> bool:

utils/rag.py CHANGED Viewed

@@ -44,7 +44,7 @@ class RAGProcessor:
         self.nvidia_client = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
     def clean_conversational_content(self, text: str) -> str:
-        """Remove conversational elements and non-medical information using NVIDIA model"""
         if not text or len(text.strip()) < 10:
             return text
@@ -55,7 +55,7 @@ class RAGProcessor:
         3. Keep only medically relevant information
         4. Preserve clinical facts, symptoms, diagnoses, treatments, and medical advice
         5. Maintain professional medical language
-        6. Return only cleaned medical content, only plain text, no special characters, or formatting.
         Text to clean:
         {text}
@@ -74,11 +74,11 @@ class RAGProcessor:
             return text
     def generate_context_from_qa(self, question: str, answer: str) -> str:
-        """Generate synthetic context from question and answer using NVIDIA model"""
         if not question or not answer:
             return ""
-        prompt = f"""You are a medical knowledge expert. Given a medical question and its answer, generate a brief relevant medical context that would help someone understand the answer better. Write about 2 sentences that provide relevant background information. Use only plain text without any formatting or symbols.
         Question: {question}
@@ -92,16 +92,20 @@ class RAGProcessor:
                 temperature=0.2,
                 max_tokens=200
             )
-            return context.strip() if context else ""
         except Exception as e:
             logger.warning(f"[RAG] Error generating context: {e}")
             return ""
     def convert_to_qca_format(self, instruction: str, user_input: str, output: str) -> Tuple[str, str, str]:
-        """Convert SFT format to QCA (Question, Context, Answer) format"""
         # Clean the content to remove conversational elements
         cleaned_input = self.clean_conversational_content(user_input)
         cleaned_output = self.clean_conversational_content(output)
         # Extract question from user input
         question = self.extract_question(cleaned_input)
@@ -110,7 +114,8 @@ class RAGProcessor:
         context = self.extract_context(cleaned_input, question, cleaned_output)
         # Clean answer
-        answer = cleaned_output
         return question, context, answer

         self.nvidia_client = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
     def clean_conversational_content(self, text: str) -> str:
+        """Remove conversational elements and non-medical information using NVIDIA model; keep concise for embeddings."""
         if not text or len(text.strip()) < 10:
             return text
         3. Keep only medically relevant information
         4. Preserve clinical facts, symptoms, diagnoses, treatments, and medical advice
         5. Maintain professional medical language
+        6. Return only cleaned medical content in 1-2 concise sentences suitable for dense retrieval embeddings. No lists, no headers.
         Text to clean:
         {text}
             return text
     def generate_context_from_qa(self, question: str, answer: str) -> str:
+        """Generate synthetic, concise context (<=2 sentences) from question and answer, embedding-friendly."""
         if not question or not answer:
             return ""
+        prompt = f"""You are a medical knowledge expert. Given a medical question and its answer, generate a brief relevant medical context that helps retrieval. Limit to 1–2 sentences, concise, avoid boilerplate, no enumerations.
         Question: {question}
                 temperature=0.2,
                 max_tokens=200
             )
+            # Trim to a single short paragraph
+            return (context or "").strip().split("\n")[0][:600]
         except Exception as e:
             logger.warning(f"[RAG] Error generating context: {e}")
             return ""
     def convert_to_qca_format(self, instruction: str, user_input: str, output: str) -> Tuple[str, str, str]:
+        """Convert SFT format to QCA (Question, Context, Answer) format, compressing for embedding suitability."""
         # Clean the content to remove conversational elements
         cleaned_input = self.clean_conversational_content(user_input)
         cleaned_output = self.clean_conversational_content(output)
+        # Hard caps for embedding friendliness
+        cleaned_input = (cleaned_input or "")[:1200]
+        cleaned_output = (cleaned_output or "")[:1200]
         # Extract question from user input
         question = self.extract_question(cleaned_input)
         context = self.extract_context(cleaned_input, question, cleaned_output)
         # Clean answer
+        # Prefer short, direct answers
+        answer = cleaned_output[:800]
         return question, context, answer

vi/processing.py CHANGED Viewed

@@ -7,6 +7,30 @@ from typing import Dict, Any, List, Optional, Callable
 logger = logging.getLogger(__name__)
 def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
     """
     Translate specific text fields in an SFT row from English to Vietnamese.
@@ -29,6 +53,10 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
     try:
         translated_row = translator.translate_dict(row, text_fields)
         logger.debug(f"Translated SFT row with fields: {text_fields}")
         return translated_row
     except Exception as e:
@@ -52,11 +80,15 @@ def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] =
         return row
     if text_fields is None:
-        # Default fields to translate in RAG format
-        text_fields = ["instruction", "input", "output"]
     try:
         translated_row = translator.translate_dict(row, text_fields)
         logger.debug(f"Translated RAG row with fields: {text_fields}")
         return translated_row
     except Exception as e:

 logger = logging.getLogger(__name__)
+def _vi_sanitize_text(s: str) -> str:
+    """Light Vietnamese sanitization for finetuning and RAG: strip extra spaces, limit repetition, preserve numbers/units."""
+    if not isinstance(s, str):
+        return s
+    t = s.strip()
+    # Collapse repeated punctuation and whitespace
+    import re
+    t = re.sub(r"\s+", " ", t)
+    t = re.sub(r"([.?!]){3,}", r"..", t)
+    # Remove obvious repetition chunks (very heuristic)
+    parts = t.split()
+    if len(parts) > 20:
+        window = 6
+        seen = set()
+        filtered = []
+        for i in range(len(parts)):
+            ngram = " ".join(parts[max(0, i-window):i+1])
+            if ngram in seen:
+                continue
+            seen.add(ngram)
+            filtered.append(parts[i])
+        t = " ".join(filtered)
+    return t
 def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
     """
     Translate specific text fields in an SFT row from English to Vietnamese.
     try:
         translated_row = translator.translate_dict(row, text_fields)
+        # Sanitize translated fields
+        for f in text_fields:
+            if f in translated_row.get("sft", {}):
+                translated_row["sft"][f] = _vi_sanitize_text(translated_row["sft"][f])
         logger.debug(f"Translated SFT row with fields: {text_fields}")
         return translated_row
     except Exception as e:
         return row
     if text_fields is None:
+        # Default fields to translate in RAG format (Q, A, C)
+        text_fields = ["question", "answer", "context"]
     try:
         translated_row = translator.translate_dict(row, text_fields)
+        # Sanitize translated fields
+        for f in text_fields:
+            if f in translated_row:
+                translated_row[f] = _vi_sanitize_text(translated_row[f])
         logger.debug(f"Translated RAG row with fields: {text_fields}")
         return translated_row
     except Exception as e: