Spaces:

MedSwin
/

MedAI_Processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 15, 2025

Commit

e76f718

1 Parent(s): dca816b

Upd local llm infer

Browse files

Files changed (4) hide show

app.py +3 -1
utils/augment.py +40 -27
utils/local_llm.py +234 -29
utils/rag.py +38 -15

app.py CHANGED Viewed

@@ -456,7 +456,9 @@ def _run_job(dataset_key: str, params: ProcessParams):
                 seed=params.seed,
                 progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"]),
                 translator=translator,
-                paraphraser=paraphraser
             )
         else:
             # Standard SFT processing mode

                 seed=params.seed,
                 progress_cb=lambda p, msg=None: set_state(progress=p, message=msg or STATE["message"]),
                 translator=translator,
+                paraphraser=paraphraser,
+                is_local=IS_LOCAL,
+                hf_token=os.getenv("HF_TOKEN")
             )
         else:
             # Standard SFT processing mode

utils/augment.py CHANGED Viewed

@@ -252,8 +252,11 @@ def validate_medical_accuracy(question: str, answer: str, paraphraser) -> bool:
         return False
     try:
-        # Use the existing consistency check but with medical focus
-        return paraphraser.consistency_check(question, answer)
     except Exception as e:
         logger.warning(f"Medical accuracy validation failed: {e}")
         return True  # Default to accepting if validation fails
@@ -264,15 +267,21 @@ def enhance_medical_terminology(text: str, paraphraser) -> str:
         return text
     try:
-        prompt = (
-            "Improve the medical terminology in this text while preserving all factual information:\n\n"
-            f"{text}\n\n"
-            "Return only the improved text with better medical terminology:"
-        )
-        enhanced = paraphraser.paraphrase(text, difficulty="hard", custom_prompt=prompt)
-        if enhanced and not is_invalid_response(enhanced):
-            return enhanced
     except Exception as e:
         logger.warning(f"Medical terminology enhancement failed: {e}")
@@ -283,22 +292,26 @@ def create_clinical_scenarios(question: str, answer: str, paraphraser) -> list:
     scenarios = []
     try:
-        # Generate different clinical contexts
-        context_prompts = [
-            f"Rewrite this medical question as if asked by a patient in an emergency room:\n\n{question}",
-            f"Rewrite this medical question as if asked by a patient in a routine checkup:\n\n{question}",
-            f"Rewrite this medical question as if asked by a patient with chronic conditions:\n\n{question}",
-            f"Rewrite this medical question as if asked by a patient's family member:\n\n{question}"
-        ]
-        for i, prompt in enumerate(context_prompts):
-            try:
-                scenario_question = paraphraser.paraphrase(question, difficulty="hard", custom_prompt=prompt)
-                if scenario_question and not is_invalid_response(scenario_question):
-                    scenarios.append((scenario_question, answer, f"clinical_scenario_{i+1}"))
-            except Exception as e:
-                logger.warning(f"Failed to create clinical scenario {i+1}: {e}")
-                continue
     except Exception as e:
         logger.warning(f"Clinical scenario creation failed: {e}")

         return False
     try:
+        # Use medical accuracy check if available (local mode), otherwise fallback to consistency check
+        if hasattr(paraphraser, 'medical_accuracy_check'):
+            return paraphraser.medical_accuracy_check(question, answer)
+        else:
+            return paraphraser.consistency_check(question, answer)
     except Exception as e:
         logger.warning(f"Medical accuracy validation failed: {e}")
         return True  # Default to accepting if validation fails
         return text
     try:
+        # Use dedicated method if available (local mode), otherwise use paraphrase with custom prompt
+        if hasattr(paraphraser, 'enhance_medical_terminology'):
+            enhanced = paraphraser.enhance_medical_terminology(text)
+            if enhanced and not is_invalid_response(enhanced):
+                return enhanced
+        else:
+            prompt = (
+                "Improve the medical terminology in this text while preserving all factual information:\n\n"
+                f"{text}\n\n"
+                "Return only the improved text with better medical terminology:"
+            )
+            enhanced = paraphraser.paraphrase(text, difficulty="hard", custom_prompt=prompt)
+            if enhanced and not is_invalid_response(enhanced):
+                return enhanced
     except Exception as e:
         logger.warning(f"Medical terminology enhancement failed: {e}")
     scenarios = []
     try:
+        # Use dedicated method if available (local mode), otherwise use paraphrase with custom prompts
+        if hasattr(paraphraser, 'create_clinical_scenarios'):
+            scenarios = paraphraser.create_clinical_scenarios(question, answer)
+        else:
+            # Fallback to original implementation
+            context_prompts = [
+                f"Rewrite this medical question as if asked by a patient in an emergency room:\n\n{question}",
+                f"Rewrite this medical question as if asked by a patient in a routine checkup:\n\n{question}",
+                f"Rewrite this medical question as if asked by a patient with chronic conditions:\n\n{question}",
+                f"Rewrite this medical question as if asked by a patient's family member:\n\n{question}"
+            ]
+            for i, prompt in enumerate(context_prompts):
+                try:
+                    scenario_question = paraphraser.paraphrase(question, difficulty="hard", custom_prompt=prompt)
+                    if scenario_question and not is_invalid_response(scenario_question):
+                        scenarios.append((scenario_question, answer, f"clinical_scenario_{i+1}"))
+                except Exception as e:
+                    logger.warning(f"Failed to create clinical scenario {i+1}: {e}")
+                    continue
     except Exception as e:
         logger.warning(f"Clinical scenario creation failed: {e}")

utils/local_llm.py CHANGED Viewed

@@ -94,16 +94,20 @@ class MedAlpacaClient:
                 max_length=2048
             ).to(self.device)
-            # Generate
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
                     max_new_tokens=max_tokens,
                     temperature=temperature,
-                    do_sample=True,
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.1
                 )
             # Decode output
@@ -123,28 +127,36 @@ class MedAlpacaClient:
             return None
     def _format_prompt(self, prompt: str) -> str:
-        """Format prompt for MedAlpaca model"""
-        # MedAlpaca uses a specific format for medical Q&A
         if "Question:" in prompt and "Answer:" in prompt:
             return prompt
         elif "Context:" in prompt and "Question:" in prompt:
             return prompt
         else:
-            # Simple medical Q&A format
-            return f"Question: {prompt}\n\nAnswer:"
     def _clean_response(self, text: str) -> str:
-        """Clean generated response"""
         if not text:
             return text
-        # Remove common prefixes
         prefixes_to_remove = [
             "Answer:",
             "The answer is:",
             "Based on the information provided:",
             "Here's the answer:",
             "Here is the answer:",
         ]
         text = text.strip()
@@ -152,7 +164,13 @@ class MedAlpacaClient:
             if text.startswith(prefix):
                 text = text[len(prefix):].strip()
                 break
         return text
     def _snip(self, text: str, max_words: int = 12) -> str:
@@ -162,6 +180,61 @@ class MedAlpacaClient:
         words = text.strip().split()
         return " ".join(words[:max_words]) + (" …" if len(words) > max_words else "")
     def unload_model(self):
         """Unload model to free memory"""
         if self.model is not None:
@@ -185,34 +258,56 @@ class LocalParaphraser:
         self.client = MedAlpacaClient(model_name, hf_token)
     def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
-        """Paraphrase text using MedAlpaca"""
         if not text or len(text) < 12:
             return text
         if custom_prompt:
             prompt = custom_prompt
         else:
-            prompt = (
-                "Paraphrase the following medical text concisely, preserve meaning and clinical terms.\n"
-                "Do not fabricate or remove factual claims.\n"
-                "Return ONLY the rewritten text, without any introduction, commentary.\n\n"
-                f"Original text: {text}"
-            )
-        result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=0.1)
         return result if result else text
     def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
-        """Translate text using MedAlpaca"""
         if not text:
             return text
-        prompt = f"Translate the following medical text to {target_lang}. Keep meaning exact, preserve medical terms:\n\n{text}"
         result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
         return result.strip() if result else None
     def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
-        """Backtranslate text using MedAlpaca"""
         if not text:
             return text
@@ -221,23 +316,133 @@ class LocalParaphraser:
         if not translated:
             return None
-        # Then translate back to English
-        prompt = f"Translate the following {via_lang} text back to English, preserving the exact meaning:\n\n{translated}"
         result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
         return result.strip() if result else None
     def consistency_check(self, user: str, output: str) -> bool:
-        """Check consistency using MedAlpaca"""
         prompt = (
-            "You are a strict medical QA validator. Given the USER input (question+context) "
-            "and the MODEL ANSWER, reply with exactly 'PASS' if the answer is supported and safe, "
-            "otherwise 'FAIL'. No extra text.\n\n"
-            f"USER:\n{user}\n\nANSWER:\n{output}"
         )
-        result = self.client.generate(prompt, max_tokens=3, temperature=0.0)
         return isinstance(result, str) and "PASS" in result.upper()
     def unload(self):
         """Unload the model"""
         self.client.unload_model()

                 max_length=2048
             ).to(self.device)
+            # Generate with optimized parameters for MedAlpaca
             with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
                     max_new_tokens=max_tokens,
                     temperature=temperature,
+                    do_sample=True if temperature > 0 else False,
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id,
+                    repetition_penalty=1.1,
+                    top_p=0.9 if temperature > 0 else 1.0,
+                    top_k=50 if temperature > 0 else 0,
+                    num_beams=1 if temperature > 0 else 4,
+                    early_stopping=True
                 )
             # Decode output
             return None
     def _format_prompt(self, prompt: str) -> str:
+        """Format prompt for MedAlpaca model with medical-specific formatting"""
+        # MedAlpaca was trained on medical Q&A pairs, so we use its expected format
         if "Question:" in prompt and "Answer:" in prompt:
             return prompt
         elif "Context:" in prompt and "Question:" in prompt:
             return prompt
+        elif "You are a" in prompt or "medical" in prompt.lower():
+            # For medical instructions, use Alpaca format
+            return f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{prompt}\n\n### Response:"
         else:
+            # Default medical Q&A format for MedAlpaca
+            return f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAnswer the following medical question accurately and professionally.\n\n### Input:\n{prompt}\n\n### Response:"
     def _clean_response(self, text: str) -> str:
+        """Clean generated response with medical-specific cleaning"""
         if not text:
             return text
+        # Remove common prefixes and Alpaca format artifacts
         prefixes_to_remove = [
             "Answer:",
             "The answer is:",
             "Based on the information provided:",
             "Here's the answer:",
             "Here is the answer:",
+            "### Response:",
+            "Response:",
+            "Below is an instruction",
+            "### Instruction:",
+            "Instruction:",
         ]
         text = text.strip()
             if text.startswith(prefix):
                 text = text[len(prefix):].strip()
                 break
+        # Remove any remaining Alpaca format artifacts
+        if "### Response:" in text:
+            text = text.split("### Response:")[-1].strip()
+        if "### Input:" in text:
+            text = text.split("### Input:")[0].strip()
         return text
     def _snip(self, text: str, max_words: int = 12) -> str:
         words = text.strip().split()
         return " ".join(words[:max_words]) + (" …" if len(words) > max_words else "")
+    def generate_batch(self, prompts: list, max_tokens: int = 512, temperature: float = 0.2) -> list:
+        """Generate text for multiple prompts in batch for better efficiency"""
+        if not self.is_loaded:
+            self.load_model()
+        if not prompts:
+            return []
+        try:
+            # Format all prompts
+            formatted_prompts = [self._format_prompt(prompt) for prompt in prompts]
+            # Tokenize all inputs
+            inputs = self.tokenizer(
+                formatted_prompts,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=2048
+            ).to(self.device)
+            # Generate for all prompts
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=True if temperature > 0 else False,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    repetition_penalty=1.1,
+                    top_p=0.9 if temperature > 0 else 1.0,
+                    top_k=50 if temperature > 0 else 0,
+                    num_beams=1 if temperature > 0 else 4,
+                    early_stopping=True
+                )
+            # Decode all outputs
+            results = []
+            input_length = inputs['input_ids'].shape[1]
+            for i, output in enumerate(outputs):
+                generated_text = self.tokenizer.decode(
+                    output[input_length:],
+                    skip_special_tokens=True
+                ).strip()
+                cleaned_text = self._clean_response(generated_text)
+                results.append(cleaned_text)
+            logger.info(f"[LOCAL_LLM] Generated batch of {len(prompts)} texts")
+            return results
+        except Exception as e:
+            logger.error(f"[LOCAL_LLM] Batch generation failed: {e}")
+            return [None] * len(prompts)
     def unload_model(self):
         """Unload model to free memory"""
         if self.model is not None:
         self.client = MedAlpacaClient(model_name, hf_token)
     def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
+        """Paraphrase text using MedAlpaca with medical-specific optimization"""
         if not text or len(text) < 12:
             return text
         if custom_prompt:
             prompt = custom_prompt
         else:
+            # Medical-specific paraphrasing prompts based on difficulty
+            if difficulty == "easy":
+                prompt = (
+                    "You are a medical professional. Rewrite the following medical text using different words while preserving all medical facts, clinical terms, and meaning. Keep the same level of detail and accuracy.\n\n"
+                    f"Original medical text: {text}\n\n"
+                    "Rewritten medical text:"
+                )
+            else:  # hard difficulty
+                prompt = (
+                    "You are a medical expert. Rewrite the following medical text using more sophisticated medical language and different sentence structures while preserving all clinical facts, medical terminology, and diagnostic information. Maintain professional medical tone.\n\n"
+                    f"Original medical text: {text}\n\n"
+                    "Enhanced medical text:"
+                )
+        # Adjust temperature based on difficulty
+        temperature = 0.1 if difficulty == "easy" else 0.3
+        result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=temperature)
         return result if result else text
     def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
+        """Translate text using MedAlpaca with medical terminology preservation"""
         if not text:
             return text
+        # Medical-specific translation prompt
+        if target_lang == "vi":
+            prompt = (
+                "You are a medical translator. Translate the following English medical text to Vietnamese while preserving all medical terminology, clinical facts, and professional medical language. Use appropriate Vietnamese medical terms.\n\n"
+                f"English medical text: {text}\n\n"
+                "Vietnamese medical translation:"
+            )
+        else:
+            prompt = (
+                f"You are a medical translator. Translate the following medical text to {target_lang} while preserving all medical terminology, clinical facts, and professional medical language.\n\n"
+                f"Original medical text: {text}\n\n"
+                f"{target_lang} medical translation:"
+            )
         result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
         return result.strip() if result else None
     def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
+        """Backtranslate text using MedAlpaca with medical accuracy"""
         if not text:
             return text
         if not translated:
             return None
+        # Then translate back to English with medical focus
+        if via_lang == "vi":
+            prompt = (
+                "You are a medical translator. Translate the following Vietnamese medical text back to English while preserving all medical terminology, clinical facts, and professional medical language. Ensure the translation is medically accurate.\n\n"
+                f"Vietnamese medical text: {translated}\n\n"
+                "English medical translation:"
+            )
+        else:
+            prompt = (
+                f"You are a medical translator. Translate the following {via_lang} medical text back to English while preserving all medical terminology, clinical facts, and professional medical language.\n\n"
+                f"{via_lang} medical text: {translated}\n\n"
+                "English medical translation:"
+            )
         result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
         return result.strip() if result else None
     def consistency_check(self, user: str, output: str) -> bool:
+        """Check consistency using MedAlpaca with medical validation focus"""
         prompt = (
+            "You are a medical quality assurance expert. Evaluate if the medical answer is consistent with the question/context and medically accurate. Consider:\n"
+            "1. Medical accuracy and clinical appropriateness\n"
+            "2. Consistency with the question asked\n"
+            "3. Safety and professional medical standards\n"
+            "4. Completeness of the medical information\n\n"
+            "Reply with exactly 'PASS' if the answer is medically sound and consistent, otherwise 'FAIL'.\n\n"
+            f"Question/Context: {user}\n\n"
+            f"Medical Answer: {output}\n\n"
+            "Evaluation:"
         )
+        result = self.client.generate(prompt, max_tokens=5, temperature=0.0)
         return isinstance(result, str) and "PASS" in result.upper()
+    def medical_accuracy_check(self, question: str, answer: str) -> bool:
+        """Check medical accuracy of Q&A pairs using MedAlpaca"""
+        if not question or not answer:
+            return False
+        prompt = (
+            "You are a medical accuracy validator. Evaluate if the medical answer is accurate and appropriate for the question. Consider:\n"
+            "1. Medical facts and clinical knowledge\n"
+            "2. Appropriate medical terminology\n"
+            "3. Clinical reasoning and logic\n"
+            "4. Safety considerations\n\n"
+            "Reply with exactly 'ACCURATE' if the answer is medically correct, otherwise 'INACCURATE'.\n\n"
+            f"Medical Question: {question}\n\n"
+            f"Medical Answer: {answer}\n\n"
+            "Medical Accuracy Assessment:"
+        )
+        result = self.client.generate(prompt, max_tokens=5, temperature=0.0)
+        return isinstance(result, str) and "ACCURATE" in result.upper()
+    def enhance_medical_terminology(self, text: str) -> str:
+        """Enhance medical terminology in text using MedAlpaca"""
+        if not text or len(text) < 20:
+            return text
+        prompt = (
+            "You are a medical terminology expert. Improve the medical terminology in the following text while preserving all factual information and clinical accuracy. Use more precise medical terms where appropriate.\n\n"
+            f"Original text: {text}\n\n"
+            "Enhanced medical text:"
+        )
+        result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.1)
+        return result if result else text
+    def create_clinical_scenarios(self, question: str, answer: str) -> list:
+        """Create different clinical scenarios from Q&A pairs using MedAlpaca"""
+        scenarios = []
+        # Different clinical context prompts
+        context_prompts = [
+            (
+                "Rewrite this medical question as if asked by a patient in an emergency room setting:",
+                "emergency_room"
+            ),
+            (
+                "Rewrite this medical question as if asked by a patient during a routine checkup:",
+                "routine_checkup"
+            ),
+            (
+                "Rewrite this medical question as if asked by a patient with chronic conditions:",
+                "chronic_care"
+            ),
+            (
+                "Rewrite this medical question as if asked by a patient's family member:",
+                "family_inquiry"
+            )
+        ]
+        for prompt_template, scenario_type in context_prompts:
+            try:
+                prompt = f"{prompt_template}\n\nOriginal question: {question}\n\nRewritten question:"
+                scenario_question = self.client.generate(prompt, max_tokens=min(400, len(question)+50), temperature=0.2)
+                if scenario_question and not self._is_invalid_response(scenario_question):
+                    scenarios.append((scenario_question, answer, scenario_type))
+            except Exception as e:
+                logger.warning(f"Failed to create clinical scenario {scenario_type}: {e}")
+                continue
+        return scenarios
+    def _is_invalid_response(self, text: str) -> bool:
+        """Check if response is invalid (similar to augment.py)"""
+        if not text or not isinstance(text, str):
+            return True
+        text_lower = text.lower().strip()
+        invalid_patterns = [
+            "fail", "invalid", "i couldn't", "i can't", "i cannot", "unable to",
+            "sorry", "error", "not available", "no answer", "insufficient",
+            "don't know", "do not know", "not sure", "cannot determine",
+            "unable to provide", "not possible", "not applicable", "n/a"
+        ]
+        if len(text_lower) < 3:
+            return True
+        for pattern in invalid_patterns:
+            if pattern in text_lower:
+                return True
+        return False
     def unload(self):
         """Unload the model"""
         self.client.unload_model()

utils/rag.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Dict, List, Tuple, Optional, Callable
 from utils.schema import sft_row, rag_row
 from utils.cloud_llm import NvidiaClient, KeyRotator
 from vi.processing import should_translate, translate_rag_row
 from utils import augment as A
@@ -41,11 +42,17 @@ def _iter_json_or_jsonl(path: str):
 class RAGProcessor:
     """Processes medical datasets into RAG-specific QCA (Question, Context, Answer) format"""
-    def __init__(self, nvidia_model: str):
-        self.nvidia_client = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
     def clean_conversational_content(self, text: str) -> str:
-        """Remove conversational elements and non-medical information using NVIDIA model; keep concise for embeddings."""
         if not text or len(text.strip()) < 10:
             return text
@@ -64,11 +71,18 @@ class RAGProcessor:
         Cleaned medical content:"""
         try:
-            cleaned = self.nvidia_client.generate(
-                prompt,
-                temperature=0.1,
-                max_tokens=min(1000, len(text) + 200)
-            )
             return cleaned.strip() if cleaned else text
         except Exception as e:
             logger.warning(f"[RAG] Error cleaning text: {e}")
@@ -88,11 +102,18 @@ class RAGProcessor:
         Generate a concise medical context:"""
         try:
-            context = self.nvidia_client.generate(
-                prompt,
-                temperature=0.2,
-                max_tokens=200
-            )
             # Trim to a single short paragraph
             return (context or "").strip().split("\n")[0][:600]
         except Exception as e:
@@ -330,7 +351,9 @@ def process_file_into_rag(
     seed: int,
     progress_cb: Optional[Callable[[float, str], None]],
     translator=None,
-    paraphraser=None
 ) -> Tuple[int, Dict]:
     """Main entry point for RAG processing"""
     random.seed(seed)
@@ -342,7 +365,7 @@ def process_file_into_rag(
     logger.info(f"[RAG] Begin RAG processing dataset={dataset_key} sample_limit={sample_limit}")
     # Initialize RAG processor
-    rag_processor = RAGProcessor(nvidia_model)
     dedupe_seen = set()
     key = dataset_key.lower()

 from utils.schema import sft_row, rag_row
 from utils.cloud_llm import NvidiaClient, KeyRotator
+from utils.local_llm import MedAlpacaClient
 from vi.processing import should_translate, translate_rag_row
 from utils import augment as A
 class RAGProcessor:
     """Processes medical datasets into RAG-specific QCA (Question, Context, Answer) format"""
+    def __init__(self, nvidia_model: str, is_local: bool = False, hf_token: str = None):
+        self.is_local = is_local
+        if is_local:
+            self.medalpaca_client = MedAlpacaClient(hf_token=hf_token)
+            self.nvidia_client = None
+        else:
+            self.nvidia_client = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
+            self.medalpaca_client = None
     def clean_conversational_content(self, text: str) -> str:
+        """Remove conversational elements and non-medical information using MedAlpaca or NVIDIA model; keep concise for embeddings."""
         if not text or len(text.strip()) < 10:
             return text
         Cleaned medical content:"""
         try:
+            if self.is_local and self.medalpaca_client:
+                cleaned = self.medalpaca_client.generate(
+                    prompt,
+                    temperature=0.1,
+                    max_tokens=min(1000, len(text) + 200)
+                )
+            else:
+                cleaned = self.nvidia_client.generate(
+                    prompt,
+                    temperature=0.1,
+                    max_tokens=min(1000, len(text) + 200)
+                )
             return cleaned.strip() if cleaned else text
         except Exception as e:
             logger.warning(f"[RAG] Error cleaning text: {e}")
         Generate a concise medical context:"""
         try:
+            if self.is_local and self.medalpaca_client:
+                context = self.medalpaca_client.generate(
+                    prompt,
+                    temperature=0.2,
+                    max_tokens=200
+                )
+            else:
+                context = self.nvidia_client.generate(
+                    prompt,
+                    temperature=0.2,
+                    max_tokens=200
+                )
             # Trim to a single short paragraph
             return (context or "").strip().split("\n")[0][:600]
         except Exception as e:
     seed: int,
     progress_cb: Optional[Callable[[float, str], None]],
     translator=None,
+    paraphraser=None,
+    is_local: bool = False,
+    hf_token: str = None
 ) -> Tuple[int, Dict]:
     """Main entry point for RAG processing"""
     random.seed(seed)
     logger.info(f"[RAG] Begin RAG processing dataset={dataset_key} sample_limit={sample_limit}")
     # Initialize RAG processor
+    rag_processor = RAGProcessor(nvidia_model, is_local=is_local, hf_token=hf_token)
     dedupe_seen = set()
     key = dataset_key.lower()