Spaces:

MedVietAI
/

processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 15, 2025

Commit

a89888b

1 Parent(s): 235b116

Upd local setups with dynamic mode setter

Browse files

Files changed (15) hide show

.dockerignore +3 -0
Dockerfile +17 -2
README.md +22 -3
app.py +69 -27
build.sh +35 -0
DATA_PROCESSING.md → docs/DATA_PROCESSING.md +0 -0
LICENSE.txt → docs/LICENSE.txt +0 -0
docs/LOCAL_MODE.md +128 -0
REQUEST.md → docs/REQUEST.md +0 -0
review.md → docs/REVIEW.md +0 -0
requirements-dev.txt +10 -0
utils/ __init__.py +5 -3
utils/{llm.py → cloud_llm.py} +0 -0
utils/local_llm.py +243 -0
utils/rag.py +1 -1

.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.md
+*.json
+LICENSE.txt

Dockerfile CHANGED Viewed

@@ -9,9 +9,18 @@ RUN useradd -m -u 1000 user
 ENV HOME=/home/user
 WORKDIR $HOME/app
 # Install Python dependencies first (better layer caching)
 COPY --chown=user requirements.txt .
-RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
 # Copy the application
 COPY --chown=user . .
@@ -25,9 +34,15 @@ ENV SENTENCE_TRANSFORMERS_HOME="$HOME/.cache/huggingface/sentence-transformers"
 ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
 # Prepare runtime dirs
-RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs && \
     chown -R user:user $HOME/app
 USER user
 EXPOSE 7860

 ENV HOME=/home/user
 WORKDIR $HOME/app
+# Set dynamic mode environment variable (default to cloud mode)
+ARG IS_LOCAL=true
+ENV IS_LOCAL=${IS_LOCAL}
 # Install Python dependencies first (better layer caching)
 COPY --chown=user requirements.txt .
+# Install local mode dependencies if IS_LOCAL is true
+COPY --chown=user requirements-dev.txt .
+RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt && \
+    if [ "$IS_LOCAL" = "true" ]; then \
+        pip install --no-cache-dir -r requirements-dev.txt; \
+    fi
 # Copy the application
 COPY --chown=user . .
 ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
 # Prepare runtime dirs
+RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs $HOME/app/data && \
     chown -R user:user $HOME/app
+# Download MedAlpaca model if in local mode
+RUN if [ "$IS_LOCAL" = "true" ]; then \
+        echo "Downloading MedAlpaca-13b model for local mode..."; \
+        python -c "from huggingface_hub import snapshot_download; import os; snapshot_download('medalpaca/medalpaca-13b', token=os.getenv('HF_TOKEN'), cache_dir='$HOME/.cache/huggingface')"; \
+    fi
 USER user
 EXPOSE 7860

README.md CHANGED Viewed

@@ -25,6 +25,12 @@ short_description: Data processing with en-vi translation. Derived from 500k mi
 ## 🎯 Features
 ### 🔄 Advanced Data Augmentation
 - **Paraphrasing**: Multi-model rotation (NVIDIA + Gemini) with easy/hard difficulty levels
 - **Backtranslation**: Vietnamese pivot language for semantic preservation
@@ -73,6 +79,18 @@ short_description: Data processing with en-vi translation. Derived from 500k mi
 ## ⚙️ Configuration
 ### Augmentation Parameters
 ```python
 class AugmentOptions:
@@ -140,10 +158,11 @@ curl -X POST "https://huggingface.co/spaces/MedVietAI/processing/rag/healthcarem
 ## 📚 Documentation
-- [Request Documentation](https://huggingface.co/spaces/MedVietAI/processing/blob/main/REQUEST.md)
-- [Data Processing Guide](https://huggingface.co/spaces/MedVietAI/processing/blob/main/DATA_PROCESSING.md)
 ## 📄 License
-[Apache-2.0 LICENSE](https://huggingface.co/spaces/MedVietAI/processing/blob/main/LICENSE.txt)

 ## 🎯 Features
+### 🏠 Dual Mode Operation
+- **Local Mode**: MedAlpaca-13b model running locally for privacy and cost efficiency
+- **Cloud Mode**: NVIDIA + Gemini API integration for scalable processing
+- **Dynamic Switching**: Toggle between modes via environment variables
+- **Medical Specialization**: MedAlpaca-13b specifically fine-tuned for medical tasks
 ### 🔄 Advanced Data Augmentation
 - **Paraphrasing**: Multi-model rotation (NVIDIA + Gemini) with easy/hard difficulty levels
 - **Backtranslation**: Vietnamese pivot language for semantic preservation
 ## ⚙️ Configuration
+### Mode Selection
+```bash
+# Local Mode (MedAlpaca-13b)
+IS_LOCAL=true
+HF_TOKEN=your_huggingface_token
+# Cloud Mode (NVIDIA/Gemini APIs)
+IS_LOCAL=false
+NVIDIA_API_1=your_nvidia_key
+GEMINI_API_1=your_gemini_key
+```
 ### Augmentation Parameters
 ```python
 class AugmentOptions:
 ## 📚 Documentation
+- [Request Documentation](docs/REQUEST.md)
+- [Data Processing Guide](docs/DATA_PROCESSING.md)
+- [Local Mode Guide](docs/LOCAL_MODE.md)
 ## 📄 License
+[Apache-2.0 LICENSE](docs/LICENSE.txt)

app.py CHANGED Viewed

@@ -15,7 +15,8 @@ from utils.datasets import resolve_dataset, hf_download_dataset
 from utils.processor import process_file_into_sft
 from utils.rag import process_file_into_rag
 from utils.drive_saver import DriveSaver
-from utils.llm import Paraphraser
 from utils.schema import CentralisedWriter, RAGWriter
 from utils.token import get_credentials, exchange_code, build_auth_url
 from vi.translator import VietnameseTranslator
@@ -30,29 +31,54 @@ if not logger.handlers:
 # ────────── Boot ──────────
 load_dotenv(override=True)
 SPACE_NAME = os.getenv("SPACE_NAME", "MedAI Processor")
 OUTPUT_DIR = os.path.abspath(os.getenv("OUTPUT_DIR", "cache/outputs"))
 LOG_DIR = os.path.abspath(os.getenv("LOG_DIR", "logs"))
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 os.makedirs(LOG_DIR, exist_ok=True)
-# --- Bootstrap Google OAuth ---
-try:
-    creds = get_credentials()
-    if creds:
-        logger.info("✅ OAuth credentials loaded and valid")
-except Exception as e:
-    logger.warning(f"⚠️ OAuth not initialized yet: {e}")
-# --- Bootstrap Google Drive ---
-drive = DriveSaver(default_folder_id=os.getenv("GDRIVE_FOLDER_ID"))
-# LLM rotator with paraphraser nodes
-paraphraser = Paraphraser(
-    nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
-    gemini_model_easy=os.getenv("GEMINI_MODEL_EASY", "gemini-2.5-flash-lite"),
-    gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
-)
 # Vietnamese translator (currently using Helsinki-NLP/opus-mt-en-vi)
 vietnamese_translator = VietnameseTranslator()
@@ -123,6 +149,11 @@ def root():
       <h1>📊 {SPACE_NAME} – Medical Dataset Augmenter</h1>
       <p>This Hugging Face Space processes medical datasets into a <b>centralised fine-tuning format</b>
          (JSONL + CSV), with optional <i>data augmentation</i>.</p>
       <div class="section">
         <h2>⚡ Quick Actions</h2>
@@ -155,7 +186,7 @@ def root():
         <ul>
           <li><a href="/status" target="_blank">Check current job status</a></li>
           <li><a href="/files" target="_blank">List generated artifacts</a></li>
-          <li><a href="https://medvietai-processing.hf.space/oauth2/start" target="_blank">Authorize your GCS credential</a></li>
           <li><a href="https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing/blob/main/REQUEST.md" target="_blank">📑 Request Doc (all curl examples)</a></li>
         </ul>
       </div>
@@ -242,9 +273,12 @@ def status():
     with STATE_LOCK:
         return JSONResponse(STATE)
-# ──────── GCS token ────────
 @app.get("/oauth2/start")
 def oauth2_start(request: Request):
     # Compute redirect URI dynamically from the actual host the Space is using
     host = request.headers.get("x-forwarded-host") or request.headers.get("host")
     scheme = "https"  # Spaces are HTTPS at the edge
@@ -256,9 +290,12 @@ def oauth2_start(request: Request):
     except Exception as e:
         raise HTTPException(500, f"OAuth init failed: {e}")
-# Display your token
 @app.get("/oauth2/callback")
 def oauth2_callback(request: Request, code: str = "", state: str = ""):
     if not code:
         raise HTTPException(400, "Missing 'code'")
     # Send req
@@ -448,14 +485,19 @@ def _run_job(dataset_key: str, params: ProcessParams):
         logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
         writer.close()
-        # Upload to GDrive
-        set_state(message="uploading to Google Drive", progress=0.95)
-        up1 = drive.upload_file_to_drive(jsonl_path, mimetype="application/json")
-        up2 = drive.upload_file_to_drive(csv_path,   mimetype="text/csv")
-        logger.info(
-            f"[JOB] Uploads complete uploaded={bool(up1 and up2)} "
-            f"jsonl={jsonl_path} csv={csv_path}"
-        )
         # Finalize a task
         result = {

 from utils.processor import process_file_into_sft
 from utils.rag import process_file_into_rag
 from utils.drive_saver import DriveSaver
+from utils.cloud_llm import Paraphraser
+from utils.local_llm import LocalParaphraser
 from utils.schema import CentralisedWriter, RAGWriter
 from utils.token import get_credentials, exchange_code, build_auth_url
 from vi.translator import VietnameseTranslator
 # ────────── Boot ──────────
 load_dotenv(override=True)
+# Check if running in local mode
+IS_LOCAL = os.getenv("IS_LOCAL", "false").lower() == "true"
 SPACE_NAME = os.getenv("SPACE_NAME", "MedAI Processor")
 OUTPUT_DIR = os.path.abspath(os.getenv("OUTPUT_DIR", "cache/outputs"))
 LOG_DIR = os.path.abspath(os.getenv("LOG_DIR", "logs"))
+# In local mode, use data/ folder instead of cache/outputs
+if IS_LOCAL:
+    OUTPUT_DIR = os.path.abspath("data")
+    logger.info(f"[MODE] Running in LOCAL mode - outputs will be saved to: {OUTPUT_DIR}")
+else:
+    logger.info(f"[MODE] Running in CLOUD mode - outputs will be saved to: {OUTPUT_DIR}")
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 os.makedirs(LOG_DIR, exist_ok=True)
+# --- Bootstrap Google OAuth (only in cloud mode) ---
+if not IS_LOCAL:
+    try:
+        creds = get_credentials()
+        if creds:
+            logger.info("✅ OAuth credentials loaded and valid")
+    except Exception as e:
+        logger.warning(f"⚠️ OAuth not initialized yet: {e}")
+    # --- Bootstrap Google Drive (only in cloud mode) ---
+    drive = DriveSaver(default_folder_id=os.getenv("GDRIVE_FOLDER_ID"))
+else:
+    drive = None
+    logger.info("🚀 Local mode: Skipping Google Drive setup")
+# Initialize paraphraser based on mode
+if IS_LOCAL:
+    # Local mode: Use MedAlpaca model
+    logger.info("🏠 Initializing local MedAlpaca paraphraser...")
+    paraphraser = LocalParaphraser(
+        model_name="medalpaca/medalpaca-13b",
+        hf_token=os.getenv("HF_TOKEN")
+    )
+else:
+    # Cloud mode: Use existing NVIDIA/Gemini setup
+    logger.info("☁️ Initializing cloud paraphraser (NVIDIA/Gemini)...")
+    paraphraser = Paraphraser(
+        nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
+        gemini_model_easy=os.getenv("GEMINI_MODEL_EASY", "gemini-2.5-flash-lite"),
+        gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
+    )
 # Vietnamese translator (currently using Helsinki-NLP/opus-mt-en-vi)
 vietnamese_translator = VietnameseTranslator()
       <h1>📊 {SPACE_NAME} – Medical Dataset Augmenter</h1>
       <p>This Hugging Face Space processes medical datasets into a <b>centralised fine-tuning format</b>
          (JSONL + CSV), with optional <i>data augmentation</i>.</p>
+      <div style="margin-bottom: 15px; padding: 10px; background: {'#e8f5e8' if IS_LOCAL else '#e8f0ff'}; border-radius: 5px; border-left: 4px solid {'#28a745' if IS_LOCAL else '#007bff'};">
+        <strong>🔧 Current Mode:</strong> {'🏠 LOCAL (MedAlpaca-13b)' if IS_LOCAL else '☁️ CLOUD (NVIDIA/Gemini APIs)'}
+        <br><small>Outputs will be saved to: {OUTPUT_DIR}</small>
+      </div>
       <div class="section">
         <h2>⚡ Quick Actions</h2>
         <ul>
           <li><a href="/status" target="_blank">Check current job status</a></li>
           <li><a href="/files" target="_blank">List generated artifacts</a></li>
+          {'<li><a href="https://medvietai-processing.hf.space/oauth2/start" target="_blank">Authorize your GCS credential</a></li>' if not IS_LOCAL else ''}
           <li><a href="https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing/blob/main/REQUEST.md" target="_blank">📑 Request Doc (all curl examples)</a></li>
         </ul>
       </div>
     with STATE_LOCK:
         return JSONResponse(STATE)
+# ──────── GCS token (only in cloud mode) ────────
 @app.get("/oauth2/start")
 def oauth2_start(request: Request):
+    if IS_LOCAL:
+        raise HTTPException(400, "OAuth is not available in local mode. Google Drive integration is disabled.")
     # Compute redirect URI dynamically from the actual host the Space is using
     host = request.headers.get("x-forwarded-host") or request.headers.get("host")
     scheme = "https"  # Spaces are HTTPS at the edge
     except Exception as e:
         raise HTTPException(500, f"OAuth init failed: {e}")
+# Display your token (only in cloud mode)
 @app.get("/oauth2/callback")
 def oauth2_callback(request: Request, code: str = "", state: str = ""):
+    if IS_LOCAL:
+        raise HTTPException(400, "OAuth is not available in local mode. Google Drive integration is disabled.")
     if not code:
         raise HTTPException(400, "Missing 'code'")
     # Send req
         logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
         writer.close()
+        # Upload to GDrive (only in cloud mode) or save locally
+        if IS_LOCAL:
+            set_state(message="saving files locally", progress=0.95)
+            logger.info(f"[JOB] Files saved locally: jsonl={jsonl_path} csv={csv_path}")
+            up1 = up2 = True  # Local mode always "succeeds"
+        else:
+            set_state(message="uploading to Google Drive", progress=0.95)
+            up1 = drive.upload_file_to_drive(jsonl_path, mimetype="application/json")
+            up2 = drive.upload_file_to_drive(csv_path,   mimetype="text/csv")
+            logger.info(
+                f"[JOB] Uploads complete uploaded={bool(up1 and up2)} "
+                f"jsonl={jsonl_path} csv={csv_path}"
+            )
         # Finalize a task
         result = {

build.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Build script for MedAI Processing with dynamic local/cloud mode support
+echo "🏗️  MedAI Processing Build Script"
+echo "=================================="
+# Check if mode is specified
+if [ "$1" = "local" ]; then
+    echo "🏠 Building in LOCAL mode (MedAlpaca-13b)"
+    docker build --build-arg IS_LOCAL=true -t medai-processing:local .
+elif [ "$1" = "cloud" ]; then
+    echo "☁️  Building in CLOUD mode (NVIDIA/Gemini APIs)"
+    docker build --build-arg IS_LOCAL=false -t medai-processing:cloud .
+else
+    echo "Usage: $0 [local|cloud]"
+    echo ""
+    echo "  local  - Build with MedAlpaca-13b model for local inference"
+    echo "  cloud  - Build with NVIDIA/Gemini API integration"
+    echo ""
+    echo "Examples:"
+    echo "  $0 local   # Build for local mode"
+    echo "  $0 cloud   # Build for cloud mode"
+    exit 1
+fi
+echo ""
+echo "✅ Build completed successfully!"
+echo ""
+echo "To run the container:"
+if [ "$1" = "local" ]; then
+    echo "  docker run -p 7860:7860 -e HF_TOKEN=your_token_here medai-processing:local"
+else
+    echo "  docker run -p 7860:7860 -e NVIDIA_API_1=your_key -e GEMINI_API_1=your_key medai-processing:cloud"
+fi

DATA_PROCESSING.md → docs/DATA_PROCESSING.md RENAMED Viewed

File without changes

LICENSE.txt → docs/LICENSE.txt RENAMED Viewed

File without changes

docs/LOCAL_MODE.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Local Mode Documentation
+## Overview
+The MedAI Processing system now supports two modes of operation:
+- **Cloud Mode** (default): Uses NVIDIA and Gemini APIs for processing
+- **Local Mode**: Uses MedAlpaca-13b model running locally for processing
+## Local Mode Features
+### Local Mode Benefits
+- **No API costs**: Process data without external API calls
+- **Privacy**: All processing happens locally
+- **Offline capability**: Works without internet connection (after model download)
+- **Medical specialization**: Uses MedAlpaca-13b, a model specifically fine-tuned for medical tasks
+### Technical Details
+- **Model**: [MedAlpaca-13b](https://huggingface.co/medalpaca/medalpaca-13b)
+- **Quantization**: 4-bit quantization for memory efficiency
+- **CUDA Support**: Automatic GPU acceleration when available
+- **Memory Management**: Automatic model unloading to free memory
+## Building and Running
+### Build Script
+Use the provided build script for easy building:
+```bash
+# Build for local mode
+./build.sh local
+# Build for cloud mode
+./build.sh cloud
+```
+### Manual Docker Build
+#### Local Mode
+```bash
+docker build --build-arg IS_LOCAL=true -t medai-processing:local .
+```
+#### Cloud Mode
+```bash
+docker build --build-arg IS_LOCAL=false -t medai-processing:cloud .
+```
+## Environment Variables
+### Local Mode Required
+- `IS_LOCAL=true`: Enables local mode
+- `HF_TOKEN`: Hugging Face token for model download (default: provided token)
+### Local Mode Optional
+- `HF_HOME`: Hugging Face cache directory (default: ~/.cache/huggingface)
+### Cloud Mode Required
+- `IS_LOCAL=false`: Enables cloud mode (default)
+- `NVIDIA_API_1`: NVIDIA API key
+- `GEMINI_API_1`: Gemini API key
+## Output Differences
+### Local Mode
+- **Output Location**: `data/` folder (local filesystem)
+- **No Google Drive**: Files are saved locally only
+- **No OAuth**: Google Drive authentication is disabled
+### Cloud Mode
+- **Output Location**: `cache/outputs/` folder
+- **Google Drive**: Files are uploaded to Google Drive
+- **OAuth**: Google Drive authentication is available
+## Model Information
+### MedAlpaca-13b
+- **Size**: 13 billion parameters
+- **Specialization**: Medical domain tasks
+- **Training Data**:
+  - ChatDoctor (200k Q&A pairs)
+  - WikiDoc (67k items)
+  - StackExchange (academia, biology, fitness, health)
+  - Anki flashcards (33k items)
+### Performance Considerations
+- **Memory**: Requires ~8GB RAM (with 4-bit quantization)
+- **GPU**: CUDA acceleration recommended for faster inference
+- **Storage**: Model download requires ~7GB disk space
+## Usage Examples
+### Processing with Local Mode
+1. Set `IS_LOCAL=true` in environment
+2. Provide `HF_TOKEN` for model access
+3. Run processing jobs - they will use MedAlpaca locally
+4. Output files will be saved to `data/` folder
+### Processing with Cloud Mode
+1. Set `IS_LOCAL=false` (or omit)
+2. Provide NVIDIA and Gemini API keys
+3. Run processing jobs - they will use external APIs
+4. Output files will be uploaded to Google Drive
+## Troubleshooting
+### Local Mode Issues
+- **Model download fails**: Check HF_TOKEN and internet connection
+- **Out of memory**: Ensure sufficient RAM (8GB+ recommended)
+- **Slow inference**: Enable CUDA if available
+### Cloud Mode Issues
+- **API errors**: Check API keys and quotas
+- **Upload failures**: Verify Google Drive authentication
+## Migration Guide
+### From Cloud to Local
+1. Update environment: `IS_LOCAL=true`
+2. Add HF_TOKEN
+3. Rebuild container with local mode
+4. Output will switch from Google Drive to local `data/` folder
+### From Local to Cloud
+1. Update environment: `IS_LOCAL=false`
+2. Add NVIDIA and Gemini API keys
+3. Rebuild container with cloud mode
+4. Output will switch from local to Google Drive

REQUEST.md → docs/REQUEST.md RENAMED Viewed

File without changes

review.md → docs/REVIEW.md RENAMED Viewed

File without changes

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Local mode dependencies for MedAlpaca-13b inference
+torch>=2.0.0
+torch-cuda>=2.0.0
+accelerate>=0.20.0
+bitsandbytes>=0.41.0
+peft>=0.4.0
+datasets>=2.14.0
+evaluate>=0.4.0
+scipy>=1.10.0
+scikit-learn>=1.3.0

utils/ __init__.py CHANGED Viewed

@@ -3,7 +3,8 @@ Utility package for the Medical Dataset Augmenter Space.
 This package provides:
 - drive_saver: Google Drive upload helper
-- llm: API key rotation, paraphraser, translation/backtranslation
 - datasets: Hugging Face dataset resolver & downloader
 - processor: dataset-specific processing pipeline with augmentation
 - schema: centralised SFT writer (JSONL + CSV)
@@ -12,11 +13,12 @@ This package provides:
 """
 from . import drive_saver
-from . import llm
 from . import datasets
 from . import processor
 from . import schema
 from . import augment
 from . import token
-__all__ = ["drive_saver", "llm", "datasets", "processor", "schema", "augment"]

 This package provides:
 - drive_saver: Google Drive upload helper
+- cloud_llm: API key rotation, paraphraser, translation/backtranslation
+- local_llm: Load medalpaca-13B for augmentation, processing and translation
 - datasets: Hugging Face dataset resolver & downloader
 - processor: dataset-specific processing pipeline with augmentation
 - schema: centralised SFT writer (JSONL + CSV)
 """
 from . import drive_saver
+from . import cloud_llm
+from . import local_llm
 from . import datasets
 from . import processor
 from . import schema
 from . import augment
 from . import token
+__all__ = ["drive_saver", "cloud_llm", "local_llm", "datasets", "processor", "schema", "augment"]

utils/{llm.py → cloud_llm.py} RENAMED Viewed

File without changes

utils/local_llm.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Local MedAlpaca-13b inference client
+import os
+import logging
+import torch
+from typing import Optional
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import gc
+logger = logging.getLogger("local_llm")
+if not logger.handlers:
+    logger.setLevel(logging.INFO)
+    handler = logging.StreamHandler()
+    logger.addHandler(handler)
+class MedAlpacaClient:
+    """Local MedAlpaca-13b client for medical text generation"""
+    def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
+        self.model_name = model_name
+        self.hf_token = hf_token or os.getenv("HF_TOKEN")
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.is_loaded = False
+        logger.info(f"[LOCAL_LLM] Initializing MedAlpaca client on device: {self.device}")
+    def load_model(self):
+        """Load the MedAlpaca model and tokenizer"""
+        if self.is_loaded:
+            return
+        try:
+            logger.info(f"[LOCAL_LLM] Loading MedAlpaca model: {self.model_name}")
+            # Configure quantization for memory efficiency
+            if self.device == "cuda":
+                quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4"
+                )
+            else:
+                quantization_config = None
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                token=self.hf_token,
+                cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface")
+            )
+            # Add padding token if not present
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Load model
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                token=self.hf_token,
+                cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface"),
+                quantization_config=quantization_config,
+                device_map="auto" if self.device == "cuda" else None,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                trust_remote_code=True
+            )
+            if self.device == "cpu":
+                self.model = self.model.to(self.device)
+            self.is_loaded = True
+            logger.info("[LOCAL_LLM] MedAlpaca model loaded successfully")
+        except Exception as e:
+            logger.error(f"[LOCAL_LLM] Failed to load model: {e}")
+            raise
+    def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> Optional[str]:
+        """Generate text using MedAlpaca model"""
+        if not self.is_loaded:
+            self.load_model()
+        try:
+            # Format prompt for MedAlpaca
+            formatted_prompt = self._format_prompt(prompt)
+            # Tokenize input
+            inputs = self.tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=2048
+            ).to(self.device)
+            # Generate
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    repetition_penalty=1.1
+                )
+            # Decode output
+            generated_text = self.tokenizer.decode(
+                outputs[0][inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            ).strip()
+            # Clean up response
+            cleaned_text = self._clean_response(generated_text)
+            logger.info(f"[LOCAL_LLM] Generated: {self._snip(cleaned_text)}")
+            return cleaned_text
+        except Exception as e:
+            logger.error(f"[LOCAL_LLM] Generation failed: {e}")
+            return None
+    def _format_prompt(self, prompt: str) -> str:
+        """Format prompt for MedAlpaca model"""
+        # MedAlpaca uses a specific format for medical Q&A
+        if "Question:" in prompt and "Answer:" in prompt:
+            return prompt
+        elif "Context:" in prompt and "Question:" in prompt:
+            return prompt
+        else:
+            # Simple medical Q&A format
+            return f"Question: {prompt}\n\nAnswer:"
+    def _clean_response(self, text: str) -> str:
+        """Clean generated response"""
+        if not text:
+            return text
+        # Remove common prefixes
+        prefixes_to_remove = [
+            "Answer:",
+            "The answer is:",
+            "Based on the information provided:",
+            "Here's the answer:",
+            "Here is the answer:",
+        ]
+        text = text.strip()
+        for prefix in prefixes_to_remove:
+            if text.startswith(prefix):
+                text = text[len(prefix):].strip()
+                break
+        return text
+    def _snip(self, text: str, max_words: int = 12) -> str:
+        """Truncate text for logging"""
+        if not text:
+            return "∅"
+        words = text.strip().split()
+        return " ".join(words[:max_words]) + (" …" if len(words) > max_words else "")
+    def unload_model(self):
+        """Unload model to free memory"""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.tokenizer is not None:
+            del self.tokenizer
+            self.tokenizer = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+        self.is_loaded = False
+        logger.info("[LOCAL_LLM] Model unloaded and memory freed")
+class LocalParaphraser:
+    """Local paraphraser using MedAlpaca model"""
+    def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
+        self.client = MedAlpacaClient(model_name, hf_token)
+    def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
+        """Paraphrase text using MedAlpaca"""
+        if not text or len(text) < 12:
+            return text
+        if custom_prompt:
+            prompt = custom_prompt
+        else:
+            prompt = (
+                "Paraphrase the following medical text concisely, preserve meaning and clinical terms.\n"
+                "Do not fabricate or remove factual claims.\n"
+                "Return ONLY the rewritten text, without any introduction, commentary.\n\n"
+                f"Original text: {text}"
+            )
+        result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=0.1)
+        return result if result else text
+    def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
+        """Translate text using MedAlpaca"""
+        if not text:
+            return text
+        prompt = f"Translate the following medical text to {target_lang}. Keep meaning exact, preserve medical terms:\n\n{text}"
+        result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
+        return result.strip() if result else None
+    def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
+        """Backtranslate text using MedAlpaca"""
+        if not text:
+            return text
+        # First translate to target language
+        translated = self.translate(text, target_lang=via_lang)
+        if not translated:
+            return None
+        # Then translate back to English
+        prompt = f"Translate the following {via_lang} text back to English, preserving the exact meaning:\n\n{translated}"
+        result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
+        return result.strip() if result else None
+    def consistency_check(self, user: str, output: str) -> bool:
+        """Check consistency using MedAlpaca"""
+        prompt = (
+            "You are a strict medical QA validator. Given the USER input (question+context) "
+            "and the MODEL ANSWER, reply with exactly 'PASS' if the answer is supported and safe, "
+            "otherwise 'FAIL'. No extra text.\n\n"
+            f"USER:\n{user}\n\nANSWER:\n{output}"
+        )
+        result = self.client.generate(prompt, max_tokens=3, temperature=0.0)
+        return isinstance(result, str) and "PASS" in result.upper()
+    def unload(self):
+        """Unload the model"""
+        self.client.unload_model()

utils/rag.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 from typing import Dict, List, Tuple, Optional, Callable
 from utils.schema import sft_row, rag_row
-from utils.llm import NvidiaClient, KeyRotator
 from vi.processing import should_translate, translate_rag_row
 from utils import augment as A

 from typing import Dict, List, Tuple, Optional, Callable
 from utils.schema import sft_row, rag_row
+from utils.cloud_llm import NvidiaClient, KeyRotator
 from vi.processing import should_translate, translate_rag_row
 from utils import augment as A