Spaces:
Sleeping
Sleeping
Commit
·
a89888b
1
Parent(s):
235b116
Upd local setups with dynamic mode setter
Browse files- .dockerignore +3 -0
- Dockerfile +17 -2
- README.md +22 -3
- app.py +69 -27
- build.sh +35 -0
- DATA_PROCESSING.md → docs/DATA_PROCESSING.md +0 -0
- LICENSE.txt → docs/LICENSE.txt +0 -0
- docs/LOCAL_MODE.md +128 -0
- REQUEST.md → docs/REQUEST.md +0 -0
- review.md → docs/REVIEW.md +0 -0
- requirements-dev.txt +10 -0
- utils/ __init__.py +5 -3
- utils/{llm.py → cloud_llm.py} +0 -0
- utils/local_llm.py +243 -0
- utils/rag.py +1 -1
.dockerignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.md
|
| 2 |
+
*.json
|
| 3 |
+
LICENSE.txt
|
Dockerfile
CHANGED
|
@@ -9,9 +9,18 @@ RUN useradd -m -u 1000 user
|
|
| 9 |
ENV HOME=/home/user
|
| 10 |
WORKDIR $HOME/app
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# Install Python dependencies first (better layer caching)
|
| 13 |
COPY --chown=user requirements.txt .
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Copy the application
|
| 17 |
COPY --chown=user . .
|
|
@@ -25,9 +34,15 @@ ENV SENTENCE_TRANSFORMERS_HOME="$HOME/.cache/huggingface/sentence-transformers"
|
|
| 25 |
ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
|
| 26 |
|
| 27 |
# Prepare runtime dirs
|
| 28 |
-
RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs && \
|
| 29 |
chown -R user:user $HOME/app
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
USER user
|
| 32 |
|
| 33 |
EXPOSE 7860
|
|
|
|
| 9 |
ENV HOME=/home/user
|
| 10 |
WORKDIR $HOME/app
|
| 11 |
|
| 12 |
+
# Set dynamic mode environment variable (default to cloud mode)
|
| 13 |
+
ARG IS_LOCAL=true
|
| 14 |
+
ENV IS_LOCAL=${IS_LOCAL}
|
| 15 |
+
|
| 16 |
# Install Python dependencies first (better layer caching)
|
| 17 |
COPY --chown=user requirements.txt .
|
| 18 |
+
# Install local mode dependencies if IS_LOCAL is true
|
| 19 |
+
COPY --chown=user requirements-dev.txt .
|
| 20 |
+
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt && \
|
| 21 |
+
if [ "$IS_LOCAL" = "true" ]; then \
|
| 22 |
+
pip install --no-cache-dir -r requirements-dev.txt; \
|
| 23 |
+
fi
|
| 24 |
|
| 25 |
# Copy the application
|
| 26 |
COPY --chown=user . .
|
|
|
|
| 34 |
ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
|
| 35 |
|
| 36 |
# Prepare runtime dirs
|
| 37 |
+
RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs $HOME/app/data && \
|
| 38 |
chown -R user:user $HOME/app
|
| 39 |
|
| 40 |
+
# Download MedAlpaca model if in local mode
|
| 41 |
+
RUN if [ "$IS_LOCAL" = "true" ]; then \
|
| 42 |
+
echo "Downloading MedAlpaca-13b model for local mode..."; \
|
| 43 |
+
python -c "from huggingface_hub import snapshot_download; import os; snapshot_download('medalpaca/medalpaca-13b', token=os.getenv('HF_TOKEN'), cache_dir='$HOME/.cache/huggingface')"; \
|
| 44 |
+
fi
|
| 45 |
+
|
| 46 |
USER user
|
| 47 |
|
| 48 |
EXPOSE 7860
|
README.md
CHANGED
|
@@ -25,6 +25,12 @@ short_description: Data processing with en-vi translation. Derived from 500k mi
|
|
| 25 |
|
| 26 |
## 🎯 Features
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
### 🔄 Advanced Data Augmentation
|
| 29 |
- **Paraphrasing**: Multi-model rotation (NVIDIA + Gemini) with easy/hard difficulty levels
|
| 30 |
- **Backtranslation**: Vietnamese pivot language for semantic preservation
|
|
@@ -73,6 +79,18 @@ short_description: Data processing with en-vi translation. Derived from 500k mi
|
|
| 73 |
|
| 74 |
## ⚙️ Configuration
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
### Augmentation Parameters
|
| 77 |
```python
|
| 78 |
class AugmentOptions:
|
|
@@ -140,10 +158,11 @@ curl -X POST "https://huggingface.co/spaces/MedVietAI/processing/rag/healthcarem
|
|
| 140 |
|
| 141 |
## 📚 Documentation
|
| 142 |
|
| 143 |
-
- [Request Documentation](
|
| 144 |
-
- [Data Processing Guide](
|
|
|
|
| 145 |
|
| 146 |
## 📄 License
|
| 147 |
|
| 148 |
-
[Apache-2.0 LICENSE](
|
| 149 |
|
|
|
|
| 25 |
|
| 26 |
## 🎯 Features
|
| 27 |
|
| 28 |
+
### 🏠 Dual Mode Operation
|
| 29 |
+
- **Local Mode**: MedAlpaca-13b model running locally for privacy and cost efficiency
|
| 30 |
+
- **Cloud Mode**: NVIDIA + Gemini API integration for scalable processing
|
| 31 |
+
- **Dynamic Switching**: Toggle between modes via environment variables
|
| 32 |
+
- **Medical Specialization**: MedAlpaca-13b specifically fine-tuned for medical tasks
|
| 33 |
+
|
| 34 |
### 🔄 Advanced Data Augmentation
|
| 35 |
- **Paraphrasing**: Multi-model rotation (NVIDIA + Gemini) with easy/hard difficulty levels
|
| 36 |
- **Backtranslation**: Vietnamese pivot language for semantic preservation
|
|
|
|
| 79 |
|
| 80 |
## ⚙️ Configuration
|
| 81 |
|
| 82 |
+
### Mode Selection
|
| 83 |
+
```bash
|
| 84 |
+
# Local Mode (MedAlpaca-13b)
|
| 85 |
+
IS_LOCAL=true
|
| 86 |
+
HF_TOKEN=your_huggingface_token
|
| 87 |
+
|
| 88 |
+
# Cloud Mode (NVIDIA/Gemini APIs)
|
| 89 |
+
IS_LOCAL=false
|
| 90 |
+
NVIDIA_API_1=your_nvidia_key
|
| 91 |
+
GEMINI_API_1=your_gemini_key
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
### Augmentation Parameters
|
| 95 |
```python
|
| 96 |
class AugmentOptions:
|
|
|
|
| 158 |
|
| 159 |
## 📚 Documentation
|
| 160 |
|
| 161 |
+
- [Request Documentation](docs/REQUEST.md)
|
| 162 |
+
- [Data Processing Guide](docs/DATA_PROCESSING.md)
|
| 163 |
+
- [Local Mode Guide](docs/LOCAL_MODE.md)
|
| 164 |
|
| 165 |
## 📄 License
|
| 166 |
|
| 167 |
+
[Apache-2.0 LICENSE](docs/LICENSE.txt)
|
| 168 |
|
app.py
CHANGED
|
@@ -15,7 +15,8 @@ from utils.datasets import resolve_dataset, hf_download_dataset
|
|
| 15 |
from utils.processor import process_file_into_sft
|
| 16 |
from utils.rag import process_file_into_rag
|
| 17 |
from utils.drive_saver import DriveSaver
|
| 18 |
-
from utils.
|
|
|
|
| 19 |
from utils.schema import CentralisedWriter, RAGWriter
|
| 20 |
from utils.token import get_credentials, exchange_code, build_auth_url
|
| 21 |
from vi.translator import VietnameseTranslator
|
|
@@ -30,29 +31,54 @@ if not logger.handlers:
|
|
| 30 |
# ────────── Boot ──────────
|
| 31 |
load_dotenv(override=True)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
SPACE_NAME = os.getenv("SPACE_NAME", "MedAI Processor")
|
| 34 |
OUTPUT_DIR = os.path.abspath(os.getenv("OUTPUT_DIR", "cache/outputs"))
|
| 35 |
LOG_DIR = os.path.abspath(os.getenv("LOG_DIR", "logs"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 37 |
os.makedirs(LOG_DIR, exist_ok=True)
|
| 38 |
|
| 39 |
-
# --- Bootstrap Google OAuth ---
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
| 46 |
|
| 47 |
-
# --- Bootstrap Google Drive ---
|
| 48 |
-
drive = DriveSaver(default_folder_id=os.getenv("GDRIVE_FOLDER_ID"))
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Vietnamese translator (currently using Helsinki-NLP/opus-mt-en-vi)
|
| 58 |
vietnamese_translator = VietnameseTranslator()
|
|
@@ -123,6 +149,11 @@ def root():
|
|
| 123 |
<h1>📊 {SPACE_NAME} – Medical Dataset Augmenter</h1>
|
| 124 |
<p>This Hugging Face Space processes medical datasets into a <b>centralised fine-tuning format</b>
|
| 125 |
(JSONL + CSV), with optional <i>data augmentation</i>.</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
<div class="section">
|
| 128 |
<h2>⚡ Quick Actions</h2>
|
|
@@ -155,7 +186,7 @@ def root():
|
|
| 155 |
<ul>
|
| 156 |
<li><a href="/status" target="_blank">Check current job status</a></li>
|
| 157 |
<li><a href="/files" target="_blank">List generated artifacts</a></li>
|
| 158 |
-
<li><a href="https://medvietai-processing.hf.space/oauth2/start" target="_blank">Authorize your GCS credential</a></li>
|
| 159 |
<li><a href="https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing/blob/main/REQUEST.md" target="_blank">📑 Request Doc (all curl examples)</a></li>
|
| 160 |
</ul>
|
| 161 |
</div>
|
|
@@ -242,9 +273,12 @@ def status():
|
|
| 242 |
with STATE_LOCK:
|
| 243 |
return JSONResponse(STATE)
|
| 244 |
|
| 245 |
-
# ──────── GCS token ────────
|
| 246 |
@app.get("/oauth2/start")
|
| 247 |
def oauth2_start(request: Request):
|
|
|
|
|
|
|
|
|
|
| 248 |
# Compute redirect URI dynamically from the actual host the Space is using
|
| 249 |
host = request.headers.get("x-forwarded-host") or request.headers.get("host")
|
| 250 |
scheme = "https" # Spaces are HTTPS at the edge
|
|
@@ -256,9 +290,12 @@ def oauth2_start(request: Request):
|
|
| 256 |
except Exception as e:
|
| 257 |
raise HTTPException(500, f"OAuth init failed: {e}")
|
| 258 |
|
| 259 |
-
# Display your token
|
| 260 |
@app.get("/oauth2/callback")
|
| 261 |
def oauth2_callback(request: Request, code: str = "", state: str = ""):
|
|
|
|
|
|
|
|
|
|
| 262 |
if not code:
|
| 263 |
raise HTTPException(400, "Missing 'code'")
|
| 264 |
# Send req
|
|
@@ -448,14 +485,19 @@ def _run_job(dataset_key: str, params: ProcessParams):
|
|
| 448 |
logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
|
| 449 |
writer.close()
|
| 450 |
|
| 451 |
-
# Upload to GDrive
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
# Finalize a task
|
| 461 |
result = {
|
|
|
|
| 15 |
from utils.processor import process_file_into_sft
|
| 16 |
from utils.rag import process_file_into_rag
|
| 17 |
from utils.drive_saver import DriveSaver
|
| 18 |
+
from utils.cloud_llm import Paraphraser
|
| 19 |
+
from utils.local_llm import LocalParaphraser
|
| 20 |
from utils.schema import CentralisedWriter, RAGWriter
|
| 21 |
from utils.token import get_credentials, exchange_code, build_auth_url
|
| 22 |
from vi.translator import VietnameseTranslator
|
|
|
|
| 31 |
# ────────── Boot ──────────
|
| 32 |
load_dotenv(override=True)
|
| 33 |
|
| 34 |
+
# Check if running in local mode
|
| 35 |
+
IS_LOCAL = os.getenv("IS_LOCAL", "false").lower() == "true"
|
| 36 |
+
|
| 37 |
SPACE_NAME = os.getenv("SPACE_NAME", "MedAI Processor")
|
| 38 |
OUTPUT_DIR = os.path.abspath(os.getenv("OUTPUT_DIR", "cache/outputs"))
|
| 39 |
LOG_DIR = os.path.abspath(os.getenv("LOG_DIR", "logs"))
|
| 40 |
+
|
| 41 |
+
# In local mode, use data/ folder instead of cache/outputs
|
| 42 |
+
if IS_LOCAL:
|
| 43 |
+
OUTPUT_DIR = os.path.abspath("data")
|
| 44 |
+
logger.info(f"[MODE] Running in LOCAL mode - outputs will be saved to: {OUTPUT_DIR}")
|
| 45 |
+
else:
|
| 46 |
+
logger.info(f"[MODE] Running in CLOUD mode - outputs will be saved to: {OUTPUT_DIR}")
|
| 47 |
+
|
| 48 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 49 |
os.makedirs(LOG_DIR, exist_ok=True)
|
| 50 |
|
| 51 |
+
# --- Bootstrap Google OAuth (only in cloud mode) ---
|
| 52 |
+
if not IS_LOCAL:
|
| 53 |
+
try:
|
| 54 |
+
creds = get_credentials()
|
| 55 |
+
if creds:
|
| 56 |
+
logger.info("✅ OAuth credentials loaded and valid")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.warning(f"⚠️ OAuth not initialized yet: {e}")
|
| 59 |
|
| 60 |
+
# --- Bootstrap Google Drive (only in cloud mode) ---
|
| 61 |
+
drive = DriveSaver(default_folder_id=os.getenv("GDRIVE_FOLDER_ID"))
|
| 62 |
+
else:
|
| 63 |
+
drive = None
|
| 64 |
+
logger.info("🚀 Local mode: Skipping Google Drive setup")
|
| 65 |
|
| 66 |
+
# Initialize paraphraser based on mode
|
| 67 |
+
if IS_LOCAL:
|
| 68 |
+
# Local mode: Use MedAlpaca model
|
| 69 |
+
logger.info("🏠 Initializing local MedAlpaca paraphraser...")
|
| 70 |
+
paraphraser = LocalParaphraser(
|
| 71 |
+
model_name="medalpaca/medalpaca-13b",
|
| 72 |
+
hf_token=os.getenv("HF_TOKEN")
|
| 73 |
+
)
|
| 74 |
+
else:
|
| 75 |
+
# Cloud mode: Use existing NVIDIA/Gemini setup
|
| 76 |
+
logger.info("☁️ Initializing cloud paraphraser (NVIDIA/Gemini)...")
|
| 77 |
+
paraphraser = Paraphraser(
|
| 78 |
+
nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
|
| 79 |
+
gemini_model_easy=os.getenv("GEMINI_MODEL_EASY", "gemini-2.5-flash-lite"),
|
| 80 |
+
gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
|
| 81 |
+
)
|
| 82 |
|
| 83 |
# Vietnamese translator (currently using Helsinki-NLP/opus-mt-en-vi)
|
| 84 |
vietnamese_translator = VietnameseTranslator()
|
|
|
|
| 149 |
<h1>📊 {SPACE_NAME} – Medical Dataset Augmenter</h1>
|
| 150 |
<p>This Hugging Face Space processes medical datasets into a <b>centralised fine-tuning format</b>
|
| 151 |
(JSONL + CSV), with optional <i>data augmentation</i>.</p>
|
| 152 |
+
|
| 153 |
+
<div style="margin-bottom: 15px; padding: 10px; background: {'#e8f5e8' if IS_LOCAL else '#e8f0ff'}; border-radius: 5px; border-left: 4px solid {'#28a745' if IS_LOCAL else '#007bff'};">
|
| 154 |
+
<strong>🔧 Current Mode:</strong> {'🏠 LOCAL (MedAlpaca-13b)' if IS_LOCAL else '☁️ CLOUD (NVIDIA/Gemini APIs)'}
|
| 155 |
+
<br><small>Outputs will be saved to: {OUTPUT_DIR}</small>
|
| 156 |
+
</div>
|
| 157 |
|
| 158 |
<div class="section">
|
| 159 |
<h2>⚡ Quick Actions</h2>
|
|
|
|
| 186 |
<ul>
|
| 187 |
<li><a href="/status" target="_blank">Check current job status</a></li>
|
| 188 |
<li><a href="/files" target="_blank">List generated artifacts</a></li>
|
| 189 |
+
{'<li><a href="https://medvietai-processing.hf.space/oauth2/start" target="_blank">Authorize your GCS credential</a></li>' if not IS_LOCAL else ''}
|
| 190 |
<li><a href="https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing/blob/main/REQUEST.md" target="_blank">📑 Request Doc (all curl examples)</a></li>
|
| 191 |
</ul>
|
| 192 |
</div>
|
|
|
|
| 273 |
with STATE_LOCK:
|
| 274 |
return JSONResponse(STATE)
|
| 275 |
|
| 276 |
+
# ──────── GCS token (only in cloud mode) ────────
|
| 277 |
@app.get("/oauth2/start")
|
| 278 |
def oauth2_start(request: Request):
|
| 279 |
+
if IS_LOCAL:
|
| 280 |
+
raise HTTPException(400, "OAuth is not available in local mode. Google Drive integration is disabled.")
|
| 281 |
+
|
| 282 |
# Compute redirect URI dynamically from the actual host the Space is using
|
| 283 |
host = request.headers.get("x-forwarded-host") or request.headers.get("host")
|
| 284 |
scheme = "https" # Spaces are HTTPS at the edge
|
|
|
|
| 290 |
except Exception as e:
|
| 291 |
raise HTTPException(500, f"OAuth init failed: {e}")
|
| 292 |
|
| 293 |
+
# Display your token (only in cloud mode)
|
| 294 |
@app.get("/oauth2/callback")
|
| 295 |
def oauth2_callback(request: Request, code: str = "", state: str = ""):
|
| 296 |
+
if IS_LOCAL:
|
| 297 |
+
raise HTTPException(400, "OAuth is not available in local mode. Google Drive integration is disabled.")
|
| 298 |
+
|
| 299 |
if not code:
|
| 300 |
raise HTTPException(400, "Missing 'code'")
|
| 301 |
# Send req
|
|
|
|
| 485 |
logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
|
| 486 |
writer.close()
|
| 487 |
|
| 488 |
+
# Upload to GDrive (only in cloud mode) or save locally
|
| 489 |
+
if IS_LOCAL:
|
| 490 |
+
set_state(message="saving files locally", progress=0.95)
|
| 491 |
+
logger.info(f"[JOB] Files saved locally: jsonl={jsonl_path} csv={csv_path}")
|
| 492 |
+
up1 = up2 = True # Local mode always "succeeds"
|
| 493 |
+
else:
|
| 494 |
+
set_state(message="uploading to Google Drive", progress=0.95)
|
| 495 |
+
up1 = drive.upload_file_to_drive(jsonl_path, mimetype="application/json")
|
| 496 |
+
up2 = drive.upload_file_to_drive(csv_path, mimetype="text/csv")
|
| 497 |
+
logger.info(
|
| 498 |
+
f"[JOB] Uploads complete uploaded={bool(up1 and up2)} "
|
| 499 |
+
f"jsonl={jsonl_path} csv={csv_path}"
|
| 500 |
+
)
|
| 501 |
|
| 502 |
# Finalize a task
|
| 503 |
result = {
|
build.sh
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Build script for MedAI Processing with dynamic local/cloud mode support
|
| 4 |
+
|
| 5 |
+
echo "🏗️ MedAI Processing Build Script"
|
| 6 |
+
echo "=================================="
|
| 7 |
+
|
| 8 |
+
# Check if mode is specified
|
| 9 |
+
if [ "$1" = "local" ]; then
|
| 10 |
+
echo "🏠 Building in LOCAL mode (MedAlpaca-13b)"
|
| 11 |
+
docker build --build-arg IS_LOCAL=true -t medai-processing:local .
|
| 12 |
+
elif [ "$1" = "cloud" ]; then
|
| 13 |
+
echo "☁️ Building in CLOUD mode (NVIDIA/Gemini APIs)"
|
| 14 |
+
docker build --build-arg IS_LOCAL=false -t medai-processing:cloud .
|
| 15 |
+
else
|
| 16 |
+
echo "Usage: $0 [local|cloud]"
|
| 17 |
+
echo ""
|
| 18 |
+
echo " local - Build with MedAlpaca-13b model for local inference"
|
| 19 |
+
echo " cloud - Build with NVIDIA/Gemini API integration"
|
| 20 |
+
echo ""
|
| 21 |
+
echo "Examples:"
|
| 22 |
+
echo " $0 local # Build for local mode"
|
| 23 |
+
echo " $0 cloud # Build for cloud mode"
|
| 24 |
+
exit 1
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
echo ""
|
| 28 |
+
echo "✅ Build completed successfully!"
|
| 29 |
+
echo ""
|
| 30 |
+
echo "To run the container:"
|
| 31 |
+
if [ "$1" = "local" ]; then
|
| 32 |
+
echo " docker run -p 7860:7860 -e HF_TOKEN=your_token_here medai-processing:local"
|
| 33 |
+
else
|
| 34 |
+
echo " docker run -p 7860:7860 -e NVIDIA_API_1=your_key -e GEMINI_API_1=your_key medai-processing:cloud"
|
| 35 |
+
fi
|
DATA_PROCESSING.md → docs/DATA_PROCESSING.md
RENAMED
|
File without changes
|
LICENSE.txt → docs/LICENSE.txt
RENAMED
|
File without changes
|
docs/LOCAL_MODE.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local Mode Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
The MedAI Processing system now supports two modes of operation:
|
| 6 |
+
|
| 7 |
+
- **Cloud Mode** (default): Uses NVIDIA and Gemini APIs for processing
|
| 8 |
+
- **Local Mode**: Uses MedAlpaca-13b model running locally for processing
|
| 9 |
+
|
| 10 |
+
## Local Mode Features
|
| 11 |
+
|
| 12 |
+
### Local Mode Benefits
|
| 13 |
+
- **No API costs**: Process data without external API calls
|
| 14 |
+
- **Privacy**: All processing happens locally
|
| 15 |
+
- **Offline capability**: Works without internet connection (after model download)
|
| 16 |
+
- **Medical specialization**: Uses MedAlpaca-13b, a model specifically fine-tuned for medical tasks
|
| 17 |
+
|
| 18 |
+
### Technical Details
|
| 19 |
+
- **Model**: [MedAlpaca-13b](https://huggingface.co/medalpaca/medalpaca-13b)
|
| 20 |
+
- **Quantization**: 4-bit quantization for memory efficiency
|
| 21 |
+
- **CUDA Support**: Automatic GPU acceleration when available
|
| 22 |
+
- **Memory Management**: Automatic model unloading to free memory
|
| 23 |
+
|
| 24 |
+
## Building and Running
|
| 25 |
+
|
| 26 |
+
### Build Script
|
| 27 |
+
Use the provided build script for easy building:
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
# Build for local mode
|
| 31 |
+
./build.sh local
|
| 32 |
+
|
| 33 |
+
# Build for cloud mode
|
| 34 |
+
./build.sh cloud
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
### Manual Docker Build
|
| 38 |
+
|
| 39 |
+
#### Local Mode
|
| 40 |
+
```bash
|
| 41 |
+
docker build --build-arg IS_LOCAL=true -t medai-processing:local .
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
#### Cloud Mode
|
| 45 |
+
```bash
|
| 46 |
+
docker build --build-arg IS_LOCAL=false -t medai-processing:cloud .
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Environment Variables
|
| 50 |
+
|
| 51 |
+
### Local Mode Required
|
| 52 |
+
- `IS_LOCAL=true`: Enables local mode
|
| 53 |
+
- `HF_TOKEN`: Hugging Face token for model download (default: provided token)
|
| 54 |
+
|
| 55 |
+
### Local Mode Optional
|
| 56 |
+
- `HF_HOME`: Hugging Face cache directory (default: ~/.cache/huggingface)
|
| 57 |
+
|
| 58 |
+
### Cloud Mode Required
|
| 59 |
+
- `IS_LOCAL=false`: Enables cloud mode (default)
|
| 60 |
+
- `NVIDIA_API_1`: NVIDIA API key
|
| 61 |
+
- `GEMINI_API_1`: Gemini API key
|
| 62 |
+
|
| 63 |
+
## Output Differences
|
| 64 |
+
|
| 65 |
+
### Local Mode
|
| 66 |
+
- **Output Location**: `data/` folder (local filesystem)
|
| 67 |
+
- **No Google Drive**: Files are saved locally only
|
| 68 |
+
- **No OAuth**: Google Drive authentication is disabled
|
| 69 |
+
|
| 70 |
+
### Cloud Mode
|
| 71 |
+
- **Output Location**: `cache/outputs/` folder
|
| 72 |
+
- **Google Drive**: Files are uploaded to Google Drive
|
| 73 |
+
- **OAuth**: Google Drive authentication is available
|
| 74 |
+
|
| 75 |
+
## Model Information
|
| 76 |
+
|
| 77 |
+
### MedAlpaca-13b
|
| 78 |
+
- **Size**: 13 billion parameters
|
| 79 |
+
- **Specialization**: Medical domain tasks
|
| 80 |
+
- **Training Data**:
|
| 81 |
+
- ChatDoctor (200k Q&A pairs)
|
| 82 |
+
- WikiDoc (67k items)
|
| 83 |
+
- StackExchange (academia, biology, fitness, health)
|
| 84 |
+
- Anki flashcards (33k items)
|
| 85 |
+
|
| 86 |
+
### Performance Considerations
|
| 87 |
+
- **Memory**: Requires ~8GB RAM (with 4-bit quantization)
|
| 88 |
+
- **GPU**: CUDA acceleration recommended for faster inference
|
| 89 |
+
- **Storage**: Model download requires ~7GB disk space
|
| 90 |
+
|
| 91 |
+
## Usage Examples
|
| 92 |
+
|
| 93 |
+
### Processing with Local Mode
|
| 94 |
+
1. Set `IS_LOCAL=true` in environment
|
| 95 |
+
2. Provide `HF_TOKEN` for model access
|
| 96 |
+
3. Run processing jobs - they will use MedAlpaca locally
|
| 97 |
+
4. Output files will be saved to `data/` folder
|
| 98 |
+
|
| 99 |
+
### Processing with Cloud Mode
|
| 100 |
+
1. Set `IS_LOCAL=false` (or omit)
|
| 101 |
+
2. Provide NVIDIA and Gemini API keys
|
| 102 |
+
3. Run processing jobs - they will use external APIs
|
| 103 |
+
4. Output files will be uploaded to Google Drive
|
| 104 |
+
|
| 105 |
+
## Troubleshooting
|
| 106 |
+
|
| 107 |
+
### Local Mode Issues
|
| 108 |
+
- **Model download fails**: Check HF_TOKEN and internet connection
|
| 109 |
+
- **Out of memory**: Ensure sufficient RAM (8GB+ recommended)
|
| 110 |
+
- **Slow inference**: Enable CUDA if available
|
| 111 |
+
|
| 112 |
+
### Cloud Mode Issues
|
| 113 |
+
- **API errors**: Check API keys and quotas
|
| 114 |
+
- **Upload failures**: Verify Google Drive authentication
|
| 115 |
+
|
| 116 |
+
## Migration Guide
|
| 117 |
+
|
| 118 |
+
### From Cloud to Local
|
| 119 |
+
1. Update environment: `IS_LOCAL=true`
|
| 120 |
+
2. Add HF_TOKEN
|
| 121 |
+
3. Rebuild container with local mode
|
| 122 |
+
4. Output will switch from Google Drive to local `data/` folder
|
| 123 |
+
|
| 124 |
+
### From Local to Cloud
|
| 125 |
+
1. Update environment: `IS_LOCAL=false`
|
| 126 |
+
2. Add NVIDIA and Gemini API keys
|
| 127 |
+
3. Rebuild container with cloud mode
|
| 128 |
+
4. Output will switch from local to Google Drive
|
REQUEST.md → docs/REQUEST.md
RENAMED
|
File without changes
|
review.md → docs/REVIEW.md
RENAMED
|
File without changes
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local mode dependencies for MedAlpaca-13b inference
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
torch-cuda>=2.0.0
|
| 4 |
+
accelerate>=0.20.0
|
| 5 |
+
bitsandbytes>=0.41.0
|
| 6 |
+
peft>=0.4.0
|
| 7 |
+
datasets>=2.14.0
|
| 8 |
+
evaluate>=0.4.0
|
| 9 |
+
scipy>=1.10.0
|
| 10 |
+
scikit-learn>=1.3.0
|
utils/ __init__.py
CHANGED
|
@@ -3,7 +3,8 @@ Utility package for the Medical Dataset Augmenter Space.
|
|
| 3 |
|
| 4 |
This package provides:
|
| 5 |
- drive_saver: Google Drive upload helper
|
| 6 |
-
-
|
|
|
|
| 7 |
- datasets: Hugging Face dataset resolver & downloader
|
| 8 |
- processor: dataset-specific processing pipeline with augmentation
|
| 9 |
- schema: centralised SFT writer (JSONL + CSV)
|
|
@@ -12,11 +13,12 @@ This package provides:
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from . import drive_saver
|
| 15 |
-
from . import
|
|
|
|
| 16 |
from . import datasets
|
| 17 |
from . import processor
|
| 18 |
from . import schema
|
| 19 |
from . import augment
|
| 20 |
from . import token
|
| 21 |
|
| 22 |
-
__all__ = ["drive_saver", "
|
|
|
|
| 3 |
|
| 4 |
This package provides:
|
| 5 |
- drive_saver: Google Drive upload helper
|
| 6 |
+
- cloud_llm: API key rotation, paraphraser, translation/backtranslation
|
| 7 |
+
- local_llm: Load medalpaca-13B for augmentation, processing and translation
|
| 8 |
- datasets: Hugging Face dataset resolver & downloader
|
| 9 |
- processor: dataset-specific processing pipeline with augmentation
|
| 10 |
- schema: centralised SFT writer (JSONL + CSV)
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
from . import drive_saver
|
| 16 |
+
from . import cloud_llm
|
| 17 |
+
from . import local_llm
|
| 18 |
from . import datasets
|
| 19 |
from . import processor
|
| 20 |
from . import schema
|
| 21 |
from . import augment
|
| 22 |
from . import token
|
| 23 |
|
| 24 |
+
__all__ = ["drive_saver", "cloud_llm", "local_llm", "datasets", "processor", "schema", "augment"]
|
utils/{llm.py → cloud_llm.py}
RENAMED
|
File without changes
|
utils/local_llm.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local MedAlpaca-13b inference client
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
import torch
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 7 |
+
import gc
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger("local_llm")
|
| 10 |
+
if not logger.handlers:
|
| 11 |
+
logger.setLevel(logging.INFO)
|
| 12 |
+
handler = logging.StreamHandler()
|
| 13 |
+
logger.addHandler(handler)
|
| 14 |
+
|
| 15 |
+
class MedAlpacaClient:
|
| 16 |
+
"""Local MedAlpaca-13b client for medical text generation"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
|
| 19 |
+
self.model_name = model_name
|
| 20 |
+
self.hf_token = hf_token or os.getenv("HF_TOKEN")
|
| 21 |
+
self.model = None
|
| 22 |
+
self.tokenizer = None
|
| 23 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
+
self.is_loaded = False
|
| 25 |
+
|
| 26 |
+
logger.info(f"[LOCAL_LLM] Initializing MedAlpaca client on device: {self.device}")
|
| 27 |
+
|
| 28 |
+
def load_model(self):
|
| 29 |
+
"""Load the MedAlpaca model and tokenizer"""
|
| 30 |
+
if self.is_loaded:
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
logger.info(f"[LOCAL_LLM] Loading MedAlpaca model: {self.model_name}")
|
| 35 |
+
|
| 36 |
+
# Configure quantization for memory efficiency
|
| 37 |
+
if self.device == "cuda":
|
| 38 |
+
quantization_config = BitsAndBytesConfig(
|
| 39 |
+
load_in_4bit=True,
|
| 40 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 41 |
+
bnb_4bit_use_double_quant=True,
|
| 42 |
+
bnb_4bit_quant_type="nf4"
|
| 43 |
+
)
|
| 44 |
+
else:
|
| 45 |
+
quantization_config = None
|
| 46 |
+
|
| 47 |
+
# Load tokenizer
|
| 48 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 49 |
+
self.model_name,
|
| 50 |
+
token=self.hf_token,
|
| 51 |
+
cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface")
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Add padding token if not present
|
| 55 |
+
if self.tokenizer.pad_token is None:
|
| 56 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 57 |
+
|
| 58 |
+
# Load model
|
| 59 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 60 |
+
self.model_name,
|
| 61 |
+
token=self.hf_token,
|
| 62 |
+
cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface"),
|
| 63 |
+
quantization_config=quantization_config,
|
| 64 |
+
device_map="auto" if self.device == "cuda" else None,
|
| 65 |
+
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
|
| 66 |
+
trust_remote_code=True
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
if self.device == "cpu":
|
| 70 |
+
self.model = self.model.to(self.device)
|
| 71 |
+
|
| 72 |
+
self.is_loaded = True
|
| 73 |
+
logger.info("[LOCAL_LLM] MedAlpaca model loaded successfully")
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"[LOCAL_LLM] Failed to load model: {e}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> Optional[str]:
|
| 80 |
+
"""Generate text using MedAlpaca model"""
|
| 81 |
+
if not self.is_loaded:
|
| 82 |
+
self.load_model()
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
# Format prompt for MedAlpaca
|
| 86 |
+
formatted_prompt = self._format_prompt(prompt)
|
| 87 |
+
|
| 88 |
+
# Tokenize input
|
| 89 |
+
inputs = self.tokenizer(
|
| 90 |
+
formatted_prompt,
|
| 91 |
+
return_tensors="pt",
|
| 92 |
+
padding=True,
|
| 93 |
+
truncation=True,
|
| 94 |
+
max_length=2048
|
| 95 |
+
).to(self.device)
|
| 96 |
+
|
| 97 |
+
# Generate
|
| 98 |
+
with torch.no_grad():
|
| 99 |
+
outputs = self.model.generate(
|
| 100 |
+
**inputs,
|
| 101 |
+
max_new_tokens=max_tokens,
|
| 102 |
+
temperature=temperature,
|
| 103 |
+
do_sample=True,
|
| 104 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
| 105 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
| 106 |
+
repetition_penalty=1.1
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Decode output
|
| 110 |
+
generated_text = self.tokenizer.decode(
|
| 111 |
+
outputs[0][inputs['input_ids'].shape[1]:],
|
| 112 |
+
skip_special_tokens=True
|
| 113 |
+
).strip()
|
| 114 |
+
|
| 115 |
+
# Clean up response
|
| 116 |
+
cleaned_text = self._clean_response(generated_text)
|
| 117 |
+
|
| 118 |
+
logger.info(f"[LOCAL_LLM] Generated: {self._snip(cleaned_text)}")
|
| 119 |
+
return cleaned_text
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"[LOCAL_LLM] Generation failed: {e}")
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
def _format_prompt(self, prompt: str) -> str:
|
| 126 |
+
"""Format prompt for MedAlpaca model"""
|
| 127 |
+
# MedAlpaca uses a specific format for medical Q&A
|
| 128 |
+
if "Question:" in prompt and "Answer:" in prompt:
|
| 129 |
+
return prompt
|
| 130 |
+
elif "Context:" in prompt and "Question:" in prompt:
|
| 131 |
+
return prompt
|
| 132 |
+
else:
|
| 133 |
+
# Simple medical Q&A format
|
| 134 |
+
return f"Question: {prompt}\n\nAnswer:"
|
| 135 |
+
|
| 136 |
+
def _clean_response(self, text: str) -> str:
|
| 137 |
+
"""Clean generated response"""
|
| 138 |
+
if not text:
|
| 139 |
+
return text
|
| 140 |
+
|
| 141 |
+
# Remove common prefixes
|
| 142 |
+
prefixes_to_remove = [
|
| 143 |
+
"Answer:",
|
| 144 |
+
"The answer is:",
|
| 145 |
+
"Based on the information provided:",
|
| 146 |
+
"Here's the answer:",
|
| 147 |
+
"Here is the answer:",
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
+
text = text.strip()
|
| 151 |
+
for prefix in prefixes_to_remove:
|
| 152 |
+
if text.startswith(prefix):
|
| 153 |
+
text = text[len(prefix):].strip()
|
| 154 |
+
break
|
| 155 |
+
|
| 156 |
+
return text
|
| 157 |
+
|
| 158 |
+
def _snip(self, text: str, max_words: int = 12) -> str:
|
| 159 |
+
"""Truncate text for logging"""
|
| 160 |
+
if not text:
|
| 161 |
+
return "∅"
|
| 162 |
+
words = text.strip().split()
|
| 163 |
+
return " ".join(words[:max_words]) + (" …" if len(words) > max_words else "")
|
| 164 |
+
|
| 165 |
+
def unload_model(self):
|
| 166 |
+
"""Unload model to free memory"""
|
| 167 |
+
if self.model is not None:
|
| 168 |
+
del self.model
|
| 169 |
+
self.model = None
|
| 170 |
+
if self.tokenizer is not None:
|
| 171 |
+
del self.tokenizer
|
| 172 |
+
self.tokenizer = None
|
| 173 |
+
|
| 174 |
+
if torch.cuda.is_available():
|
| 175 |
+
torch.cuda.empty_cache()
|
| 176 |
+
gc.collect()
|
| 177 |
+
|
| 178 |
+
self.is_loaded = False
|
| 179 |
+
logger.info("[LOCAL_LLM] Model unloaded and memory freed")
|
| 180 |
+
|
| 181 |
+
class LocalParaphraser:
|
| 182 |
+
"""Local paraphraser using MedAlpaca model"""
|
| 183 |
+
|
| 184 |
+
def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
|
| 185 |
+
self.client = MedAlpacaClient(model_name, hf_token)
|
| 186 |
+
|
| 187 |
+
def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
|
| 188 |
+
"""Paraphrase text using MedAlpaca"""
|
| 189 |
+
if not text or len(text) < 12:
|
| 190 |
+
return text
|
| 191 |
+
|
| 192 |
+
if custom_prompt:
|
| 193 |
+
prompt = custom_prompt
|
| 194 |
+
else:
|
| 195 |
+
prompt = (
|
| 196 |
+
"Paraphrase the following medical text concisely, preserve meaning and clinical terms.\n"
|
| 197 |
+
"Do not fabricate or remove factual claims.\n"
|
| 198 |
+
"Return ONLY the rewritten text, without any introduction, commentary.\n\n"
|
| 199 |
+
f"Original text: {text}"
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=0.1)
|
| 203 |
+
return result if result else text
|
| 204 |
+
|
| 205 |
+
def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
|
| 206 |
+
"""Translate text using MedAlpaca"""
|
| 207 |
+
if not text:
|
| 208 |
+
return text
|
| 209 |
+
|
| 210 |
+
prompt = f"Translate the following medical text to {target_lang}. Keep meaning exact, preserve medical terms:\n\n{text}"
|
| 211 |
+
result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
|
| 212 |
+
return result.strip() if result else None
|
| 213 |
+
|
| 214 |
+
def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
|
| 215 |
+
"""Backtranslate text using MedAlpaca"""
|
| 216 |
+
if not text:
|
| 217 |
+
return text
|
| 218 |
+
|
| 219 |
+
# First translate to target language
|
| 220 |
+
translated = self.translate(text, target_lang=via_lang)
|
| 221 |
+
if not translated:
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
# Then translate back to English
|
| 225 |
+
prompt = f"Translate the following {via_lang} text back to English, preserving the exact meaning:\n\n{translated}"
|
| 226 |
+
result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
|
| 227 |
+
return result.strip() if result else None
|
| 228 |
+
|
| 229 |
+
def consistency_check(self, user: str, output: str) -> bool:
|
| 230 |
+
"""Check consistency using MedAlpaca"""
|
| 231 |
+
prompt = (
|
| 232 |
+
"You are a strict medical QA validator. Given the USER input (question+context) "
|
| 233 |
+
"and the MODEL ANSWER, reply with exactly 'PASS' if the answer is supported and safe, "
|
| 234 |
+
"otherwise 'FAIL'. No extra text.\n\n"
|
| 235 |
+
f"USER:\n{user}\n\nANSWER:\n{output}"
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
result = self.client.generate(prompt, max_tokens=3, temperature=0.0)
|
| 239 |
+
return isinstance(result, str) and "PASS" in result.upper()
|
| 240 |
+
|
| 241 |
+
def unload(self):
|
| 242 |
+
"""Unload the model"""
|
| 243 |
+
self.client.unload_model()
|
utils/rag.py
CHANGED
|
@@ -6,7 +6,7 @@ import random
|
|
| 6 |
from typing import Dict, List, Tuple, Optional, Callable
|
| 7 |
|
| 8 |
from utils.schema import sft_row, rag_row
|
| 9 |
-
from utils.
|
| 10 |
from vi.processing import should_translate, translate_rag_row
|
| 11 |
from utils import augment as A
|
| 12 |
|
|
|
|
| 6 |
from typing import Dict, List, Tuple, Optional, Callable
|
| 7 |
|
| 8 |
from utils.schema import sft_row, rag_row
|
| 9 |
+
from utils.cloud_llm import NvidiaClient, KeyRotator
|
| 10 |
from vi.processing import should_translate, translate_rag_row
|
| 11 |
from utils import augment as A
|
| 12 |
|