LiamKhoaLe commited on
Commit
a89888b
·
1 Parent(s): 235b116

Upd local setups with dynamic mode setter

Browse files
.dockerignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.md
2
+ *.json
3
+ LICENSE.txt
Dockerfile CHANGED
@@ -9,9 +9,18 @@ RUN useradd -m -u 1000 user
9
  ENV HOME=/home/user
10
  WORKDIR $HOME/app
11
 
 
 
 
 
12
  # Install Python dependencies first (better layer caching)
13
  COPY --chown=user requirements.txt .
14
- RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
15
 
16
  # Copy the application
17
  COPY --chown=user . .
@@ -25,9 +34,15 @@ ENV SENTENCE_TRANSFORMERS_HOME="$HOME/.cache/huggingface/sentence-transformers"
25
  ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
26
 
27
  # Prepare runtime dirs
28
- RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs && \
29
  chown -R user:user $HOME/app
30
 
 
 
 
 
 
 
31
  USER user
32
 
33
  EXPOSE 7860
 
9
  ENV HOME=/home/user
10
  WORKDIR $HOME/app
11
 
12
+ # Set dynamic mode environment variable (default to cloud mode)
13
+ ARG IS_LOCAL=true
14
+ ENV IS_LOCAL=${IS_LOCAL}
15
+
16
  # Install Python dependencies first (better layer caching)
17
  COPY --chown=user requirements.txt .
18
+ # Install local mode dependencies if IS_LOCAL is true
19
+ COPY --chown=user requirements-dev.txt .
20
+ RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt && \
21
+ if [ "$IS_LOCAL" = "true" ]; then \
22
+ pip install --no-cache-dir -r requirements-dev.txt; \
23
+ fi
24
 
25
  # Copy the application
26
  COPY --chown=user . .
 
34
  ENV MEDGEMMA_HOME="$HOME/.cache/huggingface/sentence-transformers"
35
 
36
  # Prepare runtime dirs
37
+ RUN mkdir -p $HOME/app/logs $HOME/app/cache $HOME/app/cache/hf $HOME/app/cache/outputs $HOME/app/data && \
38
  chown -R user:user $HOME/app
39
 
40
+ # Download MedAlpaca model if in local mode
41
+ RUN if [ "$IS_LOCAL" = "true" ]; then \
42
+ echo "Downloading MedAlpaca-13b model for local mode..."; \
43
+ python -c "from huggingface_hub import snapshot_download; import os; snapshot_download('medalpaca/medalpaca-13b', token=os.getenv('HF_TOKEN'), cache_dir='$HOME/.cache/huggingface')"; \
44
+ fi
45
+
46
  USER user
47
 
48
  EXPOSE 7860
README.md CHANGED
@@ -25,6 +25,12 @@ short_description: Data processing with en-vi translation. Derived from 500k mi
25
 
26
  ## 🎯 Features
27
 
 
 
 
 
 
 
28
  ### 🔄 Advanced Data Augmentation
29
  - **Paraphrasing**: Multi-model rotation (NVIDIA + Gemini) with easy/hard difficulty levels
30
  - **Backtranslation**: Vietnamese pivot language for semantic preservation
@@ -73,6 +79,18 @@ short_description: Data processing with en-vi translation. Derived from 500k mi
73
 
74
  ## ⚙️ Configuration
75
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ### Augmentation Parameters
77
  ```python
78
  class AugmentOptions:
@@ -140,10 +158,11 @@ curl -X POST "https://huggingface.co/spaces/MedVietAI/processing/rag/healthcarem
140
 
141
  ## 📚 Documentation
142
 
143
- - [Request Documentation](https://huggingface.co/spaces/MedVietAI/processing/blob/main/REQUEST.md)
144
- - [Data Processing Guide](https://huggingface.co/spaces/MedVietAI/processing/blob/main/DATA_PROCESSING.md)
 
145
 
146
  ## 📄 License
147
 
148
- [Apache-2.0 LICENSE](https://huggingface.co/spaces/MedVietAI/processing/blob/main/LICENSE.txt)
149
 
 
25
 
26
  ## 🎯 Features
27
 
28
+ ### 🏠 Dual Mode Operation
29
+ - **Local Mode**: MedAlpaca-13b model running locally for privacy and cost efficiency
30
+ - **Cloud Mode**: NVIDIA + Gemini API integration for scalable processing
31
+ - **Dynamic Switching**: Toggle between modes via environment variables
32
+ - **Medical Specialization**: MedAlpaca-13b specifically fine-tuned for medical tasks
33
+
34
  ### 🔄 Advanced Data Augmentation
35
  - **Paraphrasing**: Multi-model rotation (NVIDIA + Gemini) with easy/hard difficulty levels
36
  - **Backtranslation**: Vietnamese pivot language for semantic preservation
 
79
 
80
  ## ⚙️ Configuration
81
 
82
+ ### Mode Selection
83
+ ```bash
84
+ # Local Mode (MedAlpaca-13b)
85
+ IS_LOCAL=true
86
+ HF_TOKEN=your_huggingface_token
87
+
88
+ # Cloud Mode (NVIDIA/Gemini APIs)
89
+ IS_LOCAL=false
90
+ NVIDIA_API_1=your_nvidia_key
91
+ GEMINI_API_1=your_gemini_key
92
+ ```
93
+
94
  ### Augmentation Parameters
95
  ```python
96
  class AugmentOptions:
 
158
 
159
  ## 📚 Documentation
160
 
161
+ - [Request Documentation](docs/REQUEST.md)
162
+ - [Data Processing Guide](docs/DATA_PROCESSING.md)
163
+ - [Local Mode Guide](docs/LOCAL_MODE.md)
164
 
165
  ## 📄 License
166
 
167
+ [Apache-2.0 LICENSE](docs/LICENSE.txt)
168
 
app.py CHANGED
@@ -15,7 +15,8 @@ from utils.datasets import resolve_dataset, hf_download_dataset
15
  from utils.processor import process_file_into_sft
16
  from utils.rag import process_file_into_rag
17
  from utils.drive_saver import DriveSaver
18
- from utils.llm import Paraphraser
 
19
  from utils.schema import CentralisedWriter, RAGWriter
20
  from utils.token import get_credentials, exchange_code, build_auth_url
21
  from vi.translator import VietnameseTranslator
@@ -30,29 +31,54 @@ if not logger.handlers:
30
  # ────────── Boot ──────────
31
  load_dotenv(override=True)
32
 
 
 
 
33
  SPACE_NAME = os.getenv("SPACE_NAME", "MedAI Processor")
34
  OUTPUT_DIR = os.path.abspath(os.getenv("OUTPUT_DIR", "cache/outputs"))
35
  LOG_DIR = os.path.abspath(os.getenv("LOG_DIR", "logs"))
 
 
 
 
 
 
 
 
36
  os.makedirs(OUTPUT_DIR, exist_ok=True)
37
  os.makedirs(LOG_DIR, exist_ok=True)
38
 
39
- # --- Bootstrap Google OAuth ---
40
- try:
41
- creds = get_credentials()
42
- if creds:
43
- logger.info("✅ OAuth credentials loaded and valid")
44
- except Exception as e:
45
- logger.warning(f"⚠️ OAuth not initialized yet: {e}")
 
46
 
47
- # --- Bootstrap Google Drive ---
48
- drive = DriveSaver(default_folder_id=os.getenv("GDRIVE_FOLDER_ID"))
 
 
 
49
 
50
- # LLM rotator with paraphraser nodes
51
- paraphraser = Paraphraser(
52
- nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
53
- gemini_model_easy=os.getenv("GEMINI_MODEL_EASY", "gemini-2.5-flash-lite"),
54
- gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
55
- )
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Vietnamese translator (currently using Helsinki-NLP/opus-mt-en-vi)
58
  vietnamese_translator = VietnameseTranslator()
@@ -123,6 +149,11 @@ def root():
123
  <h1>📊 {SPACE_NAME} – Medical Dataset Augmenter</h1>
124
  <p>This Hugging Face Space processes medical datasets into a <b>centralised fine-tuning format</b>
125
  (JSONL + CSV), with optional <i>data augmentation</i>.</p>
 
 
 
 
 
126
 
127
  <div class="section">
128
  <h2>⚡ Quick Actions</h2>
@@ -155,7 +186,7 @@ def root():
155
  <ul>
156
  <li><a href="/status" target="_blank">Check current job status</a></li>
157
  <li><a href="/files" target="_blank">List generated artifacts</a></li>
158
- <li><a href="https://medvietai-processing.hf.space/oauth2/start" target="_blank">Authorize your GCS credential</a></li>
159
  <li><a href="https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing/blob/main/REQUEST.md" target="_blank">📑 Request Doc (all curl examples)</a></li>
160
  </ul>
161
  </div>
@@ -242,9 +273,12 @@ def status():
242
  with STATE_LOCK:
243
  return JSONResponse(STATE)
244
 
245
- # ──────── GCS token ────────
246
  @app.get("/oauth2/start")
247
  def oauth2_start(request: Request):
 
 
 
248
  # Compute redirect URI dynamically from the actual host the Space is using
249
  host = request.headers.get("x-forwarded-host") or request.headers.get("host")
250
  scheme = "https" # Spaces are HTTPS at the edge
@@ -256,9 +290,12 @@ def oauth2_start(request: Request):
256
  except Exception as e:
257
  raise HTTPException(500, f"OAuth init failed: {e}")
258
 
259
- # Display your token
260
  @app.get("/oauth2/callback")
261
  def oauth2_callback(request: Request, code: str = "", state: str = ""):
 
 
 
262
  if not code:
263
  raise HTTPException(400, "Missing 'code'")
264
  # Send req
@@ -448,14 +485,19 @@ def _run_job(dataset_key: str, params: ProcessParams):
448
  logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
449
  writer.close()
450
 
451
- # Upload to GDrive
452
- set_state(message="uploading to Google Drive", progress=0.95)
453
- up1 = drive.upload_file_to_drive(jsonl_path, mimetype="application/json")
454
- up2 = drive.upload_file_to_drive(csv_path, mimetype="text/csv")
455
- logger.info(
456
- f"[JOB] Uploads complete uploaded={bool(up1 and up2)} "
457
- f"jsonl={jsonl_path} csv={csv_path}"
458
- )
 
 
 
 
 
459
 
460
  # Finalize a task
461
  result = {
 
15
  from utils.processor import process_file_into_sft
16
  from utils.rag import process_file_into_rag
17
  from utils.drive_saver import DriveSaver
18
+ from utils.cloud_llm import Paraphraser
19
+ from utils.local_llm import LocalParaphraser
20
  from utils.schema import CentralisedWriter, RAGWriter
21
  from utils.token import get_credentials, exchange_code, build_auth_url
22
  from vi.translator import VietnameseTranslator
 
31
  # ────────── Boot ──────────
32
  load_dotenv(override=True)
33
 
34
+ # Check if running in local mode
35
+ IS_LOCAL = os.getenv("IS_LOCAL", "false").lower() == "true"
36
+
37
  SPACE_NAME = os.getenv("SPACE_NAME", "MedAI Processor")
38
  OUTPUT_DIR = os.path.abspath(os.getenv("OUTPUT_DIR", "cache/outputs"))
39
  LOG_DIR = os.path.abspath(os.getenv("LOG_DIR", "logs"))
40
+
41
+ # In local mode, use data/ folder instead of cache/outputs
42
+ if IS_LOCAL:
43
+ OUTPUT_DIR = os.path.abspath("data")
44
+ logger.info(f"[MODE] Running in LOCAL mode - outputs will be saved to: {OUTPUT_DIR}")
45
+ else:
46
+ logger.info(f"[MODE] Running in CLOUD mode - outputs will be saved to: {OUTPUT_DIR}")
47
+
48
  os.makedirs(OUTPUT_DIR, exist_ok=True)
49
  os.makedirs(LOG_DIR, exist_ok=True)
50
 
51
+ # --- Bootstrap Google OAuth (only in cloud mode) ---
52
+ if not IS_LOCAL:
53
+ try:
54
+ creds = get_credentials()
55
+ if creds:
56
+ logger.info("✅ OAuth credentials loaded and valid")
57
+ except Exception as e:
58
+ logger.warning(f"⚠️ OAuth not initialized yet: {e}")
59
 
60
+ # --- Bootstrap Google Drive (only in cloud mode) ---
61
+ drive = DriveSaver(default_folder_id=os.getenv("GDRIVE_FOLDER_ID"))
62
+ else:
63
+ drive = None
64
+ logger.info("🚀 Local mode: Skipping Google Drive setup")
65
 
66
+ # Initialize paraphraser based on mode
67
+ if IS_LOCAL:
68
+ # Local mode: Use MedAlpaca model
69
+ logger.info("🏠 Initializing local MedAlpaca paraphraser...")
70
+ paraphraser = LocalParaphraser(
71
+ model_name="medalpaca/medalpaca-13b",
72
+ hf_token=os.getenv("HF_TOKEN")
73
+ )
74
+ else:
75
+ # Cloud mode: Use existing NVIDIA/Gemini setup
76
+ logger.info("☁️ Initializing cloud paraphraser (NVIDIA/Gemini)...")
77
+ paraphraser = Paraphraser(
78
+ nvidia_model=os.getenv("NVIDIA_MODEL", "meta/llama-3.1-8b-instruct"),
79
+ gemini_model_easy=os.getenv("GEMINI_MODEL_EASY", "gemini-2.5-flash-lite"),
80
+ gemini_model_hard=os.getenv("GEMINI_MODEL_HARD", "gemini-2.5-flash"),
81
+ )
82
 
83
  # Vietnamese translator (currently using Helsinki-NLP/opus-mt-en-vi)
84
  vietnamese_translator = VietnameseTranslator()
 
149
  <h1>📊 {SPACE_NAME} – Medical Dataset Augmenter</h1>
150
  <p>This Hugging Face Space processes medical datasets into a <b>centralised fine-tuning format</b>
151
  (JSONL + CSV), with optional <i>data augmentation</i>.</p>
152
+
153
+ <div style="margin-bottom: 15px; padding: 10px; background: {'#e8f5e8' if IS_LOCAL else '#e8f0ff'}; border-radius: 5px; border-left: 4px solid {'#28a745' if IS_LOCAL else '#007bff'};">
154
+ <strong>🔧 Current Mode:</strong> {'🏠 LOCAL (MedAlpaca-13b)' if IS_LOCAL else '☁️ CLOUD (NVIDIA/Gemini APIs)'}
155
+ <br><small>Outputs will be saved to: {OUTPUT_DIR}</small>
156
+ </div>
157
 
158
  <div class="section">
159
  <h2>⚡ Quick Actions</h2>
 
186
  <ul>
187
  <li><a href="/status" target="_blank">Check current job status</a></li>
188
  <li><a href="/files" target="_blank">List generated artifacts</a></li>
189
+ {'<li><a href="https://medvietai-processing.hf.space/oauth2/start" target="_blank">Authorize your GCS credential</a></li>' if not IS_LOCAL else ''}
190
  <li><a href="https://huggingface.co/spaces/BinKhoaLe1812/MedAI_Processing/blob/main/REQUEST.md" target="_blank">📑 Request Doc (all curl examples)</a></li>
191
  </ul>
192
  </div>
 
273
  with STATE_LOCK:
274
  return JSONResponse(STATE)
275
 
276
+ # ──────── GCS token (only in cloud mode) ────────
277
  @app.get("/oauth2/start")
278
  def oauth2_start(request: Request):
279
+ if IS_LOCAL:
280
+ raise HTTPException(400, "OAuth is not available in local mode. Google Drive integration is disabled.")
281
+
282
  # Compute redirect URI dynamically from the actual host the Space is using
283
  host = request.headers.get("x-forwarded-host") or request.headers.get("host")
284
  scheme = "https" # Spaces are HTTPS at the edge
 
290
  except Exception as e:
291
  raise HTTPException(500, f"OAuth init failed: {e}")
292
 
293
+ # Display your token (only in cloud mode)
294
  @app.get("/oauth2/callback")
295
  def oauth2_callback(request: Request, code: str = "", state: str = ""):
296
+ if IS_LOCAL:
297
+ raise HTTPException(400, "OAuth is not available in local mode. Google Drive integration is disabled.")
298
+
299
  if not code:
300
  raise HTTPException(400, "Missing 'code'")
301
  # Send req
 
485
  logger.info(f"[JOB] Processed dataset={dataset_key} rows={count} stats={stats}")
486
  writer.close()
487
 
488
+ # Upload to GDrive (only in cloud mode) or save locally
489
+ if IS_LOCAL:
490
+ set_state(message="saving files locally", progress=0.95)
491
+ logger.info(f"[JOB] Files saved locally: jsonl={jsonl_path} csv={csv_path}")
492
+ up1 = up2 = True # Local mode always "succeeds"
493
+ else:
494
+ set_state(message="uploading to Google Drive", progress=0.95)
495
+ up1 = drive.upload_file_to_drive(jsonl_path, mimetype="application/json")
496
+ up2 = drive.upload_file_to_drive(csv_path, mimetype="text/csv")
497
+ logger.info(
498
+ f"[JOB] Uploads complete uploaded={bool(up1 and up2)} "
499
+ f"jsonl={jsonl_path} csv={csv_path}"
500
+ )
501
 
502
  # Finalize a task
503
  result = {
build.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Build script for MedAI Processing with dynamic local/cloud mode support
4
+
5
+ echo "🏗️ MedAI Processing Build Script"
6
+ echo "=================================="
7
+
8
+ # Check if mode is specified
9
+ if [ "$1" = "local" ]; then
10
+ echo "🏠 Building in LOCAL mode (MedAlpaca-13b)"
11
+ docker build --build-arg IS_LOCAL=true -t medai-processing:local .
12
+ elif [ "$1" = "cloud" ]; then
13
+ echo "☁️ Building in CLOUD mode (NVIDIA/Gemini APIs)"
14
+ docker build --build-arg IS_LOCAL=false -t medai-processing:cloud .
15
+ else
16
+ echo "Usage: $0 [local|cloud]"
17
+ echo ""
18
+ echo " local - Build with MedAlpaca-13b model for local inference"
19
+ echo " cloud - Build with NVIDIA/Gemini API integration"
20
+ echo ""
21
+ echo "Examples:"
22
+ echo " $0 local # Build for local mode"
23
+ echo " $0 cloud # Build for cloud mode"
24
+ exit 1
25
+ fi
26
+
27
+ echo ""
28
+ echo "✅ Build completed successfully!"
29
+ echo ""
30
+ echo "To run the container:"
31
+ if [ "$1" = "local" ]; then
32
+ echo " docker run -p 7860:7860 -e HF_TOKEN=your_token_here medai-processing:local"
33
+ else
34
+ echo " docker run -p 7860:7860 -e NVIDIA_API_1=your_key -e GEMINI_API_1=your_key medai-processing:cloud"
35
+ fi
DATA_PROCESSING.md → docs/DATA_PROCESSING.md RENAMED
File without changes
LICENSE.txt → docs/LICENSE.txt RENAMED
File without changes
docs/LOCAL_MODE.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Mode Documentation
2
+
3
+ ## Overview
4
+
5
+ The MedAI Processing system now supports two modes of operation:
6
+
7
+ - **Cloud Mode** (default): Uses NVIDIA and Gemini APIs for processing
8
+ - **Local Mode**: Uses MedAlpaca-13b model running locally for processing
9
+
10
+ ## Local Mode Features
11
+
12
+ ### Local Mode Benefits
13
+ - **No API costs**: Process data without external API calls
14
+ - **Privacy**: All processing happens locally
15
+ - **Offline capability**: Works without internet connection (after model download)
16
+ - **Medical specialization**: Uses MedAlpaca-13b, a model specifically fine-tuned for medical tasks
17
+
18
+ ### Technical Details
19
+ - **Model**: [MedAlpaca-13b](https://huggingface.co/medalpaca/medalpaca-13b)
20
+ - **Quantization**: 4-bit quantization for memory efficiency
21
+ - **CUDA Support**: Automatic GPU acceleration when available
22
+ - **Memory Management**: Automatic model unloading to free memory
23
+
24
+ ## Building and Running
25
+
26
+ ### Build Script
27
+ Use the provided build script for easy building:
28
+
29
+ ```bash
30
+ # Build for local mode
31
+ ./build.sh local
32
+
33
+ # Build for cloud mode
34
+ ./build.sh cloud
35
+ ```
36
+
37
+ ### Manual Docker Build
38
+
39
+ #### Local Mode
40
+ ```bash
41
+ docker build --build-arg IS_LOCAL=true -t medai-processing:local .
42
+ ```
43
+
44
+ #### Cloud Mode
45
+ ```bash
46
+ docker build --build-arg IS_LOCAL=false -t medai-processing:cloud .
47
+ ```
48
+
49
+ ## Environment Variables
50
+
51
+ ### Local Mode Required
52
+ - `IS_LOCAL=true`: Enables local mode
53
+ - `HF_TOKEN`: Hugging Face token for model download (default: provided token)
54
+
55
+ ### Local Mode Optional
56
+ - `HF_HOME`: Hugging Face cache directory (default: ~/.cache/huggingface)
57
+
58
+ ### Cloud Mode Required
59
+ - `IS_LOCAL=false`: Enables cloud mode (default)
60
+ - `NVIDIA_API_1`: NVIDIA API key
61
+ - `GEMINI_API_1`: Gemini API key
62
+
63
+ ## Output Differences
64
+
65
+ ### Local Mode
66
+ - **Output Location**: `data/` folder (local filesystem)
67
+ - **No Google Drive**: Files are saved locally only
68
+ - **No OAuth**: Google Drive authentication is disabled
69
+
70
+ ### Cloud Mode
71
+ - **Output Location**: `cache/outputs/` folder
72
+ - **Google Drive**: Files are uploaded to Google Drive
73
+ - **OAuth**: Google Drive authentication is available
74
+
75
+ ## Model Information
76
+
77
+ ### MedAlpaca-13b
78
+ - **Size**: 13 billion parameters
79
+ - **Specialization**: Medical domain tasks
80
+ - **Training Data**:
81
+ - ChatDoctor (200k Q&A pairs)
82
+ - WikiDoc (67k items)
83
+ - StackExchange (academia, biology, fitness, health)
84
+ - Anki flashcards (33k items)
85
+
86
+ ### Performance Considerations
87
+ - **Memory**: Requires ~8GB RAM (with 4-bit quantization)
88
+ - **GPU**: CUDA acceleration recommended for faster inference
89
+ - **Storage**: Model download requires ~7GB disk space
90
+
91
+ ## Usage Examples
92
+
93
+ ### Processing with Local Mode
94
+ 1. Set `IS_LOCAL=true` in environment
95
+ 2. Provide `HF_TOKEN` for model access
96
+ 3. Run processing jobs - they will use MedAlpaca locally
97
+ 4. Output files will be saved to `data/` folder
98
+
99
+ ### Processing with Cloud Mode
100
+ 1. Set `IS_LOCAL=false` (or omit)
101
+ 2. Provide NVIDIA and Gemini API keys
102
+ 3. Run processing jobs - they will use external APIs
103
+ 4. Output files will be uploaded to Google Drive
104
+
105
+ ## Troubleshooting
106
+
107
+ ### Local Mode Issues
108
+ - **Model download fails**: Check HF_TOKEN and internet connection
109
+ - **Out of memory**: Ensure sufficient RAM (8GB+ recommended)
110
+ - **Slow inference**: Enable CUDA if available
111
+
112
+ ### Cloud Mode Issues
113
+ - **API errors**: Check API keys and quotas
114
+ - **Upload failures**: Verify Google Drive authentication
115
+
116
+ ## Migration Guide
117
+
118
+ ### From Cloud to Local
119
+ 1. Update environment: `IS_LOCAL=true`
120
+ 2. Add HF_TOKEN
121
+ 3. Rebuild container with local mode
122
+ 4. Output will switch from Google Drive to local `data/` folder
123
+
124
+ ### From Local to Cloud
125
+ 1. Update environment: `IS_LOCAL=false`
126
+ 2. Add NVIDIA and Gemini API keys
127
+ 3. Rebuild container with cloud mode
128
+ 4. Output will switch from local to Google Drive
REQUEST.md → docs/REQUEST.md RENAMED
File without changes
review.md → docs/REVIEW.md RENAMED
File without changes
requirements-dev.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local mode dependencies for MedAlpaca-13b inference
2
+ torch>=2.0.0
3
+ torch-cuda>=2.0.0
4
+ accelerate>=0.20.0
5
+ bitsandbytes>=0.41.0
6
+ peft>=0.4.0
7
+ datasets>=2.14.0
8
+ evaluate>=0.4.0
9
+ scipy>=1.10.0
10
+ scikit-learn>=1.3.0
utils/ __init__.py CHANGED
@@ -3,7 +3,8 @@ Utility package for the Medical Dataset Augmenter Space.
3
 
4
  This package provides:
5
  - drive_saver: Google Drive upload helper
6
- - llm: API key rotation, paraphraser, translation/backtranslation
 
7
  - datasets: Hugging Face dataset resolver & downloader
8
  - processor: dataset-specific processing pipeline with augmentation
9
  - schema: centralised SFT writer (JSONL + CSV)
@@ -12,11 +13,12 @@ This package provides:
12
  """
13
 
14
  from . import drive_saver
15
- from . import llm
 
16
  from . import datasets
17
  from . import processor
18
  from . import schema
19
  from . import augment
20
  from . import token
21
 
22
- __all__ = ["drive_saver", "llm", "datasets", "processor", "schema", "augment"]
 
3
 
4
  This package provides:
5
  - drive_saver: Google Drive upload helper
6
+ - cloud_llm: API key rotation, paraphraser, translation/backtranslation
7
+ - local_llm: Load medalpaca-13B for augmentation, processing and translation
8
  - datasets: Hugging Face dataset resolver & downloader
9
  - processor: dataset-specific processing pipeline with augmentation
10
  - schema: centralised SFT writer (JSONL + CSV)
 
13
  """
14
 
15
  from . import drive_saver
16
+ from . import cloud_llm
17
+ from . import local_llm
18
  from . import datasets
19
  from . import processor
20
  from . import schema
21
  from . import augment
22
  from . import token
23
 
24
+ __all__ = ["drive_saver", "cloud_llm", "local_llm", "datasets", "processor", "schema", "augment"]
utils/{llm.py → cloud_llm.py} RENAMED
File without changes
utils/local_llm.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local MedAlpaca-13b inference client
2
+ import os
3
+ import logging
4
+ import torch
5
+ from typing import Optional
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
+ import gc
8
+
9
+ logger = logging.getLogger("local_llm")
10
+ if not logger.handlers:
11
+ logger.setLevel(logging.INFO)
12
+ handler = logging.StreamHandler()
13
+ logger.addHandler(handler)
14
+
15
+ class MedAlpacaClient:
16
+ """Local MedAlpaca-13b client for medical text generation"""
17
+
18
+ def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
19
+ self.model_name = model_name
20
+ self.hf_token = hf_token or os.getenv("HF_TOKEN")
21
+ self.model = None
22
+ self.tokenizer = None
23
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ self.is_loaded = False
25
+
26
+ logger.info(f"[LOCAL_LLM] Initializing MedAlpaca client on device: {self.device}")
27
+
28
+ def load_model(self):
29
+ """Load the MedAlpaca model and tokenizer"""
30
+ if self.is_loaded:
31
+ return
32
+
33
+ try:
34
+ logger.info(f"[LOCAL_LLM] Loading MedAlpaca model: {self.model_name}")
35
+
36
+ # Configure quantization for memory efficiency
37
+ if self.device == "cuda":
38
+ quantization_config = BitsAndBytesConfig(
39
+ load_in_4bit=True,
40
+ bnb_4bit_compute_dtype=torch.float16,
41
+ bnb_4bit_use_double_quant=True,
42
+ bnb_4bit_quant_type="nf4"
43
+ )
44
+ else:
45
+ quantization_config = None
46
+
47
+ # Load tokenizer
48
+ self.tokenizer = AutoTokenizer.from_pretrained(
49
+ self.model_name,
50
+ token=self.hf_token,
51
+ cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface")
52
+ )
53
+
54
+ # Add padding token if not present
55
+ if self.tokenizer.pad_token is None:
56
+ self.tokenizer.pad_token = self.tokenizer.eos_token
57
+
58
+ # Load model
59
+ self.model = AutoModelForCausalLM.from_pretrained(
60
+ self.model_name,
61
+ token=self.hf_token,
62
+ cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface"),
63
+ quantization_config=quantization_config,
64
+ device_map="auto" if self.device == "cuda" else None,
65
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
66
+ trust_remote_code=True
67
+ )
68
+
69
+ if self.device == "cpu":
70
+ self.model = self.model.to(self.device)
71
+
72
+ self.is_loaded = True
73
+ logger.info("[LOCAL_LLM] MedAlpaca model loaded successfully")
74
+
75
+ except Exception as e:
76
+ logger.error(f"[LOCAL_LLM] Failed to load model: {e}")
77
+ raise
78
+
79
+ def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> Optional[str]:
80
+ """Generate text using MedAlpaca model"""
81
+ if not self.is_loaded:
82
+ self.load_model()
83
+
84
+ try:
85
+ # Format prompt for MedAlpaca
86
+ formatted_prompt = self._format_prompt(prompt)
87
+
88
+ # Tokenize input
89
+ inputs = self.tokenizer(
90
+ formatted_prompt,
91
+ return_tensors="pt",
92
+ padding=True,
93
+ truncation=True,
94
+ max_length=2048
95
+ ).to(self.device)
96
+
97
+ # Generate
98
+ with torch.no_grad():
99
+ outputs = self.model.generate(
100
+ **inputs,
101
+ max_new_tokens=max_tokens,
102
+ temperature=temperature,
103
+ do_sample=True,
104
+ pad_token_id=self.tokenizer.eos_token_id,
105
+ eos_token_id=self.tokenizer.eos_token_id,
106
+ repetition_penalty=1.1
107
+ )
108
+
109
+ # Decode output
110
+ generated_text = self.tokenizer.decode(
111
+ outputs[0][inputs['input_ids'].shape[1]:],
112
+ skip_special_tokens=True
113
+ ).strip()
114
+
115
+ # Clean up response
116
+ cleaned_text = self._clean_response(generated_text)
117
+
118
+ logger.info(f"[LOCAL_LLM] Generated: {self._snip(cleaned_text)}")
119
+ return cleaned_text
120
+
121
+ except Exception as e:
122
+ logger.error(f"[LOCAL_LLM] Generation failed: {e}")
123
+ return None
124
+
125
+ def _format_prompt(self, prompt: str) -> str:
126
+ """Format prompt for MedAlpaca model"""
127
+ # MedAlpaca uses a specific format for medical Q&A
128
+ if "Question:" in prompt and "Answer:" in prompt:
129
+ return prompt
130
+ elif "Context:" in prompt and "Question:" in prompt:
131
+ return prompt
132
+ else:
133
+ # Simple medical Q&A format
134
+ return f"Question: {prompt}\n\nAnswer:"
135
+
136
+ def _clean_response(self, text: str) -> str:
137
+ """Clean generated response"""
138
+ if not text:
139
+ return text
140
+
141
+ # Remove common prefixes
142
+ prefixes_to_remove = [
143
+ "Answer:",
144
+ "The answer is:",
145
+ "Based on the information provided:",
146
+ "Here's the answer:",
147
+ "Here is the answer:",
148
+ ]
149
+
150
+ text = text.strip()
151
+ for prefix in prefixes_to_remove:
152
+ if text.startswith(prefix):
153
+ text = text[len(prefix):].strip()
154
+ break
155
+
156
+ return text
157
+
158
+ def _snip(self, text: str, max_words: int = 12) -> str:
159
+ """Truncate text for logging"""
160
+ if not text:
161
+ return "∅"
162
+ words = text.strip().split()
163
+ return " ".join(words[:max_words]) + (" …" if len(words) > max_words else "")
164
+
165
+ def unload_model(self):
166
+ """Unload model to free memory"""
167
+ if self.model is not None:
168
+ del self.model
169
+ self.model = None
170
+ if self.tokenizer is not None:
171
+ del self.tokenizer
172
+ self.tokenizer = None
173
+
174
+ if torch.cuda.is_available():
175
+ torch.cuda.empty_cache()
176
+ gc.collect()
177
+
178
+ self.is_loaded = False
179
+ logger.info("[LOCAL_LLM] Model unloaded and memory freed")
180
+
181
+ class LocalParaphraser:
182
+ """Local paraphraser using MedAlpaca model"""
183
+
184
+ def __init__(self, model_name: str = "medalpaca/medalpaca-13b", hf_token: str = None):
185
+ self.client = MedAlpacaClient(model_name, hf_token)
186
+
187
+ def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
188
+ """Paraphrase text using MedAlpaca"""
189
+ if not text or len(text) < 12:
190
+ return text
191
+
192
+ if custom_prompt:
193
+ prompt = custom_prompt
194
+ else:
195
+ prompt = (
196
+ "Paraphrase the following medical text concisely, preserve meaning and clinical terms.\n"
197
+ "Do not fabricate or remove factual claims.\n"
198
+ "Return ONLY the rewritten text, without any introduction, commentary.\n\n"
199
+ f"Original text: {text}"
200
+ )
201
+
202
+ result = self.client.generate(prompt, max_tokens=min(600, max(128, len(text)//2)), temperature=0.1)
203
+ return result if result else text
204
+
205
+ def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
206
+ """Translate text using MedAlpaca"""
207
+ if not text:
208
+ return text
209
+
210
+ prompt = f"Translate the following medical text to {target_lang}. Keep meaning exact, preserve medical terms:\n\n{text}"
211
+ result = self.client.generate(prompt, max_tokens=min(800, len(text)+100), temperature=0.0)
212
+ return result.strip() if result else None
213
+
214
+ def backtranslate(self, text: str, via_lang: str = "vi") -> Optional[str]:
215
+ """Backtranslate text using MedAlpaca"""
216
+ if not text:
217
+ return text
218
+
219
+ # First translate to target language
220
+ translated = self.translate(text, target_lang=via_lang)
221
+ if not translated:
222
+ return None
223
+
224
+ # Then translate back to English
225
+ prompt = f"Translate the following {via_lang} text back to English, preserving the exact meaning:\n\n{translated}"
226
+ result = self.client.generate(prompt, max_tokens=min(900, len(text)+150), temperature=0.0)
227
+ return result.strip() if result else None
228
+
229
+ def consistency_check(self, user: str, output: str) -> bool:
230
+ """Check consistency using MedAlpaca"""
231
+ prompt = (
232
+ "You are a strict medical QA validator. Given the USER input (question+context) "
233
+ "and the MODEL ANSWER, reply with exactly 'PASS' if the answer is supported and safe, "
234
+ "otherwise 'FAIL'. No extra text.\n\n"
235
+ f"USER:\n{user}\n\nANSWER:\n{output}"
236
+ )
237
+
238
+ result = self.client.generate(prompt, max_tokens=3, temperature=0.0)
239
+ return isinstance(result, str) and "PASS" in result.upper()
240
+
241
+ def unload(self):
242
+ """Unload the model"""
243
+ self.client.unload_model()
utils/rag.py CHANGED
@@ -6,7 +6,7 @@ import random
6
  from typing import Dict, List, Tuple, Optional, Callable
7
 
8
  from utils.schema import sft_row, rag_row
9
- from utils.llm import NvidiaClient, KeyRotator
10
  from vi.processing import should_translate, translate_rag_row
11
  from utils import augment as A
12
 
 
6
  from typing import Dict, List, Tuple, Optional, Callable
7
 
8
  from utils.schema import sft_row, rag_row
9
+ from utils.cloud_llm import NvidiaClient, KeyRotator
10
  from vi.processing import should_translate, translate_rag_row
11
  from utils import augment as A
12