Spaces:

hssling
/

diagnostic-copilot-api

Sleeping

App Files Files Community

hssling commited on 10 days ago

Commit

ccb8f07

0 Parent(s):

Initial commit of model training and inference backend

Browse files

Files changed (3) hide show

app.py +91 -0
requirements.txt +9 -0
train_multimodal.py +115 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from PIL import Image
+# 1. HuggingFace Space Deployment Settings
+MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" # Base model
+# To use your fine-tuned model from Kaggle:
+# 1. model.push_to_hub("your-name/med-qwen-vl-adapter")
+# 2. Add adapter load here for PEFT
+ADAPTER_ID = None
+# Initialize Model and Processor globally
+print("Starting App Engine...")
+print(f"Loading {MODEL_ID}...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForVision2Seq.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    device_map="auto"
+)
+if ADAPTER_ID:
+    print(f"Loading custom fine-tuned LoRA weights: {ADAPTER_ID}")
+    model.load_adapter(ADAPTER_ID)
+# 2. Main API Function called by our Next App
+def diagnose_api(history: str, examination: str, image: Image.Image = None, audio_path: str = None):
+    try:
+        if image is None:
+            # Fallback if no image is passed
+            return "Error: Qwen-VL requires at least one image/diagnostic input to function accurately."
+        # Re-construct the specific structured prompt our diagnostic copilot uses
+        system_prompt = "You are a highly advanced Multi-Modal Diagnostic Co-Pilot Medical AI. Provide ## Integrated Analysis, ## Decision Making, and ## Management & Treatment Plan."
+        user_prompt = f"History: {history}\nExamination: {examination}\nAnalyze the provided scan and history."
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": user_prompt}
+                ]
+            }
+        ]
+        text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[text_input],
+            images=[image],
+            padding=True,
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=1024, temperature=0.2)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        return output_text
+    except Exception as e:
+        return f"Model Error: {str(e)}"
+# 3. Create the Gradio interface
+# This acts as the visual UI for the HF Space, but more importantly,
+# exposes an API endpoint via `/api/predict` that our React app can connect to securely.
+demo = gr.Interface(
+    fn=diagnose_api,
+    inputs=[
+        gr.Textbox(lines=5, label="Patient History (String)", placeholder="Age, symptoms, past medical history..."),
+        gr.Textbox(lines=5, label="Examination Findings (String)", placeholder="Vitals, systemic exam..."),
+        gr.Image(type="pil", label="Diagnostic Scan / Image"),
+        gr.Audio(type="filepath", label="Optional Dictation Audio", visible=False) # Qwen-VL does not naturally support audio, handling externally or ignoring
+    ],
+    outputs=gr.Markdown(label="Clinical Report Output"),
+    title="Multi-Modal Diagnostic Co-Pilot API (Trained via Kaggle)",
+    description="This Space hosts the fine-tuned medical vision-language model for the Diagnostic Co-Pilot ecosystem."
+)
+if __name__ == "__main__":
+    demo.launch(share=False) # Will be automatically launched by HF Spaces without share=True

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0
+transformers>=4.40.0
+accelerate
+peft
+bitsandbytes
+trl
+datasets
+gradio>=4.0.0
+Pillow

train_multimodal.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq, TrainingArguments
+from peft import LoraConfig, get_peft_model
+from datasets import load_dataset
+from trl import SFTTrainer
+# 1. Configuration for Kaggle/HuggingFace Fine-Tuning
+MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" # Small, highly capable multimodal model perfect for medical VQA
+DATASET_ID = "flaviagiammarino/vqa-rad" # Example Medical VQA dataset (Radiology)
+OUTPUT_DIR = "./med-qwen-vl-adapter"
+def main():
+    print(f"Loading processor and model: {MODEL_ID}")
+    # Load processor and model with memory-efficient 4-bit quantization
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = AutoModelForVision2Seq.from_pretrained(
+        MODEL_ID,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+    )
+    # Apply LoRA (Low-Rank Adaptation)
+    print("Applying LoRA parameters...")
+    lora_config = LoraConfig(
+        r=16,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Attention layers
+        bias="none",
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # Load and format the dataset
+    print(f"Loading dataset: {DATASET_ID}")
+    dataset = load_dataset(DATASET_ID, split="train[:50%]") # Use subset for demonstration
+    def format_data(example):
+        # We need to format the inputs as required by the specific model
+        # For Qwen2-VL:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": example["question"]}
+                ]
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": example["answer"]}
+                ]
+            }
+        ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+        return {"text": text, "image": example["image"]}
+    formatted_dataset = dataset.map(format_data, remove_columns=dataset.column_names)
+    # Setup Training Arguments
+    training_args = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        learning_rate=2e-4,
+        logging_steps=10,
+        max_steps=100, # Set low for quick Kaggle demonstration
+        save_strategy="steps",
+        save_steps=50,
+        fp16=True,
+        optim="paged_adamw_8bit",
+        remove_unused_columns=False,
+        report_to="none" # Disable wandb for seamless Kaggle runs
+    )
+    # Custom Data Collator for Vision-Language Models
+    def collate_fn(examples):
+        texts = [ex["text"] for ex in examples]
+        images = [ex["image"] for ex in examples]
+        batch = processor(
+            text=texts,
+            images=images,
+            padding=True,
+            return_tensors="pt"
+        )
+        # Labels are the same as input_ids for standard causal LM training
+        batch["labels"] = batch["input_ids"].clone()
+        return batch
+    # Train using TRL's SFT Trainer
+    print("Starting fine-tuning...")
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=formatted_dataset,
+        data_collator=collate_fn,
+        dataset_text_field="text" # SFTTrainer requires this, though we use a custom collator
+    )
+    trainer.train()
+    # Save the adapter
+    print(f"Saving fine-tuned adapter to {OUTPUT_DIR}")
+    trainer.save_model(OUTPUT_DIR)
+    processor.save_pretrained(OUTPUT_DIR)
+    print("Done! You can now merge this adapter or upload it directly to the Hugging Face Hub (e.g. via model.push_to_hub())")
+if __name__ == "__main__":
+    main()