v2: retrained on full dataset (6265 samples, 2 epochs, loss 4.7)

- Retrained LoRA on complete Sanskrit_OCR_Parallel_Corpus (previously only 55%)
- Added train_v2.py: simplified training script without Unsloth dependency
- Fixed inference.py: use absolute model paths
- Updated run.py: added --local_dataset flag, fixed model_dir references

Files changed (6) hide show

adapter_config.json +7 -8
adapter_model.safetensors +1 -1
inference.py +3 -3
run.py +39 -27
tokenizer_config.json +1 -1
train_v2.py +279 -0

adapter_config.json CHANGED Viewed

@@ -4,10 +4,9 @@
   "arrow_config": null,
   "auto_mapping": {
     "base_model_class": "DeepseekOCRForCausalLM",
-    "parent_library": "transformers_modules.deepseek_ocr.modeling_deepseekocr",
-    "unsloth_fixed": true
   },
-  "base_model_name_or_path": "deepseek_ocr",
   "bias": "none",
   "corda_config": null,
   "ensure_weight_tying": false,
@@ -33,16 +32,16 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
     "gate_proj",
-    "q_proj",
     "k_proj",
-    "o_proj",
-    "down_proj",
-    "up_proj"
   ],
   "target_parameters": null,
-  "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,
   "use_dora": false,
   "use_qalora": false,

   "arrow_config": null,
   "auto_mapping": {
     "base_model_class": "DeepseekOCRForCausalLM",
+    "parent_library": "transformers_modules.deepseek_ocr.modeling_deepseekocr"
   },
+  "base_model_name_or_path": "/home/ubuntu/deepseek_ocr",
   "bias": "none",
   "corda_config": null,
   "ensure_weight_tying": false,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
     "v_proj",
     "gate_proj",
     "k_proj",
+    "down_proj"
   ],
   "target_parameters": null,
+  "task_type": null,
   "trainable_token_indices": null,
   "use_dora": false,
   "use_qalora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:997072bf68d91539c958713abe5e5a3b1baf2cbd5af3749919256d2b0c34bbcf
 size 310662536

 version https://git-lfs.github.com/spec/v1
+oid sha256:4f95dfcbb52e9a0e95dfdc7754457c66b117e2f486ec214554b102aea6b78b9c
 size 310662536

inference.py CHANGED Viewed

@@ -58,11 +58,11 @@ def load_model_with_lora(base_model_path="deepseek_ocr", lora_path="./lora_model
     return model
-def run_inference(model, image_path, prompt="<image>\nFree OCR. "):
     print(f"Running inference on: {image_path}")
     processor = AutoProcessor.from_pretrained(
-        "deepseek_ocr",
         trust_remote_code=True,
     )
@@ -99,7 +99,7 @@ if __name__ == "__main__":
     model = load_model_with_lora(args.base_model, args.lora)
-    raw = run_inference(model, args.image)
     cleaned = clean_text(raw)

     return model
+def run_inference(model, image_path, base_model_path="deepseek_ocr", prompt="<image>\nFree OCR. "):
     print(f"Running inference on: {image_path}")
     processor = AutoProcessor.from_pretrained(
+        base_model_path,
         trust_remote_code=True,
     )
     model = load_model_with_lora(args.base_model, args.lora)
+    raw = run_inference(model, args.image, args.base_model)
     cleaned = clean_text(raw)

run.py CHANGED Viewed

@@ -61,6 +61,7 @@ def load_model(model_path="deepseek_ocr", load_in_4bit=False):
         trust_remote_code=True,
         unsloth_force_compile=True,
         use_gradient_checkpointing="unsloth",
     )
     print("Model and tokenizer loaded successfully!")
@@ -88,36 +89,42 @@ def setup_lora(model):
     return model
 def load_and_prepare_dataset(dataset_name="snskrt/Sanskrit_OCR_Parallel_Corpus",
-                             train_size=0.8, val_size=0.1, max_samples=None, token=None):
     """
     Load and prepare the Sanskrit OCR dataset.
-    This function downloads the entire repo, reads 'LABELS/labels.csv',
-    and pairs images from the 'IMAGES/' folder.
     """
     import time
     print(f"Loading dataset: {dataset_name}")
     try:
-        # 1. Download the entire dataset as a snapshot first with retry logic
-        print("Downloading dataset snapshot (this may take a while)...")
-        max_retries = 5
-        for attempt in range(max_retries):
-            try:
-                dataset_path = snapshot_download(
-                    repo_id=dataset_name,
-                    repo_type="dataset",
-                    token=token,
-                    max_workers=1
-                )
-                break
-            except Exception as e:
-                if "429" in str(e) and attempt < max_retries - 1:
-                    wait_time = 60 * (attempt + 1)
-                    print(f"Rate limited. Waiting {wait_time} seconds before retry {attempt + 2}/{max_retries}...")
-                    time.sleep(wait_time)
-                else:
-                    raise
-        print(f"Dataset downloaded to: {dataset_path}")
         # 2. Read the labels.csv file from the LABELS directory
         labels_csv_path = os.path.join(dataset_path, "LABELS", "labels.csv")
@@ -294,7 +301,8 @@ def train_model(model, tokenizer, train_data, val_data,
                 per_device_train_batch_size=2,
                 gradient_accumulation_steps=4,
                 learning_rate=2e-4,
-                max_steps=None):
     """Train the model"""
     print("Starting training...")
@@ -337,7 +345,7 @@ def train_model(model, tokenizer, train_data, val_data,
     )
     # Load tokenizer for the data collator
-    tokenizer_for_collator = AutoProcessor.from_pretrained("deepseek_ocr", trust_remote_code=True)
     # Import preprocessing functions from the cached model
     import sys
@@ -657,6 +665,8 @@ def main():
                         help="HuggingFace token for authenticated access")
     parser.add_argument("--inspect_only", action="store_true",
                         help="Only inspect dataset structure without training")
     args = parser.parse_args()
@@ -683,7 +693,8 @@ def main():
         train_size=args.train_size,
         val_size=args.val_size,
         max_samples=args.max_samples,
-        token=hf_token
     )
     # If inspect only, exit here
@@ -714,7 +725,8 @@ def main():
         per_device_train_batch_size=args.batch_size,
         gradient_accumulation_steps=args.gradient_accumulation,
         learning_rate=args.learning_rate,
-        max_steps=args.max_steps
     )
     # Step 7: Save model

         trust_remote_code=True,
         unsloth_force_compile=True,
         use_gradient_checkpointing="unsloth",
+        attn_implementation="eager",
     )
     print("Model and tokenizer loaded successfully!")
     return model
 def load_and_prepare_dataset(dataset_name="snskrt/Sanskrit_OCR_Parallel_Corpus",
+                             train_size=0.8, val_size=0.1, max_samples=None, token=None,
+                             local_path=None):
     """
     Load and prepare the Sanskrit OCR dataset.
+    This function reads 'LABELS/labels.csv' and pairs images from the 'IMAGES/' folder.
+    If local_path is provided, uses local dataset instead of downloading.
     """
     import time
     print(f"Loading dataset: {dataset_name}")
     try:
+        if local_path and os.path.exists(local_path):
+            # Use local dataset
+            dataset_path = local_path
+            print(f"Using local dataset from: {dataset_path}")
+        else:
+            # Download the entire dataset as a snapshot first with retry logic
+            print("Downloading dataset snapshot (this may take a while)...")
+            max_retries = 5
+            for attempt in range(max_retries):
+                try:
+                    dataset_path = snapshot_download(
+                        repo_id=dataset_name,
+                        repo_type="dataset",
+                        token=token,
+                        max_workers=1
+                    )
+                    break
+                except Exception as e:
+                    if "429" in str(e) and attempt < max_retries - 1:
+                        wait_time = 60 * (attempt + 1)
+                        print(f"Rate limited. Waiting {wait_time} seconds before retry {attempt + 2}/{max_retries}...")
+                        time.sleep(wait_time)
+                    else:
+                        raise
+            print(f"Dataset downloaded to: {dataset_path}")
         # 2. Read the labels.csv file from the LABELS directory
         labels_csv_path = os.path.join(dataset_path, "LABELS", "labels.csv")
                 per_device_train_batch_size=2,
                 gradient_accumulation_steps=4,
                 learning_rate=2e-4,
+                max_steps=None,
+                model_dir="deepseek_ocr"):
     """Train the model"""
     print("Starting training...")
     )
     # Load tokenizer for the data collator
+    tokenizer_for_collator = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
     # Import preprocessing functions from the cached model
     import sys
                         help="HuggingFace token for authenticated access")
     parser.add_argument("--inspect_only", action="store_true",
                         help="Only inspect dataset structure without training")
+    parser.add_argument("--local_dataset", type=str, default=None,
+                        help="Path to local dataset directory (avoids re-downloading)")
     args = parser.parse_args()
         train_size=args.train_size,
         val_size=args.val_size,
         max_samples=args.max_samples,
+        token=hf_token,
+        local_path=args.local_dataset
     )
     # If inspect only, exit here
         per_device_train_batch_size=args.batch_size,
         gradient_accumulation_steps=args.gradient_accumulation,
         learning_rate=args.learning_rate,
+        max_steps=args.max_steps,
+        model_dir=model_path
     )
     # Step 7: Save model

tokenizer_config.json CHANGED Viewed

@@ -6655,7 +6655,7 @@
   "legacy": true,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<｜▁pad▁｜>",
-  "tokenizer_class": "LlamaTokenizerFast",
   "unk_token": null,
   "use_default_system_prompt": false
 }

   "legacy": true,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<｜▁pad▁｜>",
+  "tokenizer_class": "LlamaTokenizer",
   "unk_token": null,
   "use_default_system_prompt": false
 }

train_v2.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+DeepSeek OCR Fine-tuning for Sanskrit - Simplified Version
+Works with transformers 4.45.0, peft, accelerate
+"""
+import os
+import csv
+import torch
+import torchvision.transforms as T
+from glob import glob
+from pathlib import Path
+from PIL import Image, ImageOps
+from io import BytesIO
+from dataclasses import dataclass
+from typing import Any, Dict, List
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModel, AutoProcessor, Trainer, TrainingArguments
+from datasets import Dataset, DatasetDict
+import argparse
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def load_dataset_local(dataset_path, train_size=0.8, val_size=0.1, max_samples=None):
+    """Load dataset from local path"""
+    print(f"Loading dataset from: {dataset_path}")
+    labels_csv = os.path.join(dataset_path, "LABELS", "labels.csv")
+    labels_dict = {}
+    with open(labels_csv, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        header = next(reader)
+        for row in reader:
+            if row:
+                labels_dict[row[0]] = row[1]
+    print(f"Loaded {len(labels_dict)} labels")
+    image_paths = sorted(glob(os.path.join(dataset_path, "IMAGES", "*.jpg")))
+    print(f"Found {len(image_paths)} images")
+    data = []
+    for img_path in image_paths:
+        img_name = Path(img_path).name
+        if img_name in labels_dict:
+            text = labels_dict[img_name].strip()
+            if text:
+                data.append({"image_path": img_path, "text": text})
+    print(f"Paired {len(data)} samples")
+    if max_samples and max_samples < len(data):
+        data = data[:max_samples]
+    dataset = Dataset.from_list(data)
+    # Split
+    train_test = dataset.train_test_split(test_size=(1 - train_size), seed=42)
+    val_test_ratio = val_size / (1 - train_size)
+    val_test = train_test['test'].train_test_split(test_size=(1 - val_test_ratio), seed=42)
+    return DatasetDict({
+        'train': train_test['train'],
+        'validation': val_test['train'],
+        'test': val_test['test']
+    })
+class ImageTransform:
+    """Image transform for normalization."""
+    def __init__(self, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)):
+        self.mean = mean
+        self.std = std
+        self.transform = T.Compose([
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)
+        ])
+    def __call__(self, image):
+        return self.transform(image).float()
+@dataclass
+class DeepSeekOCRDataCollator:
+    """Custom data collator for DeepSeek-OCR training"""
+    tokenizer: Any
+    image_size: int = 640
+    base_size: int = 1024
+    prompt: str = "<image>\nFree OCR. "
+    def __post_init__(self):
+        self.image_transform = ImageTransform()
+        self.image_token_id = 128815
+        self.patch_size = 16
+        self.downsample_ratio = 4
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        from torch.nn.utils.rnn import pad_sequence
+        import math
+        batch_input_ids = []
+        batch_labels = []
+        batch_images = []
+        batch_images_seq_mask = []
+        batch_images_spatial_crop = []
+        for feature in features:
+            image_path = feature["image_path"]
+            text = feature["text"]
+            # Load and process image
+            image = Image.open(image_path).convert("RGB")
+            # Create global view
+            global_view = ImageOps.pad(
+                image,
+                (self.base_size, self.base_size),
+                color=(128, 128, 128)
+            )
+            image_tensor = self.image_transform(global_view)
+            # Create empty patches tensor (no local crops for simplicity)
+            empty_patches = torch.zeros(1, 3, self.image_size, self.image_size)
+            # Build prompt
+            full_text = f"<|User|>{self.prompt}<|Assistant|>{text}"
+            # Tokenize
+            tokens = self.tokenizer.encode(full_text, add_special_tokens=False)
+            # Calculate image token positions
+            num_queries = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)
+            num_image_tokens = (num_queries + 1) * num_queries + 1
+            # Build input_ids with image tokens
+            input_ids = [0]  # BOS
+            images_seq_mask = [False]
+            # Add image tokens
+            input_ids.extend([self.image_token_id] * num_image_tokens)
+            images_seq_mask.extend([True] * num_image_tokens)
+            # Add text tokens
+            input_ids.extend(tokens)
+            images_seq_mask.extend([False] * len(tokens))
+            # Add EOS
+            input_ids.append(1)
+            images_seq_mask.append(False)
+            batch_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
+            batch_labels.append(torch.tensor(input_ids, dtype=torch.long))
+            # Model expects (patches, original) tuple
+            batch_images.append((empty_patches, image_tensor.unsqueeze(0)))
+            batch_images_seq_mask.append(torch.tensor(images_seq_mask, dtype=torch.bool))
+            # Spatial crop shape: (height_crops, width_crops)
+            batch_images_spatial_crop.append(torch.tensor([1, 1], dtype=torch.long))
+        # Pad sequences
+        input_ids = pad_sequence(batch_input_ids, batch_first=True, padding_value=0)
+        labels = pad_sequence(batch_labels, batch_first=True, padding_value=-100)
+        attention_mask = (input_ids != 0).long()
+        images_seq_mask = pad_sequence(batch_images_seq_mask, batch_first=True, padding_value=False)
+        images_spatial_crop = torch.stack(batch_images_spatial_crop)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "images": batch_images,
+            "images_seq_mask": images_seq_mask,
+            "images_spatial_crop": images_spatial_crop,
+        }
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir", type=str, default="deepseek_ocr")
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--output_dir", type=str, default="./results")
+    parser.add_argument("--lora_output", type=str, default="./lora_model_v2")
+    parser.add_argument("--epochs", type=int, default=2)
+    parser.add_argument("--batch_size", type=int, default=2)
+    parser.add_argument("--gradient_accumulation", type=int, default=4)
+    parser.add_argument("--learning_rate", type=float, default=2e-4)
+    parser.add_argument("--max_samples", type=int, default=None)
+    args = parser.parse_args()
+    # Load dataset
+    dataset = load_dataset_local(
+        args.dataset_path,
+        max_samples=args.max_samples
+    )
+    print(f"Train: {len(dataset['train'])}, Val: {len(dataset['validation'])}")
+    # Load model
+    print("Loading model...")
+    model = AutoModel.from_pretrained(
+        args.model_dir,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    processor = AutoProcessor.from_pretrained(
+        args.model_dir,
+        trust_remote_code=True
+    )
+    # Setup LoRA
+    print("Setting up LoRA...")
+    lora_config = LoraConfig(
+        r=16,
+        lora_alpha=16,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0,
+        bias="none",
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # Ensure model is in training mode
+    model.train()
+    # Enable gradients for base model
+    for param in model.parameters():
+        param.requires_grad = False
+    for name, param in model.named_parameters():
+        if 'lora' in name.lower():
+            param.requires_grad = True
+    # Training args
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation,
+        num_train_epochs=args.epochs,
+        learning_rate=args.learning_rate,
+        bf16=True,
+        logging_steps=10,
+        save_strategy="epoch",
+        eval_strategy="epoch",
+        warmup_steps=50,
+        weight_decay=0.01,
+        lr_scheduler_type="cosine",
+        remove_unused_columns=False,
+        dataloader_num_workers=0,  # Avoid multiprocessing issues
+        gradient_checkpointing=False,  # Disable - causes issues with this model
+    )
+    # Data collator - processor is the tokenizer for DeepSeek-OCR
+    collator = DeepSeekOCRDataCollator(processor)
+    # Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset['train'],
+        eval_dataset=dataset['validation'],
+        data_collator=collator,
+    )
+    # Train
+    print("Starting training...")
+    trainer.train()
+    # Save
+    print(f"Saving to {args.lora_output}...")
+    model.save_pretrained(args.lora_output)
+    processor.save_pretrained(args.lora_output)
+    print("Done!")
+if __name__ == "__main__":
+    main()