MWirelabs
/

assamese-ocr

@@ -1,3 +1,48 @@
 # AssameseOCR
 **AssameseOCR** is a vision-language model for Optical Character Recognition (OCR) of printed Assamese text. Built on Microsoft's Florence-2-large foundation model with a custom character-level decoder, it achieves 94.67% character accuracy on the Mozhi dataset.
@@ -98,11 +143,13 @@ pip install torch torchvision transformers pillow
 ```python
 import torch
 from PIL import Image
 from transformers import AutoModelForCausalLM, CLIPImageProcessor
 import json
-# Load tokenizer
 class CharTokenizer:
     def __init__(self, vocab):
         self.vocab = vocab
@@ -112,6 +159,18 @@ class CharTokenizer:
         self.bos_token_id = self.char2id["<s>"]
         self.eos_token_id = self.char2id["</s>"]
     def decode(self, ids, skip_special_tokens=True):
         chars = []
         for i in ids:
@@ -127,9 +186,46 @@ class CharTokenizer:
             vocab = json.load(f)
         return cls(vocab)
-# Load model components
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Florence base model
 florence_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Florence-2-large-ft",
@@ -139,17 +235,50 @@ florence_model = AutoModelForCausalLM.from_pretrained(
 # Load image processor
 image_processor = CLIPImageProcessor.from_pretrained("microsoft/Florence-2-large-ft")
-# Load tokenizer
-char_tokenizer = CharTokenizer.load("assamese_char_tokenizer.json")
-# Load AssameseOCR weights
-# (Note: You'll need to define the FlorenceCharOCR class as in training)
-checkpoint = torch.load("assamese_ocr_best.pt", map_location=device)
-# ocr_model.load_state_dict(checkpoint['model_state_dict'])
-# Inference
-image = Image.open("assamese_text.jpg")
-# Process and predict...
 ```
 ## Vocabulary
@@ -211,5 +340,4 @@ If you use AssameseOCR in your research, please cite:
 - [KhasiBERT](https://huggingface.co/MWirelabs/KhasiBERT-110M) - Khasi language model
 - [NE-BERT](https://huggingface.co/MWirelabs/NE-BERT) - 9 Northeast languages
 - [Kren-M](https://huggingface.co/MWirelabs/Kren-M) - Khasi-English conversational AI
 - **AssameseOCR** - Assamese text recognition

+---
+language:
+- asm  # Assamese ISO 639-1 code
+license: apache-2.0
+base_model: microsoft/Florence-2-large-ft
+tags:
+- vision
+- ocr
+- assamese
+- northeast-india
+- indic-languages
+- character-recognition
+- florence-2
+- vision-language
+datasets:
+- darknight054/indic-mozhi-ocr
+metrics:
+- accuracy
+- character_error_rate
+library_name: transformers
+pipeline_tag: image-to-text
+model-index:
+- name: AssameseOCR
+  results:
+  - task:
+      type: image-to-text
+      name: Optical Character Recognition
+    dataset:
+      name: Mozhi Indic OCR (Assamese)
+      type: darknight054/indic-mozhi-ocr
+      config: assamese
+      split: test
+    metrics:
+    - type: accuracy
+      value: 94.67
+      name: Character Accuracy
+      verified: false
+    - type: character_error_rate
+      value: 5.33
+      name: Character Error Rate (CER)
+      verified: false
+---
 # AssameseOCR
 **AssameseOCR** is a vision-language model for Optical Character Recognition (OCR) of printed Assamese text. Built on Microsoft's Florence-2-large foundation model with a custom character-level decoder, it achieves 94.67% character accuracy on the Mozhi dataset.
 ```python
 import torch
+import torch.nn as nn
 from PIL import Image
 from transformers import AutoModelForCausalLM, CLIPImageProcessor
+from huggingface_hub import hf_hub_download
 import json
+# CharTokenizer class
 class CharTokenizer:
     def __init__(self, vocab):
         self.vocab = vocab
         self.bos_token_id = self.char2id["<s>"]
         self.eos_token_id = self.char2id["</s>"]
+    def encode(self, text, max_length=None, add_special_tokens=True):
+        ids = [self.bos_token_id] if add_special_tokens else []
+        for ch in text:
+            ids.append(self.char2id.get(ch, self.char2id["<unk>"]))
+        if add_special_tokens:
+            ids.append(self.eos_token_id)
+        if max_length:
+            ids = ids[:max_length]
+            if len(ids) < max_length:
+                ids += [self.pad_token_id] * (max_length - len(ids))
+        return ids
     def decode(self, ids, skip_special_tokens=True):
         chars = []
         for i in ids:
             vocab = json.load(f)
         return cls(vocab)
+# FlorenceCharOCR model class
+class FlorenceCharOCR(nn.Module):
+    def __init__(self, florence_model, vocab_size, vision_hidden_dim, decoder_hidden_dim=512, num_layers=4):
+        super().__init__()
+        self.florence_model = florence_model
+        for param in self.florence_model.parameters():
+            param.requires_grad = False
+        self.vision_proj = nn.Linear(vision_hidden_dim, decoder_hidden_dim)
+        self.embedding = nn.Embedding(vocab_size, decoder_hidden_dim)
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=decoder_hidden_dim,
+            nhead=8,
+            batch_first=True
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
+        self.fc_out = nn.Linear(decoder_hidden_dim, vocab_size)
+    def forward(self, pixel_values, tgt_ids, tgt_mask=None):
+        with torch.no_grad():
+            vision_feats = self.florence_model._encode_image(pixel_values)
+        vision_feats = self.vision_proj(vision_feats)
+        tgt_emb = self.embedding(tgt_ids)
+        decoder_out = self.decoder(tgt_emb, vision_feats, tgt_mask=tgt_mask)
+        logits = self.fc_out(decoder_out)
+        return logits
+# Load components
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Download files from HuggingFace
+tokenizer_path = hf_hub_download(repo_id="MWirelabs/assamese-ocr", filename="assamese_char_tokenizer.json")
+model_path = hf_hub_download(repo_id="MWirelabs/assamese-ocr", filename="assamese_ocr_best.pt")
+# Load tokenizer
+char_tokenizer = CharTokenizer.load(tokenizer_path)
 # Load Florence base model
 florence_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Florence-2-large-ft",
 # Load image processor
 image_processor = CLIPImageProcessor.from_pretrained("microsoft/Florence-2-large-ft")
+# Initialize OCR model
+ocr_model = FlorenceCharOCR(
+    florence_model=florence_model,
+    vocab_size=len(char_tokenizer.vocab),
+    vision_hidden_dim=1024,
+    decoder_hidden_dim=512,
+    num_layers=4
+).to(device)
+# Load trained weights
+checkpoint = torch.load(model_path, map_location=device)
+ocr_model.load_state_dict(checkpoint['model_state_dict'])
+ocr_model.eval()
+# Inference function
+def recognize_text(image_path):
+    # Load and process image
+    image = Image.open(image_path).convert("RGB")
+    pixel_values = image_processor(images=[image], return_tensors="pt")['pixel_values'].to(device)
+    # Generate prediction
+    with torch.no_grad():
+        # Start with BOS token
+        generated_ids = [char_tokenizer.bos_token_id]
+        for _ in range(128):  # max length
+            tgt_tensor = torch.tensor([generated_ids], device=device)
+            logits = ocr_model(pixel_values, tgt_tensor)
+            # Get next token
+            next_token = logits[0, -1].argmax().item()
+            generated_ids.append(next_token)
+            # Stop if EOS
+            if next_token == char_tokenizer.eos_token_id:
+                break
+    # Decode
+    text = char_tokenizer.decode(generated_ids, skip_special_tokens=True)
+    return text
+# Example usage
+result = recognize_text("assamese_text.jpg")
+print(f"Recognized text: {result}")
 ```
 ## Vocabulary
 - [KhasiBERT](https://huggingface.co/MWirelabs/KhasiBERT-110M) - Khasi language model
 - [NE-BERT](https://huggingface.co/MWirelabs/NE-BERT) - 9 Northeast languages
 - [Kren-M](https://huggingface.co/MWirelabs/Kren-M) - Khasi-English conversational AI
 - **AssameseOCR** - Assamese text recognition