Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

label_to_id.json +25 -0
lc_infer.py +287 -0
model/config.json +78 -0
model/model.safetensors +3 -0
model/special_tokens_map.json +37 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +245 -0
model/trainer_state.json +0 -0
model/training_args.bin +3 -0

label_to_id.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "assamese": 0,
+    "bengali": 1,
+    "bodo": 2,
+    "dogri": 3,
+    "english": 4,
+    "gujarati": 5,
+    "hindi": 6,
+    "kannada": 7,
+    "kashmiri": 8,
+    "kokani": 9,
+    "maithili": 10,
+    "malayalam": 11,
+    "manipuri": 12,
+    "marathi": 13,
+    "nepali": 14,
+    "oriya": 15,
+    "punjabi": 16,
+    "sanskrit": 17,
+    "santali": 18,
+    "sindhi": 19,
+    "tamil": 20,
+    "telugu": 21,
+    "urdu": 22
+}

lc_infer.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import os
+import json
+import argparse
+import logging
+from tqdm import tqdm
+from typing import List, Dict
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# ===========================
+# PATH RESOLUTION (NO HARDCODE)
+# ===========================
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL_PATH = os.path.join(SCRIPT_DIR, "model")
+LABEL_MAP_PATH = os.path.join(SCRIPT_DIR, "label_to_id.json")
+# ===========================
+# Logging
+# ===========================
+def setup_logging(output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    log_path = os.path.join(output_dir, "language_classifier.log")
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s | %(levelname)s | %(message)s",
+        handlers=[
+            logging.FileHandler(log_path),
+            logging.StreamHandler()
+        ],
+    )
+    logging.info(f"Logging to: {log_path}")
+# ===========================
+# DDP SETUP
+# ===========================
+def setup_distributed():
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group(backend="nccl")
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+        return True, rank, world_size, local_rank
+    return False, 0, 1, 0
+def is_main_process():
+    return (
+        not dist.is_available()
+        or not dist.is_initialized()
+        or dist.get_rank() == 0
+    )
+# ===========================
+# Input Discovery
+# ===========================
+def find_all_jsonl_files(path: str) -> List[str]:
+    if os.path.isfile(path):
+        if not path.endswith(".jsonl"):
+            raise ValueError(f"Input file must be .jsonl: {path}")
+        return [path]
+    if not os.path.isdir(path):
+        raise ValueError(f"Input path does not exist: {path}")
+    files = []
+    for root, _, filenames in os.walk(path):
+        for fn in filenames:
+            if fn.endswith(".jsonl"):
+                files.append(os.path.join(root, fn))
+    if not files:
+        raise RuntimeError(f"No .jsonl files found inside: {path}")
+    return sorted(files)
+# ===========================
+# Dataset (Streaming, DDP-safe)
+# ===========================
+class JsonlIterableDataset(torch.utils.data.IterableDataset):
+    def __init__(self, input_path: str, text_key: str, rank: int, world_size: int):
+        self.files = find_all_jsonl_files(input_path)
+        self.text_key = text_key
+        self.rank = rank
+        self.world_size = world_size
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        worker_id = worker_info.id if worker_info else 0
+        num_workers = worker_info.num_workers if worker_info else 1
+        global_worker_id = self.rank * num_workers + worker_id
+        global_num_workers = self.world_size * num_workers
+        json_loads = json.loads
+        text_key = self.text_key
+        for path in self.files:
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                i = 0
+                for line in f:
+                    if i == global_worker_id:
+                        try:
+                            obj = json_loads(line)
+                        except json.JSONDecodeError:
+                            pass
+                        else:
+                            text = obj.get(text_key)
+                            if isinstance(text, str) and text.strip():
+                                obj["__lc_text"] = text
+                                yield obj
+                    i += 1
+                    if i == global_num_workers:
+                        i = 0
+# ===========================
+# Collator
+# ===========================
+class Collator:
+    def __init__(self, tokenizer, max_length=512):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __call__(self, batch):
+        if not batch:
+            return None
+        texts = [x["__lc_text"] for x in batch]
+        enc = self.tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        return {"enc": enc, "raw": batch}
+# ===========================
+# Main
+# ===========================
+def main():
+    parser = argparse.ArgumentParser("Language Classifier Inference")
+    parser.add_argument("--input_path", required=True)
+    parser.add_argument("--output_path", required=True)
+    parser.add_argument("--text_key", required=True)
+    parser.add_argument("--batch_size", type=int, default=2048)
+    parser.add_argument("--max_length", type=int, default=512)
+    parser.add_argument("--num_workers", type=int, default=8)
+    args = parser.parse_args()
+    setup_logging(args.output_path)
+    # --------------------
+    # DDP
+    # --------------------
+    distributed, rank, world_size, local_rank = setup_distributed()
+    device = f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu"
+    logging.info(f"Distributed={distributed} | World size={world_size}")
+    # --------------------
+    # Load label map
+    # --------------------
+    if not os.path.isfile(LABEL_MAP_PATH):
+        raise RuntimeError(f"Missing label map: {LABEL_MAP_PATH}")
+    with open(LABEL_MAP_PATH, "r", encoding="utf-8") as f:
+        label_map = json.load(f)
+    id_to_label = {v: k for k, v in label_map.items()}
+    # --------------------
+    # Load model
+    # --------------------
+    if not os.path.isdir(MODEL_PATH):
+        raise RuntimeError(f"Model directory not found: {MODEL_PATH}")
+    logging.info(f"Loading model from {MODEL_PATH}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
+    model.to(device)
+    model.eval()
+    # --------------------
+    # Dataset & Loader
+    # --------------------
+    dataset = JsonlIterableDataset(
+        args.input_path,
+        args.text_key,
+        rank=rank,
+        world_size=world_size,
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        collate_fn=Collator(tokenizer, args.max_length),
+        pin_memory=True,
+        persistent_workers=True,
+        prefetch_factor=4,
+    )
+    # --------------------
+    # Accumulators
+    # --------------------
+    outputs: Dict[int, List[dict]] = {k: [] for k in id_to_label.keys()}
+    # --------------------
+    # Inference
+    # --------------------
+    iterator = tqdm(dataloader, desc="Classifying") if is_main_process() else dataloader
+    with torch.no_grad():
+        for batch in iterator:
+            if batch is None:
+                continue
+            try:
+                enc = {k: v.to(device) for k, v in batch["enc"].items()}
+                raw = batch["raw"]
+                logits = model(**enc).logits
+                preds = torch.argmax(logits, dim=-1).cpu().tolist()
+                for obj, pred in zip(raw, preds):
+                    obj = dict(obj)
+                    obj.pop("__lc_text", None)
+                    obj["predicted_id"] = pred
+                    obj["predicted_language"] = id_to_label[pred]
+                    outputs[pred].append(obj)
+            except Exception as e:
+                logging.exception(f"Batch failed: {e}")
+    # --------------------
+    # Write outputs
+    # --------------------
+    os.makedirs(args.output_path, exist_ok=True)
+    for cls_id, cls_name in id_to_label.items():
+        out_path = os.path.join(
+            args.output_path,
+            f"{cls_name}.rank{rank}.jsonl"
+        )
+        logging.info(f"Writing {len(outputs[cls_id])} samples to {out_path}")
+        with open(out_path, "w", encoding="utf-8") as f:
+            for obj in outputs[cls_id]:
+                f.write(json.dumps(obj, ensure_ascii=False) + "\n")
+    if distributed:
+        dist.barrier()
+        dist.destroy_process_group()
+    logging.info("Language classification completed successfully.")
+if __name__ == "__main__":
+    main()

model/config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "embedding_size": 768,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 3,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.56.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 250000
+}

model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37cf7198a098b1fa6825ba86a1bb80c950ffc48d83e030abc72d1f939fe75180
+size 1112262872

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,245 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<as>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<bd>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<bn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<dg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<en>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<gom>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<gu>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<hi>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<kha>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<kn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<ks>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<mai>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<ml>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<mni>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<mr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<ne>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<or>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<pa>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<sa>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<sd>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<sat>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<ta>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<te>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<ur>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}

model/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60fb1107c5f308b237d9eccfa3a3329e204ba8a411f4504ef68e9fb6c6614542
+size 5841