from transformers import pipeline, AutoTokenizer import bz2, json from pprint import pprint MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual-light" # Load the tokenizer and model using the pipeline ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) ner_pipeline = pipeline( "generic-ner", model=MODEL_NAME, tokenizer=ner_tokenizer, trust_remote_code=True, device="cpu", ) def process_archive(lingproc_path): """ Processes paired NER and full-text archives to extract full text and sentence offsets. Args: ner_path (str): Path to the NER .jsonl.bz2 archive. fulltext_path (str): Path to the full-text .jsonl.bz2 archive. Returns: List of tuples: (doc_id, full_text, sentence_offsets) """ results = [] with bz2.open(lingproc_path, mode='rt', encoding='utf-8') as f: for line in f: data = json.loads(line) doc_id = data.get("id") # Reconstruct the full text from all tokens using their offsets offset_token_map = {} for sent in data.get("sents", []): for token in sent.get("tok", []): offset = token["o"] text = token["t"] offset_token_map[offset] = text # Rebuild full text from sorted offsets full_text_parts = [] sorted_offsets = sorted(offset_token_map.keys()) last_end = 0 for offset in sorted_offsets: token = offset_token_map[offset] if offset > last_end: full_text_parts.append(" " * (offset - last_end)) full_text_parts.append(token) last_end = offset + len(token) full_text = "".join(full_text_parts).strip() # assert new_full_text == full_text, f"Full text mismatch for doc_id {doc_id}. Expected: {full_text}, Got: {new_full_text}" sentences = [] for sent in data.get("sents", []): tokens = sent.get("tok", []) if not tokens: continue start = tokens[0]["o"] end = tokens[-1]["o"] + len(tokens[-1]["t"]) newtokens = [{"t": token["t"], "o": token["o"], "l": len(token["t"])} for token in tokens] sentences.append({"start": start, "end": end, "tokens": newtokens}) results.append((doc_id, full_text, sentences)) return results processed_cis = process_archive("../../data/lematin-1885.jsonl.bz2") for ci in processed_cis: doc_id, full_text, offsets = ci print(f"Document ID: {doc_id}") # print(f"Full Text: {full_text}") # print("Sentences:") for sentence in offsets: start = sentence["start"] end = sentence["end"] tokens = sentence["tokens"] sentence_text = full_text[start:end] tokens_texts = [full_text[token["o"]:token["o"] + len(token["t"])] for token in tokens] # print(sentence_text) entities = ner_pipeline(sentence_text, tokens=tokens_texts) for entity in entities: abs_start = sentence["start"] + entity["lOffset"] abs_end = sentence["start"] + entity["rOffset"] entity_text = full_text[abs_start:abs_end] entity_surface = entity["surface"] assert entity_text == entity_surface, f"Entity text mismatch: {entity_text} != {entity_surface}" print(f"{doc_id}: {entity_text} -- surface: {entity_surface} -- {entity['type']} -- {abs_start} - {abs_end}") # pprint(entities) # print(f" Sentence: {sentence_text} (Start: {start}, End: {end})") # for token in tokens: # token_text = token["t"] # token_offset = token["o"] # token_label = token["l"] # print(f" Token: {token_text} (Offset: {token_offset}, Label: {token_label})") # entities = ner_pipeline(sentence)