File size: 1,909 Bytes
286a29c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import spacy
from spacy.tokens import DocBin
import json

def compare_ner_pipelines(binary_file_path, pipeline_names, output_file_path):
    # Load SpaCy models based on provided pipeline names
    nlp_pipelines = [spacy.load(name) for name in pipeline_names]

    # Load documents from a binary file
    doc_bin = DocBin().from_disk(binary_file_path)
    docs = list(doc_bin.get_docs(nlp_pipelines[0].vocab))  # assuming all models share the same vocab

    # Function to extract entities with their positions
    def extract_entities(doc):
        return {(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}

    # Compare entities in each document across all pipelines
    all_entities_comparison = []
    for doc in docs:
        # Include manually annotated entities as the first item in the list
        entities_per_pipeline = [extract_entities(nlp(doc.text)) for nlp in nlp_pipelines]

        # Find common and unique entities
        common_entities = set.intersection(*entities_per_pipeline)
        unique_entities = [ents - common_entities for ents in entities_per_pipeline]

        # Append results for each document
        all_entities_comparison.append({
            "document_text": doc.text,
            "common_entities": list(common_entities),
            "unique_entities_per_pipeline": {i: list(ents) for i, ents in enumerate(unique_entities)},
        })
    # Save the results to a file
    with open(output_file_path, 'w', encoding="utf-16") as f:
        json.dump(all_entities_comparison, f, indent=4, ensure_ascii=False)

    print(f"Comparison results saved to {output_file_path}")

# Example usage
def main():
    base_path = r"E:\ICIST-2024-models\spacy-tr\spacy-tr"
    compare_ner_pipelines("SRP19101_1.spacy", [base_path + r"\output20\model-best", base_path + r"\output17\model-best"], "compare_results_nk.json")

if __name__ == "__main__":
    main()