File size: 4,902 Bytes
ee710a2
3e35703
2312221
3cf6a99
3e35703
3cf6a99
3e35703
 
524cb91
3e35703
3cf6a99
3e35703
2312221
3e35703
 
 
524cb91
3e35703
3cf6a99
3e35703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf6a99
524cb91
3cf6a99
524cb91
3e35703
 
 
 
 
 
 
 
 
 
 
3cf6a99
524cb91
3cf6a99
 
 
3e35703
3cf6a99
 
524cb91
3e35703
 
 
 
 
 
3cf6a99
 
3e35703
 
 
 
3cf6a99
 
 
3e35703
 
 
 
 
 
 
 
3cf6a99
524cb91
3cf6a99
 
3e35703
 
 
524cb91
3cf6a99
3e35703
524cb91
2312221
 
3e35703
3cf6a99
 
2312221
 
 
 
524cb91
2312221
3e35703
 
2312221
 
3cf6a99
3e35703
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF for PDF handling
import torch
import docx  # For DOCX handling

# Load pre-trained models
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
ner_model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from a PDF file with error handling
def extract_text_from_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text.strip()
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# Function to extract text from a DOCX file
def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text.strip()
    except Exception as e:
        return f"Error extracting text from DOCX: {str(e)}"

# Function to calculate cosine similarity
def calculate_similarity(input_label, predefined_labels):
    input_embedding = embedding_model.encode(input_label, convert_to_tensor=True)
    predefined_embeddings = embedding_model.encode(predefined_labels, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(input_embedding, predefined_embeddings)
    best_match_idx = torch.argmax(cosine_scores).item()
    return predefined_labels[best_match_idx], cosine_scores[0][best_match_idx].item()

# Function to map recognized entities to custom labels with cosine similarity
def map_labels_with_similarity(input_label, label_map):
    predefined_labels = list(label_map.keys())
    best_match_label, similarity_score = calculate_similarity(input_label, predefined_labels)
    if similarity_score > 0.7:  # Threshold for considering a match
        return best_match_label
    return None

# Function to process the text and extract entities based on custom labels
def process_text(file, labels):
    # Determine the file type and extract text accordingly
    if file.name.endswith(".pdf"):
        text = extract_text_from_pdf(file.name)
    elif file.name.endswith(".docx"):
        text = extract_text_from_docx(file.name)
    else:
        return "Unsupported file type. Please upload a PDF or DOCX file."

    if text.startswith("Error"):
        return text  # Return the error message if text extraction failed

    # Define the custom label mapping
    label_map = {
        "Name": ["PER"],
        "Organization": ["ORG"],
        "Location": ["LOC"],
        "Address": ["LOC"],  # Address mapped to Location
        "Project": ["MISC"],
        "Education": ["MISC"],
    }

    # Split the custom labels provided by the user and handle potential input issues
    requested_labels = [label.strip().capitalize() for label in labels.split(",") if label.strip()]
    if not requested_labels:
        return "No valid labels provided. Please enter valid labels to extract."

    # Initialize a dictionary to hold the extracted information
    extracted_info = {label: [] for label in requested_labels}

    # Perform NER on the extracted text
    ner_results = ner_pipeline(text)

    # Process the NER results
    for entity in ner_results:
        entity_text = entity['word'].replace("##", "")
        entity_group = entity['entity_group']

        # Determine the best matching label using cosine similarity
        for input_label in requested_labels:
            best_match_label = map_labels_with_similarity(input_label, label_map)
            if best_match_label and entity_group in label_map[best_match_label]:
                extracted_info[input_label].append(entity_text)

    # Format the output
    output = ""
    for label, entities in extracted_info.items():
        if entities:
            # Remove duplicates and clean up the entities
            unique_entities = sorted(set(entities))
            output += f"{label}: {', '.join(unique_entities)}\n"
        else:
            output += f"{label}: No information found.\n"

    return output.strip()

# Create Gradio components
file_input = gr.File(label="Upload a PDF or DOCX file")
label_input = gr.Textbox(label="Enter labels to extract (comma-separated)")
output_text = gr.Textbox(label="Extracted Information")

# Create the Gradio interface
iface = gr.Interface(
    fn=process_text,
    inputs=[file_input, label_input],
    outputs=output_text,
    title="NER with Custom Labels from PDF or DOCX",
    description="Upload a PDF or DOCX file and extract entities based on custom labels."
)

# Launch the Gradio interface
iface.launch()