File size: 4,902 Bytes
ee710a2 3e35703 2312221 3cf6a99 3e35703 3cf6a99 3e35703 524cb91 3e35703 3cf6a99 3e35703 2312221 3e35703 524cb91 3e35703 3cf6a99 3e35703 3cf6a99 524cb91 3cf6a99 524cb91 3e35703 3cf6a99 524cb91 3cf6a99 3e35703 3cf6a99 524cb91 3e35703 3cf6a99 3e35703 3cf6a99 3e35703 3cf6a99 524cb91 3cf6a99 3e35703 524cb91 3cf6a99 3e35703 524cb91 2312221 3e35703 3cf6a99 2312221 524cb91 2312221 3e35703 2312221 3cf6a99 3e35703 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer, util
import fitz # PyMuPDF for PDF handling
import torch
import docx # For DOCX handling
# Load pre-trained models
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
ner_model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Function to extract text from a PDF file with error handling
def extract_text_from_pdf(file_path):
try:
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text.strip()
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
# Function to extract text from a DOCX file
def extract_text_from_docx(file_path):
try:
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
return text.strip()
except Exception as e:
return f"Error extracting text from DOCX: {str(e)}"
# Function to calculate cosine similarity
def calculate_similarity(input_label, predefined_labels):
input_embedding = embedding_model.encode(input_label, convert_to_tensor=True)
predefined_embeddings = embedding_model.encode(predefined_labels, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(input_embedding, predefined_embeddings)
best_match_idx = torch.argmax(cosine_scores).item()
return predefined_labels[best_match_idx], cosine_scores[0][best_match_idx].item()
# Function to map recognized entities to custom labels with cosine similarity
def map_labels_with_similarity(input_label, label_map):
predefined_labels = list(label_map.keys())
best_match_label, similarity_score = calculate_similarity(input_label, predefined_labels)
if similarity_score > 0.7: # Threshold for considering a match
return best_match_label
return None
# Function to process the text and extract entities based on custom labels
def process_text(file, labels):
# Determine the file type and extract text accordingly
if file.name.endswith(".pdf"):
text = extract_text_from_pdf(file.name)
elif file.name.endswith(".docx"):
text = extract_text_from_docx(file.name)
else:
return "Unsupported file type. Please upload a PDF or DOCX file."
if text.startswith("Error"):
return text # Return the error message if text extraction failed
# Define the custom label mapping
label_map = {
"Name": ["PER"],
"Organization": ["ORG"],
"Location": ["LOC"],
"Address": ["LOC"], # Address mapped to Location
"Project": ["MISC"],
"Education": ["MISC"],
}
# Split the custom labels provided by the user and handle potential input issues
requested_labels = [label.strip().capitalize() for label in labels.split(",") if label.strip()]
if not requested_labels:
return "No valid labels provided. Please enter valid labels to extract."
# Initialize a dictionary to hold the extracted information
extracted_info = {label: [] for label in requested_labels}
# Perform NER on the extracted text
ner_results = ner_pipeline(text)
# Process the NER results
for entity in ner_results:
entity_text = entity['word'].replace("##", "")
entity_group = entity['entity_group']
# Determine the best matching label using cosine similarity
for input_label in requested_labels:
best_match_label = map_labels_with_similarity(input_label, label_map)
if best_match_label and entity_group in label_map[best_match_label]:
extracted_info[input_label].append(entity_text)
# Format the output
output = ""
for label, entities in extracted_info.items():
if entities:
# Remove duplicates and clean up the entities
unique_entities = sorted(set(entities))
output += f"{label}: {', '.join(unique_entities)}\n"
else:
output += f"{label}: No information found.\n"
return output.strip()
# Create Gradio components
file_input = gr.File(label="Upload a PDF or DOCX file")
label_input = gr.Textbox(label="Enter labels to extract (comma-separated)")
output_text = gr.Textbox(label="Extracted Information")
# Create the Gradio interface
iface = gr.Interface(
fn=process_text,
inputs=[file_input, label_input],
outputs=output_text,
title="NER with Custom Labels from PDF or DOCX",
description="Upload a PDF or DOCX file and extract entities based on custom labels."
)
# Launch the Gradio interface
iface.launch()
|